]>
Commit | Line | Data |
---|---|---|
9f95a23c TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #pragma once | |
5 | ||
6 | #include <mutex> | |
7 | #include <boost/intrusive/avl_set.hpp> | |
8 | ||
9 | #include "Allocator.h" | |
10 | #include "os/bluestore/bluestore_types.h" | |
11 | #include "include/mempool.h" | |
12 | ||
13 | struct range_seg_t { | |
14 | MEMPOOL_CLASS_HELPERS(); ///< memory monitoring | |
15 | uint64_t start; ///< starting offset of this segment | |
16 | uint64_t end; ///< ending offset (non-inclusive) | |
17 | ||
18 | range_seg_t(uint64_t start, uint64_t end) | |
19 | : start{start}, | |
20 | end{end} | |
21 | {} | |
22 | // Tree is sorted by offset, greater offsets at the end of the tree. | |
23 | struct before_t { | |
24 | template<typename KeyLeft, typename KeyRight> | |
25 | bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const { | |
26 | return lhs.end <= rhs.start; | |
27 | } | |
28 | }; | |
29 | boost::intrusive::avl_set_member_hook<> offset_hook; | |
30 | ||
31 | // Tree is sorted by size, larger sizes at the end of the tree. | |
32 | struct shorter_t { | |
33 | template<typename KeyType> | |
34 | bool operator()(const range_seg_t& lhs, const KeyType& rhs) const { | |
35 | auto lhs_size = lhs.end - lhs.start; | |
36 | auto rhs_size = rhs.end - rhs.start; | |
37 | if (lhs_size < rhs_size) { | |
38 | return true; | |
39 | } else if (lhs_size > rhs_size) { | |
40 | return false; | |
41 | } else { | |
42 | return lhs.start < rhs.start; | |
43 | } | |
44 | } | |
45 | }; | |
e306af50 TL |
46 | inline uint64_t length() const { |
47 | return end - start; | |
48 | } | |
9f95a23c TL |
49 | boost::intrusive::avl_set_member_hook<> size_hook; |
50 | }; | |
51 | ||
e306af50 TL |
52 | class AvlAllocator : public Allocator { |
53 | struct dispose_rs { | |
54 | void operator()(range_seg_t* p) | |
55 | { | |
56 | delete p; | |
57 | } | |
58 | }; | |
59 | ||
60 | protected: | |
61 | /* | |
62 | * ctor intended for the usage from descendant class(es) which | |
63 | * provides handling for spilled over entries | |
64 | * (when entry count >= max_entries) | |
65 | */ | |
66 | AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size, | |
67 | uint64_t max_mem, | |
20effc67 | 68 | std::string_view name); |
e306af50 | 69 | |
9f95a23c TL |
70 | public: |
71 | AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size, | |
20effc67 | 72 | std::string_view name); |
e306af50 | 73 | ~AvlAllocator(); |
f67539c2 TL |
74 | const char* get_type() const override |
75 | { | |
76 | return "avl"; | |
77 | } | |
9f95a23c TL |
78 | int64_t allocate( |
79 | uint64_t want, | |
80 | uint64_t unit, | |
81 | uint64_t max_alloc_size, | |
82 | int64_t hint, | |
e306af50 TL |
83 | PExtentVector *extents) override; |
84 | void release(const interval_set<uint64_t>& release_set) override; | |
e306af50 TL |
85 | uint64_t get_free() override; |
86 | double get_fragmentation() override; | |
87 | ||
88 | void dump() override; | |
1e59de90 TL |
89 | void foreach( |
90 | std::function<void(uint64_t offset, uint64_t length)> notify) override; | |
e306af50 TL |
91 | void init_add_free(uint64_t offset, uint64_t length) override; |
92 | void init_rm_free(uint64_t offset, uint64_t length) override; | |
93 | void shutdown() override; | |
9f95a23c TL |
94 | |
95 | private: | |
20effc67 TL |
96 | // pick a range by search from cursor forward |
97 | uint64_t _pick_block_after( | |
98 | uint64_t *cursor, | |
99 | uint64_t size, | |
100 | uint64_t align); | |
101 | // pick a range with exactly the same size or larger | |
102 | uint64_t _pick_block_fits( | |
103 | uint64_t size, | |
9f95a23c | 104 | uint64_t align); |
9f95a23c TL |
105 | int _allocate( |
106 | uint64_t size, | |
107 | uint64_t unit, | |
108 | uint64_t *offset, | |
109 | uint64_t *length); | |
110 | ||
111 | using range_tree_t = | |
112 | boost::intrusive::avl_set< | |
113 | range_seg_t, | |
114 | boost::intrusive::compare<range_seg_t::before_t>, | |
115 | boost::intrusive::member_hook< | |
116 | range_seg_t, | |
117 | boost::intrusive::avl_set_member_hook<>, | |
118 | &range_seg_t::offset_hook>>; | |
119 | range_tree_t range_tree; ///< main range tree | |
120 | /* | |
121 | * The range_size_tree should always contain the | |
122 | * same number of segments as the range_tree. | |
123 | * The only difference is that the range_size_tree | |
124 | * is ordered by segment sizes. | |
125 | */ | |
126 | using range_size_tree_t = | |
127 | boost::intrusive::avl_multiset< | |
128 | range_seg_t, | |
129 | boost::intrusive::compare<range_seg_t::shorter_t>, | |
130 | boost::intrusive::member_hook< | |
131 | range_seg_t, | |
132 | boost::intrusive::avl_set_member_hook<>, | |
e306af50 TL |
133 | &range_seg_t::size_hook>, |
134 | boost::intrusive::constant_time_size<true>>; | |
9f95a23c TL |
135 | range_size_tree_t range_size_tree; |
136 | ||
9f95a23c TL |
137 | uint64_t num_free = 0; ///< total bytes in freelist |
138 | ||
139 | /* | |
140 | * This value defines the number of elements in the ms_lbas array. | |
141 | * The value of 64 was chosen as it covers all power of 2 buckets | |
142 | * up to UINT64_MAX. | |
143 | * This is the equivalent of highest-bit of UINT64_MAX. | |
144 | */ | |
145 | static constexpr unsigned MAX_LBAS = 64; | |
146 | uint64_t lbas[MAX_LBAS] = {0}; | |
147 | ||
148 | /* | |
149 | * Minimum size which forces the dynamic allocator to change | |
150 | * it's allocation strategy. Once the allocator cannot satisfy | |
151 | * an allocation of this size then it switches to using more | |
152 | * aggressive strategy (i.e search by size rather than offset). | |
153 | */ | |
154 | uint64_t range_size_alloc_threshold = 0; | |
155 | /* | |
156 | * The minimum free space, in percent, which must be available | |
157 | * in allocator to continue allocations in a first-fit fashion. | |
158 | * Once the allocator's free space drops below this level we dynamically | |
159 | * switch to using best-fit allocations. | |
160 | */ | |
161 | int range_size_alloc_free_pct = 0; | |
20effc67 TL |
162 | /* |
163 | * Maximum number of segments to check in the first-fit mode, without this | |
164 | * limit, fragmented device can see lots of iterations and _block_picker() | |
165 | * becomes the performance limiting factor on high-performance storage. | |
166 | */ | |
167 | const uint32_t max_search_count; | |
168 | /* | |
169 | * Maximum distance to search forward from the last offset, without this | |
170 | * limit, fragmented device can see lots of iterations and _block_picker() | |
171 | * becomes the performance limiting factor on high-performance storage. | |
172 | */ | |
173 | const uint32_t max_search_bytes; | |
e306af50 TL |
174 | /* |
175 | * Max amount of range entries allowed. 0 - unlimited | |
176 | */ | |
177 | uint64_t range_count_cap = 0; | |
178 | ||
179 | void _range_size_tree_rm(range_seg_t& r) { | |
180 | ceph_assert(num_free >= r.length()); | |
181 | num_free -= r.length(); | |
182 | range_size_tree.erase(r); | |
183 | ||
184 | } | |
185 | void _range_size_tree_try_insert(range_seg_t& r) { | |
186 | if (_try_insert_range(r.start, r.end)) { | |
187 | range_size_tree.insert(r); | |
188 | num_free += r.length(); | |
189 | } else { | |
190 | range_tree.erase_and_dispose(r, dispose_rs{}); | |
191 | } | |
192 | } | |
193 | bool _try_insert_range(uint64_t start, | |
194 | uint64_t end, | |
195 | range_tree_t::iterator* insert_pos = nullptr) { | |
196 | bool res = !range_count_cap || range_size_tree.size() < range_count_cap; | |
197 | bool remove_lowest = false; | |
198 | if (!res) { | |
199 | if (end - start > _lowest_size_available()) { | |
200 | remove_lowest = true; | |
201 | res = true; | |
202 | } | |
203 | } | |
204 | if (!res) { | |
205 | _spillover_range(start, end); | |
206 | } else { | |
207 | // NB: we should do insertion before the following removal | |
208 | // to avoid potential iterator disposal insertion might depend on. | |
209 | if (insert_pos) { | |
210 | auto new_rs = new range_seg_t{ start, end }; | |
211 | range_tree.insert_before(*insert_pos, *new_rs); | |
212 | range_size_tree.insert(*new_rs); | |
213 | num_free += new_rs->length(); | |
214 | } | |
215 | if (remove_lowest) { | |
216 | auto r = range_size_tree.begin(); | |
217 | _range_size_tree_rm(*r); | |
218 | _spillover_range(r->start, r->end); | |
219 | range_tree.erase_and_dispose(*r, dispose_rs{}); | |
220 | } | |
221 | } | |
222 | return res; | |
223 | } | |
224 | virtual void _spillover_range(uint64_t start, uint64_t end) { | |
225 | // this should be overriden when range count cap is present, | |
226 | // i.e. (range_count_cap > 0) | |
227 | ceph_assert(false); | |
228 | } | |
229 | protected: | |
230 | // called when extent to be released/marked free | |
231 | virtual void _add_to_tree(uint64_t start, uint64_t size); | |
232 | ||
233 | protected: | |
9f95a23c TL |
234 | CephContext* cct; |
235 | std::mutex lock; | |
e306af50 TL |
236 | |
237 | double _get_fragmentation() const { | |
20effc67 | 238 | auto free_blocks = p2align(num_free, (uint64_t)block_size) / block_size; |
e306af50 TL |
239 | if (free_blocks <= 1) { |
240 | return .0; | |
241 | } | |
242 | return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1)); | |
243 | } | |
244 | void _dump() const; | |
1e59de90 | 245 | void _foreach(std::function<void(uint64_t offset, uint64_t length)>) const; |
e306af50 TL |
246 | |
247 | uint64_t _lowest_size_available() { | |
248 | auto rs = range_size_tree.begin(); | |
249 | return rs != range_size_tree.end() ? rs->length() : 0; | |
250 | } | |
251 | ||
252 | int64_t _allocate( | |
253 | uint64_t want, | |
254 | uint64_t unit, | |
255 | uint64_t max_alloc_size, | |
256 | int64_t hint, | |
257 | PExtentVector *extents); | |
258 | ||
259 | void _release(const interval_set<uint64_t>& release_set); | |
260 | void _release(const PExtentVector& release_set); | |
261 | void _shutdown(); | |
262 | ||
263 | void _process_range_removal(uint64_t start, uint64_t end, range_tree_t::iterator& rs); | |
264 | void _remove_from_tree(uint64_t start, uint64_t size); | |
265 | void _try_remove_from_tree(uint64_t start, uint64_t size, | |
266 | std::function<void(uint64_t offset, uint64_t length, bool found)> cb); | |
267 | ||
268 | uint64_t _get_free() const { | |
269 | return num_free; | |
270 | } | |
9f95a23c | 271 | }; |