]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/AvlAllocator.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / os / bluestore / AvlAllocator.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #pragma once
5
6 #include <mutex>
7 #include <boost/intrusive/avl_set.hpp>
8
9 #include "Allocator.h"
10 #include "os/bluestore/bluestore_types.h"
11 #include "include/mempool.h"
12
13 struct range_seg_t {
14 MEMPOOL_CLASS_HELPERS(); ///< memory monitoring
15 uint64_t start; ///< starting offset of this segment
16 uint64_t end; ///< ending offset (non-inclusive)
17
18 range_seg_t(uint64_t start, uint64_t end)
19 : start{start},
20 end{end}
21 {}
22 // Tree is sorted by offset, greater offsets at the end of the tree.
23 struct before_t {
24 template<typename KeyLeft, typename KeyRight>
25 bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const {
26 return lhs.end <= rhs.start;
27 }
28 };
29 boost::intrusive::avl_set_member_hook<> offset_hook;
30
31 // Tree is sorted by size, larger sizes at the end of the tree.
32 struct shorter_t {
33 template<typename KeyType>
34 bool operator()(const range_seg_t& lhs, const KeyType& rhs) const {
35 auto lhs_size = lhs.end - lhs.start;
36 auto rhs_size = rhs.end - rhs.start;
37 if (lhs_size < rhs_size) {
38 return true;
39 } else if (lhs_size > rhs_size) {
40 return false;
41 } else {
42 return lhs.start < rhs.start;
43 }
44 }
45 };
46 inline uint64_t length() const {
47 return end - start;
48 }
49 boost::intrusive::avl_set_member_hook<> size_hook;
50 };
51
52 class AvlAllocator : public Allocator {
53 struct dispose_rs {
54 void operator()(range_seg_t* p)
55 {
56 delete p;
57 }
58 };
59
60 protected:
61 /*
62 * ctor intended for the usage from descendant class(es) which
63 * provides handling for spilled over entries
64 * (when entry count >= max_entries)
65 */
66 AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
67 uint64_t max_mem,
68 const std::string& name);
69
70 public:
71 AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
72 const std::string& name);
73 ~AvlAllocator();
74 const char* get_type() const override
75 {
76 return "avl";
77 }
78 int64_t allocate(
79 uint64_t want,
80 uint64_t unit,
81 uint64_t max_alloc_size,
82 int64_t hint,
83 PExtentVector *extents) override;
84 void release(const interval_set<uint64_t>& release_set) override;
85 int64_t get_capacity() const {
86 return num_total;
87 }
88
89 uint64_t get_block_size() const {
90 return block_size;
91 }
92 uint64_t get_free() override;
93 double get_fragmentation() override;
94
95 void dump() override;
96 void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override;
97 void init_add_free(uint64_t offset, uint64_t length) override;
98 void init_rm_free(uint64_t offset, uint64_t length) override;
99 void shutdown() override;
100
101 private:
102 template<class Tree>
103 uint64_t _block_picker(const Tree& t, uint64_t *cursor, uint64_t size,
104 uint64_t align);
105 int _allocate(
106 uint64_t size,
107 uint64_t unit,
108 uint64_t *offset,
109 uint64_t *length);
110
111 using range_tree_t =
112 boost::intrusive::avl_set<
113 range_seg_t,
114 boost::intrusive::compare<range_seg_t::before_t>,
115 boost::intrusive::member_hook<
116 range_seg_t,
117 boost::intrusive::avl_set_member_hook<>,
118 &range_seg_t::offset_hook>>;
119 range_tree_t range_tree; ///< main range tree
120 /*
121 * The range_size_tree should always contain the
122 * same number of segments as the range_tree.
123 * The only difference is that the range_size_tree
124 * is ordered by segment sizes.
125 */
126 using range_size_tree_t =
127 boost::intrusive::avl_multiset<
128 range_seg_t,
129 boost::intrusive::compare<range_seg_t::shorter_t>,
130 boost::intrusive::member_hook<
131 range_seg_t,
132 boost::intrusive::avl_set_member_hook<>,
133 &range_seg_t::size_hook>,
134 boost::intrusive::constant_time_size<true>>;
135 range_size_tree_t range_size_tree;
136
137 const int64_t num_total; ///< device size
138 const uint64_t block_size; ///< block size
139 uint64_t num_free = 0; ///< total bytes in freelist
140
141 /*
142 * This value defines the number of elements in the ms_lbas array.
143 * The value of 64 was chosen as it covers all power of 2 buckets
144 * up to UINT64_MAX.
145 * This is the equivalent of highest-bit of UINT64_MAX.
146 */
147 static constexpr unsigned MAX_LBAS = 64;
148 uint64_t lbas[MAX_LBAS] = {0};
149
150 /*
151 * Minimum size which forces the dynamic allocator to change
152 * it's allocation strategy. Once the allocator cannot satisfy
153 * an allocation of this size then it switches to using more
154 * aggressive strategy (i.e search by size rather than offset).
155 */
156 uint64_t range_size_alloc_threshold = 0;
157 /*
158 * The minimum free space, in percent, which must be available
159 * in allocator to continue allocations in a first-fit fashion.
160 * Once the allocator's free space drops below this level we dynamically
161 * switch to using best-fit allocations.
162 */
163 int range_size_alloc_free_pct = 0;
164
165 /*
166 * Max amount of range entries allowed. 0 - unlimited
167 */
168 uint64_t range_count_cap = 0;
169
170 void _range_size_tree_rm(range_seg_t& r) {
171 ceph_assert(num_free >= r.length());
172 num_free -= r.length();
173 range_size_tree.erase(r);
174
175 }
176 void _range_size_tree_try_insert(range_seg_t& r) {
177 if (_try_insert_range(r.start, r.end)) {
178 range_size_tree.insert(r);
179 num_free += r.length();
180 } else {
181 range_tree.erase_and_dispose(r, dispose_rs{});
182 }
183 }
184 bool _try_insert_range(uint64_t start,
185 uint64_t end,
186 range_tree_t::iterator* insert_pos = nullptr) {
187 bool res = !range_count_cap || range_size_tree.size() < range_count_cap;
188 bool remove_lowest = false;
189 if (!res) {
190 if (end - start > _lowest_size_available()) {
191 remove_lowest = true;
192 res = true;
193 }
194 }
195 if (!res) {
196 _spillover_range(start, end);
197 } else {
198 // NB: we should do insertion before the following removal
199 // to avoid potential iterator disposal insertion might depend on.
200 if (insert_pos) {
201 auto new_rs = new range_seg_t{ start, end };
202 range_tree.insert_before(*insert_pos, *new_rs);
203 range_size_tree.insert(*new_rs);
204 num_free += new_rs->length();
205 }
206 if (remove_lowest) {
207 auto r = range_size_tree.begin();
208 _range_size_tree_rm(*r);
209 _spillover_range(r->start, r->end);
210 range_tree.erase_and_dispose(*r, dispose_rs{});
211 }
212 }
213 return res;
214 }
215 virtual void _spillover_range(uint64_t start, uint64_t end) {
216 // this should be overriden when range count cap is present,
217 // i.e. (range_count_cap > 0)
218 ceph_assert(false);
219 }
220 protected:
221 // called when extent to be released/marked free
222 virtual void _add_to_tree(uint64_t start, uint64_t size);
223
224 protected:
225 CephContext* cct;
226 std::mutex lock;
227
228 double _get_fragmentation() const {
229 auto free_blocks = p2align(num_free, block_size) / block_size;
230 if (free_blocks <= 1) {
231 return .0;
232 }
233 return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1));
234 }
235 void _dump() const;
236
237 uint64_t _lowest_size_available() {
238 auto rs = range_size_tree.begin();
239 return rs != range_size_tree.end() ? rs->length() : 0;
240 }
241
242 int64_t _allocate(
243 uint64_t want,
244 uint64_t unit,
245 uint64_t max_alloc_size,
246 int64_t hint,
247 PExtentVector *extents);
248
249 void _release(const interval_set<uint64_t>& release_set);
250 void _release(const PExtentVector& release_set);
251 void _shutdown();
252
253 void _process_range_removal(uint64_t start, uint64_t end, range_tree_t::iterator& rs);
254 void _remove_from_tree(uint64_t start, uint64_t size);
255 void _try_remove_from_tree(uint64_t start, uint64_t size,
256 std::function<void(uint64_t offset, uint64_t length, bool found)> cb);
257
258 uint64_t _get_free() const {
259 return num_free;
260 }
261 };