]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BtreeAllocator.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / os / bluestore / BtreeAllocator.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #pragma once
5
6 #include <mutex>
7 #include "include/cpp-btree/btree_map.h"
8 #include "include/cpp-btree/btree_set.h"
9 #include "Allocator.h"
10 #include "os/bluestore/bluestore_types.h"
11 #include "include/mempool.h"
12
13 class BtreeAllocator : public Allocator {
14 struct range_seg_t {
15 uint64_t start; ///< starting offset of this segment
16 uint64_t end; ///< ending offset (non-inclusive)
17
18 range_seg_t(uint64_t start, uint64_t end)
19 : start{start},
20 end{end}
21 {}
22 inline uint64_t length() const {
23 return end - start;
24 }
25 };
26
27 struct range_value_t {
28 uint64_t size;
29 uint64_t start;
30 range_value_t(uint64_t start, uint64_t end)
31 : size{end - start},
32 start{start}
33 {}
34 range_value_t(const range_seg_t& rs)
35 : size{rs.length()},
36 start{rs.start}
37 {}
38 };
39 // do the radix sort
40 struct compare_range_value_t {
41 int operator()(const range_value_t& lhs,
42 const range_value_t& rhs) const noexcept {
43 if (lhs.size < rhs.size) {
44 return -1;
45 } else if (lhs.size > rhs.size) {
46 return 1;
47 }
48 if (lhs.start < rhs.start) {
49 return -1;
50 } else if (lhs.start > rhs.start) {
51 return 1;
52 } else {
53 return 0;
54 }
55 }
56 };
57 protected:
58 /*
59 * ctor intended for the usage from descendant class(es) which
60 * provides handling for spilled over entries
61 * (when entry count >= max_entries)
62 */
63 BtreeAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
64 uint64_t max_mem,
65 std::string_view name);
66
67 public:
68 BtreeAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
69 std::string_view name);
70 ~BtreeAllocator();
71 const char* get_type() const override
72 {
73 return "btree";
74 }
75 int64_t allocate(
76 uint64_t want,
77 uint64_t unit,
78 uint64_t max_alloc_size,
79 int64_t hint,
80 PExtentVector *extents) override;
81 void release(const interval_set<uint64_t>& release_set) override;
82 uint64_t get_free() override;
83 double get_fragmentation() override;
84
85 void dump() override;
86 void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override;
87 void init_add_free(uint64_t offset, uint64_t length) override;
88 void init_rm_free(uint64_t offset, uint64_t length) override;
89 void shutdown() override;
90
91 private:
92 // pick a range by search from cursor forward
93 uint64_t _pick_block_after(
94 uint64_t *cursor,
95 uint64_t size,
96 uint64_t align);
97 // pick a range with exactly the same size or larger
98 uint64_t _pick_block_fits(
99 uint64_t size,
100 uint64_t align);
101 int _allocate(
102 uint64_t size,
103 uint64_t unit,
104 uint64_t *offset,
105 uint64_t *length);
106
107 template<class T>
108 using pool_allocator = mempool::bluestore_alloc::pool_allocator<T>;
109 using range_tree_t =
110 btree::btree_map<
111 uint64_t /* start */,
112 uint64_t /* end */,
113 std::less<uint64_t>,
114 pool_allocator<std::pair<uint64_t, uint64_t>>>;
115 range_tree_t range_tree; ///< main range tree
116 /*
117 * The range_size_tree should always contain the
118 * same number of segments as the range_tree.
119 * The only difference is that the range_size_tree
120 * is ordered by segment sizes.
121 */
122 using range_size_tree_t =
123 btree::btree_set<
124 range_value_t /* size, start */,
125 compare_range_value_t,
126 pool_allocator<range_value_t>>;
127 range_size_tree_t range_size_tree;
128
129 uint64_t num_free = 0; ///< total bytes in freelist
130
131 /*
132 * This value defines the number of elements in the ms_lbas array.
133 * The value of 64 was chosen as it covers all power of 2 buckets
134 * up to UINT64_MAX.
135 * This is the equivalent of highest-bit of UINT64_MAX.
136 */
137 static constexpr unsigned MAX_LBAS = 64;
138 uint64_t lbas[MAX_LBAS] = {0};
139
140 /*
141 * Minimum size which forces the dynamic allocator to change
142 * it's allocation strategy. Once the allocator cannot satisfy
143 * an allocation of this size then it switches to using more
144 * aggressive strategy (i.e search by size rather than offset).
145 */
146 uint64_t range_size_alloc_threshold = 0;
147 /*
148 * The minimum free space, in percent, which must be available
149 * in allocator to continue allocations in a first-fit fashion.
150 * Once the allocator's free space drops below this level we dynamically
151 * switch to using best-fit allocations.
152 */
153 int range_size_alloc_free_pct = 0;
154
155 /*
156 * Max amount of range entries allowed. 0 - unlimited
157 */
158 int64_t range_count_cap = 0;
159
160 private:
161 CephContext* cct;
162 std::mutex lock;
163
164 double _get_fragmentation() const {
165 auto free_blocks = p2align(num_free, (uint64_t)block_size) / block_size;
166 if (free_blocks <= 1) {
167 return .0;
168 }
169 return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1));
170 }
171 void _dump() const;
172
173 uint64_t _lowest_size_available() const {
174 auto rs = range_size_tree.begin();
175 return rs != range_size_tree.end() ? rs->size : 0;
176 }
177
178 int64_t _allocate(
179 uint64_t want,
180 uint64_t unit,
181 uint64_t max_alloc_size,
182 int64_t hint,
183 PExtentVector *extents);
184
185 void _release(const interval_set<uint64_t>& release_set);
186 void _release(const PExtentVector& release_set);
187 void _shutdown();
188
189 // called when extent to be released/marked free
190 void _add_to_tree(uint64_t start, uint64_t size);
191 void _process_range_removal(uint64_t start, uint64_t end, range_tree_t::iterator& rs);
192 void _remove_from_tree(uint64_t start, uint64_t size);
193 void _try_remove_from_tree(uint64_t start, uint64_t size,
194 std::function<void(uint64_t offset, uint64_t length, bool found)> cb);
195
196 uint64_t _get_free() const {
197 return num_free;
198 }
199 };