1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
2 // vim: ts=8 sw=2 smarttab
8 #include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h"
9 #include "key_layout.h"
10 #include "stage_types.h"
12 namespace crimson::os::seastore::onode
{
14 class NodeExtentMutable
;
16 struct internal_sub_item_t
{
17 const snap_gen_t
& get_key() const { return key
; }
18 const laddr_packed_t
* get_p_value() const { return &value
; }
22 } __attribute__((packed
));
25 * internal_sub_items_t
27 * The STAGE_RIGHT implementation for internal node N0/N1/N2, implements staged
28 * contract as an indexable container to index snap-gen to child node
31 * The layout of the contaner storing n sub-items:
33 * # <--------- container range -----------> #
34 * #<~># sub-items [2, n) #
35 * # # <- sub-item 1 -> # <- sub-item 0 -> #
36 * #...# snap-gen | laddr # snap-gen | laddr #
41 class internal_sub_items_t
{
43 using num_keys_t
= index_t
;
45 internal_sub_items_t(const container_range_t
& _range
)
46 : node_size
{_range
.node_size
} {
47 assert(is_valid_node_size(node_size
));
48 auto& range
= _range
.range
;
49 assert(range
.p_start
< range
.p_end
);
50 assert((range
.p_end
- range
.p_start
) % sizeof(internal_sub_item_t
) == 0);
51 num_items
= (range
.p_end
- range
.p_start
) / sizeof(internal_sub_item_t
);
52 assert(num_items
> 0);
53 auto _p_first_item
= range
.p_end
- sizeof(internal_sub_item_t
);
54 p_first_item
= reinterpret_cast<const internal_sub_item_t
*>(_p_first_item
);
57 // container type system
58 using key_get_type
= const snap_gen_t
&;
59 static constexpr auto CONTAINER_TYPE
= ContainerType::INDEXABLE
;
60 num_keys_t
keys() const { return num_items
; }
61 key_get_type
operator[](index_t index
) const {
62 assert(index
< num_items
);
63 return (p_first_item
- index
)->get_key();
65 node_offset_t
size_before(index_t index
) const {
66 size_t ret
= index
* sizeof(internal_sub_item_t
);
67 assert(ret
< node_size
);
70 const laddr_packed_t
* get_p_value(index_t index
) const {
71 assert(index
< num_items
);
72 return (p_first_item
- index
)->get_p_value();
74 node_offset_t
size_overhead_at(index_t index
) const { return 0u; }
75 void encode(const char* p_node_start
, ceph::bufferlist
& encoded
) const {
76 auto p_end
= reinterpret_cast<const char*>(p_first_item
) +
77 sizeof(internal_sub_item_t
);
78 auto p_start
= p_end
- num_items
* sizeof(internal_sub_item_t
);
79 int start_offset
= p_start
- p_node_start
;
80 int stage_size
= p_end
- p_start
;
81 assert(start_offset
> 0);
82 assert(stage_size
> 0);
83 assert(start_offset
+ stage_size
< (int)node_size
);
84 ceph::encode(static_cast<node_offset_t
>(start_offset
), encoded
);
85 ceph::encode(static_cast<node_offset_t
>(stage_size
), encoded
);
88 static internal_sub_items_t
decode(
89 const char* p_node_start
,
90 extent_len_t node_size
,
91 ceph::bufferlist::const_iterator
& delta
) {
92 node_offset_t start_offset
;
93 ceph::decode(start_offset
, delta
);
94 node_offset_t stage_size
;
95 ceph::decode(stage_size
, delta
);
96 assert(start_offset
> 0);
97 assert(stage_size
> 0);
98 assert((unsigned)start_offset
+ stage_size
< node_size
);
99 return internal_sub_items_t({{p_node_start
+ start_offset
,
100 p_node_start
+ start_offset
+ stage_size
},
104 static node_offset_t
header_size() { return 0u; }
106 template <IsFullKey Key
>
107 static node_offset_t
estimate_insert(
108 const Key
&, const laddr_t
&) {
109 return sizeof(internal_sub_item_t
);
112 template <IsFullKey Key
>
113 static const laddr_packed_t
* insert_at(
114 NodeExtentMutable
&, const internal_sub_items_t
&,
115 const Key
&, const laddr_t
&,
116 index_t index
, node_offset_t size
, const char* p_left_bound
);
118 static node_offset_t
trim_until(NodeExtentMutable
&, internal_sub_items_t
&, index_t
);
120 static node_offset_t
erase_at(
121 NodeExtentMutable
&, const internal_sub_items_t
&, index_t
, const char*);
127 extent_len_t node_size
;
129 const internal_sub_item_t
* p_first_item
;
133 class internal_sub_items_t::Appender
{
135 Appender(NodeExtentMutable
* p_mut
, char* p_append
)
136 : p_mut
{p_mut
}, p_append
{p_append
} {}
137 Appender(NodeExtentMutable
* p_mut
, const internal_sub_items_t
& sub_items
)
139 p_append
{(char*)(sub_items
.p_first_item
+ 1 - sub_items
.keys())} {
140 assert(sub_items
.keys());
142 void append(const internal_sub_items_t
& src
, index_t from
, index_t items
);
143 void append(const full_key_t
<KT
>&, const laddr_t
&, const laddr_packed_t
*&);
144 char* wrap() { return p_append
; }
146 NodeExtentMutable
* p_mut
;
153 * The STAGE_RIGHT implementation for leaf node N0/N1/N2, implements staged
154 * contract as an indexable container to index snap-gen to value_header_t.
156 * The layout of the contaner storing n sub-items:
158 * # <------------------------ container range -------------------------------> #
159 * # <---------- sub-items ----------------> # <--- offsets ---------# #
160 * #<~># sub-items [2, n) #<~>| offsets [2, n) # #
161 * # # <- sub-item 1 -> # <- sub-item 0 -> # | # #
162 * #...# snap-gen | value # snap-gen | value #...| offset1 | offset0 # num_keys #
165 * p_items_end + p_offsets + |
168 class leaf_sub_items_t
{
170 // should be enough to index all keys under 64 KiB node
171 using num_keys_t
= uint16_t;
173 // TODO: remove if num_keys_t is aligned
174 struct num_keys_packed_t
{
176 } __attribute__((packed
));
178 leaf_sub_items_t(const container_range_t
& _range
)
179 : node_size
{_range
.node_size
} {
180 assert(is_valid_node_size(node_size
));
181 auto& range
= _range
.range
;
182 assert(range
.p_start
< range
.p_end
);
183 auto _p_num_keys
= range
.p_end
- sizeof(num_keys_t
);
184 assert(range
.p_start
< _p_num_keys
);
185 p_num_keys
= reinterpret_cast<const num_keys_packed_t
*>(_p_num_keys
);
187 auto _p_offsets
= _p_num_keys
- sizeof(node_offset_t
);
188 assert(range
.p_start
< _p_offsets
);
189 p_offsets
= reinterpret_cast<const node_offset_packed_t
*>(_p_offsets
);
190 p_items_end
= reinterpret_cast<const char*>(&get_offset(keys() - 1));
191 assert(range
.p_start
< p_items_end
);
192 assert(range
.p_start
== p_start());
195 bool operator==(const leaf_sub_items_t
& x
) {
196 return (p_num_keys
== x
.p_num_keys
&&
197 p_offsets
== x
.p_offsets
&&
198 p_items_end
== x
.p_items_end
);
201 const char* p_start() const { return get_item_end(keys()); }
203 const node_offset_packed_t
& get_offset(index_t index
) const {
204 assert(index
< keys());
205 return *(p_offsets
- index
);
208 const node_offset_t
get_offset_to_end(index_t index
) const {
209 assert(index
<= keys());
210 return index
== 0 ? 0 : get_offset(index
- 1).value
;
213 const char* get_item_start(index_t index
) const {
214 return p_items_end
- get_offset(index
).value
;
217 const char* get_item_end(index_t index
) const {
218 return p_items_end
- get_offset_to_end(index
);
221 // container type system
222 using key_get_type
= const snap_gen_t
&;
223 static constexpr auto CONTAINER_TYPE
= ContainerType::INDEXABLE
;
224 num_keys_t
keys() const { return p_num_keys
->value
; }
225 key_get_type
operator[](index_t index
) const {
226 assert(index
< keys());
227 auto pointer
= get_item_end(index
);
228 assert(get_item_start(index
) < pointer
);
229 pointer
-= sizeof(snap_gen_t
);
230 assert(get_item_start(index
) < pointer
);
231 return *reinterpret_cast<const snap_gen_t
*>(pointer
);
233 node_offset_t
size_before(index_t index
) const {
234 assert(index
<= keys());
237 ret
= sizeof(num_keys_t
);
240 ret
= sizeof(num_keys_t
) +
241 (index
+ 1) * sizeof(node_offset_t
) +
242 get_offset(index
).value
;
244 assert(ret
< node_size
);
247 node_offset_t
size_overhead_at(index_t index
) const { return sizeof(node_offset_t
); }
248 const value_header_t
* get_p_value(index_t index
) const {
249 assert(index
< keys());
250 auto pointer
= get_item_start(index
);
251 auto value
= reinterpret_cast<const value_header_t
*>(pointer
);
252 assert(pointer
+ value
->allocation_size() + sizeof(snap_gen_t
) ==
253 get_item_end(index
));
256 void encode(const char* p_node_start
, ceph::bufferlist
& encoded
) const {
257 auto p_end
= reinterpret_cast<const char*>(p_num_keys
) +
259 int start_offset
= p_start() - p_node_start
;
260 int stage_size
= p_end
- p_start();
261 assert(start_offset
> 0);
262 assert(stage_size
> 0);
263 assert(start_offset
+ stage_size
< (int)node_size
);
264 ceph::encode(static_cast<node_offset_t
>(start_offset
), encoded
);
265 ceph::encode(static_cast<node_offset_t
>(stage_size
), encoded
);
268 static leaf_sub_items_t
decode(
269 const char* p_node_start
,
270 extent_len_t node_size
,
271 ceph::bufferlist::const_iterator
& delta
) {
272 node_offset_t start_offset
;
273 ceph::decode(start_offset
, delta
);
274 node_offset_t stage_size
;
275 ceph::decode(stage_size
, delta
);
276 assert(start_offset
> 0);
277 assert(stage_size
> 0);
278 assert((unsigned)start_offset
+ stage_size
< node_size
);
279 return leaf_sub_items_t({{p_node_start
+ start_offset
,
280 p_node_start
+ start_offset
+ stage_size
},
284 static node_offset_t
header_size() { return sizeof(num_keys_t
); }
286 template <IsFullKey Key
>
287 static node_offset_t
estimate_insert(
288 const Key
&, const value_config_t
& value
) {
289 return value
.allocation_size() + sizeof(snap_gen_t
) + sizeof(node_offset_t
);
292 template <IsFullKey Key
>
293 static const value_header_t
* insert_at(
294 NodeExtentMutable
&, const leaf_sub_items_t
&,
295 const Key
&, const value_config_t
&,
296 index_t index
, node_offset_t size
, const char* p_left_bound
);
298 static node_offset_t
trim_until(NodeExtentMutable
&, leaf_sub_items_t
&, index_t index
);
300 static node_offset_t
erase_at(
301 NodeExtentMutable
&, const leaf_sub_items_t
&, index_t
, const char*);
307 extent_len_t node_size
;
308 const num_keys_packed_t
* p_num_keys
;
309 const node_offset_packed_t
* p_offsets
;
310 const char* p_items_end
;
313 constexpr index_t APPENDER_LIMIT
= 3u;
316 class leaf_sub_items_t::Appender
{
317 struct range_items_t
{
322 const full_key_t
<KT
>* p_key
;
323 value_config_t value_config
;
325 using var_t
= std::variant
<range_items_t
, kv_item_t
>;
328 Appender(NodeExtentMutable
* p_mut
, char* p_append
)
329 : p_mut
{p_mut
}, p_append
{p_append
} {
331 Appender(NodeExtentMutable
* p_mut
, const leaf_sub_items_t
& sub_items
)
332 : p_mut
{p_mut
} , op_dst(sub_items
) {
333 assert(sub_items
.keys());
336 void append(const leaf_sub_items_t
& src
, index_t from
, index_t items
);
337 void append(const full_key_t
<KT
>& key
,
338 const value_config_t
& value
, const value_header_t
*& p_value
) {
341 assert(pp_value
== nullptr);
342 assert(cnt
<= APPENDER_LIMIT
);
343 appends
[cnt
] = kv_item_t
{&key
, value
};
350 NodeExtentMutable
* p_mut
;
352 std::optional
<leaf_sub_items_t
> op_src
;
353 const value_header_t
** pp_value
= nullptr;
354 char* p_append
= nullptr;
355 var_t appends
[APPENDER_LIMIT
];
357 // append from existing
358 std::optional
<leaf_sub_items_t
> op_dst
;
359 char* p_appended
= nullptr;
362 template <node_type_t
> struct _sub_items_t
;
363 template<> struct _sub_items_t
<node_type_t::INTERNAL
> { using type
= internal_sub_items_t
; };
364 template<> struct _sub_items_t
<node_type_t::LEAF
> { using type
= leaf_sub_items_t
; };
365 template <node_type_t NODE_TYPE
>
366 using sub_items_t
= typename _sub_items_t
<NODE_TYPE
>::type
;