1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
2 // vim: ts=8 sw=2 smarttab
9 #include <boost/smart_ptr/intrusive_ref_counter.hpp>
11 #include "crimson/common/type_helpers.h"
13 #include "node_extent_mutable.h"
14 #include "stages/key_layout.h"
15 #include "stages/stage_types.h"
17 #include "tree_types.h"
20 * Tree example (2 levels):
22 * Root node keys: [ 3 7 ]
28 * Leaf node keys: [ 1 2 3] [ 4 5 7] [ 9 11 12]
29 * values: [v1 v2 v3] [v4 v5 v6] [v7 v8 v9]
31 * Tree structure properties:
32 * - As illustrated above, the parent key is strictly equal to its left child's
34 * - If a tree is indexing multiple seastore transactions, each transaction
35 * will be mapped to a Super which points to a distinct root node. So the
36 * transactions are isolated at tree level. However, tree nodes from
37 * different transactions can reference the same seastore CachedExtent before
39 * - The resources of the transactional tree are tracked by tree_cursor_ts held
40 * by users. As long as any cursor is alive, the according tree hierarchy is
41 * alive and keeps tracked. See the reversed resource management sections
45 namespace crimson::os::seastore::onode
{
53 * A cursor points to a position (LeafNode and search_position_t) of the tree
54 * where it can find the according key and value pair. The position is updated
55 * by LeafNode insert/split/delete/merge internally and is kept valid. It also
56 * caches the key-value information for a specific node layout version.
58 * Exposes public interfaces for Btree::Cursor.
60 using layout_version_t
= uint32_t;
61 class tree_cursor_t final
62 : public boost::intrusive_ref_counter
<
63 tree_cursor_t
, boost::thread_unsafe_counter
> {
67 tree_cursor_t(const tree_cursor_t
&) = delete;
68 tree_cursor_t(tree_cursor_t
&&) = delete;
69 tree_cursor_t
& operator=(const tree_cursor_t
&) = delete;
70 tree_cursor_t
& operator=(tree_cursor_t
&&) = delete;
75 * Represents one-past-the-last of all the sorted key-value
76 * pairs in the tree. An end cursor won't contain valid key-value
79 bool is_end() const { return position
.is_end(); }
81 /// Returns the key view in tree if it is not an end cursor.
82 const key_view_t
& get_key_view() const;
84 /// Returns the value pointer in tree if it is not an end cursor.
85 const onode_t
* get_p_value() const;
88 tree_cursor_t(Ref
<LeafNode
>, const search_position_t
&);
89 tree_cursor_t(Ref
<LeafNode
>, const search_position_t
&,
90 const key_view_t
& key
, const onode_t
*, layout_version_t
);
91 // lookup reaches the end, contain leaf node for further insert
92 tree_cursor_t(Ref
<LeafNode
>);
93 const search_position_t
& get_position() const { return position
; }
94 Ref
<LeafNode
> get_leaf_node() { return leaf_node
; }
95 template <bool VALIDATE
>
96 void update_track(Ref
<LeafNode
>, const search_position_t
&);
97 void update_kv(const key_view_t
&, const onode_t
*, layout_version_t
) const;
98 void ensure_kv() const;
102 * Reversed resource management (tree_cursor_t)
104 * tree_cursor_t holds a reference to the LeafNode, so the LeafNode will be
105 * alive as long as any of it's cursors is still referenced by user.
107 Ref
<LeafNode
> leaf_node
;
108 search_position_t position
;
110 // cached information
111 mutable std::optional
<key_view_t
> key_view
;
112 mutable const onode_t
* p_value
;
113 mutable layout_version_t node_version
;
115 friend class LeafNode
;
116 friend class Node
; // get_position(), get_leaf_node()
122 * An abstracted class for both InternalNode and LeafNode.
124 * Exposes public interfaces for Btree.
127 : public boost::intrusive_ref_counter
<
128 Node
, boost::thread_unsafe_counter
> {
131 using node_ertr
= crimson::errorator
<
132 crimson::ct_error::input_output_error
,
133 crimson::ct_error::invarg
,
134 crimson::ct_error::enoent
,
135 crimson::ct_error::erange
>;
136 template <class ValueT
=void>
137 using node_future
= node_ertr::future
<ValueT
>;
139 struct search_result_t
{
140 bool is_end() const { return p_cursor
->is_end(); }
141 Ref
<tree_cursor_t
> p_cursor
;
144 MatchKindBS
match() const {
145 assert(mstat
>= MSTAT_MIN
&& mstat
<= MSTAT_MAX
);
146 return (mstat
== MSTAT_EQ
? MatchKindBS::EQ
: MatchKindBS::NE
);
151 Node(const Node
&) = delete;
152 Node(Node
&&) = delete;
153 Node
& operator=(const Node
&) = delete;
154 Node
& operator=(Node
&&) = delete;
159 * A positive value denotes the level (or height) of this node in tree.
160 * 0 means LeafNode, positive means InternalNode.
162 level_t
level() const;
167 * Returns a cursor pointing to the smallest key in the sub-tree formed by
170 * Returns an end cursor if it is an empty root node.
172 virtual node_future
<Ref
<tree_cursor_t
>> lookup_smallest(context_t
) = 0;
177 * Returns a cursor pointing to the largest key in the sub-tree formed by
180 * Returns an end cursor if it is an empty root node.
182 virtual node_future
<Ref
<tree_cursor_t
>> lookup_largest(context_t
) = 0;
187 * Returns a cursor pointing to the first element in the range [first, last)
188 * of the sub-tree which does not compare less than the input key. The
189 * result also denotes whether the pointed key is equal to the input key.
191 * Returns an end cursor with MatchKindBS::NE if:
192 * - It is an empty root node;
193 * - Or the input key is larger than all the keys in the sub-tree;
195 node_future
<search_result_t
> lower_bound(context_t c
, const key_hobj_t
& key
);
200 * Try to insert a key-value pair into the sub-tree formed by this node.
202 * Returns a boolean denoting whether the insertion is successful:
203 * - If true, the returned cursor points to the inserted element in tree;
204 * - If false, the returned cursor points to the conflicting element in tree;
206 node_future
<std::pair
<Ref
<tree_cursor_t
>, bool>> insert(
207 context_t
, const key_hobj_t
&, const onode_t
&);
209 /// Recursively collects the statistics of the sub-tree formed by this node
210 node_future
<tree_stats_t
> get_tree_stats(context_t
);
212 /// Returns an ostream containing a dump of all the elements in the node.
213 std::ostream
& dump(std::ostream
&) const;
215 /// Returns an ostream containing an one-line summary of this node.
216 std::ostream
& dump_brief(std::ostream
&) const;
218 /// Initializes the tree by allocating an empty root node.
219 static node_future
<> mkfs(context_t
, RootNodeTracker
&);
221 /// Loads the tree root. The tree must be initialized.
222 static node_future
<Ref
<Node
>> load_root(context_t
, RootNodeTracker
&);
224 // Only for unit test purposes.
225 void test_make_destructable(context_t
, NodeExtentMutable
&, Super::URef
&&);
226 virtual node_future
<> test_clone_root(context_t
, RootNodeTracker
&) const = 0;
229 virtual node_future
<> test_clone_non_root(context_t
, Ref
<InternalNode
>) const {
230 ceph_abort("impossible path");
232 virtual node_future
<search_result_t
> lower_bound_tracked(
233 context_t
, const key_hobj_t
&, MatchHistory
&) = 0;
234 virtual node_future
<> do_get_tree_stats(context_t
, tree_stats_t
&) = 0;
237 Node(NodeImplURef
&&);
238 bool is_root() const {
239 assert((super
&& !_parent_info
.has_value()) ||
240 (!super
&& _parent_info
.has_value()));
241 return !_parent_info
.has_value();
245 void make_root(context_t c
, Super::URef
&& _super
);
246 void make_root_new(context_t c
, Super::URef
&& _super
) {
247 assert(_super
->get_root_laddr() == L_ADDR_NULL
);
248 make_root(c
, std::move(_super
));
250 void make_root_from(context_t c
, Super::URef
&& _super
, laddr_t from_addr
) {
251 assert(_super
->get_root_laddr() == from_addr
);
252 make_root(c
, std::move(_super
));
254 void as_root(Super::URef
&& _super
);
255 node_future
<> upgrade_root(context_t
);
258 template <bool VALIDATE
= true>
259 void as_child(const search_position_t
&, Ref
<InternalNode
>);
260 struct parent_info_t
{
261 search_position_t position
;
262 Ref
<InternalNode
> ptr
;
264 const parent_info_t
& parent_info() const { return *_parent_info
; }
265 node_future
<> insert_parent(context_t
, Ref
<Node
> right_node
);
269 * Reversed resource management (Node)
271 * Root Node holds a reference to its parent Super class, so its parent
272 * will be alive as long as this root node is alive.
274 * None-root Node holds a reference to its parent Node, so its parent will
275 * be alive as long as any of it's children is alive.
280 std::optional
<parent_info_t
> _parent_info
;
283 static node_future
<Ref
<Node
>> load(context_t
, laddr_t
, bool expect_is_level_tail
);
286 friend class InternalNode
;
288 inline std::ostream
& operator<<(std::ostream
& os
, const Node
& node
) {
289 return node
.dump_brief(os
);
295 * A concrete implementation of Node class that represents an internal tree
296 * node. Its level is always positive and its values are logical block
297 * addresses to its child nodes. An internal node cannot be empty.
299 class InternalNode final
: public Node
{
302 InternalNode(InternalNodeImpl
*, NodeImplURef
&&);
303 ~InternalNode() override
{ assert(tracked_child_nodes
.empty()); }
304 InternalNode(const InternalNode
&) = delete;
305 InternalNode(InternalNode
&&) = delete;
306 InternalNode
& operator=(const InternalNode
&) = delete;
307 InternalNode
& operator=(InternalNode
&&) = delete;
309 node_future
<> apply_child_split(
310 context_t
, const search_position_t
&, Ref
<Node
> left
, Ref
<Node
> right
);
311 template <bool VALIDATE
>
312 void do_track_child(Node
& child
) {
313 if constexpr (VALIDATE
) {
314 validate_child(child
);
316 auto& child_pos
= child
.parent_info().position
;
317 assert(tracked_child_nodes
.find(child_pos
) == tracked_child_nodes
.end());
318 tracked_child_nodes
[child_pos
] = &child
;
320 void do_untrack_child(const Node
& child
) {
321 auto& child_pos
= child
.parent_info().position
;
322 assert(tracked_child_nodes
.find(child_pos
)->second
== &child
);
323 [[maybe_unused
]] auto removed
= tracked_child_nodes
.erase(child_pos
);
327 static node_future
<Ref
<InternalNode
>> allocate_root(
328 context_t
, level_t
, laddr_t
, Super::URef
&&);
331 node_future
<Ref
<tree_cursor_t
>> lookup_smallest(context_t
) override
;
332 node_future
<Ref
<tree_cursor_t
>> lookup_largest(context_t
) override
;
333 node_future
<search_result_t
> lower_bound_tracked(
334 context_t
, const key_hobj_t
&, MatchHistory
&) override
;
335 node_future
<> do_get_tree_stats(context_t
, tree_stats_t
&) override
;
337 node_future
<> test_clone_root(context_t
, RootNodeTracker
&) const override
;
340 // XXX: extract a common tracker for InternalNode to track Node,
341 // and LeafNode to track tree_cursor_t.
342 node_future
<Ref
<Node
>> get_or_track_child(context_t
, const search_position_t
&, laddr_t
);
344 const search_position_t
&, match_stage_t
, Ref
<Node
>, Ref
<Node
> nxt_child
= nullptr);
345 void replace_track(const search_position_t
&, Ref
<Node
> new_child
, Ref
<Node
> old_child
);
346 void track_split(const search_position_t
&, Ref
<InternalNode
>);
347 void validate_tracked_children() const {
349 for (auto& kv
: tracked_child_nodes
) {
350 assert(kv
.first
== kv
.second
->parent_info().position
);
351 validate_child(*kv
.second
);
355 void validate_child(const Node
& child
) const;
357 struct fresh_node_t
{
358 Ref
<InternalNode
> node
;
359 NodeExtentMutable mut
;
360 std::pair
<Ref
<Node
>, NodeExtentMutable
> make_pair() {
361 return std::make_pair(Ref
<Node
>(node
), mut
);
364 static node_future
<fresh_node_t
> allocate(context_t
, field_type_t
, bool, level_t
);
368 * Reversed resource management (InternalNode)
370 * InteralNode keeps track of its child nodes which are still alive in
371 * memory, and their positions will be updated throughout
372 * insert/split/delete/merge operations of this node.
374 // XXX: leverage intrusive data structure to control memory overhead
375 std::map
<search_position_t
, Node
*> tracked_child_nodes
;
376 InternalNodeImpl
* impl
;
382 * A concrete implementation of Node class that represents a leaf tree node.
383 * Its level is always 0. A leaf node can only be empty if it is root.
385 class LeafNode final
: public Node
{
387 // public to tree_cursor_t
388 ~LeafNode() override
{ assert(tracked_cursors
.empty()); }
389 LeafNode(const LeafNode
&) = delete;
390 LeafNode(LeafNode
&&) = delete;
391 LeafNode
& operator=(const LeafNode
&) = delete;
392 LeafNode
& operator=(LeafNode
&&) = delete;
394 bool is_level_tail() const;
395 layout_version_t
get_layout_version() const { return layout_version
; }
396 std::tuple
<key_view_t
, const onode_t
*, layout_version_t
> get_kv(
397 const search_position_t
&) const;
398 template <bool VALIDATE
>
399 void do_track_cursor(tree_cursor_t
& cursor
) {
400 if constexpr (VALIDATE
) {
401 validate_cursor(cursor
);
403 auto& cursor_pos
= cursor
.get_position();
404 assert(tracked_cursors
.find(cursor_pos
) == tracked_cursors
.end());
405 tracked_cursors
[cursor_pos
] = &cursor
;
407 void do_untrack_cursor(tree_cursor_t
& cursor
) {
408 validate_cursor(cursor
);
409 auto& cursor_pos
= cursor
.get_position();
410 assert(tracked_cursors
.find(cursor_pos
)->second
== &cursor
);
411 [[maybe_unused
]] auto removed
= tracked_cursors
.erase(cursor_pos
);
416 node_future
<Ref
<tree_cursor_t
>> lookup_smallest(context_t
) override
;
417 node_future
<Ref
<tree_cursor_t
>> lookup_largest(context_t
) override
;
418 node_future
<search_result_t
> lower_bound_tracked(
419 context_t
, const key_hobj_t
&, MatchHistory
&) override
;
420 node_future
<> do_get_tree_stats(context_t
, tree_stats_t
&) override
;
422 node_future
<> test_clone_root(context_t
, RootNodeTracker
&) const override
;
425 LeafNode(LeafNodeImpl
*, NodeImplURef
&&);
426 node_future
<Ref
<tree_cursor_t
>> insert_value(
427 context_t
, const key_hobj_t
&, const onode_t
&,
428 const search_position_t
&, const MatchHistory
&,
430 static node_future
<Ref
<LeafNode
>> allocate_root(context_t
, RootNodeTracker
&);
434 // XXX: extract a common tracker for InternalNode to track Node,
435 // and LeafNode to track tree_cursor_t.
436 Ref
<tree_cursor_t
> get_or_track_cursor(
437 const search_position_t
&, const key_view_t
&, const onode_t
*);
438 Ref
<tree_cursor_t
> track_insert(
439 const search_position_t
&, match_stage_t
, const onode_t
*);
440 void track_split(const search_position_t
&, Ref
<LeafNode
>);
441 void validate_tracked_cursors() const {
443 for (auto& kv
: tracked_cursors
) {
444 assert(kv
.first
== kv
.second
->get_position());
445 validate_cursor(*kv
.second
);
449 void validate_cursor(tree_cursor_t
& cursor
) const;
450 // invalidate p_value pointers in tree_cursor_t
451 void on_layout_change() { ++layout_version
; }
453 struct fresh_node_t
{
455 NodeExtentMutable mut
;
456 std::pair
<Ref
<Node
>, NodeExtentMutable
> make_pair() {
457 return std::make_pair(Ref
<Node
>(node
), mut
);
460 static node_future
<fresh_node_t
> allocate(context_t
, field_type_t
, bool);
464 * Reversed resource management (LeafNode)
466 * LeafNode keeps track of the referencing cursors which are still alive in
467 * memory, and their positions will be updated throughout
468 * insert/split/delete/merge operations of this node.
470 // XXX: leverage intrusive data structure to control memory overhead
471 std::map
<search_position_t
, tree_cursor_t
*> tracked_cursors
;
473 layout_version_t layout_version
= 0;