]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/onode_manager/staged-fltree/node.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / crimson / os / seastore / onode_manager / staged-fltree / node.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #pragma once
5
6 #include <map>
7 #include <memory>
8 #include <ostream>
9 #include <boost/smart_ptr/intrusive_ref_counter.hpp>
10
11 #include "crimson/common/type_helpers.h"
12
13 #include "node_extent_mutable.h"
14 #include "stages/key_layout.h"
15 #include "stages/stage_types.h"
16 #include "super.h"
17 #include "tree_types.h"
18
19 /**
20 * Tree example (2 levels):
21 *
22 * Root node keys: [ 3 7 ]
23 * values: [p1 p2 p3]
24 * / | \
25 * ------- | -------
26 * | | |
27 * V V V
28 * Leaf node keys: [ 1 2 3] [ 4 5 7] [ 9 11 12]
29 * values: [v1 v2 v3] [v4 v5 v6] [v7 v8 v9]
30 *
31 * Tree structure properties:
32 * - As illustrated above, the parent key is strictly equal to its left child's
33 * largest key;
34 * - If a tree is indexing multiple seastore transactions, each transaction
35 * will be mapped to a Super which points to a distinct root node. So the
36 * transactions are isolated at tree level. However, tree nodes from
37 * different transactions can reference the same seastore CachedExtent before
38 * modification;
39 * - The resources of the transactional tree are tracked by tree_cursor_ts held
40 * by users. As long as any cursor is alive, the according tree hierarchy is
41 * alive and keeps tracked. See the reversed resource management sections
42 * below;
43 */
44
45 namespace crimson::os::seastore::onode {
46
47 class LeafNode;
48 class InternalNode;
49
50 /**
51 * tree_cursor_t
52 *
53 * A cursor points to a position (LeafNode and search_position_t) of the tree
54 * where it can find the according key and value pair. The position is updated
55 * by LeafNode insert/split/delete/merge internally and is kept valid. It also
56 * caches the key-value information for a specific node layout version.
57 *
58 * Exposes public interfaces for Btree::Cursor.
59 */
60 using layout_version_t = uint32_t;
61 class tree_cursor_t final
62 : public boost::intrusive_ref_counter<
63 tree_cursor_t, boost::thread_unsafe_counter> {
64 public:
65 // public to Btree
66 ~tree_cursor_t();
67 tree_cursor_t(const tree_cursor_t&) = delete;
68 tree_cursor_t(tree_cursor_t&&) = delete;
69 tree_cursor_t& operator=(const tree_cursor_t&) = delete;
70 tree_cursor_t& operator=(tree_cursor_t&&) = delete;
71
72 /**
73 * is_end
74 *
75 * Represents one-past-the-last of all the sorted key-value
76 * pairs in the tree. An end cursor won't contain valid key-value
77 * information.
78 */
79 bool is_end() const { return position.is_end(); }
80
81 /// Returns the key view in tree if it is not an end cursor.
82 const key_view_t& get_key_view() const;
83
84 /// Returns the value pointer in tree if it is not an end cursor.
85 const onode_t* get_p_value() const;
86
87 private:
88 tree_cursor_t(Ref<LeafNode>, const search_position_t&);
89 tree_cursor_t(Ref<LeafNode>, const search_position_t&,
90 const key_view_t& key, const onode_t*, layout_version_t);
91 // lookup reaches the end, contain leaf node for further insert
92 tree_cursor_t(Ref<LeafNode>);
93 const search_position_t& get_position() const { return position; }
94 Ref<LeafNode> get_leaf_node() { return leaf_node; }
95 template <bool VALIDATE>
96 void update_track(Ref<LeafNode>, const search_position_t&);
97 void update_kv(const key_view_t&, const onode_t*, layout_version_t) const;
98 void ensure_kv() const;
99
100 private:
101 /**
102 * Reversed resource management (tree_cursor_t)
103 *
104 * tree_cursor_t holds a reference to the LeafNode, so the LeafNode will be
105 * alive as long as any of it's cursors is still referenced by user.
106 */
107 Ref<LeafNode> leaf_node;
108 search_position_t position;
109
110 // cached information
111 mutable std::optional<key_view_t> key_view;
112 mutable const onode_t* p_value;
113 mutable layout_version_t node_version;
114
115 friend class LeafNode;
116 friend class Node; // get_position(), get_leaf_node()
117 };
118
119 /**
120 * Node
121 *
122 * An abstracted class for both InternalNode and LeafNode.
123 *
124 * Exposes public interfaces for Btree.
125 */
126 class Node
127 : public boost::intrusive_ref_counter<
128 Node, boost::thread_unsafe_counter> {
129 public:
130 // public to Btree
131 using node_ertr = crimson::errorator<
132 crimson::ct_error::input_output_error,
133 crimson::ct_error::invarg,
134 crimson::ct_error::enoent,
135 crimson::ct_error::erange>;
136 template <class ValueT=void>
137 using node_future = node_ertr::future<ValueT>;
138
139 struct search_result_t {
140 bool is_end() const { return p_cursor->is_end(); }
141 Ref<tree_cursor_t> p_cursor;
142 match_stat_t mstat;
143
144 MatchKindBS match() const {
145 assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX);
146 return (mstat == MSTAT_EQ ? MatchKindBS::EQ : MatchKindBS::NE);
147 }
148 };
149
150 virtual ~Node();
151 Node(const Node&) = delete;
152 Node(Node&&) = delete;
153 Node& operator=(const Node&) = delete;
154 Node& operator=(Node&&) = delete;
155
156 /**
157 * level
158 *
159 * A positive value denotes the level (or height) of this node in tree.
160 * 0 means LeafNode, positive means InternalNode.
161 */
162 level_t level() const;
163
164 /**
165 * lookup_smallest
166 *
167 * Returns a cursor pointing to the smallest key in the sub-tree formed by
168 * this node.
169 *
170 * Returns an end cursor if it is an empty root node.
171 */
172 virtual node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) = 0;
173
174 /**
175 * lookup_largest
176 *
177 * Returns a cursor pointing to the largest key in the sub-tree formed by
178 * this node.
179 *
180 * Returns an end cursor if it is an empty root node.
181 */
182 virtual node_future<Ref<tree_cursor_t>> lookup_largest(context_t) = 0;
183
184 /**
185 * lower_bound
186 *
187 * Returns a cursor pointing to the first element in the range [first, last)
188 * of the sub-tree which does not compare less than the input key. The
189 * result also denotes whether the pointed key is equal to the input key.
190 *
191 * Returns an end cursor with MatchKindBS::NE if:
192 * - It is an empty root node;
193 * - Or the input key is larger than all the keys in the sub-tree;
194 */
195 node_future<search_result_t> lower_bound(context_t c, const key_hobj_t& key);
196
197 /**
198 * insert
199 *
200 * Try to insert a key-value pair into the sub-tree formed by this node.
201 *
202 * Returns a boolean denoting whether the insertion is successful:
203 * - If true, the returned cursor points to the inserted element in tree;
204 * - If false, the returned cursor points to the conflicting element in tree;
205 */
206 node_future<std::pair<Ref<tree_cursor_t>, bool>> insert(
207 context_t, const key_hobj_t&, const onode_t&);
208
209 /// Recursively collects the statistics of the sub-tree formed by this node
210 node_future<tree_stats_t> get_tree_stats(context_t);
211
212 /// Returns an ostream containing a dump of all the elements in the node.
213 std::ostream& dump(std::ostream&) const;
214
215 /// Returns an ostream containing an one-line summary of this node.
216 std::ostream& dump_brief(std::ostream&) const;
217
218 /// Initializes the tree by allocating an empty root node.
219 static node_future<> mkfs(context_t, RootNodeTracker&);
220
221 /// Loads the tree root. The tree must be initialized.
222 static node_future<Ref<Node>> load_root(context_t, RootNodeTracker&);
223
224 // Only for unit test purposes.
225 void test_make_destructable(context_t, NodeExtentMutable&, Super::URef&&);
226 virtual node_future<> test_clone_root(context_t, RootNodeTracker&) const = 0;
227
228 protected:
229 virtual node_future<> test_clone_non_root(context_t, Ref<InternalNode>) const {
230 ceph_abort("impossible path");
231 }
232 virtual node_future<search_result_t> lower_bound_tracked(
233 context_t, const key_hobj_t&, MatchHistory&) = 0;
234 virtual node_future<> do_get_tree_stats(context_t, tree_stats_t&) = 0;
235
236 protected:
237 Node(NodeImplURef&&);
238 bool is_root() const {
239 assert((super && !_parent_info.has_value()) ||
240 (!super && _parent_info.has_value()));
241 return !_parent_info.has_value();
242 }
243
244 // as root
245 void make_root(context_t c, Super::URef&& _super);
246 void make_root_new(context_t c, Super::URef&& _super) {
247 assert(_super->get_root_laddr() == L_ADDR_NULL);
248 make_root(c, std::move(_super));
249 }
250 void make_root_from(context_t c, Super::URef&& _super, laddr_t from_addr) {
251 assert(_super->get_root_laddr() == from_addr);
252 make_root(c, std::move(_super));
253 }
254 void as_root(Super::URef&& _super);
255 node_future<> upgrade_root(context_t);
256
257 // as child/non-root
258 template <bool VALIDATE = true>
259 void as_child(const search_position_t&, Ref<InternalNode>);
260 struct parent_info_t {
261 search_position_t position;
262 Ref<InternalNode> ptr;
263 };
264 const parent_info_t& parent_info() const { return *_parent_info; }
265 node_future<> insert_parent(context_t, Ref<Node> right_node);
266
267 private:
268 /**
269 * Reversed resource management (Node)
270 *
271 * Root Node holds a reference to its parent Super class, so its parent
272 * will be alive as long as this root node is alive.
273 *
274 * None-root Node holds a reference to its parent Node, so its parent will
275 * be alive as long as any of it's children is alive.
276 */
277 // as root
278 Super::URef super;
279 // as child/non-root
280 std::optional<parent_info_t> _parent_info;
281
282 private:
283 static node_future<Ref<Node>> load(context_t, laddr_t, bool expect_is_level_tail);
284
285 NodeImplURef impl;
286 friend class InternalNode;
287 };
288 inline std::ostream& operator<<(std::ostream& os, const Node& node) {
289 return node.dump_brief(os);
290 }
291
292 /**
293 * InternalNode
294 *
295 * A concrete implementation of Node class that represents an internal tree
296 * node. Its level is always positive and its values are logical block
297 * addresses to its child nodes. An internal node cannot be empty.
298 */
299 class InternalNode final : public Node {
300 public:
301 // public to Node
302 InternalNode(InternalNodeImpl*, NodeImplURef&&);
303 ~InternalNode() override { assert(tracked_child_nodes.empty()); }
304 InternalNode(const InternalNode&) = delete;
305 InternalNode(InternalNode&&) = delete;
306 InternalNode& operator=(const InternalNode&) = delete;
307 InternalNode& operator=(InternalNode&&) = delete;
308
309 node_future<> apply_child_split(
310 context_t, const search_position_t&, Ref<Node> left, Ref<Node> right);
311 template <bool VALIDATE>
312 void do_track_child(Node& child) {
313 if constexpr (VALIDATE) {
314 validate_child(child);
315 }
316 auto& child_pos = child.parent_info().position;
317 assert(tracked_child_nodes.find(child_pos) == tracked_child_nodes.end());
318 tracked_child_nodes[child_pos] = &child;
319 }
320 void do_untrack_child(const Node& child) {
321 auto& child_pos = child.parent_info().position;
322 assert(tracked_child_nodes.find(child_pos)->second == &child);
323 [[maybe_unused]] auto removed = tracked_child_nodes.erase(child_pos);
324 assert(removed);
325 }
326
327 static node_future<Ref<InternalNode>> allocate_root(
328 context_t, level_t, laddr_t, Super::URef&&);
329
330 protected:
331 node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override;
332 node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override;
333 node_future<search_result_t> lower_bound_tracked(
334 context_t, const key_hobj_t&, MatchHistory&) override;
335 node_future<> do_get_tree_stats(context_t, tree_stats_t&) override;
336
337 node_future<> test_clone_root(context_t, RootNodeTracker&) const override;
338
339 private:
340 // XXX: extract a common tracker for InternalNode to track Node,
341 // and LeafNode to track tree_cursor_t.
342 node_future<Ref<Node>> get_or_track_child(context_t, const search_position_t&, laddr_t);
343 void track_insert(
344 const search_position_t&, match_stage_t, Ref<Node>, Ref<Node> nxt_child = nullptr);
345 void replace_track(const search_position_t&, Ref<Node> new_child, Ref<Node> old_child);
346 void track_split(const search_position_t&, Ref<InternalNode>);
347 void validate_tracked_children() const {
348 #ifndef NDEBUG
349 for (auto& kv : tracked_child_nodes) {
350 assert(kv.first == kv.second->parent_info().position);
351 validate_child(*kv.second);
352 }
353 #endif
354 }
355 void validate_child(const Node& child) const;
356
357 struct fresh_node_t {
358 Ref<InternalNode> node;
359 NodeExtentMutable mut;
360 std::pair<Ref<Node>, NodeExtentMutable> make_pair() {
361 return std::make_pair(Ref<Node>(node), mut);
362 }
363 };
364 static node_future<fresh_node_t> allocate(context_t, field_type_t, bool, level_t);
365
366 private:
367 /**
368 * Reversed resource management (InternalNode)
369 *
370 * InteralNode keeps track of its child nodes which are still alive in
371 * memory, and their positions will be updated throughout
372 * insert/split/delete/merge operations of this node.
373 */
374 // XXX: leverage intrusive data structure to control memory overhead
375 std::map<search_position_t, Node*> tracked_child_nodes;
376 InternalNodeImpl* impl;
377 };
378
379 /**
380 * LeafNode
381 *
382 * A concrete implementation of Node class that represents a leaf tree node.
383 * Its level is always 0. A leaf node can only be empty if it is root.
384 */
385 class LeafNode final : public Node {
386 public:
387 // public to tree_cursor_t
388 ~LeafNode() override { assert(tracked_cursors.empty()); }
389 LeafNode(const LeafNode&) = delete;
390 LeafNode(LeafNode&&) = delete;
391 LeafNode& operator=(const LeafNode&) = delete;
392 LeafNode& operator=(LeafNode&&) = delete;
393
394 bool is_level_tail() const;
395 layout_version_t get_layout_version() const { return layout_version; }
396 std::tuple<key_view_t, const onode_t*, layout_version_t> get_kv(
397 const search_position_t&) const;
398 template <bool VALIDATE>
399 void do_track_cursor(tree_cursor_t& cursor) {
400 if constexpr (VALIDATE) {
401 validate_cursor(cursor);
402 }
403 auto& cursor_pos = cursor.get_position();
404 assert(tracked_cursors.find(cursor_pos) == tracked_cursors.end());
405 tracked_cursors[cursor_pos] = &cursor;
406 }
407 void do_untrack_cursor(tree_cursor_t& cursor) {
408 validate_cursor(cursor);
409 auto& cursor_pos = cursor.get_position();
410 assert(tracked_cursors.find(cursor_pos)->second == &cursor);
411 [[maybe_unused]] auto removed = tracked_cursors.erase(cursor_pos);
412 assert(removed);
413 }
414
415 protected:
416 node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override;
417 node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override;
418 node_future<search_result_t> lower_bound_tracked(
419 context_t, const key_hobj_t&, MatchHistory&) override;
420 node_future<> do_get_tree_stats(context_t, tree_stats_t&) override;
421
422 node_future<> test_clone_root(context_t, RootNodeTracker&) const override;
423
424 private:
425 LeafNode(LeafNodeImpl*, NodeImplURef&&);
426 node_future<Ref<tree_cursor_t>> insert_value(
427 context_t, const key_hobj_t&, const onode_t&,
428 const search_position_t&, const MatchHistory&,
429 match_stat_t mstat);
430 static node_future<Ref<LeafNode>> allocate_root(context_t, RootNodeTracker&);
431 friend class Node;
432
433 private:
434 // XXX: extract a common tracker for InternalNode to track Node,
435 // and LeafNode to track tree_cursor_t.
436 Ref<tree_cursor_t> get_or_track_cursor(
437 const search_position_t&, const key_view_t&, const onode_t*);
438 Ref<tree_cursor_t> track_insert(
439 const search_position_t&, match_stage_t, const onode_t*);
440 void track_split(const search_position_t&, Ref<LeafNode>);
441 void validate_tracked_cursors() const {
442 #ifndef NDEBUG
443 for (auto& kv : tracked_cursors) {
444 assert(kv.first == kv.second->get_position());
445 validate_cursor(*kv.second);
446 }
447 #endif
448 }
449 void validate_cursor(tree_cursor_t& cursor) const;
450 // invalidate p_value pointers in tree_cursor_t
451 void on_layout_change() { ++layout_version; }
452
453 struct fresh_node_t {
454 Ref<LeafNode> node;
455 NodeExtentMutable mut;
456 std::pair<Ref<Node>, NodeExtentMutable> make_pair() {
457 return std::make_pair(Ref<Node>(node), mut);
458 }
459 };
460 static node_future<fresh_node_t> allocate(context_t, field_type_t, bool);
461
462 private:
463 /**
464 * Reversed resource management (LeafNode)
465 *
466 * LeafNode keeps track of the referencing cursors which are still alive in
467 * memory, and their positions will be updated throughout
468 * insert/split/delete/merge operations of this node.
469 */
470 // XXX: leverage intrusive data structure to control memory overhead
471 std::map<search_position_t, tree_cursor_t*> tracked_cursors;
472 LeafNodeImpl* impl;
473 layout_version_t layout_version = 0;
474 };
475
476 }