1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
11 #include "include/buffer.h"
13 #include "crimson/common/fixed_kv_node_layout.h"
14 #include "crimson/common/errorator.h"
15 #include "crimson/os/seastore/seastore_types.h"
16 #include "crimson/os/seastore/cache.h"
17 #include "crimson/os/seastore/cached_extent.h"
19 #include "crimson/os/seastore/btree/btree_range_pin.h"
20 #include "crimson/os/seastore/btree/fixed_kv_btree.h"
21 #include "crimson/os/seastore/root_block.h"
23 namespace crimson::os::seastore
{
28 * Base class enabling recursive lookup between internal and leaf nodes.
30 template <typename node_key_t
>
31 struct FixedKVNode
: ChildableCachedExtent
{
32 using FixedKVNodeRef
= TCachedExtentRef
<FixedKVNode
>;
33 fixed_kv_node_meta_t
<node_key_t
> range
;
35 struct copy_source_cmp_t
{
36 using is_transparent
= node_key_t
;
37 bool operator()(const FixedKVNodeRef
&l
, const FixedKVNodeRef
&r
) const {
38 assert(l
->range
.end
<= r
->range
.begin
39 || r
->range
.end
<= l
->range
.begin
40 || (l
->range
.begin
== r
->range
.begin
41 && l
->range
.end
== r
->range
.end
));
42 return l
->range
.begin
< r
->range
.begin
;
44 bool operator()(const node_key_t
&l
, const FixedKVNodeRef
&r
) const {
45 return l
< r
->range
.begin
;
47 bool operator()(const FixedKVNodeRef
&l
, const node_key_t
&r
) const {
48 return l
->range
.begin
< r
;
54 * Nodes of fixed-kv-btree connect to their child nodes by pointers following
57 * 1. if nodes are stable:
58 * a. parent points at the node's stable parent
59 * b. prior_instance is empty
60 * c. child pointers point at stable children. Child resolution is done
61 * directly via this array.
62 * c. copy_sources is empty
63 * 2. if nodes are mutation_pending:
64 * a. parent is empty and needs to be fixed upon commit
65 * b. prior_instance points to its stable version
66 * c. child pointers are null except for initial_pending() children of
67 * this transaction. Child resolution is done by first checking this
68 * array, and then recursively resolving via the parent. We copy child
69 * pointers from parent on commit.
70 * c. copy_sources is empty
71 * 3. if nodes are initial_pending
72 * a. parent points at its pending parent on this transaction (must exist)
73 * b. prior_instance is empty or, if it's the result of rewrite, points to
74 * its stable predecessor
75 * c. child pointers are null except for initial_pending() children of
76 * this transaction (live due to 3a below). Child resolution is done
77 * by first checking this array, and then recursively resolving via
78 * the correct copy_sources entry. We copy child pointers from copy_sources
80 * d. copy_sources contains the set of stable nodes at the same tree-level(only
81 * its "prior_instance" if the node is the result of a rewrite), with which
82 * the lba range of this node overlaps.
84 std::vector
<ChildableCachedExtent
*> children
;
85 std::set
<FixedKVNodeRef
, copy_source_cmp_t
> copy_sources
;
86 uint16_t capacity
= 0;
87 parent_tracker_t
* my_tracker
= nullptr;
88 RootBlockRef root_block
;
91 assert(!has_parent_tracker() || !(bool)root_block
);
92 return (bool)has_parent_tracker() || (bool)root_block
;
95 FixedKVNode(uint16_t capacity
, ceph::bufferptr
&&ptr
)
96 : ChildableCachedExtent(std::move(ptr
)),
97 children(capacity
, nullptr),
99 FixedKVNode(const FixedKVNode
&rhs
)
100 : ChildableCachedExtent(rhs
),
102 children(rhs
.capacity
, nullptr),
103 capacity(rhs
.capacity
) {}
105 virtual fixed_kv_node_meta_t
<node_key_t
> get_node_meta() const = 0;
106 virtual uint16_t get_node_size() const = 0;
108 virtual ~FixedKVNode() = default;
109 virtual node_key_t
get_key_from_idx(uint16_t idx
) const = 0;
111 template<typename iter_t
>
112 void update_child_ptr(iter_t iter
, ChildableCachedExtent
* child
) {
113 children
[iter
.get_offset()] = child
;
114 set_child_ptracker(child
);
117 virtual bool is_leaf_and_has_children() const = 0;
119 template<typename iter_t
>
120 void insert_child_ptr(iter_t iter
, ChildableCachedExtent
* child
) {
121 auto raw_children
= children
.data();
122 auto offset
= iter
.get_offset();
124 &raw_children
[offset
+ 1],
125 &raw_children
[offset
],
126 (get_node_size() - offset
) * sizeof(ChildableCachedExtent
*));
128 children
[offset
] = child
;
129 set_child_ptracker(child
);
131 // this can only happen when reserving lba spaces
132 ceph_assert(is_leaf_and_has_children());
133 // this is to avoid mistakenly copying pointers from
134 // copy sources when committing this lba node, because
135 // we rely on pointers' "nullness" to avoid copying
136 // pointers for updated values
137 children
[offset
] = RESERVATION_PTR
;
141 template<typename iter_t
>
142 void remove_child_ptr(iter_t iter
) {
143 LOG_PREFIX(FixedKVNode::remove_child_ptr
);
144 auto raw_children
= children
.data();
145 auto offset
= iter
.get_offset();
146 SUBTRACE(seastore_fixedkv_tree
, "trans.{}, pos {}, total size {}, extent {}",
147 this->pending_for_transaction
,
150 (void*)raw_children
[offset
]);
151 // parent tracker of the child being removed will be
152 // reset when the child is invalidated, so no need to
155 &raw_children
[offset
],
156 &raw_children
[offset
+ 1],
157 (get_node_size() - offset
- 1) * sizeof(ChildableCachedExtent
*));
160 FixedKVNode
& get_stable_for_key(node_key_t key
) {
161 ceph_assert(is_pending());
162 if (is_mutation_pending()) {
163 return (FixedKVNode
&)*get_prior_instance();
165 ceph_assert(!copy_sources
.empty());
166 auto it
= copy_sources
.upper_bound(key
);
168 auto ©_source
= *it
;
169 ceph_assert(copy_source
->get_node_meta().is_in_range(key
));
174 static void push_copy_sources(
178 ceph_assert(dest
.is_initial_pending());
179 if (!src
.is_pending()) {
180 dest
.copy_sources
.emplace(&src
);
181 } else if (src
.is_mutation_pending()) {
182 dest
.copy_sources
.emplace(
183 src
.get_prior_instance()->template cast
<FixedKVNode
>());
185 ceph_assert(src
.is_initial_pending());
186 dest
.copy_sources
.insert(
187 src
.copy_sources
.begin(),
188 src
.copy_sources
.end());
192 virtual uint16_t get_node_split_pivot() = 0;
194 static void move_child_ptrs(
202 dest
.children
.data() + dest_start
,
203 src
.children
.data() + src_start
,
204 (src_end
- src_start
) * sizeof(ChildableCachedExtent
*));
206 ceph_assert(src_start
< src_end
);
207 ceph_assert(src
.children
.size() >= src_end
);
208 for (auto it
= src
.children
.begin() + src_start
;
209 it
!= src
.children
.begin() + src_end
;
213 if (is_valid_child_ptr(child
)) {
214 dest
.set_child_ptracker(child
);
219 void link_child(ChildableCachedExtent
* child
, uint16_t pos
) {
220 assert(pos
< get_node_size());
222 ceph_assert(!is_pending());
223 ceph_assert(child
->is_valid() && !child
->is_pending());
224 assert(!children
[pos
]);
225 children
[pos
] = child
;
226 set_child_ptracker(child
);
229 virtual get_child_ret_t
<LogicalCachedExtent
>
230 get_logical_child(op_context_t
<node_key_t
> c
, uint16_t pos
) = 0;
232 template <typename T
, typename iter_t
>
233 get_child_ret_t
<T
> get_child(op_context_t
<node_key_t
> c
, iter_t iter
) {
234 auto pos
= iter
.get_offset();
235 assert(children
.capacity());
236 auto child
= children
[pos
];
237 if (is_valid_child_ptr(child
)) {
238 ceph_assert(child
->get_type() == T::TYPE
);
239 return c
.cache
.template get_extent_viewable_by_trans
<T
>(c
.trans
, (T
*)child
);
240 } else if (is_pending()) {
241 auto key
= iter
.get_key();
242 auto &sparent
= get_stable_for_key(key
);
243 auto spos
= sparent
.child_pos_for_key(key
);
244 auto child
= sparent
.children
[spos
];
245 if (is_valid_child_ptr(child
)) {
246 ceph_assert(child
->get_type() == T::TYPE
);
247 return c
.cache
.template get_extent_viewable_by_trans
<T
>(c
.trans
, (T
*)child
);
249 return child_pos_t(&sparent
, spos
);
252 return child_pos_t(this, pos
);
256 void split_child_ptrs(
260 assert(!left
.my_tracker
);
261 assert(!right
.my_tracker
);
262 push_copy_sources(left
, *this);
263 push_copy_sources(right
, *this);
265 uint16_t pivot
= get_node_split_pivot();
266 move_child_ptrs(left
, *this, 0, 0, pivot
);
267 move_child_ptrs(right
, *this, 0, pivot
, get_node_size());
268 my_tracker
= nullptr;
272 void merge_child_ptrs(
276 ceph_assert(!my_tracker
);
277 push_copy_sources(*this, left
);
278 push_copy_sources(*this, right
);
280 if (left
.is_pending()) {
281 move_child_ptrs(*this, left
, 0, 0, left
.get_node_size());
282 left
.my_tracker
= nullptr;
285 if (right
.is_pending()) {
286 move_child_ptrs(*this, right
, left
.get_node_size(), 0, right
.get_node_size());
287 right
.my_tracker
= nullptr;
291 static void balance_child_ptrs(
295 FixedKVNode
&replacement_left
,
296 FixedKVNode
&replacement_right
)
298 size_t l_size
= left
.get_node_size();
299 size_t r_size
= right
.get_node_size();
300 size_t total
= l_size
+ r_size
;
301 size_t pivot_idx
= (l_size
+ r_size
) / 2;
302 if (total
% 2 && prefer_left
) {
306 assert(!replacement_left
.my_tracker
);
307 assert(!replacement_right
.my_tracker
);
308 if (pivot_idx
< l_size
) {
310 push_copy_sources(replacement_left
, left
);
311 push_copy_sources(replacement_right
, left
);
312 if (left
.is_pending()) {
313 move_child_ptrs(replacement_left
, left
, 0, 0, pivot_idx
);
314 move_child_ptrs(replacement_right
, left
, 0, pivot_idx
, l_size
);
315 left
.my_tracker
= nullptr;
319 push_copy_sources(replacement_right
, right
);
320 if (right
.is_pending()) {
321 move_child_ptrs(replacement_right
, right
, l_size
- pivot_idx
, 0, r_size
);
322 right
.my_tracker
= nullptr;
326 push_copy_sources(replacement_left
, left
);
327 if (left
.is_pending()) {
328 move_child_ptrs(replacement_left
, left
, 0, 0, l_size
);
329 left
.my_tracker
= nullptr;
333 push_copy_sources(replacement_left
, right
);
334 push_copy_sources(replacement_right
, right
);
335 if (right
.is_pending()) {
336 move_child_ptrs(replacement_left
, right
, l_size
, 0, pivot_idx
- l_size
);
337 move_child_ptrs(replacement_right
, right
, 0, pivot_idx
- l_size
, r_size
);
338 right
.my_tracker
= nullptr;
343 void set_parent_tracker_from_prior_instance() {
344 assert(is_mutation_pending());
345 auto &prior
= (FixedKVNode
&)(*get_prior_instance());
346 if (range
.is_root()) {
347 ceph_assert(prior
.root_block
);
348 ceph_assert(pending_for_transaction
);
349 root_block
= prior
.root_block
;
350 link_phy_tree_root_node(root_block
, this);
353 ceph_assert(!root_block
);
354 take_prior_parent_tracker();
355 assert(is_parent_valid());
356 auto parent
= get_parent_node
<FixedKVNode
>();
357 //TODO: can this search be avoided?
358 auto off
= parent
->lower_bound_offset(get_node_meta().begin
);
359 assert(parent
->get_key_from_idx(off
) == get_node_meta().begin
);
360 parent
->children
[off
] = this;
363 bool is_children_empty() const {
364 for (auto it
= children
.begin();
365 it
!= children
.begin() + get_node_size();
367 if (is_valid_child_ptr(*it
)
368 && (*it
)->is_valid()) {
375 void set_children_from_prior_instance() {
376 assert(get_prior_instance());
377 auto &prior
= (FixedKVNode
&)(*get_prior_instance());
378 assert(prior
.my_tracker
|| prior
.is_children_empty());
380 if (prior
.my_tracker
) {
381 prior
.my_tracker
->reset_parent(this);
382 my_tracker
= prior
.my_tracker
;
383 // All my initial pending children is pointing to the original
384 // tracker which has been dropped by the above line, so need
385 // to adjust them to point to the new tracker
386 adjust_ptracker_for_children();
388 assert(my_tracker
|| is_children_empty());
391 void adjust_ptracker_for_children() {
392 auto begin
= children
.begin();
393 auto end
= begin
+ get_node_size();
394 ceph_assert(end
<= children
.end());
395 for (auto it
= begin
; it
!= end
; it
++) {
397 if (is_valid_child_ptr(child
)) {
398 set_child_ptracker(child
);
403 void on_delta_write(paddr_t record_block_offset
) final
{
404 // All in-memory relative addrs are necessarily record-relative
405 assert(get_prior_instance());
406 assert(pending_for_transaction
);
407 resolve_relative_addrs(record_block_offset
);
410 virtual uint16_t lower_bound_offset(node_key_t
) const = 0;
411 virtual uint16_t upper_bound_offset(node_key_t
) const = 0;
412 virtual uint16_t child_pos_for_key(node_key_t
) const = 0;
414 virtual bool validate_stable_children() = 0;
416 template<typename iter_t
>
417 uint16_t copy_children_from_stable_source(
419 iter_t foreign_start_it
,
420 iter_t foreign_end_it
,
421 iter_t local_start_it
) {
422 auto foreign_it
= foreign_start_it
, local_it
= local_start_it
;
423 while (foreign_it
!= foreign_end_it
424 && local_it
.get_offset() < get_node_size())
426 auto &child
= children
[local_it
.get_offset()];
427 if (foreign_it
.get_key() == local_it
.get_key()) {
428 // the foreign key is preserved
430 child
= source
.children
[foreign_it
.get_offset()];
434 } else if (foreign_it
.get_key() < local_it
.get_key()) {
435 // the foreign key has been removed, because, if it hasn't,
436 // there must have been a local key before the one pointed
437 // by the current "local_it" that's equal to this foreign key
438 // and has pushed the foreign_it forward.
441 // the local key must be a newly inserted one.
445 return local_it
.get_offset();
448 template<typename Func
>
449 void copy_children_from_stable_sources(Func
&&get_iter
) {
450 if (!copy_sources
.empty()) {
451 auto it
= --copy_sources
.upper_bound(get_node_meta().begin
);
453 uint16_t start_pos
= cs
->lower_bound_offset(
454 get_node_meta().begin
);
455 if (start_pos
== cs
->get_node_size()) {
459 uint16_t local_next_pos
= 0;
460 for (; it
!= copy_sources
.end(); it
++) {
461 auto& copy_source
= *it
;
462 auto end_pos
= copy_source
->get_node_size();
463 if (copy_source
->get_node_meta().is_in_range(get_node_meta().end
)) {
464 end_pos
= copy_source
->upper_bound_offset(get_node_meta().end
);
466 auto local_start_iter
= get_iter(*this, local_next_pos
);
467 auto foreign_start_iter
= get_iter(*copy_source
, start_pos
);
468 auto foreign_end_iter
= get_iter(*copy_source
, end_pos
);
469 local_next_pos
= copy_children_from_stable_source(
470 *copy_source
, foreign_start_iter
, foreign_end_iter
, local_start_iter
);
471 if (end_pos
!= copy_source
->get_node_size()) {
479 void on_invalidated(Transaction
&t
) final
{
480 reset_parent_tracker();
484 return is_initial_pending() && get_prior_instance();
487 void on_initial_write() final
{
488 // All in-memory relative addrs are necessarily block-relative
489 resolve_relative_addrs(get_paddr());
490 if (range
.is_root()) {
491 reset_parent_tracker();
493 assert(has_parent_tracker() ? (is_parent_valid()) : true);
496 void set_child_ptracker(ChildableCachedExtent
*child
) {
497 if (!this->my_tracker
) {
498 this->my_tracker
= new parent_tracker_t(this);
500 child
->reset_parent_tracker(this->my_tracker
);
503 void on_clean_read() final
{
504 // From initial write of block, relative addrs are necessarily block-relative
505 resolve_relative_addrs(get_paddr());
508 virtual void resolve_relative_addrs(paddr_t base
) = 0;
512 * FixedKVInternalNode
514 * Abstracts operations on and layout of internal nodes for the
520 typename NODE_KEY_LE
,
522 typename node_type_t
>
523 struct FixedKVInternalNode
524 : FixedKVNode
<NODE_KEY
>,
525 common::FixedKVNodeLayout
<
527 fixed_kv_node_meta_t
<NODE_KEY
>,
528 fixed_kv_node_meta_le_t
<NODE_KEY_LE
>,
529 NODE_KEY
, NODE_KEY_LE
,
530 paddr_t
, paddr_le_t
> {
531 using Ref
= TCachedExtentRef
<node_type_t
>;
532 using base_t
= FixedKVNode
<NODE_KEY
>;
533 using base_ref
= typename FixedKVNode
<NODE_KEY
>::FixedKVNodeRef
;
534 using node_layout_t
=
535 common::FixedKVNodeLayout
<
537 fixed_kv_node_meta_t
<NODE_KEY
>,
538 fixed_kv_node_meta_le_t
<NODE_KEY_LE
>,
543 using internal_const_iterator_t
= typename
node_layout_t::const_iterator
;
544 using internal_iterator_t
= typename
node_layout_t::iterator
;
545 using this_type_t
= FixedKVInternalNode
<
552 FixedKVInternalNode(ceph::bufferptr
&&ptr
)
553 : FixedKVNode
<NODE_KEY
>(CAPACITY
, std::move(ptr
)),
554 node_layout_t(this->get_bptr().c_str()) {}
555 FixedKVInternalNode(const FixedKVInternalNode
&rhs
)
556 : FixedKVNode
<NODE_KEY
>(rhs
),
557 node_layout_t(this->get_bptr().c_str()) {}
559 bool is_leaf_and_has_children() const final
{
563 uint16_t get_node_split_pivot() final
{
564 return this->get_split_pivot().get_offset();
567 void prepare_write() final
{
568 if (this->is_initial_pending()) {
569 if (this->is_rewrite()) {
570 this->set_children_from_prior_instance();
572 this->copy_children_from_stable_sources(
573 [this](base_t
&node
, uint16_t pos
) {
574 ceph_assert(node
.get_type() == this->get_type());
575 auto &n
= static_cast<this_type_t
&>(node
);
576 return n
.iter_idx(pos
);
579 if (this->is_rewrite()) {
580 this->reset_prior_instance();
582 this->adjust_ptracker_for_children();
584 assert(this->validate_stable_children());
585 this->copy_sources
.clear();
589 get_child_ret_t
<LogicalCachedExtent
>
590 get_logical_child(op_context_t
<NODE_KEY
>, uint16_t pos
) final
{
591 ceph_abort("impossible");
592 return get_child_ret_t
<LogicalCachedExtent
>(child_pos_t(nullptr, 0));
595 bool validate_stable_children() final
{
596 LOG_PREFIX(FixedKVInternalNode::validate_stable_children
);
597 if (this->children
.empty()) {
601 for (auto i
: *this) {
602 auto child
= (FixedKVNode
<NODE_KEY
>*)this->children
[i
.get_offset()];
603 if (child
&& child
->range
.begin
!= i
.get_key()) {
604 SUBERROR(seastore_fixedkv_tree
,
605 "stable child not valid: child {}, child meta{}, key {}",
607 child
->get_node_meta(),
616 virtual ~FixedKVInternalNode() {
617 if (this->is_valid() && !this->is_pending()) {
618 if (this->range
.is_root()) {
619 ceph_assert(this->root_block
);
620 unlink_phy_tree_root_node
<NODE_KEY
>(this->root_block
);
622 ceph_assert(this->is_parent_valid());
623 auto parent
= this->template get_parent_node
<FixedKVNode
<NODE_KEY
>>();
624 auto off
= parent
->lower_bound_offset(this->get_meta().begin
);
625 assert(parent
->get_key_from_idx(off
) == this->get_meta().begin
);
626 assert(parent
->children
[off
] == this);
627 parent
->children
[off
] = nullptr;
632 uint16_t lower_bound_offset(NODE_KEY key
) const final
{
633 return this->lower_bound(key
).get_offset();
636 uint16_t upper_bound_offset(NODE_KEY key
) const final
{
637 return this->upper_bound(key
).get_offset();
640 uint16_t child_pos_for_key(NODE_KEY key
) const final
{
641 auto it
= this->upper_bound(key
);
642 assert(it
!= this->begin());
644 return it
.get_offset();
647 NODE_KEY
get_key_from_idx(uint16_t idx
) const final
{
648 return this->iter_idx(idx
).get_key();
651 fixed_kv_node_meta_t
<NODE_KEY
> get_node_meta() const {
652 return this->get_meta();
655 uint16_t get_node_size() const final
{
656 return this->get_size();
659 typename
node_layout_t::delta_buffer_t delta_buffer
;
660 typename
node_layout_t::delta_buffer_t
*maybe_get_delta_buffer() {
661 return this->is_mutation_pending()
662 ? &delta_buffer
: nullptr;
665 CachedExtentRef
duplicate_for_write(Transaction
&) override
{
666 assert(delta_buffer
.empty());
667 return CachedExtentRef(new node_type_t(*this));
670 void on_replace_prior(Transaction
&) final
{
671 ceph_assert(!this->is_rewrite());
672 this->set_children_from_prior_instance();
673 auto &prior
= (this_type_t
&)(*this->get_prior_instance());
674 auto copied
= this->copy_children_from_stable_source(
679 ceph_assert(copied
<= get_node_size());
680 assert(this->validate_stable_children());
681 this->set_parent_tracker_from_prior_instance();
685 internal_const_iterator_t iter
,
687 FixedKVNode
<NODE_KEY
>* nextent
) {
688 LOG_PREFIX(FixedKVInternalNode::update
);
689 SUBTRACE(seastore_fixedkv_tree
, "trans.{}, pos {}, {}",
690 this->pending_for_transaction
,
693 this->update_child_ptr(iter
, nextent
);
694 return this->journal_update(
696 this->maybe_generate_relative(addr
),
697 maybe_get_delta_buffer());
701 internal_const_iterator_t iter
,
704 FixedKVNode
<NODE_KEY
>* nextent
) {
705 LOG_PREFIX(FixedKVInternalNode::insert
);
706 SUBTRACE(seastore_fixedkv_tree
, "trans.{}, pos {}, key {}, {}",
707 this->pending_for_transaction
,
711 this->insert_child_ptr(iter
, nextent
);
712 return this->journal_insert(
715 this->maybe_generate_relative(addr
),
716 maybe_get_delta_buffer());
719 void remove(internal_const_iterator_t iter
) {
720 LOG_PREFIX(FixedKVInternalNode::remove
);
721 SUBTRACE(seastore_fixedkv_tree
, "trans.{}, pos {}, key {}",
722 this->pending_for_transaction
,
725 this->remove_child_ptr(iter
);
726 return this->journal_remove(
728 maybe_get_delta_buffer());
732 internal_const_iterator_t iter
,
735 FixedKVNode
<NODE_KEY
>* nextent
) {
736 LOG_PREFIX(FixedKVInternalNode::replace
);
737 SUBTRACE(seastore_fixedkv_tree
, "trans.{}, pos {}, old key {}, key {}, {}",
738 this->pending_for_transaction
,
743 this->update_child_ptr(iter
, nextent
);
744 return this->journal_replace(
747 this->maybe_generate_relative(addr
),
748 maybe_get_delta_buffer());
751 std::tuple
<Ref
, Ref
, NODE_KEY
>
752 make_split_children(op_context_t
<NODE_KEY
> c
) {
753 auto left
= c
.cache
.template alloc_new_extent
<node_type_t
>(
754 c
.trans
, node_size
, placement_hint_t::HOT
, INIT_GENERATION
);
755 auto right
= c
.cache
.template alloc_new_extent
<node_type_t
>(
756 c
.trans
, node_size
, placement_hint_t::HOT
, INIT_GENERATION
);
757 this->split_child_ptrs(*left
, *right
);
758 auto pivot
= this->split_into(*left
, *right
);
759 left
->range
= left
->get_meta();
760 right
->range
= right
->get_meta();
761 return std::make_tuple(
768 op_context_t
<NODE_KEY
> c
,
770 auto replacement
= c
.cache
.template alloc_new_extent
<node_type_t
>(
771 c
.trans
, node_size
, placement_hint_t::HOT
, INIT_GENERATION
);
772 replacement
->merge_child_ptrs(*this, *right
);
773 replacement
->merge_from(*this, *right
->template cast
<node_type_t
>());
774 replacement
->range
= replacement
->get_meta();
778 std::tuple
<Ref
, Ref
, NODE_KEY
>
780 op_context_t
<NODE_KEY
> c
,
783 ceph_assert(_right
->get_type() == this->get_type());
784 auto &right
= *_right
->template cast
<node_type_t
>();
785 auto replacement_left
= c
.cache
.template alloc_new_extent
<node_type_t
>(
786 c
.trans
, node_size
, placement_hint_t::HOT
, INIT_GENERATION
);
787 auto replacement_right
= c
.cache
.template alloc_new_extent
<node_type_t
>(
788 c
.trans
, node_size
, placement_hint_t::HOT
, INIT_GENERATION
);
790 auto pivot
= this->balance_into_new_nodes(
796 this->balance_child_ptrs(
803 replacement_left
->range
= replacement_left
->get_meta();
804 replacement_right
->range
= replacement_right
->get_meta();
805 return std::make_tuple(
812 * Internal relative addresses on read or in memory prior to commit
813 * are either record or block relative depending on whether this
814 * physical node is is_initial_pending() or just is_mutable().
816 * User passes appropriate base depending on lifecycle and
817 * resolve_relative_addrs fixes up relative internal references
820 void resolve_relative_addrs(paddr_t base
)
822 LOG_PREFIX(FixedKVInternalNode::resolve_relative_addrs
);
823 for (auto i
: *this) {
824 if (i
->get_val().is_relative()) {
825 auto updated
= base
.add_relative(i
->get_val());
826 SUBTRACE(seastore_fixedkv_tree
, "{} -> {}", i
->get_val(), updated
);
832 void node_resolve_vals(
833 internal_iterator_t from
,
834 internal_iterator_t to
) const {
835 if (this->is_initial_pending()) {
836 for (auto i
= from
; i
!= to
; ++i
) {
837 if (i
->get_val().is_relative()) {
838 assert(i
->get_val().is_block_relative());
839 i
->set_val(this->get_paddr().add_relative(i
->get_val()));
844 void node_unresolve_vals(
845 internal_iterator_t from
,
846 internal_iterator_t to
) const {
847 if (this->is_initial_pending()) {
848 for (auto i
= from
; i
!= to
; ++i
) {
849 if (i
->get_val().is_relative()) {
850 assert(i
->get_val().is_record_relative());
851 i
->set_val(i
->get_val().block_relative_to(this->get_paddr()));
857 std::ostream
&_print_detail(std::ostream
&out
) const
859 out
<< ", size=" << this->get_size()
860 << ", meta=" << this->get_meta()
861 << ", my_tracker=" << (void*)this->my_tracker
;
862 if (this->my_tracker
) {
863 out
<< ", my_tracker->parent=" << (void*)this->my_tracker
->get_parent().get();
865 return out
<< ", root_block=" << (void*)this->root_block
.get();
868 ceph::bufferlist
get_delta() {
869 ceph::buffer::ptr
bptr(delta_buffer
.get_bytes());
870 delta_buffer
.copy_out(bptr
.c_str(), bptr
.length());
876 void apply_delta_and_adjust_crc(
877 paddr_t base
, const ceph::bufferlist
&_bl
) {
878 assert(_bl
.length());
879 ceph::bufferlist bl
= _bl
;
881 typename
node_layout_t::delta_buffer_t buffer
;
882 buffer
.copy_in(bl
.front().c_str(), bl
.front().length());
883 buffer
.replay(*this);
884 this->set_last_committed_crc(this->get_crc32c());
885 resolve_relative_addrs(base
);
888 constexpr static size_t get_min_capacity() {
889 return (node_layout_t::get_capacity() - 1) / 2;
892 bool at_max_capacity() const {
893 assert(this->get_size() <= node_layout_t::get_capacity());
894 return this->get_size() == node_layout_t::get_capacity();
897 bool at_min_capacity() const {
898 assert(this->get_size() >= (get_min_capacity() - 1));
899 return this->get_size() <= get_min_capacity();
902 bool below_min_capacity() const {
903 assert(this->get_size() >= (get_min_capacity() - 1));
904 return this->get_size() < get_min_capacity();
911 typename NODE_KEY_LE
,
915 typename node_type_t
,
917 struct FixedKVLeafNode
918 : FixedKVNode
<NODE_KEY
>,
919 common::FixedKVNodeLayout
<
921 fixed_kv_node_meta_t
<NODE_KEY
>,
922 fixed_kv_node_meta_le_t
<NODE_KEY_LE
>,
923 NODE_KEY
, NODE_KEY_LE
,
925 using Ref
= TCachedExtentRef
<node_type_t
>;
926 using node_layout_t
=
927 common::FixedKVNodeLayout
<
929 fixed_kv_node_meta_t
<NODE_KEY
>,
930 fixed_kv_node_meta_le_t
<NODE_KEY_LE
>,
935 using internal_const_iterator_t
= typename
node_layout_t::const_iterator
;
936 using this_type_t
= FixedKVLeafNode
<
945 using base_t
= FixedKVNode
<NODE_KEY
>;
946 FixedKVLeafNode(ceph::bufferptr
&&ptr
)
947 : FixedKVNode
<NODE_KEY
>(has_children
? CAPACITY
: 0, std::move(ptr
)),
948 node_layout_t(this->get_bptr().c_str()) {}
949 FixedKVLeafNode(const FixedKVLeafNode
&rhs
)
950 : FixedKVNode
<NODE_KEY
>(rhs
),
951 node_layout_t(this->get_bptr().c_str()) {}
953 static constexpr bool do_has_children
= has_children
;
955 bool is_leaf_and_has_children() const final
{
959 uint16_t get_node_split_pivot() final
{
960 return this->get_split_pivot().get_offset();
963 get_child_ret_t
<LogicalCachedExtent
>
964 get_logical_child(op_context_t
<NODE_KEY
> c
, uint16_t pos
) final
{
965 auto child
= this->children
[pos
];
966 if (is_valid_child_ptr(child
)) {
967 ceph_assert(child
->is_logical());
968 return c
.cache
.template get_extent_viewable_by_trans
<
969 LogicalCachedExtent
>(c
.trans
, (LogicalCachedExtent
*)child
);
970 } else if (this->is_pending()) {
971 auto key
= this->iter_idx(pos
).get_key();
972 auto &sparent
= this->get_stable_for_key(key
);
973 auto spos
= sparent
.child_pos_for_key(key
);
974 auto child
= sparent
.children
[spos
];
975 if (is_valid_child_ptr(child
)) {
976 ceph_assert(child
->is_logical());
977 return c
.cache
.template get_extent_viewable_by_trans
<
978 LogicalCachedExtent
>(c
.trans
, (LogicalCachedExtent
*)child
);
980 return child_pos_t(&sparent
, spos
);
983 return child_pos_t(this, pos
);
987 bool validate_stable_children() override
{
991 virtual ~FixedKVLeafNode() {
992 if (this->is_valid() && !this->is_pending()) {
993 if (this->range
.is_root()) {
994 ceph_assert(this->root_block
);
995 unlink_phy_tree_root_node
<NODE_KEY
>(this->root_block
);
997 ceph_assert(this->is_parent_valid());
998 auto parent
= this->template get_parent_node
<FixedKVNode
<NODE_KEY
>>();
999 auto off
= parent
->lower_bound_offset(this->get_meta().begin
);
1000 assert(parent
->get_key_from_idx(off
) == this->get_meta().begin
);
1001 assert(parent
->children
[off
] == this);
1002 parent
->children
[off
] = nullptr;
1007 void prepare_write() final
{
1008 if constexpr (has_children
) {
1009 if (this->is_initial_pending()) {
1010 if (this->is_rewrite()) {
1011 this->set_children_from_prior_instance();
1013 this->copy_children_from_stable_sources(
1014 [this](base_t
&node
, uint16_t pos
) {
1015 ceph_assert(node
.get_type() == this->get_type());
1016 auto &n
= static_cast<this_type_t
&>(node
);
1017 return n
.iter_idx(pos
);
1020 if (this->is_rewrite()) {
1021 this->reset_prior_instance();
1023 this->adjust_ptracker_for_children();
1025 assert(this->validate_stable_children());
1026 this->copy_sources
.clear();
1029 assert(this->is_initial_pending()
1030 ? this->copy_sources
.empty():
1034 void on_replace_prior(Transaction
&) final
{
1035 ceph_assert(!this->is_rewrite());
1036 if constexpr (has_children
) {
1037 this->set_children_from_prior_instance();
1038 auto &prior
= (this_type_t
&)(*this->get_prior_instance());
1039 auto copied
= this->copy_children_from_stable_source(
1044 ceph_assert(copied
<= get_node_size());
1045 assert(this->validate_stable_children());
1046 this->set_parent_tracker_from_prior_instance();
1048 this->set_parent_tracker_from_prior_instance();
1052 uint16_t lower_bound_offset(NODE_KEY key
) const final
{
1053 return this->lower_bound(key
).get_offset();
1056 uint16_t upper_bound_offset(NODE_KEY key
) const final
{
1057 return this->upper_bound(key
).get_offset();
1060 uint16_t child_pos_for_key(NODE_KEY key
) const final
{
1061 return lower_bound_offset(key
);
1064 NODE_KEY
get_key_from_idx(uint16_t idx
) const final
{
1065 return this->iter_idx(idx
).get_key();
1068 fixed_kv_node_meta_t
<NODE_KEY
> get_node_meta() const {
1069 return this->get_meta();
1072 uint16_t get_node_size() const final
{
1073 return this->get_size();
1076 typename
node_layout_t::delta_buffer_t delta_buffer
;
1077 virtual typename
node_layout_t::delta_buffer_t
*maybe_get_delta_buffer() {
1078 return this->is_mutation_pending() ? &delta_buffer
: nullptr;
1081 CachedExtentRef
duplicate_for_write(Transaction
&) override
{
1082 assert(delta_buffer
.empty());
1083 return CachedExtentRef(new node_type_t(*this));
1086 virtual void update(
1087 internal_const_iterator_t iter
,
1089 LogicalCachedExtent
* nextent
) = 0;
1090 virtual internal_const_iterator_t
insert(
1091 internal_const_iterator_t iter
,
1094 LogicalCachedExtent
* nextent
) = 0;
1095 virtual void remove(internal_const_iterator_t iter
) = 0;
1097 std::tuple
<Ref
, Ref
, NODE_KEY
>
1098 make_split_children(op_context_t
<NODE_KEY
> c
) {
1099 auto left
= c
.cache
.template alloc_new_extent
<node_type_t
>(
1100 c
.trans
, node_size
, placement_hint_t::HOT
, INIT_GENERATION
);
1101 auto right
= c
.cache
.template alloc_new_extent
<node_type_t
>(
1102 c
.trans
, node_size
, placement_hint_t::HOT
, INIT_GENERATION
);
1103 if constexpr (has_children
) {
1104 this->split_child_ptrs(*left
, *right
);
1106 auto pivot
= this->split_into(*left
, *right
);
1107 left
->range
= left
->get_meta();
1108 right
->range
= right
->get_meta();
1109 return std::make_tuple(
1115 Ref
make_full_merge(
1116 op_context_t
<NODE_KEY
> c
,
1118 auto replacement
= c
.cache
.template alloc_new_extent
<node_type_t
>(
1119 c
.trans
, node_size
, placement_hint_t::HOT
, INIT_GENERATION
);
1120 if constexpr (has_children
) {
1121 replacement
->merge_child_ptrs(*this, *right
);
1123 replacement
->merge_from(*this, *right
->template cast
<node_type_t
>());
1124 replacement
->range
= replacement
->get_meta();
1128 std::tuple
<Ref
, Ref
, NODE_KEY
>
1130 op_context_t
<NODE_KEY
> c
,
1133 ceph_assert(_right
->get_type() == this->get_type());
1134 auto &right
= *_right
->template cast
<node_type_t
>();
1135 auto replacement_left
= c
.cache
.template alloc_new_extent
<node_type_t
>(
1136 c
.trans
, node_size
, placement_hint_t::HOT
, INIT_GENERATION
);
1137 auto replacement_right
= c
.cache
.template alloc_new_extent
<node_type_t
>(
1138 c
.trans
, node_size
, placement_hint_t::HOT
, INIT_GENERATION
);
1140 auto pivot
= this->balance_into_new_nodes(
1145 *replacement_right
);
1146 if constexpr (has_children
) {
1147 this->balance_child_ptrs(
1152 *replacement_right
);
1155 replacement_left
->range
= replacement_left
->get_meta();
1156 replacement_right
->range
= replacement_right
->get_meta();
1157 return std::make_tuple(
1163 ceph::bufferlist
get_delta() {
1164 ceph::buffer::ptr
bptr(delta_buffer
.get_bytes());
1165 delta_buffer
.copy_out(bptr
.c_str(), bptr
.length());
1166 ceph::bufferlist bl
;
1171 void apply_delta_and_adjust_crc(
1172 paddr_t base
, const ceph::bufferlist
&_bl
) {
1173 assert(_bl
.length());
1174 ceph::bufferlist bl
= _bl
;
1176 typename
node_layout_t::delta_buffer_t buffer
;
1177 buffer
.copy_in(bl
.front().c_str(), bl
.front().length());
1178 buffer
.replay(*this);
1179 this->set_last_committed_crc(this->get_crc32c());
1180 this->resolve_relative_addrs(base
);
1183 std::ostream
&_print_detail(std::ostream
&out
) const
1185 return out
<< ", size=" << this->get_size()
1186 << ", meta=" << this->get_meta();
1189 constexpr static size_t get_min_capacity() {
1190 return (node_layout_t::get_capacity() - 1) / 2;
1193 bool at_max_capacity() const {
1194 assert(this->get_size() <= node_layout_t::get_capacity());
1195 return this->get_size() == node_layout_t::get_capacity();
1198 bool at_min_capacity() const {
1199 assert(this->get_size() >= (get_min_capacity() - 1));
1200 return this->get_size() <= get_min_capacity();
1203 bool below_min_capacity() const {
1204 assert(this->get_size() >= (get_min_capacity() - 1));
1205 return this->get_size() < get_min_capacity();
1209 } // namespace crimson::os::seastore
1211 #if FMT_VERSION >= 90000
1213 struct fmt::formatter
<
1214 crimson::os::seastore::FixedKVNode
<
1215 crimson::os::seastore::laddr_t
>> : fmt::ostream_formatter
{};
1217 struct fmt::formatter
<
1218 crimson::os::seastore::FixedKVNode
<
1219 crimson::os::seastore::paddr_t
>> : fmt::ostream_formatter
{};