]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/btree/fixed_kv_node.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crimson / os / seastore / btree / fixed_kv_node.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #pragma once
5
6 #include <sys/mman.h>
7 #include <memory>
8 #include <string.h>
9
10
11 #include "include/buffer.h"
12
13 #include "crimson/common/fixed_kv_node_layout.h"
14 #include "crimson/common/errorator.h"
15 #include "crimson/os/seastore/seastore_types.h"
16 #include "crimson/os/seastore/cache.h"
17 #include "crimson/os/seastore/cached_extent.h"
18
19 #include "crimson/os/seastore/btree/btree_range_pin.h"
20 #include "crimson/os/seastore/btree/fixed_kv_btree.h"
21 #include "crimson/os/seastore/root_block.h"
22
23 namespace crimson::os::seastore {
24
25 /**
26 * FixedKVNode
27 *
28 * Base class enabling recursive lookup between internal and leaf nodes.
29 */
30 template <typename node_key_t>
31 struct FixedKVNode : ChildableCachedExtent {
32 using FixedKVNodeRef = TCachedExtentRef<FixedKVNode>;
33 fixed_kv_node_meta_t<node_key_t> range;
34
35 struct copy_source_cmp_t {
36 using is_transparent = node_key_t;
37 bool operator()(const FixedKVNodeRef &l, const FixedKVNodeRef &r) const {
38 assert(l->range.end <= r->range.begin
39 || r->range.end <= l->range.begin
40 || (l->range.begin == r->range.begin
41 && l->range.end == r->range.end));
42 return l->range.begin < r->range.begin;
43 }
44 bool operator()(const node_key_t &l, const FixedKVNodeRef &r) const {
45 return l < r->range.begin;
46 }
47 bool operator()(const FixedKVNodeRef &l, const node_key_t &r) const {
48 return l->range.begin < r;
49 }
50 };
51
52 /*
53 *
54 * Nodes of fixed-kv-btree connect to their child nodes by pointers following
55 * invariants below:
56 *
57 * 1. if nodes are stable:
58 * a. parent points at the node's stable parent
59 * b. prior_instance is empty
60 * c. child pointers point at stable children. Child resolution is done
61 * directly via this array.
62 * c. copy_sources is empty
63 * 2. if nodes are mutation_pending:
64 * a. parent is empty and needs to be fixed upon commit
65 * b. prior_instance points to its stable version
66 * c. child pointers are null except for initial_pending() children of
67 * this transaction. Child resolution is done by first checking this
68 * array, and then recursively resolving via the parent. We copy child
69 * pointers from parent on commit.
70 * c. copy_sources is empty
71 * 3. if nodes are initial_pending
72 * a. parent points at its pending parent on this transaction (must exist)
73 * b. prior_instance is empty or, if it's the result of rewrite, points to
74 * its stable predecessor
75 * c. child pointers are null except for initial_pending() children of
76 * this transaction (live due to 3a below). Child resolution is done
77 * by first checking this array, and then recursively resolving via
78 * the correct copy_sources entry. We copy child pointers from copy_sources
79 * on commit.
80 * d. copy_sources contains the set of stable nodes at the same tree-level(only
81 * its "prior_instance" if the node is the result of a rewrite), with which
82 * the lba range of this node overlaps.
83 */
84 std::vector<ChildableCachedExtent*> children;
85 std::set<FixedKVNodeRef, copy_source_cmp_t> copy_sources;
86 uint16_t capacity = 0;
87 parent_tracker_t* my_tracker = nullptr;
88 RootBlockRef root_block;
89
90 bool is_linked() {
91 assert(!has_parent_tracker() || !(bool)root_block);
92 return (bool)has_parent_tracker() || (bool)root_block;
93 }
94
95 FixedKVNode(uint16_t capacity, ceph::bufferptr &&ptr)
96 : ChildableCachedExtent(std::move(ptr)),
97 children(capacity, nullptr),
98 capacity(capacity) {}
99 FixedKVNode(const FixedKVNode &rhs)
100 : ChildableCachedExtent(rhs),
101 range(rhs.range),
102 children(rhs.capacity, nullptr),
103 capacity(rhs.capacity) {}
104
105 virtual fixed_kv_node_meta_t<node_key_t> get_node_meta() const = 0;
106 virtual uint16_t get_node_size() const = 0;
107
108 virtual ~FixedKVNode() = default;
109 virtual node_key_t get_key_from_idx(uint16_t idx) const = 0;
110
111 template<typename iter_t>
112 void update_child_ptr(iter_t iter, ChildableCachedExtent* child) {
113 children[iter.get_offset()] = child;
114 set_child_ptracker(child);
115 }
116
117 virtual bool is_leaf_and_has_children() const = 0;
118
119 template<typename iter_t>
120 void insert_child_ptr(iter_t iter, ChildableCachedExtent* child) {
121 auto raw_children = children.data();
122 auto offset = iter.get_offset();
123 std::memmove(
124 &raw_children[offset + 1],
125 &raw_children[offset],
126 (get_node_size() - offset) * sizeof(ChildableCachedExtent*));
127 if (child) {
128 children[offset] = child;
129 set_child_ptracker(child);
130 } else {
131 // this can only happen when reserving lba spaces
132 ceph_assert(is_leaf_and_has_children());
133 // this is to avoid mistakenly copying pointers from
134 // copy sources when committing this lba node, because
135 // we rely on pointers' "nullness" to avoid copying
136 // pointers for updated values
137 children[offset] = RESERVATION_PTR;
138 }
139 }
140
141 template<typename iter_t>
142 void remove_child_ptr(iter_t iter) {
143 LOG_PREFIX(FixedKVNode::remove_child_ptr);
144 auto raw_children = children.data();
145 auto offset = iter.get_offset();
146 SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, total size {}, extent {}",
147 this->pending_for_transaction,
148 offset,
149 get_node_size(),
150 (void*)raw_children[offset]);
151 // parent tracker of the child being removed will be
152 // reset when the child is invalidated, so no need to
153 // reset it here
154 std::memmove(
155 &raw_children[offset],
156 &raw_children[offset + 1],
157 (get_node_size() - offset - 1) * sizeof(ChildableCachedExtent*));
158 }
159
160 FixedKVNode& get_stable_for_key(node_key_t key) {
161 ceph_assert(is_pending());
162 if (is_mutation_pending()) {
163 return (FixedKVNode&)*get_prior_instance();
164 } else {
165 ceph_assert(!copy_sources.empty());
166 auto it = copy_sources.upper_bound(key);
167 it--;
168 auto &copy_source = *it;
169 ceph_assert(copy_source->get_node_meta().is_in_range(key));
170 return *copy_source;
171 }
172 }
173
174 static void push_copy_sources(
175 FixedKVNode &dest,
176 FixedKVNode &src)
177 {
178 ceph_assert(dest.is_initial_pending());
179 if (!src.is_pending()) {
180 dest.copy_sources.emplace(&src);
181 } else if (src.is_mutation_pending()) {
182 dest.copy_sources.emplace(
183 src.get_prior_instance()->template cast<FixedKVNode>());
184 } else {
185 ceph_assert(src.is_initial_pending());
186 dest.copy_sources.insert(
187 src.copy_sources.begin(),
188 src.copy_sources.end());
189 }
190 }
191
192 virtual uint16_t get_node_split_pivot() = 0;
193
194 static void move_child_ptrs(
195 FixedKVNode &dest,
196 FixedKVNode &src,
197 size_t dest_start,
198 size_t src_start,
199 size_t src_end)
200 {
201 std::memmove(
202 dest.children.data() + dest_start,
203 src.children.data() + src_start,
204 (src_end - src_start) * sizeof(ChildableCachedExtent*));
205
206 ceph_assert(src_start < src_end);
207 ceph_assert(src.children.size() >= src_end);
208 for (auto it = src.children.begin() + src_start;
209 it != src.children.begin() + src_end;
210 it++)
211 {
212 auto child = *it;
213 if (is_valid_child_ptr(child)) {
214 dest.set_child_ptracker(child);
215 }
216 }
217 }
218
219 void link_child(ChildableCachedExtent* child, uint16_t pos) {
220 assert(pos < get_node_size());
221 assert(child);
222 ceph_assert(!is_pending());
223 ceph_assert(child->is_valid() && !child->is_pending());
224 assert(!children[pos]);
225 children[pos] = child;
226 set_child_ptracker(child);
227 }
228
229 virtual get_child_ret_t<LogicalCachedExtent>
230 get_logical_child(op_context_t<node_key_t> c, uint16_t pos) = 0;
231
232 template <typename T, typename iter_t>
233 get_child_ret_t<T> get_child(op_context_t<node_key_t> c, iter_t iter) {
234 auto pos = iter.get_offset();
235 assert(children.capacity());
236 auto child = children[pos];
237 if (is_valid_child_ptr(child)) {
238 ceph_assert(child->get_type() == T::TYPE);
239 return c.cache.template get_extent_viewable_by_trans<T>(c.trans, (T*)child);
240 } else if (is_pending()) {
241 auto key = iter.get_key();
242 auto &sparent = get_stable_for_key(key);
243 auto spos = sparent.child_pos_for_key(key);
244 auto child = sparent.children[spos];
245 if (is_valid_child_ptr(child)) {
246 ceph_assert(child->get_type() == T::TYPE);
247 return c.cache.template get_extent_viewable_by_trans<T>(c.trans, (T*)child);
248 } else {
249 return child_pos_t(&sparent, spos);
250 }
251 } else {
252 return child_pos_t(this, pos);
253 }
254 }
255
256 void split_child_ptrs(
257 FixedKVNode &left,
258 FixedKVNode &right)
259 {
260 assert(!left.my_tracker);
261 assert(!right.my_tracker);
262 push_copy_sources(left, *this);
263 push_copy_sources(right, *this);
264 if (is_pending()) {
265 uint16_t pivot = get_node_split_pivot();
266 move_child_ptrs(left, *this, 0, 0, pivot);
267 move_child_ptrs(right, *this, 0, pivot, get_node_size());
268 my_tracker = nullptr;
269 }
270 }
271
272 void merge_child_ptrs(
273 FixedKVNode &left,
274 FixedKVNode &right)
275 {
276 ceph_assert(!my_tracker);
277 push_copy_sources(*this, left);
278 push_copy_sources(*this, right);
279
280 if (left.is_pending()) {
281 move_child_ptrs(*this, left, 0, 0, left.get_node_size());
282 left.my_tracker = nullptr;
283 }
284
285 if (right.is_pending()) {
286 move_child_ptrs(*this, right, left.get_node_size(), 0, right.get_node_size());
287 right.my_tracker = nullptr;
288 }
289 }
290
291 static void balance_child_ptrs(
292 FixedKVNode &left,
293 FixedKVNode &right,
294 bool prefer_left,
295 FixedKVNode &replacement_left,
296 FixedKVNode &replacement_right)
297 {
298 size_t l_size = left.get_node_size();
299 size_t r_size = right.get_node_size();
300 size_t total = l_size + r_size;
301 size_t pivot_idx = (l_size + r_size) / 2;
302 if (total % 2 && prefer_left) {
303 pivot_idx++;
304 }
305
306 assert(!replacement_left.my_tracker);
307 assert(!replacement_right.my_tracker);
308 if (pivot_idx < l_size) {
309 // deal with left
310 push_copy_sources(replacement_left, left);
311 push_copy_sources(replacement_right, left);
312 if (left.is_pending()) {
313 move_child_ptrs(replacement_left, left, 0, 0, pivot_idx);
314 move_child_ptrs(replacement_right, left, 0, pivot_idx, l_size);
315 left.my_tracker = nullptr;
316 }
317
318 // deal with right
319 push_copy_sources(replacement_right, right);
320 if (right.is_pending()) {
321 move_child_ptrs(replacement_right, right, l_size - pivot_idx, 0, r_size);
322 right.my_tracker= nullptr;
323 }
324 } else {
325 // deal with left
326 push_copy_sources(replacement_left, left);
327 if (left.is_pending()) {
328 move_child_ptrs(replacement_left, left, 0, 0, l_size);
329 left.my_tracker = nullptr;
330 }
331
332 // deal with right
333 push_copy_sources(replacement_left, right);
334 push_copy_sources(replacement_right, right);
335 if (right.is_pending()) {
336 move_child_ptrs(replacement_left, right, l_size, 0, pivot_idx - l_size);
337 move_child_ptrs(replacement_right, right, 0, pivot_idx - l_size, r_size);
338 right.my_tracker= nullptr;
339 }
340 }
341 }
342
343 void set_parent_tracker_from_prior_instance() {
344 assert(is_mutation_pending());
345 auto &prior = (FixedKVNode&)(*get_prior_instance());
346 if (range.is_root()) {
347 ceph_assert(prior.root_block);
348 ceph_assert(pending_for_transaction);
349 root_block = prior.root_block;
350 link_phy_tree_root_node(root_block, this);
351 return;
352 }
353 ceph_assert(!root_block);
354 take_prior_parent_tracker();
355 assert(is_parent_valid());
356 auto parent = get_parent_node<FixedKVNode>();
357 //TODO: can this search be avoided?
358 auto off = parent->lower_bound_offset(get_node_meta().begin);
359 assert(parent->get_key_from_idx(off) == get_node_meta().begin);
360 parent->children[off] = this;
361 }
362
363 bool is_children_empty() const {
364 for (auto it = children.begin();
365 it != children.begin() + get_node_size();
366 it++) {
367 if (is_valid_child_ptr(*it)
368 && (*it)->is_valid()) {
369 return false;
370 }
371 }
372 return true;
373 }
374
375 void set_children_from_prior_instance() {
376 assert(get_prior_instance());
377 auto &prior = (FixedKVNode&)(*get_prior_instance());
378 assert(prior.my_tracker || prior.is_children_empty());
379
380 if (prior.my_tracker) {
381 prior.my_tracker->reset_parent(this);
382 my_tracker = prior.my_tracker;
383 // All my initial pending children is pointing to the original
384 // tracker which has been dropped by the above line, so need
385 // to adjust them to point to the new tracker
386 adjust_ptracker_for_children();
387 }
388 assert(my_tracker || is_children_empty());
389 }
390
391 void adjust_ptracker_for_children() {
392 auto begin = children.begin();
393 auto end = begin + get_node_size();
394 ceph_assert(end <= children.end());
395 for (auto it = begin; it != end; it++) {
396 auto child = *it;
397 if (is_valid_child_ptr(child)) {
398 set_child_ptracker(child);
399 }
400 }
401 }
402
403 void on_delta_write(paddr_t record_block_offset) final {
404 // All in-memory relative addrs are necessarily record-relative
405 assert(get_prior_instance());
406 assert(pending_for_transaction);
407 resolve_relative_addrs(record_block_offset);
408 }
409
410 virtual uint16_t lower_bound_offset(node_key_t) const = 0;
411 virtual uint16_t upper_bound_offset(node_key_t) const = 0;
412 virtual uint16_t child_pos_for_key(node_key_t) const = 0;
413
414 virtual bool validate_stable_children() = 0;
415
416 template<typename iter_t>
417 uint16_t copy_children_from_stable_source(
418 FixedKVNode &source,
419 iter_t foreign_start_it,
420 iter_t foreign_end_it,
421 iter_t local_start_it) {
422 auto foreign_it = foreign_start_it, local_it = local_start_it;
423 while (foreign_it != foreign_end_it
424 && local_it.get_offset() < get_node_size())
425 {
426 auto &child = children[local_it.get_offset()];
427 if (foreign_it.get_key() == local_it.get_key()) {
428 // the foreign key is preserved
429 if (!child) {
430 child = source.children[foreign_it.get_offset()];
431 }
432 foreign_it++;
433 local_it++;
434 } else if (foreign_it.get_key() < local_it.get_key()) {
435 // the foreign key has been removed, because, if it hasn't,
436 // there must have been a local key before the one pointed
437 // by the current "local_it" that's equal to this foreign key
438 // and has pushed the foreign_it forward.
439 foreign_it++;
440 } else {
441 // the local key must be a newly inserted one.
442 local_it++;
443 }
444 }
445 return local_it.get_offset();
446 }
447
448 template<typename Func>
449 void copy_children_from_stable_sources(Func &&get_iter) {
450 if (!copy_sources.empty()) {
451 auto it = --copy_sources.upper_bound(get_node_meta().begin);
452 auto &cs = *it;
453 uint16_t start_pos = cs->lower_bound_offset(
454 get_node_meta().begin);
455 if (start_pos == cs->get_node_size()) {
456 it++;
457 start_pos = 0;
458 }
459 uint16_t local_next_pos = 0;
460 for (; it != copy_sources.end(); it++) {
461 auto& copy_source = *it;
462 auto end_pos = copy_source->get_node_size();
463 if (copy_source->get_node_meta().is_in_range(get_node_meta().end)) {
464 end_pos = copy_source->upper_bound_offset(get_node_meta().end);
465 }
466 auto local_start_iter = get_iter(*this, local_next_pos);
467 auto foreign_start_iter = get_iter(*copy_source, start_pos);
468 auto foreign_end_iter = get_iter(*copy_source, end_pos);
469 local_next_pos = copy_children_from_stable_source(
470 *copy_source, foreign_start_iter, foreign_end_iter, local_start_iter);
471 if (end_pos != copy_source->get_node_size()) {
472 break;
473 }
474 start_pos = 0;
475 }
476 }
477 }
478
479 void on_invalidated(Transaction &t) final {
480 reset_parent_tracker();
481 }
482
483 bool is_rewrite() {
484 return is_initial_pending() && get_prior_instance();
485 }
486
487 void on_initial_write() final {
488 // All in-memory relative addrs are necessarily block-relative
489 resolve_relative_addrs(get_paddr());
490 if (range.is_root()) {
491 reset_parent_tracker();
492 }
493 assert(has_parent_tracker() ? (is_parent_valid()) : true);
494 }
495
496 void set_child_ptracker(ChildableCachedExtent *child) {
497 if (!this->my_tracker) {
498 this->my_tracker = new parent_tracker_t(this);
499 }
500 child->reset_parent_tracker(this->my_tracker);
501 }
502
503 void on_clean_read() final {
504 // From initial write of block, relative addrs are necessarily block-relative
505 resolve_relative_addrs(get_paddr());
506 }
507
508 virtual void resolve_relative_addrs(paddr_t base) = 0;
509 };
510
511 /**
512 * FixedKVInternalNode
513 *
514 * Abstracts operations on and layout of internal nodes for the
515 * FixedKVBTree.
516 */
517 template <
518 size_t CAPACITY,
519 typename NODE_KEY,
520 typename NODE_KEY_LE,
521 size_t node_size,
522 typename node_type_t>
523 struct FixedKVInternalNode
524 : FixedKVNode<NODE_KEY>,
525 common::FixedKVNodeLayout<
526 CAPACITY,
527 fixed_kv_node_meta_t<NODE_KEY>,
528 fixed_kv_node_meta_le_t<NODE_KEY_LE>,
529 NODE_KEY, NODE_KEY_LE,
530 paddr_t, paddr_le_t> {
531 using Ref = TCachedExtentRef<node_type_t>;
532 using base_t = FixedKVNode<NODE_KEY>;
533 using base_ref = typename FixedKVNode<NODE_KEY>::FixedKVNodeRef;
534 using node_layout_t =
535 common::FixedKVNodeLayout<
536 CAPACITY,
537 fixed_kv_node_meta_t<NODE_KEY>,
538 fixed_kv_node_meta_le_t<NODE_KEY_LE>,
539 NODE_KEY,
540 NODE_KEY_LE,
541 paddr_t,
542 paddr_le_t>;
543 using internal_const_iterator_t = typename node_layout_t::const_iterator;
544 using internal_iterator_t = typename node_layout_t::iterator;
545 using this_type_t = FixedKVInternalNode<
546 CAPACITY,
547 NODE_KEY,
548 NODE_KEY_LE,
549 node_size,
550 node_type_t>;
551
552 FixedKVInternalNode(ceph::bufferptr &&ptr)
553 : FixedKVNode<NODE_KEY>(CAPACITY, std::move(ptr)),
554 node_layout_t(this->get_bptr().c_str()) {}
555 FixedKVInternalNode(const FixedKVInternalNode &rhs)
556 : FixedKVNode<NODE_KEY>(rhs),
557 node_layout_t(this->get_bptr().c_str()) {}
558
559 bool is_leaf_and_has_children() const final {
560 return false;
561 }
562
563 uint16_t get_node_split_pivot() final {
564 return this->get_split_pivot().get_offset();
565 }
566
567 void prepare_write() final {
568 if (this->is_initial_pending()) {
569 if (this->is_rewrite()) {
570 this->set_children_from_prior_instance();
571 }
572 this->copy_children_from_stable_sources(
573 [this](base_t &node, uint16_t pos) {
574 ceph_assert(node.get_type() == this->get_type());
575 auto &n = static_cast<this_type_t&>(node);
576 return n.iter_idx(pos);
577 }
578 );
579 if (this->is_rewrite()) {
580 this->reset_prior_instance();
581 } else {
582 this->adjust_ptracker_for_children();
583 }
584 assert(this->validate_stable_children());
585 this->copy_sources.clear();
586 }
587 }
588
589 get_child_ret_t<LogicalCachedExtent>
590 get_logical_child(op_context_t<NODE_KEY>, uint16_t pos) final {
591 ceph_abort("impossible");
592 return get_child_ret_t<LogicalCachedExtent>(child_pos_t(nullptr, 0));
593 }
594
595 bool validate_stable_children() final {
596 LOG_PREFIX(FixedKVInternalNode::validate_stable_children);
597 if (this->children.empty()) {
598 return false;
599 }
600
601 for (auto i : *this) {
602 auto child = (FixedKVNode<NODE_KEY>*)this->children[i.get_offset()];
603 if (child && child->range.begin != i.get_key()) {
604 SUBERROR(seastore_fixedkv_tree,
605 "stable child not valid: child {}, child meta{}, key {}",
606 *child,
607 child->get_node_meta(),
608 i.get_key());
609 ceph_abort();
610 return false;
611 }
612 }
613 return true;
614 }
615
616 virtual ~FixedKVInternalNode() {
617 if (this->is_valid() && !this->is_pending()) {
618 if (this->range.is_root()) {
619 ceph_assert(this->root_block);
620 unlink_phy_tree_root_node<NODE_KEY>(this->root_block);
621 } else {
622 ceph_assert(this->is_parent_valid());
623 auto parent = this->template get_parent_node<FixedKVNode<NODE_KEY>>();
624 auto off = parent->lower_bound_offset(this->get_meta().begin);
625 assert(parent->get_key_from_idx(off) == this->get_meta().begin);
626 assert(parent->children[off] == this);
627 parent->children[off] = nullptr;
628 }
629 }
630 }
631
632 uint16_t lower_bound_offset(NODE_KEY key) const final {
633 return this->lower_bound(key).get_offset();
634 }
635
636 uint16_t upper_bound_offset(NODE_KEY key) const final {
637 return this->upper_bound(key).get_offset();
638 }
639
640 uint16_t child_pos_for_key(NODE_KEY key) const final {
641 auto it = this->upper_bound(key);
642 assert(it != this->begin());
643 --it;
644 return it.get_offset();
645 }
646
647 NODE_KEY get_key_from_idx(uint16_t idx) const final {
648 return this->iter_idx(idx).get_key();
649 }
650
651 fixed_kv_node_meta_t<NODE_KEY> get_node_meta() const {
652 return this->get_meta();
653 }
654
655 uint16_t get_node_size() const final {
656 return this->get_size();
657 }
658
659 typename node_layout_t::delta_buffer_t delta_buffer;
660 typename node_layout_t::delta_buffer_t *maybe_get_delta_buffer() {
661 return this->is_mutation_pending()
662 ? &delta_buffer : nullptr;
663 }
664
665 CachedExtentRef duplicate_for_write(Transaction&) override {
666 assert(delta_buffer.empty());
667 return CachedExtentRef(new node_type_t(*this));
668 };
669
670 void on_replace_prior(Transaction&) final {
671 ceph_assert(!this->is_rewrite());
672 this->set_children_from_prior_instance();
673 auto &prior = (this_type_t&)(*this->get_prior_instance());
674 auto copied = this->copy_children_from_stable_source(
675 prior,
676 prior.begin(),
677 prior.end(),
678 this->begin());
679 ceph_assert(copied <= get_node_size());
680 assert(this->validate_stable_children());
681 this->set_parent_tracker_from_prior_instance();
682 }
683
684 void update(
685 internal_const_iterator_t iter,
686 paddr_t addr,
687 FixedKVNode<NODE_KEY>* nextent) {
688 LOG_PREFIX(FixedKVInternalNode::update);
689 SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, {}",
690 this->pending_for_transaction,
691 iter.get_offset(),
692 *nextent);
693 this->update_child_ptr(iter, nextent);
694 return this->journal_update(
695 iter,
696 this->maybe_generate_relative(addr),
697 maybe_get_delta_buffer());
698 }
699
700 void insert(
701 internal_const_iterator_t iter,
702 NODE_KEY pivot,
703 paddr_t addr,
704 FixedKVNode<NODE_KEY>* nextent) {
705 LOG_PREFIX(FixedKVInternalNode::insert);
706 SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}, {}",
707 this->pending_for_transaction,
708 iter.get_offset(),
709 pivot,
710 *nextent);
711 this->insert_child_ptr(iter, nextent);
712 return this->journal_insert(
713 iter,
714 pivot,
715 this->maybe_generate_relative(addr),
716 maybe_get_delta_buffer());
717 }
718
719 void remove(internal_const_iterator_t iter) {
720 LOG_PREFIX(FixedKVInternalNode::remove);
721 SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}",
722 this->pending_for_transaction,
723 iter.get_offset(),
724 iter.get_key());
725 this->remove_child_ptr(iter);
726 return this->journal_remove(
727 iter,
728 maybe_get_delta_buffer());
729 }
730
731 void replace(
732 internal_const_iterator_t iter,
733 NODE_KEY pivot,
734 paddr_t addr,
735 FixedKVNode<NODE_KEY>* nextent) {
736 LOG_PREFIX(FixedKVInternalNode::replace);
737 SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, old key {}, key {}, {}",
738 this->pending_for_transaction,
739 iter.get_offset(),
740 iter.get_key(),
741 pivot,
742 *nextent);
743 this->update_child_ptr(iter, nextent);
744 return this->journal_replace(
745 iter,
746 pivot,
747 this->maybe_generate_relative(addr),
748 maybe_get_delta_buffer());
749 }
750
751 std::tuple<Ref, Ref, NODE_KEY>
752 make_split_children(op_context_t<NODE_KEY> c) {
753 auto left = c.cache.template alloc_new_extent<node_type_t>(
754 c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
755 auto right = c.cache.template alloc_new_extent<node_type_t>(
756 c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
757 this->split_child_ptrs(*left, *right);
758 auto pivot = this->split_into(*left, *right);
759 left->range = left->get_meta();
760 right->range = right->get_meta();
761 return std::make_tuple(
762 left,
763 right,
764 pivot);
765 }
766
767 Ref make_full_merge(
768 op_context_t<NODE_KEY> c,
769 Ref &right) {
770 auto replacement = c.cache.template alloc_new_extent<node_type_t>(
771 c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
772 replacement->merge_child_ptrs(*this, *right);
773 replacement->merge_from(*this, *right->template cast<node_type_t>());
774 replacement->range = replacement->get_meta();
775 return replacement;
776 }
777
778 std::tuple<Ref, Ref, NODE_KEY>
779 make_balanced(
780 op_context_t<NODE_KEY> c,
781 Ref &_right,
782 bool prefer_left) {
783 ceph_assert(_right->get_type() == this->get_type());
784 auto &right = *_right->template cast<node_type_t>();
785 auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
786 c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
787 auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
788 c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
789
790 auto pivot = this->balance_into_new_nodes(
791 *this,
792 right,
793 prefer_left,
794 *replacement_left,
795 *replacement_right);
796 this->balance_child_ptrs(
797 *this,
798 right,
799 prefer_left,
800 *replacement_left,
801 *replacement_right);
802
803 replacement_left->range = replacement_left->get_meta();
804 replacement_right->range = replacement_right->get_meta();
805 return std::make_tuple(
806 replacement_left,
807 replacement_right,
808 pivot);
809 }
810
811 /**
812 * Internal relative addresses on read or in memory prior to commit
813 * are either record or block relative depending on whether this
814 * physical node is is_initial_pending() or just is_mutable().
815 *
816 * User passes appropriate base depending on lifecycle and
817 * resolve_relative_addrs fixes up relative internal references
818 * based on base.
819 */
820 void resolve_relative_addrs(paddr_t base)
821 {
822 LOG_PREFIX(FixedKVInternalNode::resolve_relative_addrs);
823 for (auto i: *this) {
824 if (i->get_val().is_relative()) {
825 auto updated = base.add_relative(i->get_val());
826 SUBTRACE(seastore_fixedkv_tree, "{} -> {}", i->get_val(), updated);
827 i->set_val(updated);
828 }
829 }
830 }
831
832 void node_resolve_vals(
833 internal_iterator_t from,
834 internal_iterator_t to) const {
835 if (this->is_initial_pending()) {
836 for (auto i = from; i != to; ++i) {
837 if (i->get_val().is_relative()) {
838 assert(i->get_val().is_block_relative());
839 i->set_val(this->get_paddr().add_relative(i->get_val()));
840 }
841 }
842 }
843 }
844 void node_unresolve_vals(
845 internal_iterator_t from,
846 internal_iterator_t to) const {
847 if (this->is_initial_pending()) {
848 for (auto i = from; i != to; ++i) {
849 if (i->get_val().is_relative()) {
850 assert(i->get_val().is_record_relative());
851 i->set_val(i->get_val().block_relative_to(this->get_paddr()));
852 }
853 }
854 }
855 }
856
857 std::ostream &_print_detail(std::ostream &out) const
858 {
859 out << ", size=" << this->get_size()
860 << ", meta=" << this->get_meta()
861 << ", my_tracker=" << (void*)this->my_tracker;
862 if (this->my_tracker) {
863 out << ", my_tracker->parent=" << (void*)this->my_tracker->get_parent().get();
864 }
865 return out << ", root_block=" << (void*)this->root_block.get();
866 }
867
868 ceph::bufferlist get_delta() {
869 ceph::buffer::ptr bptr(delta_buffer.get_bytes());
870 delta_buffer.copy_out(bptr.c_str(), bptr.length());
871 ceph::bufferlist bl;
872 bl.push_back(bptr);
873 return bl;
874 }
875
876 void apply_delta_and_adjust_crc(
877 paddr_t base, const ceph::bufferlist &_bl) {
878 assert(_bl.length());
879 ceph::bufferlist bl = _bl;
880 bl.rebuild();
881 typename node_layout_t::delta_buffer_t buffer;
882 buffer.copy_in(bl.front().c_str(), bl.front().length());
883 buffer.replay(*this);
884 this->set_last_committed_crc(this->get_crc32c());
885 resolve_relative_addrs(base);
886 }
887
888 constexpr static size_t get_min_capacity() {
889 return (node_layout_t::get_capacity() - 1) / 2;
890 }
891
892 bool at_max_capacity() const {
893 assert(this->get_size() <= node_layout_t::get_capacity());
894 return this->get_size() == node_layout_t::get_capacity();
895 }
896
897 bool at_min_capacity() const {
898 assert(this->get_size() >= (get_min_capacity() - 1));
899 return this->get_size() <= get_min_capacity();
900 }
901
902 bool below_min_capacity() const {
903 assert(this->get_size() >= (get_min_capacity() - 1));
904 return this->get_size() < get_min_capacity();
905 }
906 };
907
908 template <
909 size_t CAPACITY,
910 typename NODE_KEY,
911 typename NODE_KEY_LE,
912 typename VAL,
913 typename VAL_LE,
914 size_t node_size,
915 typename node_type_t,
916 bool has_children>
917 struct FixedKVLeafNode
918 : FixedKVNode<NODE_KEY>,
919 common::FixedKVNodeLayout<
920 CAPACITY,
921 fixed_kv_node_meta_t<NODE_KEY>,
922 fixed_kv_node_meta_le_t<NODE_KEY_LE>,
923 NODE_KEY, NODE_KEY_LE,
924 VAL, VAL_LE> {
925 using Ref = TCachedExtentRef<node_type_t>;
926 using node_layout_t =
927 common::FixedKVNodeLayout<
928 CAPACITY,
929 fixed_kv_node_meta_t<NODE_KEY>,
930 fixed_kv_node_meta_le_t<NODE_KEY_LE>,
931 NODE_KEY,
932 NODE_KEY_LE,
933 VAL,
934 VAL_LE>;
935 using internal_const_iterator_t = typename node_layout_t::const_iterator;
936 using this_type_t = FixedKVLeafNode<
937 CAPACITY,
938 NODE_KEY,
939 NODE_KEY_LE,
940 VAL,
941 VAL_LE,
942 node_size,
943 node_type_t,
944 has_children>;
945 using base_t = FixedKVNode<NODE_KEY>;
946 FixedKVLeafNode(ceph::bufferptr &&ptr)
947 : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, std::move(ptr)),
948 node_layout_t(this->get_bptr().c_str()) {}
949 FixedKVLeafNode(const FixedKVLeafNode &rhs)
950 : FixedKVNode<NODE_KEY>(rhs),
951 node_layout_t(this->get_bptr().c_str()) {}
952
953 static constexpr bool do_has_children = has_children;
954
955 bool is_leaf_and_has_children() const final {
956 return has_children;
957 }
958
959 uint16_t get_node_split_pivot() final {
960 return this->get_split_pivot().get_offset();
961 }
962
963 get_child_ret_t<LogicalCachedExtent>
964 get_logical_child(op_context_t<NODE_KEY> c, uint16_t pos) final {
965 auto child = this->children[pos];
966 if (is_valid_child_ptr(child)) {
967 ceph_assert(child->is_logical());
968 return c.cache.template get_extent_viewable_by_trans<
969 LogicalCachedExtent>(c.trans, (LogicalCachedExtent*)child);
970 } else if (this->is_pending()) {
971 auto key = this->iter_idx(pos).get_key();
972 auto &sparent = this->get_stable_for_key(key);
973 auto spos = sparent.child_pos_for_key(key);
974 auto child = sparent.children[spos];
975 if (is_valid_child_ptr(child)) {
976 ceph_assert(child->is_logical());
977 return c.cache.template get_extent_viewable_by_trans<
978 LogicalCachedExtent>(c.trans, (LogicalCachedExtent*)child);
979 } else {
980 return child_pos_t(&sparent, spos);
981 }
982 } else {
983 return child_pos_t(this, pos);
984 }
985 }
986
987 bool validate_stable_children() override {
988 return true;
989 }
990
991 virtual ~FixedKVLeafNode() {
992 if (this->is_valid() && !this->is_pending()) {
993 if (this->range.is_root()) {
994 ceph_assert(this->root_block);
995 unlink_phy_tree_root_node<NODE_KEY>(this->root_block);
996 } else {
997 ceph_assert(this->is_parent_valid());
998 auto parent = this->template get_parent_node<FixedKVNode<NODE_KEY>>();
999 auto off = parent->lower_bound_offset(this->get_meta().begin);
1000 assert(parent->get_key_from_idx(off) == this->get_meta().begin);
1001 assert(parent->children[off] == this);
1002 parent->children[off] = nullptr;
1003 }
1004 }
1005 }
1006
1007 void prepare_write() final {
1008 if constexpr (has_children) {
1009 if (this->is_initial_pending()) {
1010 if (this->is_rewrite()) {
1011 this->set_children_from_prior_instance();
1012 }
1013 this->copy_children_from_stable_sources(
1014 [this](base_t &node, uint16_t pos) {
1015 ceph_assert(node.get_type() == this->get_type());
1016 auto &n = static_cast<this_type_t&>(node);
1017 return n.iter_idx(pos);
1018 }
1019 );
1020 if (this->is_rewrite()) {
1021 this->reset_prior_instance();
1022 } else {
1023 this->adjust_ptracker_for_children();
1024 }
1025 assert(this->validate_stable_children());
1026 this->copy_sources.clear();
1027 }
1028 }
1029 assert(this->is_initial_pending()
1030 ? this->copy_sources.empty():
1031 true);
1032 }
1033
1034 void on_replace_prior(Transaction&) final {
1035 ceph_assert(!this->is_rewrite());
1036 if constexpr (has_children) {
1037 this->set_children_from_prior_instance();
1038 auto &prior = (this_type_t&)(*this->get_prior_instance());
1039 auto copied = this->copy_children_from_stable_source(
1040 prior,
1041 prior.begin(),
1042 prior.end(),
1043 this->begin());
1044 ceph_assert(copied <= get_node_size());
1045 assert(this->validate_stable_children());
1046 this->set_parent_tracker_from_prior_instance();
1047 } else {
1048 this->set_parent_tracker_from_prior_instance();
1049 }
1050 }
1051
1052 uint16_t lower_bound_offset(NODE_KEY key) const final {
1053 return this->lower_bound(key).get_offset();
1054 }
1055
1056 uint16_t upper_bound_offset(NODE_KEY key) const final {
1057 return this->upper_bound(key).get_offset();
1058 }
1059
1060 uint16_t child_pos_for_key(NODE_KEY key) const final {
1061 return lower_bound_offset(key);
1062 }
1063
1064 NODE_KEY get_key_from_idx(uint16_t idx) const final {
1065 return this->iter_idx(idx).get_key();
1066 }
1067
1068 fixed_kv_node_meta_t<NODE_KEY> get_node_meta() const {
1069 return this->get_meta();
1070 }
1071
1072 uint16_t get_node_size() const final {
1073 return this->get_size();
1074 }
1075
1076 typename node_layout_t::delta_buffer_t delta_buffer;
1077 virtual typename node_layout_t::delta_buffer_t *maybe_get_delta_buffer() {
1078 return this->is_mutation_pending() ? &delta_buffer : nullptr;
1079 }
1080
1081 CachedExtentRef duplicate_for_write(Transaction&) override {
1082 assert(delta_buffer.empty());
1083 return CachedExtentRef(new node_type_t(*this));
1084 };
1085
1086 virtual void update(
1087 internal_const_iterator_t iter,
1088 VAL val,
1089 LogicalCachedExtent* nextent) = 0;
1090 virtual internal_const_iterator_t insert(
1091 internal_const_iterator_t iter,
1092 NODE_KEY addr,
1093 VAL val,
1094 LogicalCachedExtent* nextent) = 0;
1095 virtual void remove(internal_const_iterator_t iter) = 0;
1096
1097 std::tuple<Ref, Ref, NODE_KEY>
1098 make_split_children(op_context_t<NODE_KEY> c) {
1099 auto left = c.cache.template alloc_new_extent<node_type_t>(
1100 c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
1101 auto right = c.cache.template alloc_new_extent<node_type_t>(
1102 c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
1103 if constexpr (has_children) {
1104 this->split_child_ptrs(*left, *right);
1105 }
1106 auto pivot = this->split_into(*left, *right);
1107 left->range = left->get_meta();
1108 right->range = right->get_meta();
1109 return std::make_tuple(
1110 left,
1111 right,
1112 pivot);
1113 }
1114
1115 Ref make_full_merge(
1116 op_context_t<NODE_KEY> c,
1117 Ref &right) {
1118 auto replacement = c.cache.template alloc_new_extent<node_type_t>(
1119 c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
1120 if constexpr (has_children) {
1121 replacement->merge_child_ptrs(*this, *right);
1122 }
1123 replacement->merge_from(*this, *right->template cast<node_type_t>());
1124 replacement->range = replacement->get_meta();
1125 return replacement;
1126 }
1127
1128 std::tuple<Ref, Ref, NODE_KEY>
1129 make_balanced(
1130 op_context_t<NODE_KEY> c,
1131 Ref &_right,
1132 bool prefer_left) {
1133 ceph_assert(_right->get_type() == this->get_type());
1134 auto &right = *_right->template cast<node_type_t>();
1135 auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
1136 c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
1137 auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
1138 c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION);
1139
1140 auto pivot = this->balance_into_new_nodes(
1141 *this,
1142 right,
1143 prefer_left,
1144 *replacement_left,
1145 *replacement_right);
1146 if constexpr (has_children) {
1147 this->balance_child_ptrs(
1148 *this,
1149 right,
1150 prefer_left,
1151 *replacement_left,
1152 *replacement_right);
1153 }
1154
1155 replacement_left->range = replacement_left->get_meta();
1156 replacement_right->range = replacement_right->get_meta();
1157 return std::make_tuple(
1158 replacement_left,
1159 replacement_right,
1160 pivot);
1161 }
1162
1163 ceph::bufferlist get_delta() {
1164 ceph::buffer::ptr bptr(delta_buffer.get_bytes());
1165 delta_buffer.copy_out(bptr.c_str(), bptr.length());
1166 ceph::bufferlist bl;
1167 bl.push_back(bptr);
1168 return bl;
1169 }
1170
1171 void apply_delta_and_adjust_crc(
1172 paddr_t base, const ceph::bufferlist &_bl) {
1173 assert(_bl.length());
1174 ceph::bufferlist bl = _bl;
1175 bl.rebuild();
1176 typename node_layout_t::delta_buffer_t buffer;
1177 buffer.copy_in(bl.front().c_str(), bl.front().length());
1178 buffer.replay(*this);
1179 this->set_last_committed_crc(this->get_crc32c());
1180 this->resolve_relative_addrs(base);
1181 }
1182
1183 std::ostream &_print_detail(std::ostream &out) const
1184 {
1185 return out << ", size=" << this->get_size()
1186 << ", meta=" << this->get_meta();
1187 }
1188
1189 constexpr static size_t get_min_capacity() {
1190 return (node_layout_t::get_capacity() - 1) / 2;
1191 }
1192
1193 bool at_max_capacity() const {
1194 assert(this->get_size() <= node_layout_t::get_capacity());
1195 return this->get_size() == node_layout_t::get_capacity();
1196 }
1197
1198 bool at_min_capacity() const {
1199 assert(this->get_size() >= (get_min_capacity() - 1));
1200 return this->get_size() <= get_min_capacity();
1201 }
1202
1203 bool below_min_capacity() const {
1204 assert(this->get_size() >= (get_min_capacity() - 1));
1205 return this->get_size() < get_min_capacity();
1206 }
1207 };
1208
1209 } // namespace crimson::os::seastore
1210
1211 #if FMT_VERSION >= 90000
1212 template <>
1213 struct fmt::formatter<
1214 crimson::os::seastore::FixedKVNode<
1215 crimson::os::seastore::laddr_t>> : fmt::ostream_formatter {};
1216 template <>
1217 struct fmt::formatter<
1218 crimson::os::seastore::FixedKVNode<
1219 crimson::os::seastore::paddr_t>> : fmt::ostream_formatter {};
1220 #endif