]> git.proxmox.com Git - ceph.git/blame - ceph/src/crimson/os/seastore/lba_manager/btree/lba_btree.cc
buildsys: change download over to reef release
[ceph.git] / ceph / src / crimson / os / seastore / lba_manager / btree / lba_btree.cc
CommitLineData
20effc67
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "crimson/common/log.h"
5#include "crimson/os/seastore/logging.h"
6
7#include "crimson/os/seastore/lba_manager/btree/lba_btree.h"
8
9SET_SUBSYS(seastore_lba);
10
11namespace crimson::os::seastore::lba_manager::btree {
12
13LBABtree::mkfs_ret LBABtree::mkfs(op_context_t c)
14{
15 auto root_leaf = c.cache.alloc_new_extent<LBALeafNode>(
16 c.trans,
17 LBA_BLOCK_SIZE);
18 root_leaf->set_size(0);
19 lba_node_meta_t meta{0, L_ADDR_MAX, 1};
20 root_leaf->set_meta(meta);
21 root_leaf->pin.set_range(meta);
22 c.trans.get_lba_tree_stats().depth = 1u;
23 return lba_root_t{root_leaf->get_paddr(), 1u};
24}
25
26LBABtree::iterator::handle_boundary_ret LBABtree::iterator::handle_boundary(
27 op_context_t c,
28 mapped_space_visitor_t *visitor)
29{
30 assert(at_boundary());
31 depth_t depth_with_space = 2;
32 for (; depth_with_space <= get_depth(); ++depth_with_space) {
33 if ((get_internal(depth_with_space).pos + 1) <
34 get_internal(depth_with_space).node->get_size()) {
35 break;
36 }
37 }
38
39 if (depth_with_space <= get_depth()) {
40 return seastar::do_with(
41 [](const LBAInternalNode &internal) { return internal.begin(); },
42 [](const LBALeafNode &leaf) { return leaf.begin(); },
43 [this, c, depth_with_space, visitor](auto &li, auto &ll) {
44 for (depth_t depth = 2; depth < depth_with_space; ++depth) {
45 get_internal(depth).reset();
46 }
47 leaf.reset();
48 get_internal(depth_with_space).pos++;
49 // note, cannot result in at_boundary() by construction
50 return lookup_depth_range(
51 c, *this, depth_with_space - 1, 0, li, ll, visitor
52 );
53 });
54 } else {
55 // end
56 return seastar::now();
57 }
58}
59
60LBABtree::iterator_fut LBABtree::iterator::next(
61 op_context_t c,
62 mapped_space_visitor_t *visitor) const
63{
64 assert_valid();
65 assert(!is_end());
66
67 auto ret = *this;
68 ret.leaf.pos++;
69 if (ret.at_boundary()) {
70 return seastar::do_with(
71 ret,
72 [c, visitor](auto &ret) mutable {
73 return ret.handle_boundary(
74 c, visitor
75 ).si_then([&ret] {
76 return std::move(ret);
77 });
78 });
79 } else {
80 return iterator_fut(
81 interruptible::ready_future_marker{},
82 ret);
83 }
84
85}
86
87LBABtree::iterator_fut LBABtree::iterator::prev(op_context_t c) const
88{
89 assert_valid();
90 assert(!is_begin());
91
92 auto ret = *this;
93
94 if (ret.leaf.pos > 0) {
95 ret.leaf.pos--;
96 return iterator_fut(
97 interruptible::ready_future_marker{},
98 ret);
99 }
100
101 depth_t depth_with_space = 2;
102 for (; depth_with_space <= get_depth(); ++depth_with_space) {
103 if (ret.get_internal(depth_with_space).pos > 0) {
104 break;
105 }
106 }
107
108 assert(depth_with_space <= ret.get_depth()); // must not be begin()
109 return seastar::do_with(
110 std::move(ret),
111 [](const LBAInternalNode &internal) { return --internal.end(); },
112 [](const LBALeafNode &leaf) { return --leaf.end(); },
113 [c, depth_with_space](auto &ret, auto &li, auto &ll) {
114 for (depth_t depth = 2; depth < depth_with_space; ++depth) {
115 ret.get_internal(depth).reset();
116 }
117 ret.leaf.reset();
118 ret.get_internal(depth_with_space).pos--;
119 // note, cannot result in at_boundary() by construction
120 return lookup_depth_range(
121 c, ret, depth_with_space - 1, 0, li, ll, nullptr
122 ).si_then([&ret] {
123 assert(!ret.at_boundary());
124 return std::move(ret);
125 });
126 });
127}
128
129LBABtree::iterator_fut LBABtree::lower_bound(
130 op_context_t c,
131 laddr_t addr,
132 mapped_space_visitor_t *visitor) const
133{
134 LOG_PREFIX(LBATree::lower_bound);
135 return lookup(
136 c,
137 [addr](const LBAInternalNode &internal) {
138 assert(internal.get_size() > 0);
139 auto iter = internal.upper_bound(addr);
140 assert(iter != internal.begin());
141 --iter;
142 return iter;
143 },
144 [FNAME, c, addr](const LBALeafNode &leaf) {
145 auto ret = leaf.lower_bound(addr);
146 DEBUGT(
147 "leaf addr {}, got ret offset {}, size {}, end {}",
148 c.trans,
149 addr,
150 ret.get_offset(),
151 leaf.get_size(),
152 ret == leaf.end());
153 return ret;
154 },
155 visitor
156 ).si_then([FNAME, c](auto &&ret) {
157 DEBUGT(
158 "ret.leaf.pos {}",
159 c.trans,
160 ret.leaf.pos);
161 ret.assert_valid();
162 return std::move(ret);
163 });
164}
165
166LBABtree::insert_ret LBABtree::insert(
167 op_context_t c,
168 iterator iter,
169 laddr_t laddr,
170 lba_map_val_t val)
171{
172 LOG_PREFIX(LBATree::insert);
173 DEBUGT(
174 "inserting laddr {} at iter {}",
175 c.trans,
176 laddr,
177 iter.is_end() ? L_ADDR_MAX : iter.get_key());
178 return seastar::do_with(
179 iter,
180 [this, c, laddr, val](auto &ret) {
181 return find_insertion(
182 c, laddr, ret
183 ).si_then([this, c, laddr, val, &ret] {
184 if (!ret.at_boundary() && ret.get_key() == laddr) {
185 return insert_ret(
186 interruptible::ready_future_marker{},
187 std::make_pair(ret, false));
188 } else {
189 ++(c.trans.get_lba_tree_stats().num_inserts);
190 return handle_split(
191 c, ret
192 ).si_then([c, laddr, val, &ret] {
193 if (!ret.leaf.node->is_pending()) {
194 CachedExtentRef mut = c.cache.duplicate_for_write(
195 c.trans, ret.leaf.node
196 );
197 ret.leaf.node = mut->cast<LBALeafNode>();
198 }
199 auto iter = LBALeafNode::const_iterator(
200 ret.leaf.node.get(), ret.leaf.pos);
201 assert(iter == ret.leaf.node->lower_bound(laddr));
202 assert(iter == ret.leaf.node->end() || iter->get_key() > laddr);
203 assert(laddr >= ret.leaf.node->get_meta().begin &&
204 laddr < ret.leaf.node->get_meta().end);
205 ret.leaf.node->insert(iter, laddr, val);
206 return insert_ret(
207 interruptible::ready_future_marker{},
208 std::make_pair(ret, true));
209 });
210 }
211 });
212 });
213}
214
215LBABtree::update_ret LBABtree::update(
216 op_context_t c,
217 iterator iter,
218 lba_map_val_t val)
219{
220 LOG_PREFIX(LBATree::update);
221 DEBUGT(
222 "update element at {}",
223 c.trans,
224 iter.is_end() ? L_ADDR_MAX : iter.get_key());
225 if (!iter.leaf.node->is_pending()) {
226 CachedExtentRef mut = c.cache.duplicate_for_write(
227 c.trans, iter.leaf.node
228 );
229 iter.leaf.node = mut->cast<LBALeafNode>();
230 }
231 iter.leaf.node->update(
232 iter.leaf.node->iter_idx(iter.leaf.pos),
233 val);
234 return update_ret(
235 interruptible::ready_future_marker{},
236 iter);
237}
238
239LBABtree::remove_ret LBABtree::remove(
240 op_context_t c,
241 iterator iter)
242{
243 LOG_PREFIX(LBATree::remove);
244 DEBUGT(
245 "remove element at {}",
246 c.trans,
247 iter.is_end() ? L_ADDR_MAX : iter.get_key());
248 assert(!iter.is_end());
249 ++(c.trans.get_lba_tree_stats().num_erases);
250 return seastar::do_with(
251 iter,
252 [this, c](auto &ret) {
253 if (!ret.leaf.node->is_pending()) {
254 CachedExtentRef mut = c.cache.duplicate_for_write(
255 c.trans, ret.leaf.node
256 );
257 ret.leaf.node = mut->cast<LBALeafNode>();
258 }
259 ret.leaf.node->remove(
260 ret.leaf.node->iter_idx(ret.leaf.pos));
261
262 return handle_merge(
263 c, ret
264 );
265 });
266}
267
268LBABtree::init_cached_extent_ret LBABtree::init_cached_extent(
269 op_context_t c,
270 CachedExtentRef e)
271{
272 LOG_PREFIX(LBATree::init_cached_extent);
273 DEBUGT("extent {}", c.trans, *e);
274 if (e->is_logical()) {
275 auto logn = e->cast<LogicalCachedExtent>();
276 return lower_bound(
277 c,
278 logn->get_laddr()
279 ).si_then([FNAME, e, c, logn](auto iter) {
280 if (!iter.is_end() &&
281 iter.get_key() == logn->get_laddr() &&
282 iter.get_val().paddr == logn->get_paddr()) {
283 logn->set_pin(iter.get_pin());
284 ceph_assert(iter.get_val().len == e->get_length());
285 if (c.pins) {
286 c.pins->add_pin(
287 static_cast<BtreeLBAPin&>(logn->get_pin()).pin);
288 }
289 DEBUGT("logical extent {} live, initialized", c.trans, *logn);
290 return e;
291 } else {
292 DEBUGT("logical extent {} not live, dropping", c.trans, *logn);
293 c.cache.drop_from_cache(logn);
294 return CachedExtentRef();
295 }
296 });
297 } else if (e->get_type() == extent_types_t::LADDR_INTERNAL) {
298 auto eint = e->cast<LBAInternalNode>();
299 return lower_bound(
300 c, eint->get_node_meta().begin
301 ).si_then([FNAME, e, c, eint](auto iter) {
302 // Note, this check is valid even if iter.is_end()
303 depth_t cand_depth = eint->get_node_meta().depth;
304 if (cand_depth <= iter.get_depth() &&
305 &*iter.get_internal(cand_depth).node == &*eint) {
306 DEBUGT("extent {} is live", c.trans, *eint);
307 return e;
308 } else {
309 DEBUGT("extent {} is not live", c.trans, *eint);
310 c.cache.drop_from_cache(eint);
311 return CachedExtentRef();
312 }
313 });
314 } else if (e->get_type() == extent_types_t::LADDR_LEAF) {
315 auto eleaf = e->cast<LBALeafNode>();
316 return lower_bound(
317 c, eleaf->get_node_meta().begin
318 ).si_then([FNAME, c, e, eleaf](auto iter) {
319 // Note, this check is valid even if iter.is_end()
320 if (iter.leaf.node == &*eleaf) {
321 DEBUGT("extent {} is live", c.trans, *eleaf);
322 return e;
323 } else {
324 DEBUGT("extent {} is not live", c.trans, *eleaf);
325 c.cache.drop_from_cache(eleaf);
326 return CachedExtentRef();
327 }
328 });
329 } else {
330 DEBUGT(
331 "found other extent {} type {}",
332 c.trans,
333 *e,
334 e->get_type());
335 return init_cached_extent_ret(
336 interruptible::ready_future_marker{},
337 e);
338 }
339}
340
341LBABtree::get_internal_if_live_ret
342LBABtree::get_internal_if_live(
343 op_context_t c,
344 paddr_t addr,
345 laddr_t laddr,
346 segment_off_t len)
347{
348 LOG_PREFIX(BtreeLBAManager::get_leaf_if_live);
349 return lower_bound(
350 c, laddr
351 ).si_then([FNAME, c, addr, laddr, len](auto iter) {
352 for (depth_t d = 2; d <= iter.get_depth(); ++d) {
353 CachedExtent &node = *iter.get_internal(d).node;
354 auto internal_node = node.cast<LBAInternalNode>();
355 if (internal_node->get_paddr() == addr) {
356 DEBUGT(
357 "extent laddr {} addr {}~{} found: {}",
358 c.trans,
359 laddr,
360 addr,
361 len,
362 *internal_node);
363 assert(internal_node->get_node_meta().begin == laddr);
364 return CachedExtentRef(internal_node);
365 }
366 }
367 DEBUGT(
368 "extent laddr {} addr {}~{} is not live, no matching internal node",
369 c.trans,
370 laddr,
371 addr,
372 len);
373 return CachedExtentRef();
374 });
375}
376
377LBABtree::get_leaf_if_live_ret
378LBABtree::get_leaf_if_live(
379 op_context_t c,
380 paddr_t addr,
381 laddr_t laddr,
382 segment_off_t len)
383{
384 LOG_PREFIX(BtreeLBAManager::get_leaf_if_live);
385 return lower_bound(
386 c, laddr
387 ).si_then([FNAME, c, addr, laddr, len](auto iter) {
388 if (iter.leaf.node->get_paddr() == addr) {
389 DEBUGT(
390 "extent laddr {} addr {}~{} found: {}",
391 c.trans,
392 laddr,
393 addr,
394 len,
395 *iter.leaf.node);
396 return CachedExtentRef(iter.leaf.node);
397 } else {
398 DEBUGT(
399 "extent laddr {} addr {}~{} is not live, does not match node {}",
400 c.trans,
401 laddr,
402 addr,
403 len,
404 *iter.leaf.node);
405 return CachedExtentRef();
406 }
407 });
408}
409
410
411LBABtree::rewrite_lba_extent_ret LBABtree::rewrite_lba_extent(
412 op_context_t c,
413 CachedExtentRef e)
414{
415 LOG_PREFIX(LBABtree::rewrite_lba_extent);
416 assert(e->get_type() == extent_types_t::LADDR_INTERNAL ||
417 e->get_type() == extent_types_t::LADDR_LEAF);
418
419 auto do_rewrite = [&](auto &lba_extent) {
420 auto nlba_extent = c.cache.alloc_new_extent<
421 std::remove_reference_t<decltype(lba_extent)>
422 >(
423 c.trans,
424 lba_extent.get_length());
425 lba_extent.get_bptr().copy_out(
426 0,
427 lba_extent.get_length(),
428 nlba_extent->get_bptr().c_str());
429 nlba_extent->pin.set_range(nlba_extent->get_node_meta());
430
431 /* This is a bit underhanded. Any relative addrs here must necessarily
432 * be record relative as we are rewriting a dirty extent. Thus, we
433 * are using resolve_relative_addrs with a (likely negative) block
434 * relative offset to correct them to block-relative offsets adjusted
435 * for our new transaction location.
436 *
437 * Upon commit, these now block relative addresses will be interpretted
438 * against the real final address.
439 */
440 nlba_extent->resolve_relative_addrs(
441 make_record_relative_paddr(0) - nlba_extent->get_paddr());
442
443 DEBUGT(
444 "rewriting {} into {}",
445 c.trans,
446 lba_extent,
447 *nlba_extent);
448
449 return update_internal_mapping(
450 c,
451 nlba_extent->get_node_meta().depth,
452 nlba_extent->get_node_meta().begin,
453 e->get_paddr(),
454 nlba_extent->get_paddr()
455 ).si_then([c, e] {
456 c.cache.retire_extent(c.trans, e);
457 });
458 };
459
460 CachedExtentRef nlba_extent;
461 if (e->get_type() == extent_types_t::LADDR_INTERNAL) {
462 auto lint = e->cast<LBAInternalNode>();
463 return do_rewrite(*lint);
464 } else {
465 assert(e->get_type() == extent_types_t::LADDR_LEAF);
466 auto lleaf = e->cast<LBALeafNode>();
467 return do_rewrite(*lleaf);
468 }
469}
470
471LBABtree::get_internal_node_ret LBABtree::get_internal_node(
472 op_context_t c,
473 depth_t depth,
474 paddr_t offset,
475 laddr_t begin,
476 laddr_t end)
477{
478 LOG_PREFIX(LBATree::get_internal_node);
479 DEBUGT(
480 "reading internal at offset {}, depth {}, begin {}, end {}",
481 c.trans,
482 offset,
483 depth,
484 begin,
485 end);
486 assert(depth > 1);
487 auto init_internal = [c, depth, begin, end](LBAInternalNode &node) {
488 assert(!node.is_pending());
489 assert(!node.pin.is_linked());
490 node.pin.set_range(lba_node_meta_t{begin, end, depth});
491 if (c.pins) {
492 c.pins->add_pin(node.pin);
493 }
494 };
495 return c.cache.get_extent<LBAInternalNode>(
496 c.trans,
497 offset,
498 LBA_BLOCK_SIZE,
499 init_internal
500 ).si_then([FNAME, c, offset, init_internal, depth, begin, end](
501 LBAInternalNodeRef ret) {
502 DEBUGT(
503 "read internal at offset {} {}",
504 c.trans,
505 offset,
506 *ret);
507 // This can only happen during init_cached_extent
508 if (c.pins && !ret->is_pending() && !ret->pin.is_linked()) {
509 assert(ret->is_dirty());
510 init_internal(*ret);
511 }
512 auto meta = ret->get_meta();
513 if (ret->get_size()) {
514 ceph_assert(meta.begin <= ret->begin()->get_key());
515 ceph_assert(meta.end > (ret->end() - 1)->get_key());
516 }
517 ceph_assert(depth == meta.depth);
518 ceph_assert(begin == meta.begin);
519 ceph_assert(end == meta.end);
520 return get_internal_node_ret(
521 interruptible::ready_future_marker{},
522 ret);
523 });
524}
525
526LBABtree::get_leaf_node_ret LBABtree::get_leaf_node(
527 op_context_t c,
528 paddr_t offset,
529 laddr_t begin,
530 laddr_t end)
531{
532 LOG_PREFIX(LBATree::get_leaf_node);
533 DEBUGT(
534 "reading leaf at offset {}, begin {}, end {}",
535 c.trans,
536 offset,
537 begin,
538 end);
539 auto init_leaf = [c, begin, end](LBALeafNode &node) {
540 assert(!node.is_pending());
541 assert(!node.pin.is_linked());
542 node.pin.set_range(lba_node_meta_t{begin, end, 1});
543 if (c.pins) {
544 c.pins->add_pin(node.pin);
545 }
546 };
547 return c.cache.get_extent<LBALeafNode>(
548 c.trans,
549 offset,
550 LBA_BLOCK_SIZE,
551 init_leaf
552 ).si_then([FNAME, c, offset, init_leaf, begin, end](LBALeafNodeRef ret) {
553 DEBUGT(
554 "read leaf at offset {} {}",
555 c.trans,
556 offset,
557 *ret);
558 // This can only happen during init_cached_extent
559 if (c.pins && !ret->is_pending() && !ret->pin.is_linked()) {
560 assert(ret->is_dirty());
561 init_leaf(*ret);
562 }
563 auto meta = ret->get_meta();
564 if (ret->get_size()) {
565 ceph_assert(meta.begin <= ret->begin()->get_key());
566 ceph_assert(meta.end > (ret->end() - 1)->get_key());
567 }
568 ceph_assert(1 == meta.depth);
569 ceph_assert(begin == meta.begin);
570 ceph_assert(end == meta.end);
571 return get_leaf_node_ret(
572 interruptible::ready_future_marker{},
573 ret);
574 });
575}
576
577LBABtree::find_insertion_ret LBABtree::find_insertion(
578 op_context_t c,
579 laddr_t laddr,
580 iterator &iter)
581{
582 assert(iter.is_end() || iter.get_key() >= laddr);
583 if (!iter.is_end() && iter.get_key() == laddr) {
584 return seastar::now();
585 } else if (iter.leaf.node->get_node_meta().begin <= laddr) {
586#ifndef NDEBUG
587 auto p = iter;
588 if (p.leaf.pos > 0) {
589 --p.leaf.pos;
590 assert(p.get_key() < laddr);
591 }
592#endif
593 return seastar::now();
594 } else {
595 assert(iter.leaf.pos == 0);
596 return iter.prev(
597 c
598 ).si_then([laddr, &iter](auto p) {
599 assert(p.leaf.node->get_node_meta().begin <= laddr);
600 assert(p.get_key() < laddr);
601 // Note, this is specifically allowed to violate the iterator
602 // invariant that pos is a valid index for the node in the event
603 // that the insertion point is at the end of a node.
604 p.leaf.pos++;
605 assert(p.at_boundary());
606 iter = p;
607 return seastar::now();
608 });
609 }
610}
611
612LBABtree::handle_split_ret LBABtree::handle_split(
613 op_context_t c,
614 iterator &iter)
615{
616 LOG_PREFIX(LBATree::handle_split);
617
618 depth_t split_from = iter.check_split();
619
620 DEBUGT("split_from {}, depth {}", c.trans, split_from, iter.get_depth());
621
622 if (split_from == iter.get_depth()) {
623 auto nroot = c.cache.alloc_new_extent<LBAInternalNode>(
624 c.trans, LBA_BLOCK_SIZE);
625 lba_node_meta_t meta{0, L_ADDR_MAX, iter.get_depth() + 1};
626 nroot->set_meta(meta);
627 nroot->pin.set_range(meta);
628 nroot->journal_insert(
629 nroot->begin(),
630 L_ADDR_MIN,
631 root.get_location(),
632 nullptr);
633 iter.internal.push_back({nroot, 0});
634
635 root.set_location(nroot->get_paddr());
636 root.set_depth(iter.get_depth());
637 c.trans.get_lba_tree_stats().depth = iter.get_depth();
638 root_dirty = true;
639 }
640
641 /* pos may be either node_position_t<LBALeafNode> or
642 * node_position_t<LBAInternalNode> */
643 auto split_level = [&](auto &parent_pos, auto &pos) {
644 LOG_PREFIX(LBATree::handle_split);
645 auto [left, right, pivot] = pos.node->make_split_children(c);
646
647 auto parent_node = parent_pos.node;
648 auto parent_iter = parent_pos.get_iter();
649
650 parent_node->update(
651 parent_iter,
652 left->get_paddr());
653 parent_node->insert(
654 parent_iter + 1,
655 pivot,
656 right->get_paddr());
657
658 DEBUGT("splitted {} into left: {}, right: {}",
659 c.trans,
660 *pos.node,
661 *left,
662 *right);
663 c.cache.retire_extent(c.trans, pos.node);
664
665 return std::make_pair(left, right);
666 };
667
668 for (; split_from > 0; --split_from) {
669 auto &parent_pos = iter.get_internal(split_from + 1);
670 if (!parent_pos.node->is_pending()) {
671 parent_pos.node = c.cache.duplicate_for_write(
672 c.trans, parent_pos.node
673 )->cast<LBAInternalNode>();
674 }
675
676 if (split_from > 1) {
677 auto &pos = iter.get_internal(split_from);
678 DEBUGT("splitting internal {} at depth {}, parent: {} at pos: {}",
679 c.trans,
680 *pos.node,
681 split_from,
682 *parent_pos.node,
683 parent_pos.pos);
684 auto [left, right] = split_level(parent_pos, pos);
685
686 if (pos.pos < left->get_size()) {
687 pos.node = left;
688 } else {
689 pos.node = right;
690 pos.pos -= left->get_size();
691
692 parent_pos.pos += 1;
693 }
694 } else {
695 auto &pos = iter.leaf;
696 DEBUGT("splitting leaf {}, parent: {} at pos: {}",
697 c.trans,
698 *pos.node,
699 *parent_pos.node,
700 parent_pos.pos);
701 auto [left, right] = split_level(parent_pos, pos);
702
703 /* right->get_node_meta().begin == pivot == right->begin()->get_key()
704 * Thus, if pos.pos == left->get_size(), we want iter to point to
705 * left with pos.pos at the end rather than right with pos.pos = 0
706 * since the insertion would be to the left of the first element
707 * of right and thus necessarily less than right->get_node_meta().begin.
708 */
709 if (pos.pos <= left->get_size()) {
710 pos.node = left;
711 } else {
712 pos.node = right;
713 pos.pos -= left->get_size();
714
715 parent_pos.pos += 1;
716 }
717 }
718 }
719
720 return seastar::now();
721}
722
723template <typename NodeType>
724LBABtree::base_iertr::future<typename NodeType::Ref> get_node(
725 op_context_t c,
726 depth_t depth,
727 paddr_t addr,
728 laddr_t begin,
729 laddr_t end);
730
731template <>
732LBABtree::base_iertr::future<LBALeafNodeRef> get_node<LBALeafNode>(
733 op_context_t c,
734 depth_t depth,
735 paddr_t addr,
736 laddr_t begin,
737 laddr_t end) {
738 assert(depth == 1);
739 return LBABtree::get_leaf_node(c, addr, begin, end);
740}
741
742template <>
743LBABtree::base_iertr::future<LBAInternalNodeRef> get_node<LBAInternalNode>(
744 op_context_t c,
745 depth_t depth,
746 paddr_t addr,
747 laddr_t begin,
748 laddr_t end) {
749 return LBABtree::get_internal_node(c, depth, addr, begin, end);
750}
751
752template <typename NodeType>
753LBABtree::handle_merge_ret merge_level(
754 op_context_t c,
755 depth_t depth,
756 LBABtree::node_position_t<LBAInternalNode> &parent_pos,
757 LBABtree::node_position_t<NodeType> &pos)
758{
759 LOG_PREFIX(LBABtree::merge_level);
760 if (!parent_pos.node->is_pending()) {
761 parent_pos.node = c.cache.duplicate_for_write(
762 c.trans, parent_pos.node
763 )->cast<LBAInternalNode>();
764 }
765
766 auto iter = parent_pos.get_iter();
767 assert(iter.get_offset() < parent_pos.node->get_size());
768 bool donor_is_left = ((iter.get_offset() + 1) == parent_pos.node->get_size());
769 auto donor_iter = donor_is_left ? (iter - 1) : (iter + 1);
770 auto next_iter = donor_iter + 1;
771 auto begin = donor_iter->get_key();
772 auto end = next_iter == parent_pos.node->end()
773 ? parent_pos.node->get_node_meta().end
774 : next_iter->get_key();
775
776 DEBUGT("parent: {}, node: {}", c.trans, *parent_pos.node, *pos.node);
777 return get_node<NodeType>(
778 c,
779 depth,
780 donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()),
781 begin,
782 end
783 ).si_then([c, iter, donor_iter, donor_is_left, &parent_pos, &pos](
784 typename NodeType::Ref donor) {
785 LOG_PREFIX(LBABtree::merge_level);
786 auto [l, r] = donor_is_left ?
787 std::make_pair(donor, pos.node) : std::make_pair(pos.node, donor);
788
789 auto [liter, riter] = donor_is_left ?
790 std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter);
791
792 if (donor->at_min_capacity()) {
793 auto replacement = l->make_full_merge(c, r);
794
795 parent_pos.node->update(
796 liter,
797 replacement->get_paddr());
798 parent_pos.node->remove(riter);
799
800 pos.node = replacement;
801 if (donor_is_left) {
802 pos.pos += r->get_size();
803 parent_pos.pos--;
804 }
805
806 DEBUGT("l: {}, r: {}, replacement: {}", c.trans, *l, *r, *replacement);
807 c.cache.retire_extent(c.trans, l);
808 c.cache.retire_extent(c.trans, r);
809 } else {
810 LOG_PREFIX(LBABtree::merge_level);
811 auto [replacement_l, replacement_r, pivot] =
812 l->make_balanced(
813 c,
814 r,
815 !donor_is_left);
816
817 parent_pos.node->update(
818 liter,
819 replacement_l->get_paddr());
820 parent_pos.node->replace(
821 riter,
822 pivot,
823 replacement_r->get_paddr());
824
825 if (donor_is_left) {
826 assert(parent_pos.pos > 0);
827 parent_pos.pos--;
828 }
829
830 auto orig_position = donor_is_left ?
831 l->get_size() + pos.pos :
832 pos.pos;
833 if (orig_position < replacement_l->get_size()) {
834 pos.node = replacement_l;
835 pos.pos = orig_position;
836 } else {
837 parent_pos.pos++;
838 pos.node = replacement_r;
839 pos.pos = orig_position - replacement_l->get_size();
840 }
841
842 DEBUGT("l: {}, r: {}, replacement_l: {}, replacement_r: {}",
843 c.trans, *l, *r, *replacement_l, *replacement_r);
844 c.cache.retire_extent(c.trans, l);
845 c.cache.retire_extent(c.trans, r);
846 }
847
848 return seastar::now();
849 });
850}
851
852LBABtree::handle_merge_ret LBABtree::handle_merge(
853 op_context_t c,
854 iterator &iter)
855{
856 LOG_PREFIX(LBATree::handle_merge);
857 if (iter.get_depth() == 1 ||
858 !iter.leaf.node->below_min_capacity()) {
859 DEBUGT(
860 "no need to merge leaf, leaf size {}, depth {}",
861 c.trans,
862 iter.leaf.node->get_size(),
863 iter.get_depth());
864 return seastar::now();
865 }
866
867 return seastar::do_with(
868 depth_t{1},
869 [FNAME, this, c, &iter](auto &to_merge) {
870 return trans_intr::repeat(
871 [FNAME, this, c, &iter, &to_merge] {
872 DEBUGT(
873 "merging depth {}",
874 c.trans,
875 to_merge);
876 auto &parent_pos = iter.get_internal(to_merge + 1);
877 auto merge_fut = handle_merge_iertr::now();
878 if (to_merge > 1) {
879 auto &pos = iter.get_internal(to_merge);
880 merge_fut = merge_level(c, to_merge, parent_pos, pos);
881 } else {
882 auto &pos = iter.leaf;
883 merge_fut = merge_level(c, to_merge, parent_pos, pos);
884 }
885
886 return merge_fut.si_then([FNAME, this, c, &iter, &to_merge] {
887 ++to_merge;
888 auto &pos = iter.get_internal(to_merge);
889 if (to_merge == iter.get_depth()) {
890 if (pos.node->get_size() == 1) {
891 DEBUGT("collapsing root", c.trans);
892 c.cache.retire_extent(c.trans, pos.node);
893 assert(pos.pos == 0);
894 auto node_iter = pos.get_iter();
895 root.set_location(
896 node_iter->get_val().maybe_relative_to(pos.node->get_paddr()));
897 iter.internal.pop_back();
898 root.set_depth(iter.get_depth());
899 c.trans.get_lba_tree_stats().depth = iter.get_depth();
900 root_dirty = true;
901 } else {
902 DEBUGT("no need to collapse root", c.trans);
903 }
904 return seastar::stop_iteration::yes;
905 } else if (pos.node->below_min_capacity()) {
906 DEBUGT(
907 "continuing, next node {} depth {} at min",
908 c.trans,
909 *pos.node,
910 to_merge);
911 return seastar::stop_iteration::no;
912 } else {
913 DEBUGT(
914 "complete, next node {} depth {} not min",
915 c.trans,
916 *pos.node,
917 to_merge);
918 return seastar::stop_iteration::yes;
919 }
920 });
921 });
922 });
923}
924
925LBABtree::update_internal_mapping_ret LBABtree::update_internal_mapping(
926 op_context_t c,
927 depth_t depth,
928 laddr_t laddr,
929 paddr_t old_addr,
930 paddr_t new_addr)
931{
932 LOG_PREFIX(LBATree::update_internal_mapping);
933 DEBUGT(
934 "updating laddr {} at depth {} from {} to {}",
935 c.trans,
936 laddr,
937 depth,
938 old_addr,
939 new_addr);
940
941 return lower_bound(
942 c, laddr
943 ).si_then([=](auto iter) {
944 assert(iter.get_depth() >= depth);
945 if (depth == iter.get_depth()) {
946 DEBUGT("update at root", c.trans);
947
948 if (laddr != 0) {
949 ERRORT(
950 "updating root laddr {} at depth {} from {} to {},"
951 "laddr is not 0",
952 c.trans,
953 laddr,
954 depth,
955 old_addr,
956 new_addr,
957 root.get_location());
958 ceph_assert(0 == "impossible");
959 }
960
961 if (root.get_location() != old_addr) {
962 ERRORT(
963 "updating root laddr {} at depth {} from {} to {},"
964 "root addr {} does not match",
965 c.trans,
966 laddr,
967 depth,
968 old_addr,
969 new_addr,
970 root.get_location());
971 ceph_assert(0 == "impossible");
972 }
973
974 root.set_location(new_addr);
975 root_dirty = true;
976 } else {
977 auto &parent = iter.get_internal(depth + 1);
978 assert(parent.node);
979 assert(parent.pos < parent.node->get_size());
980 auto piter = parent.node->iter_idx(parent.pos);
981
982 if (piter->get_key() != laddr) {
983 ERRORT(
984 "updating laddr {} at depth {} from {} to {},"
985 "node {} pos {} val pivot addr {} does not match",
986 c.trans,
987 laddr,
988 depth,
989 old_addr,
990 new_addr,
991 *(parent.node),
992 parent.pos,
993 piter->get_key());
994 ceph_assert(0 == "impossible");
995 }
996
997
998 if (piter->get_val() != old_addr) {
999 ERRORT(
1000 "updating laddr {} at depth {} from {} to {},"
1001 "node {} pos {} val addr {} does not match",
1002 c.trans,
1003 laddr,
1004 depth,
1005 old_addr,
1006 new_addr,
1007 *(parent.node),
1008 parent.pos,
1009 piter->get_val());
1010 ceph_assert(0 == "impossible");
1011 }
1012
1013 CachedExtentRef mut = c.cache.duplicate_for_write(
1014 c.trans,
1015 parent.node
1016 );
1017 LBAInternalNodeRef mparent = mut->cast<LBAInternalNode>();
1018 mparent->update(piter, new_addr);
1019
1020 /* Note, iter is now invalid as we didn't udpate either the parent
1021 * node reference to the new mutable instance nor did we update the
1022 * child pointer to the new node. Not a problem as we'll now just
1023 * destruct it.
1024 */
1025 }
1026 return seastar::now();
1027 });
1028}
1029}