]> git.proxmox.com Git - ceph.git/blame - ceph/src/crimson/os/seastore/lba_manager/btree/lba_btree.h
buildsys: change download over to reef release
[ceph.git] / ceph / src / crimson / os / seastore / lba_manager / btree / lba_btree.h
CommitLineData
20effc67
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#pragma once
5
6#include <boost/container/static_vector.hpp>
7#include <sys/mman.h>
8#include <memory>
9#include <string.h>
10
11#include "crimson/common/log.h"
12
13#include "crimson/os/seastore/lba_manager.h"
14#include "crimson/os/seastore/seastore_types.h"
15#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
16
17namespace crimson::os::seastore::lba_manager::btree {
18
19
20class LBABtree {
21 static constexpr size_t MAX_DEPTH = 16;
22public:
23 using base_iertr = LBAManager::base_iertr;
24
25 class iterator;
26 using iterator_fut = base_iertr::future<iterator>;
27
28 using mapped_space_visitor_t = LBAManager::scan_mapped_space_func_t;
29
30 struct lba_tree_inner_stats_t {
31 uint64_t num_alloc_extents = 0;
32 uint64_t num_alloc_extents_iter_nexts = 0;
33 } static lba_tree_inner_stats;
34
35 class iterator {
36 public:
37 iterator(const iterator &rhs) noexcept :
38 internal(rhs.internal), leaf(rhs.leaf) {}
39 iterator(iterator &&rhs) noexcept :
40 internal(std::move(rhs.internal)), leaf(std::move(rhs.leaf)) {}
41
42 iterator &operator=(const iterator &) = default;
43 iterator &operator=(iterator &&) = default;
44
45 iterator_fut next(
46 op_context_t c,
47 mapped_space_visitor_t *visit=nullptr) const;
48
49 iterator_fut prev(op_context_t c) const;
50
51 void assert_valid() const {
52 assert(leaf.node);
53 assert(leaf.pos <= leaf.node->get_size());
54
55 for (auto &i: internal) {
56 (void)i;
57 assert(i.node);
58 assert(i.pos < i.node->get_size());
59 }
60 }
61
62 depth_t get_depth() const {
63 return internal.size() + 1;
64 }
65
66 auto &get_internal(depth_t depth) {
67 assert(depth > 1);
68 assert((depth - 2) < internal.size());
69 return internal[depth - 2];
70 }
71
72 const auto &get_internal(depth_t depth) const {
73 assert(depth > 1);
74 assert((depth - 2) < internal.size());
75 return internal[depth - 2];
76 }
77
78 laddr_t get_key() const {
79 assert(!is_end());
80 return leaf.node->iter_idx(leaf.pos).get_key();
81 }
82 lba_map_val_t get_val() const {
83 assert(!is_end());
84 auto ret = leaf.node->iter_idx(leaf.pos).get_val();
85 ret.paddr = ret.paddr.maybe_relative_to(leaf.node->get_paddr());
86 return ret;
87 }
88
89 bool is_end() const {
90 // external methods may only resolve at a boundary if at end
91 return at_boundary();
92 }
93
94 bool is_begin() const {
95 for (auto &i: internal) {
96 if (i.pos != 0)
97 return false;
98 }
99 return leaf.pos == 0;
100 }
101
102 LBAPinRef get_pin() const {
103 assert(!is_end());
104 auto val = get_val();
105 auto key = get_key();
106 return std::make_unique<BtreeLBAPin>(
107 leaf.node,
108 val.paddr,
109 lba_node_meta_t{ key, key + val.len, 0 });
110 }
111
112 private:
113 iterator() noexcept {}
114 iterator(depth_t depth) noexcept : internal(depth - 1) {}
115
116 friend class LBABtree;
117 static constexpr uint16_t INVALID = std::numeric_limits<uint16_t>::max();
118 template <typename NodeType>
119 struct node_position_t {
120 typename NodeType::Ref node;
121 uint16_t pos = INVALID;
122
123 void reset() {
124 *this = node_position_t{};
125 }
126
127 auto get_iter() {
128 assert(pos != INVALID);
129 assert(pos < node->get_size());
130 return node->iter_idx(pos);
131 }
132 };
133 boost::container::static_vector<
134 node_position_t<LBAInternalNode>, MAX_DEPTH> internal;
135 node_position_t<LBALeafNode> leaf;
136
137 bool at_boundary() const {
138 assert(leaf.pos <= leaf.node->get_size());
139 return leaf.pos == leaf.node->get_size();
140 }
141
142 using handle_boundary_ertr = base_iertr;
143 using handle_boundary_ret = handle_boundary_ertr::future<>;
144 handle_boundary_ret handle_boundary(
145 op_context_t c,
146 mapped_space_visitor_t *visitor);
147
148 depth_t check_split() const {
149 if (!leaf.node->at_max_capacity()) {
150 return 0;
151 }
152 for (depth_t split_from = 1; split_from < get_depth(); ++split_from) {
153 if (!get_internal(split_from + 1).node->at_max_capacity())
154 return split_from;
155 }
156 return get_depth();
157 }
158
159 depth_t check_merge() const {
160 if (!leaf.node->below_min_capacity()) {
161 return 0;
162 }
163 for (depth_t merge_from = 1; merge_from < get_depth(); ++merge_from) {
164 if (!get_internal(merge_from + 1).node->below_min_capacity())
165 return merge_from;
166 }
167 return get_depth();
168 }
169 };
170
171 LBABtree(lba_root_t root) : root(root) {}
172
173 bool is_root_dirty() const {
174 return root_dirty;
175 }
176 lba_root_t get_root_undirty() {
177 ceph_assert(root_dirty);
178 root_dirty = false;
179 return root;
180 }
181
182 /// mkfs
183 using mkfs_ret = lba_root_t;
184 static mkfs_ret mkfs(op_context_t c);
185
186 /**
187 * lower_bound
188 *
189 * @param c [in] context
190 * @param addr [in] ddr
191 * @return least iterator >= key
192 */
193 iterator_fut lower_bound(
194 op_context_t c,
195 laddr_t addr,
196 mapped_space_visitor_t *visit=nullptr) const;
197
198 /**
199 * upper_bound
200 *
201 * @param c [in] context
202 * @param addr [in] ddr
203 * @return least iterator > key
204 */
205 iterator_fut upper_bound(
206 op_context_t c,
207 laddr_t addr
208 ) const {
209 return lower_bound(
210 c, addr
211 ).si_then([c, addr](auto iter) {
212 if (!iter.is_end() && iter.get_key() == addr) {
213 return iter.next(c);
214 } else {
215 return iterator_fut(
216 interruptible::ready_future_marker{},
217 iter);
218 }
219 });
220 }
221
222 /**
223 * upper_bound_right
224 *
225 * @param c [in] context
226 * @param addr [in] addr
227 * @return least iterator i s.t. i.get_key() + i.get_val().len > key
228 */
229 iterator_fut upper_bound_right(
230 op_context_t c,
231 laddr_t addr) const
232 {
233 return lower_bound(
234 c, addr
235 ).si_then([c, addr](auto iter) {
236 if (iter.is_begin()) {
237 return iterator_fut(
238 interruptible::ready_future_marker{},
239 iter);
240 } else {
241 return iter.prev(
242 c
243 ).si_then([iter, addr](auto prev) {
244 if ((prev.get_key() + prev.get_val().len) > addr) {
245 return iterator_fut(
246 interruptible::ready_future_marker{},
247 prev);
248 } else {
249 return iterator_fut(
250 interruptible::ready_future_marker{},
251 iter);
252 }
253 });
254 }
255 });
256 }
257
258 iterator_fut begin(op_context_t c) const {
259 return lower_bound(c, 0);
260 }
261 iterator_fut end(op_context_t c) const {
262 return upper_bound(c, L_ADDR_MAX);
263 }
264
265 using iterate_repeat_ret_inner = base_iertr::future<
266 seastar::stop_iteration>;
267 template <typename F>
268 static base_iertr::future<> iterate_repeat(
269 op_context_t c,
270 iterator_fut &&iter_fut,
271 bool need_count,
272 F &&f,
273 mapped_space_visitor_t *visitor=nullptr) {
274 return std::move(
275 iter_fut
276 ).si_then([c, need_count, visitor, f=std::forward<F>(f)](auto iter) {
277 return seastar::do_with(
278 iter,
279 std::move(f),
280 [c, need_count, visitor](auto &pos, auto &f) {
281 return trans_intr::repeat(
282 [c, need_count, visitor, &f, &pos] {
283 return f(
284 pos
285 ).si_then([c, need_count, visitor, &pos](auto done) {
286 if (done == seastar::stop_iteration::yes) {
287 return iterate_repeat_ret_inner(
288 interruptible::ready_future_marker{},
289 seastar::stop_iteration::yes);
290 } else {
291 ceph_assert(!pos.is_end());
292 if (need_count) {
293 ++LBABtree::lba_tree_inner_stats.num_alloc_extents_iter_nexts;
294 }
295 return pos.next(
296 c, visitor
297 ).si_then([&pos](auto next) {
298 pos = next;
299 return iterate_repeat_ret_inner(
300 interruptible::ready_future_marker{},
301 seastar::stop_iteration::no);
302 });
303 }
304 });
305 });
306 });
307 });
308 }
309
310 /**
311 * insert
312 *
313 * Inserts val at laddr with iter as a hint. If element at laddr already
314 * exists returns iterator to that element unchanged and returns false.
315 *
316 * Invalidates all outstanding iterators for this tree on this transaction.
317 *
318 * @param c [in] op context
319 * @param iter [in] hint, insertion constant if immediately prior to iter
320 * @param laddr [in] addr at which to insert
321 * @param val [in] val to insert
322 * @return pair<iter, bool> where iter points to element at addr, bool true
323 * iff element at laddr did not exist.
324 */
325 using insert_iertr = base_iertr;
326 using insert_ret = insert_iertr::future<std::pair<iterator, bool>>;
327 insert_ret insert(
328 op_context_t c,
329 iterator iter,
330 laddr_t laddr,
331 lba_map_val_t val
332 );
333 insert_ret insert(
334 op_context_t c,
335 laddr_t laddr,
336 lba_map_val_t val) {
337 return lower_bound(
338 c, laddr
339 ).si_then([this, c, laddr, val](auto iter) {
340 return insert(c, iter, laddr, val);
341 });
342 }
343
344 /**
345 * update
346 *
347 * Invalidates all outstanding iterators for this tree on this transaction.
348 *
349 * @param c [in] op context
350 * @param iter [in] iterator to element to update, must not be end
351 * @param val [in] val with which to update
352 * @return iterator to newly updated element
353 */
354 using update_iertr = base_iertr;
355 using update_ret = update_iertr::future<iterator>;
356 update_ret update(
357 op_context_t c,
358 iterator iter,
359 lba_map_val_t val);
360
361 /**
362 * remove
363 *
364 * Invalidates all outstanding iterators for this tree on this transaction.
365 *
366 * @param c [in] op context
367 * @param iter [in] iterator to element to remove, must not be end
368 */
369 using remove_iertr = base_iertr;
370 using remove_ret = remove_iertr::future<>;
371 remove_ret remove(
372 op_context_t c,
373 iterator iter);
374
375 /**
376 * init_cached_extent
377 *
378 * Checks whether e is live (reachable from lba tree) and drops or initializes
379 * accordingly.
380 *
381 * Returns e if live and a null CachedExtentRef otherwise.
382 */
383 using init_cached_extent_iertr = base_iertr;
384 using init_cached_extent_ret = init_cached_extent_iertr::future<CachedExtentRef>;
385 init_cached_extent_ret init_cached_extent(op_context_t c, CachedExtentRef e);
386
387 /// get_leaf_if_live: get leaf node at laddr/addr if still live
388 using get_leaf_if_live_iertr = base_iertr;
389 using get_leaf_if_live_ret = get_leaf_if_live_iertr::future<CachedExtentRef>;
390 get_leaf_if_live_ret get_leaf_if_live(
391 op_context_t c,
392 paddr_t addr,
393 laddr_t laddr,
394 segment_off_t len);
395
396 /// get_internal_if_live: get internal node at laddr/addr if still live
397 using get_internal_if_live_iertr = base_iertr;
398 using get_internal_if_live_ret = get_internal_if_live_iertr::future<CachedExtentRef>;
399 get_internal_if_live_ret get_internal_if_live(
400 op_context_t c,
401 paddr_t addr,
402 laddr_t laddr,
403 segment_off_t len);
404
405 /**
406 * rewrite_lba_extent
407 *
408 * Rewrites a fresh copy of extent into transaction and updates internal
409 * references.
410 */
411 using rewrite_lba_extent_iertr = base_iertr;
412 using rewrite_lba_extent_ret = rewrite_lba_extent_iertr::future<>;
413 rewrite_lba_extent_ret rewrite_lba_extent(op_context_t c, CachedExtentRef e);
414
415private:
416 lba_root_t root;
417 bool root_dirty = false;
418
419 using get_internal_node_iertr = base_iertr;
420 using get_internal_node_ret = get_internal_node_iertr::future<LBAInternalNodeRef>;
421 static get_internal_node_ret get_internal_node(
422 op_context_t c,
423 depth_t depth,
424 paddr_t offset,
425 laddr_t begin,
426 laddr_t end);
427
428 using get_leaf_node_iertr = base_iertr;
429 using get_leaf_node_ret = get_leaf_node_iertr::future<LBALeafNodeRef>;
430 static get_leaf_node_ret get_leaf_node(
431 op_context_t c,
432 paddr_t offset,
433 laddr_t begin,
434 laddr_t end);
435
436 using lookup_root_iertr = base_iertr;
437 using lookup_root_ret = lookup_root_iertr::future<>;
438 lookup_root_ret lookup_root(
439 op_context_t c,
440 iterator &iter,
441 mapped_space_visitor_t *visitor) const {
442 if (root.get_depth() > 1) {
443 return get_internal_node(
444 c,
445 root.get_depth(),
446 root.get_location(),
447 0,
448 L_ADDR_MAX
449 ).si_then([this, visitor, &iter](LBAInternalNodeRef root_node) {
450 iter.get_internal(root.get_depth()).node = root_node;
451 if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length());
452 return lookup_root_iertr::now();
453 });
454 } else {
455 return get_leaf_node(
456 c,
457 root.get_location(),
458 0,
459 L_ADDR_MAX
460 ).si_then([visitor, &iter](LBALeafNodeRef root_node) {
461 iter.leaf.node = root_node;
462 if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length());
463 return lookup_root_iertr::now();
464 });
465 }
466 }
467
468 using lookup_internal_level_iertr = base_iertr;
469 using lookup_internal_level_ret = lookup_internal_level_iertr::future<>;
470 template <typename F>
471 static lookup_internal_level_ret lookup_internal_level(
472 op_context_t c,
473 depth_t depth,
474 iterator &iter,
475 F &f,
476 mapped_space_visitor_t *visitor
477 ) {
478 assert(depth > 1);
479 auto &parent_entry = iter.get_internal(depth + 1);
480 auto parent = parent_entry.node;
481 auto node_iter = parent->iter_idx(parent_entry.pos);
482 auto next_iter = node_iter + 1;
483 auto begin = node_iter->get_key();
484 auto end = next_iter == parent->end()
485 ? parent->get_node_meta().end
486 : next_iter->get_key();
487 return get_internal_node(
488 c,
489 depth,
490 node_iter->get_val().maybe_relative_to(parent->get_paddr()),
491 begin,
492 end
493 ).si_then([depth, visitor, &iter, &f](LBAInternalNodeRef node) {
494 auto &entry = iter.get_internal(depth);
495 entry.node = node;
496 auto node_iter = f(*node);
497 assert(node_iter != node->end());
498 entry.pos = node_iter->get_offset();
499 if (visitor) (*visitor)(node->get_paddr(), node->get_length());
500 return seastar::now();
501 });
502 }
503
504 using lookup_leaf_iertr = base_iertr;
505 using lookup_leaf_ret = lookup_leaf_iertr::future<>;
506 template <typename F>
507 static lookup_internal_level_ret lookup_leaf(
508 op_context_t c,
509 iterator &iter,
510 F &f,
511 mapped_space_visitor_t *visitor
512 ) {
513 auto &parent_entry = iter.get_internal(2);
514 auto parent = parent_entry.node;
515 assert(parent);
516 auto node_iter = parent->iter_idx(parent_entry.pos);
517 auto next_iter = node_iter + 1;
518 auto begin = node_iter->get_key();
519 auto end = next_iter == parent->end()
520 ? parent->get_node_meta().end
521 : next_iter->get_key();
522
523 return get_leaf_node(
524 c,
525 node_iter->get_val().maybe_relative_to(parent->get_paddr()),
526 begin,
527 end
528 ).si_then([visitor, &iter, &f](LBALeafNodeRef node) {
529 iter.leaf.node = node;
530 auto node_iter = f(*node);
531 iter.leaf.pos = node_iter->get_offset();
532 if (visitor) (*visitor)(node->get_paddr(), node->get_length());
533 return seastar::now();
534 });
535 }
536
537 /**
538 * lookup_depth_range
539 *
540 * Performs node lookups on depths [from, to) using li and ll to
541 * specific target at each level. Note, may leave the iterator
542 * at_boundary(), call handle_boundary() prior to returning out
543 * lf LBABtree.
544 */
545 using lookup_depth_range_iertr = base_iertr;
546 using lookup_depth_range_ret = lookup_depth_range_iertr::future<>;
547 template <typename LI, typename LL>
548 static lookup_depth_range_ret lookup_depth_range(
549 op_context_t c, ///< [in] context
550 iterator &iter, ///< [in,out] iterator to populate
551 depth_t from, ///< [in] from inclusive
552 depth_t to, ///< [in] to exclusive, (to <= from, to == from is a noop)
553 LI &li, ///< [in] internal->iterator
554 LL &ll, ///< [in] leaf->iterator
555 mapped_space_visitor_t *visitor ///< [in] mapped space visitor
556 ) {
557 LOG_PREFIX(LBATree::lookup_depth_range);
558 SUBDEBUGT(seastore_lba, "{} -> {}", c.trans, from, to);
559 return seastar::do_with(
560 from,
561 [c, to, visitor, &iter, &li, &ll](auto &d) {
562 return trans_intr::repeat(
563 [c, to, visitor, &iter, &li, &ll, &d] {
564 if (d > to) {
565 return [&] {
566 if (d > 1) {
567 return lookup_internal_level(
568 c,
569 d,
570 iter,
571 li,
572 visitor);
573 } else {
574 assert(d == 1);
575 return lookup_leaf(
576 c,
577 iter,
578 ll,
579 visitor);
580 }
581 }().si_then([&d] {
582 --d;
583 return lookup_depth_range_iertr::make_ready_future<
584 seastar::stop_iteration
585 >(seastar::stop_iteration::no);
586 });
587 } else {
588 return lookup_depth_range_iertr::make_ready_future<
589 seastar::stop_iteration
590 >(seastar::stop_iteration::yes);
591 }
592 });
593 });
594 }
595
596 using lookup_iertr = base_iertr;
597 using lookup_ret = lookup_iertr::future<iterator>;
598 template <typename LI, typename LL>
599 lookup_ret lookup(
600 op_context_t c,
601 LI &&lookup_internal,
602 LL &&lookup_leaf,
603 mapped_space_visitor_t *visitor
604 ) const {
605 LOG_PREFIX(LBATree::lookup);
606 return seastar::do_with(
607 iterator{root.get_depth()},
608 std::forward<LI>(lookup_internal),
609 std::forward<LL>(lookup_leaf),
610 [FNAME, this, visitor, c](auto &iter, auto &li, auto &ll) {
611 return lookup_root(
612 c, iter, visitor
613 ).si_then([FNAME, this, visitor, c, &iter, &li, &ll] {
614 if (iter.get_depth() > 1) {
615 auto &root_entry = *(iter.internal.rbegin());
616 root_entry.pos = li(*(root_entry.node)).get_offset();
617 } else {
618 auto &root_entry = iter.leaf;
619 auto riter = ll(*(root_entry.node));
620 root_entry.pos = riter->get_offset();
621 }
622 SUBDEBUGT(seastore_lba, "got root, depth {}", c.trans, root.get_depth());
623 return lookup_depth_range(
624 c,
625 iter,
626 root.get_depth() - 1,
627 0,
628 li,
629 ll,
630 visitor
631 ).si_then([c, visitor, &iter] {
632 if (iter.at_boundary()) {
633 return iter.handle_boundary(c, visitor);
634 } else {
635 return lookup_iertr::now();
636 }
637 });
638 }).si_then([&iter] {
639 return std::move(iter);
640 });
641 });
642 }
643
644 /**
645 * handle_split
646 *
647 * Prepare iter for insertion. iter should begin pointing at
648 * the valid insertion point (lower_bound(laddr)).
649 *
650 * Upon completion, iter will point at the
651 * position at which laddr should be inserted. iter may, upon completion,
652 * point at the end of a leaf other than the end leaf if that's the correct
653 * insertion point.
654 */
655 using find_insertion_iertr = base_iertr;
656 using find_insertion_ret = find_insertion_iertr::future<>;
657 static find_insertion_ret find_insertion(
658 op_context_t c,
659 laddr_t laddr,
660 iterator &iter);
661
662 /**
663 * handle_split
664 *
665 * Split nodes in iter as needed for insertion. First, scan iter from leaf
666 * to find first non-full level. Then, split from there towards leaf.
667 *
668 * Upon completion, iter will point at the newly split insertion point. As
669 * with find_insertion, iter's leaf pointer may be end without iter being
670 * end.
671 */
672 using handle_split_iertr = base_iertr;
673 using handle_split_ret = handle_split_iertr::future<>;
674 handle_split_ret handle_split(
675 op_context_t c,
676 iterator &iter);
677
678 using handle_merge_iertr = base_iertr;
679 using handle_merge_ret = handle_merge_iertr::future<>;
680 handle_merge_ret handle_merge(
681 op_context_t c,
682 iterator &iter);
683
684 using update_internal_mapping_iertr = base_iertr;
685 using update_internal_mapping_ret = update_internal_mapping_iertr::future<>;
686 update_internal_mapping_ret update_internal_mapping(
687 op_context_t c,
688 depth_t depth,
689 laddr_t laddr,
690 paddr_t old_addr,
691 paddr_t new_addr);
692
693 template <typename T>
694 using node_position_t = iterator::node_position_t<T>;
695
696 template <typename NodeType>
697 friend base_iertr::future<typename NodeType::Ref> get_node(
698 op_context_t c,
699 depth_t depth,
700 paddr_t addr,
701 laddr_t begin,
702 laddr_t end);
703
704 template <typename NodeType>
705 friend handle_merge_ret merge_level(
706 op_context_t c,
707 depth_t depth,
708 node_position_t<LBAInternalNode> &parent_pos,
709 node_position_t<NodeType> &pos);
710};
711
712}