]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- |
2 | // vim: ts=8 sw=2 smarttab expandtab | |
3 | ||
4 | #pragma once | |
5 | ||
6 | #include <boost/container/static_vector.hpp> | |
7 | #include <sys/mman.h> | |
8 | #include <memory> | |
9 | #include <string.h> | |
10 | ||
11 | #include "crimson/os/seastore/logging.h" | |
12 | ||
13 | #include "crimson/os/seastore/cache.h" | |
14 | #include "crimson/os/seastore/seastore_types.h" | |
15 | #include "crimson/os/seastore/btree/btree_range_pin.h" | |
16 | #include "crimson/os/seastore/root_block.h" | |
17 | ||
18 | #define RESERVATION_PTR reinterpret_cast<ChildableCachedExtent*>(0x1) | |
19 | ||
20 | namespace crimson::os::seastore::lba_manager::btree { | |
21 | struct lba_map_val_t; | |
22 | } | |
23 | ||
24 | namespace crimson::os::seastore { | |
25 | ||
26 | bool is_valid_child_ptr(ChildableCachedExtent* child); | |
27 | ||
28 | template <typename T> | |
29 | phy_tree_root_t& get_phy_tree_root(root_t& r); | |
30 | ||
31 | using get_phy_tree_root_node_ret = | |
32 | std::pair<bool, | |
33 | ::crimson::interruptible::interruptible_future< | |
34 | typename trans_intr::condition, CachedExtentRef>>; | |
35 | ||
36 | template <typename T, typename key_t> | |
37 | const get_phy_tree_root_node_ret get_phy_tree_root_node( | |
38 | const RootBlockRef &root_block, | |
39 | op_context_t<key_t> c); | |
40 | ||
41 | template <typename ROOT_T> | |
42 | void link_phy_tree_root_node(RootBlockRef &root_block, ROOT_T* root_node); | |
43 | ||
44 | template <typename T> | |
45 | void unlink_phy_tree_root_node(RootBlockRef &root_block); | |
46 | ||
47 | template <typename T> | |
48 | Transaction::tree_stats_t& get_tree_stats(Transaction &t); | |
49 | ||
50 | template < | |
51 | typename node_key_t, | |
52 | typename node_val_t, | |
53 | typename internal_node_t, | |
54 | typename leaf_node_t, | |
55 | typename pin_t, | |
56 | size_t node_size, | |
57 | bool leaf_has_children> | |
58 | class FixedKVBtree { | |
59 | static constexpr size_t MAX_DEPTH = 16; | |
60 | using self_type = FixedKVBtree< | |
61 | node_key_t, | |
62 | node_val_t, | |
63 | internal_node_t, | |
64 | leaf_node_t, | |
65 | pin_t, | |
66 | node_size, | |
67 | leaf_has_children>; | |
68 | public: | |
69 | using InternalNodeRef = TCachedExtentRef<internal_node_t>; | |
70 | using LeafNodeRef = TCachedExtentRef<leaf_node_t>; | |
71 | ||
72 | using base_ertr = crimson::errorator< | |
73 | crimson::ct_error::input_output_error>; | |
74 | using base_iertr = trans_iertr<base_ertr>; | |
75 | ||
76 | class iterator; | |
77 | using iterator_fut = base_iertr::future<iterator>; | |
78 | ||
79 | using mapped_space_visitor_t = std::function< | |
80 | void(paddr_t, node_key_t, extent_len_t, depth_t, extent_types_t, iterator&)>; | |
81 | ||
82 | class iterator { | |
83 | public: | |
84 | iterator(const iterator &rhs) noexcept : | |
85 | internal(rhs.internal), leaf(rhs.leaf) {} | |
86 | iterator(iterator &&rhs) noexcept : | |
87 | internal(std::move(rhs.internal)), leaf(std::move(rhs.leaf)) {} | |
88 | ||
89 | iterator &operator=(const iterator &) = default; | |
90 | iterator &operator=(iterator &&) = default; | |
91 | ||
92 | iterator_fut next( | |
93 | op_context_t<node_key_t> c, | |
94 | mapped_space_visitor_t *visitor=nullptr) const | |
95 | { | |
96 | assert_valid(); | |
97 | assert(!is_end()); | |
98 | ||
99 | auto ret = *this; | |
100 | ret.leaf.pos++; | |
101 | if (ret.at_boundary()) { | |
102 | return seastar::do_with( | |
103 | ret, | |
104 | [c, visitor](auto &ret) mutable { | |
105 | return ret.handle_boundary( | |
106 | c, visitor | |
107 | ).si_then([&ret] { | |
108 | return std::move(ret); | |
109 | }); | |
110 | }); | |
111 | } else { | |
112 | return iterator_fut( | |
113 | interruptible::ready_future_marker{}, | |
114 | ret); | |
115 | } | |
116 | ||
117 | } | |
118 | ||
119 | iterator_fut prev(op_context_t<node_key_t> c) const | |
120 | { | |
121 | assert_valid(); | |
122 | assert(!is_begin()); | |
123 | ||
124 | auto ret = *this; | |
125 | ||
126 | if (ret.leaf.pos > 0) { | |
127 | ret.leaf.pos--; | |
128 | return iterator_fut( | |
129 | interruptible::ready_future_marker{}, | |
130 | ret); | |
131 | } | |
132 | ||
133 | depth_t depth_with_space = 2; | |
134 | for (; depth_with_space <= get_depth(); ++depth_with_space) { | |
135 | if (ret.get_internal(depth_with_space).pos > 0) { | |
136 | break; | |
137 | } | |
138 | } | |
139 | ||
140 | assert(depth_with_space <= ret.get_depth()); // must not be begin() | |
141 | return seastar::do_with( | |
142 | std::move(ret), | |
143 | [](const internal_node_t &internal) { return --internal.end(); }, | |
144 | [](const leaf_node_t &leaf) { return --leaf.end(); }, | |
145 | [c, depth_with_space](auto &ret, auto &li, auto &ll) { | |
146 | for (depth_t depth = 2; depth < depth_with_space; ++depth) { | |
147 | ret.get_internal(depth).reset(); | |
148 | } | |
149 | ret.leaf.reset(); | |
150 | ret.get_internal(depth_with_space).pos--; | |
151 | // note, cannot result in at_boundary() by construction | |
152 | return lookup_depth_range( | |
153 | c, ret, depth_with_space - 1, 0, li, ll, nullptr | |
154 | ).si_then([&ret] { | |
155 | assert(!ret.at_boundary()); | |
156 | return std::move(ret); | |
157 | }); | |
158 | }); | |
159 | } | |
160 | ||
161 | void assert_valid() const { | |
162 | assert(leaf.node); | |
163 | assert(leaf.pos <= leaf.node->get_size()); | |
164 | ||
165 | for (auto &i: internal) { | |
166 | (void)i; | |
167 | assert(i.node); | |
168 | assert(i.pos < i.node->get_size()); | |
169 | } | |
170 | } | |
171 | ||
172 | depth_t get_depth() const { | |
173 | return internal.size() + 1; | |
174 | } | |
175 | ||
176 | auto &get_internal(depth_t depth) { | |
177 | assert(depth > 1); | |
178 | assert((depth - 2) < internal.size()); | |
179 | return internal[depth - 2]; | |
180 | } | |
181 | ||
182 | const auto &get_internal(depth_t depth) const { | |
183 | assert(depth > 1); | |
184 | assert((depth - 2) < internal.size()); | |
185 | return internal[depth - 2]; | |
186 | } | |
187 | ||
188 | node_key_t get_key() const { | |
189 | assert(!is_end()); | |
190 | return leaf.node->iter_idx(leaf.pos).get_key(); | |
191 | } | |
192 | node_val_t get_val() const { | |
193 | assert(!is_end()); | |
194 | auto ret = leaf.node->iter_idx(leaf.pos).get_val(); | |
195 | if constexpr ( | |
196 | std::is_same_v<crimson::os::seastore::lba_manager::btree::lba_map_val_t, | |
197 | node_val_t>) { | |
198 | ret.paddr = ret.paddr.maybe_relative_to(leaf.node->get_paddr()); | |
199 | } | |
200 | return ret; | |
201 | } | |
202 | ||
203 | bool is_end() const { | |
204 | // external methods may only resolve at a boundary if at end | |
205 | return at_boundary(); | |
206 | } | |
207 | ||
208 | bool is_begin() const { | |
209 | for (auto &i: internal) { | |
210 | if (i.pos != 0) | |
211 | return false; | |
212 | } | |
213 | return leaf.pos == 0; | |
214 | } | |
215 | ||
216 | PhysicalNodeMappingRef<node_key_t, typename pin_t::val_type> | |
217 | get_pin(op_context_t<node_key_t> ctx) const { | |
218 | assert(!is_end()); | |
219 | auto val = get_val(); | |
220 | auto key = get_key(); | |
221 | return std::make_unique<pin_t>( | |
222 | ctx, | |
223 | leaf.node, | |
224 | leaf.pos, | |
225 | val, | |
226 | fixed_kv_node_meta_t<node_key_t>{ key, key + val.len, 0 }); | |
227 | } | |
228 | ||
229 | typename leaf_node_t::Ref get_leaf_node() { | |
230 | return leaf.node; | |
231 | } | |
232 | ||
233 | uint16_t get_leaf_pos() { | |
234 | return leaf.pos; | |
235 | } | |
236 | private: | |
237 | iterator() noexcept {} | |
238 | iterator(depth_t depth) noexcept : internal(depth - 1) {} | |
239 | ||
240 | friend class FixedKVBtree; | |
241 | static constexpr uint16_t INVALID = std::numeric_limits<uint16_t>::max(); | |
242 | template <typename NodeType> | |
243 | struct node_position_t { | |
244 | typename NodeType::Ref node; | |
245 | uint16_t pos = INVALID; | |
246 | ||
247 | node_position_t() = default; | |
248 | node_position_t( | |
249 | typename NodeType::Ref node, | |
250 | uint16_t pos) | |
251 | : node(node), pos(pos) {} | |
252 | ||
253 | void reset() { | |
254 | *this = node_position_t{}; | |
255 | } | |
256 | ||
257 | auto get_iter() { | |
258 | assert(pos != INVALID); | |
259 | assert(pos < node->get_size()); | |
260 | return node->iter_idx(pos); | |
261 | } | |
262 | }; | |
263 | boost::container::static_vector< | |
264 | node_position_t<internal_node_t>, MAX_DEPTH> internal; | |
265 | node_position_t<leaf_node_t> leaf; | |
266 | ||
267 | bool at_boundary() const { | |
268 | assert(leaf.pos <= leaf.node->get_size()); | |
269 | return leaf.pos == leaf.node->get_size(); | |
270 | } | |
271 | ||
272 | using handle_boundary_ertr = base_iertr; | |
273 | using handle_boundary_ret = handle_boundary_ertr::future<>; | |
274 | handle_boundary_ret handle_boundary( | |
275 | op_context_t<node_key_t> c, | |
276 | mapped_space_visitor_t *visitor) | |
277 | { | |
278 | assert(at_boundary()); | |
279 | depth_t depth_with_space = 2; | |
280 | for (; depth_with_space <= get_depth(); ++depth_with_space) { | |
281 | if ((get_internal(depth_with_space).pos + 1) < | |
282 | get_internal(depth_with_space).node->get_size()) { | |
283 | break; | |
284 | } | |
285 | } | |
286 | ||
287 | if (depth_with_space <= get_depth()) { | |
288 | return seastar::do_with( | |
289 | [](const internal_node_t &internal) { return internal.begin(); }, | |
290 | [](const leaf_node_t &leaf) { return leaf.begin(); }, | |
291 | [this, c, depth_with_space, visitor](auto &li, auto &ll) { | |
292 | for (depth_t depth = 2; depth < depth_with_space; ++depth) { | |
293 | get_internal(depth).reset(); | |
294 | } | |
295 | leaf.reset(); | |
296 | get_internal(depth_with_space).pos++; | |
297 | // note, cannot result in at_boundary() by construction | |
298 | return lookup_depth_range( | |
299 | c, *this, depth_with_space - 1, 0, li, ll, visitor | |
300 | ); | |
301 | }); | |
302 | } else { | |
303 | // end | |
304 | return seastar::now(); | |
305 | } | |
306 | } | |
307 | ||
308 | depth_t check_split() const { | |
309 | if (!leaf.node->at_max_capacity()) { | |
310 | return 0; | |
311 | } | |
312 | for (depth_t split_from = 1; split_from < get_depth(); ++split_from) { | |
313 | if (!get_internal(split_from + 1).node->at_max_capacity()) | |
314 | return split_from; | |
315 | } | |
316 | return get_depth(); | |
317 | } | |
318 | ||
319 | depth_t check_merge() const { | |
320 | if (!leaf.node->below_min_capacity()) { | |
321 | return 0; | |
322 | } | |
323 | for (depth_t merge_from = 1; merge_from < get_depth(); ++merge_from) { | |
324 | if (!get_internal(merge_from + 1).node->below_min_capacity()) | |
325 | return merge_from; | |
326 | } | |
327 | return get_depth(); | |
328 | } | |
329 | }; | |
330 | ||
331 | FixedKVBtree(RootBlockRef &root_block) : root_block(root_block) {} | |
332 | ||
333 | auto& get_root() { | |
334 | return get_phy_tree_root<self_type>(root_block->get_root()); | |
335 | } | |
336 | ||
337 | auto& get_root() const { | |
338 | return get_phy_tree_root<self_type>(root_block->get_root()); | |
339 | } | |
340 | ||
341 | template <typename T> | |
342 | void set_root_node(const TCachedExtentRef<T> &root_node) { | |
343 | static_assert(std::is_base_of_v<typename internal_node_t::base_t, T>); | |
344 | link_phy_tree_root_node(root_block, root_node.get()); | |
345 | } | |
346 | ||
347 | auto get_root_node(op_context_t<node_key_t> c) const { | |
348 | return get_phy_tree_root_node<self_type>(root_block, c); | |
349 | } | |
350 | ||
351 | /// mkfs | |
352 | using mkfs_ret = phy_tree_root_t; | |
353 | static mkfs_ret mkfs(RootBlockRef &root_block, op_context_t<node_key_t> c) { | |
354 | assert(root_block->is_mutation_pending()); | |
355 | auto root_leaf = c.cache.template alloc_new_extent<leaf_node_t>( | |
356 | c.trans, | |
357 | node_size, | |
358 | placement_hint_t::HOT, | |
359 | INIT_GENERATION); | |
360 | root_leaf->set_size(0); | |
361 | fixed_kv_node_meta_t<node_key_t> meta{min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, 1}; | |
362 | root_leaf->set_meta(meta); | |
363 | root_leaf->range = meta; | |
364 | get_tree_stats<self_type>(c.trans).depth = 1u; | |
365 | get_tree_stats<self_type>(c.trans).extents_num_delta++; | |
366 | link_phy_tree_root_node(root_block, root_leaf.get()); | |
367 | return phy_tree_root_t{root_leaf->get_paddr(), 1u}; | |
368 | } | |
369 | ||
370 | /** | |
371 | * lower_bound | |
372 | * | |
373 | * @param c [in] context | |
374 | * @param addr [in] ddr | |
375 | * @return least iterator >= key | |
376 | */ | |
377 | iterator_fut lower_bound( | |
378 | op_context_t<node_key_t> c, | |
379 | node_key_t addr, | |
380 | mapped_space_visitor_t *visitor=nullptr, | |
381 | depth_t min_depth = 1) const | |
382 | { | |
383 | LOG_PREFIX(FixedKVBtree::lower_bound); | |
384 | return lookup( | |
385 | c, | |
386 | [addr](const internal_node_t &internal) { | |
387 | assert(internal.get_size() > 0); | |
388 | auto iter = internal.upper_bound(addr); | |
389 | assert(iter != internal.begin()); | |
390 | --iter; | |
391 | return iter; | |
392 | }, | |
393 | [FNAME, c, addr](const leaf_node_t &leaf) { | |
394 | auto ret = leaf.lower_bound(addr); | |
395 | SUBTRACET( | |
396 | seastore_fixedkv_tree, | |
397 | "leaf addr {}, got ret offset {}, size {}, end {}", | |
398 | c.trans, | |
399 | addr, | |
400 | ret.get_offset(), | |
401 | leaf.get_size(), | |
402 | ret == leaf.end()); | |
403 | return ret; | |
404 | }, | |
405 | min_depth, | |
406 | visitor | |
407 | ).si_then([FNAME, c, min_depth](auto &&ret) { | |
408 | SUBTRACET( | |
409 | seastore_fixedkv_tree, | |
410 | "ret.leaf.pos {}", | |
411 | c.trans, | |
412 | ret.leaf.pos); | |
413 | if (min_depth == 1) { | |
414 | ret.assert_valid(); | |
415 | } | |
416 | return std::move(ret); | |
417 | }); | |
418 | } | |
419 | ||
420 | ||
421 | /** | |
422 | * upper_bound | |
423 | * | |
424 | * @param c [in] context | |
425 | * @param addr [in] ddr | |
426 | * @return least iterator > key | |
427 | */ | |
428 | iterator_fut upper_bound( | |
429 | op_context_t<node_key_t> c, | |
430 | node_key_t addr | |
431 | ) const { | |
432 | return lower_bound( | |
433 | c, addr | |
434 | ).si_then([c, addr](auto iter) { | |
435 | if (!iter.is_end() && iter.get_key() == addr) { | |
436 | return iter.next(c); | |
437 | } else { | |
438 | return iterator_fut( | |
439 | interruptible::ready_future_marker{}, | |
440 | iter); | |
441 | } | |
442 | }); | |
443 | } | |
444 | ||
445 | /** | |
446 | * upper_bound_right | |
447 | * | |
448 | * @param c [in] context | |
449 | * @param addr [in] addr | |
450 | * @return least iterator i s.t. i.get_key() + i.get_val().len > key | |
451 | */ | |
452 | iterator_fut upper_bound_right( | |
453 | op_context_t<node_key_t> c, | |
454 | node_key_t addr) const | |
455 | { | |
456 | return lower_bound( | |
457 | c, addr | |
458 | ).si_then([c, addr](auto iter) { | |
459 | if (iter.is_begin()) { | |
460 | return iterator_fut( | |
461 | interruptible::ready_future_marker{}, | |
462 | iter); | |
463 | } else { | |
464 | return iter.prev( | |
465 | c | |
466 | ).si_then([iter, addr](auto prev) { | |
467 | if ((prev.get_key() + prev.get_val().len) > addr) { | |
468 | return iterator_fut( | |
469 | interruptible::ready_future_marker{}, | |
470 | prev); | |
471 | } else { | |
472 | return iterator_fut( | |
473 | interruptible::ready_future_marker{}, | |
474 | iter); | |
475 | } | |
476 | }); | |
477 | } | |
478 | }); | |
479 | } | |
480 | ||
481 | iterator_fut begin(op_context_t<node_key_t> c) const { | |
482 | return lower_bound(c, 0); | |
483 | } | |
484 | iterator_fut end(op_context_t<node_key_t> c) const { | |
485 | return upper_bound(c, min_max_t<node_key_t>::max); | |
486 | } | |
487 | ||
488 | template <typename child_node_t, typename node_t> | |
489 | void check_node( | |
490 | op_context_t<node_key_t> c, | |
491 | TCachedExtentRef<node_t> node) | |
492 | { | |
493 | for (auto i : *node) { | |
494 | CachedExtentRef child_node; | |
495 | Transaction::get_extent_ret ret; | |
496 | ||
497 | if constexpr (std::is_base_of_v<typename internal_node_t::base_t, child_node_t>) { | |
498 | ret = c.trans.get_extent( | |
499 | i->get_val().maybe_relative_to(node->get_paddr()), | |
500 | &child_node); | |
501 | } else { | |
502 | if constexpr (leaf_has_children) { | |
503 | ret = c.trans.get_extent( | |
504 | i->get_val().paddr.maybe_relative_to(node->get_paddr()), | |
505 | &child_node); | |
506 | } | |
507 | } | |
508 | if (ret == Transaction::get_extent_ret::PRESENT) { | |
509 | if (child_node->is_mutation_pending()) { | |
510 | auto &prior = (child_node_t &)*child_node->prior_instance; | |
511 | assert(prior.is_valid()); | |
512 | assert(prior.is_parent_valid()); | |
513 | if (node->is_mutation_pending()) { | |
514 | auto &n = node->get_stable_for_key(i->get_key()); | |
515 | assert(prior.get_parent_node().get() == &n); | |
516 | auto pos = n.lower_bound_offset(i->get_key()); | |
517 | assert(pos < n.get_node_size()); | |
518 | assert(n.children[pos] == &prior); | |
519 | } else { | |
520 | assert(prior.get_parent_node().get() == node.get()); | |
521 | assert(node->children[i->get_offset()] == &prior); | |
522 | } | |
523 | } else if (child_node->is_initial_pending()) { | |
524 | auto cnode = child_node->template cast<child_node_t>(); | |
525 | auto pos = node->find(i->get_key()).get_offset(); | |
526 | auto child = node->children[pos]; | |
527 | assert(child); | |
528 | assert(child == cnode.get()); | |
529 | assert(cnode->is_parent_valid()); | |
530 | } else { | |
531 | assert(child_node->is_valid()); | |
532 | auto cnode = child_node->template cast<child_node_t>(); | |
533 | assert(cnode->has_parent_tracker()); | |
534 | if (node->is_pending()) { | |
535 | auto &n = node->get_stable_for_key(i->get_key()); | |
536 | assert(cnode->get_parent_node().get() == &n); | |
537 | auto pos = n.lower_bound_offset(i->get_key()); | |
538 | assert(pos < n.get_node_size()); | |
539 | assert(n.children[pos] == cnode.get()); | |
540 | } else { | |
541 | assert(cnode->get_parent_node().get() == node.get()); | |
542 | assert(node->children[i->get_offset()] == cnode.get()); | |
543 | } | |
544 | } | |
545 | } else if (ret == Transaction::get_extent_ret::ABSENT) { | |
546 | ChildableCachedExtent* child = nullptr; | |
547 | if (node->is_pending()) { | |
548 | auto &n = node->get_stable_for_key(i->get_key()); | |
549 | auto pos = n.lower_bound_offset(i->get_key()); | |
550 | assert(pos < n.get_node_size()); | |
551 | child = n.children[pos]; | |
552 | if (is_valid_child_ptr(child)) { | |
553 | auto c = (child_node_t*)child; | |
554 | assert(c->has_parent_tracker()); | |
555 | assert(c->get_parent_node().get() == &n); | |
556 | } | |
557 | } else { | |
558 | child = node->children[i->get_offset()]; | |
559 | if (is_valid_child_ptr(child)) { | |
560 | auto c = (child_node_t*)child; | |
561 | assert(c->has_parent_tracker()); | |
562 | assert(c->get_parent_node().get() == node.get()); | |
563 | } | |
564 | } | |
565 | ||
566 | if (!is_valid_child_ptr(child)) { | |
567 | if constexpr ( | |
568 | std::is_base_of_v<typename internal_node_t::base_t, child_node_t>) | |
569 | { | |
570 | assert(!c.cache.query_cache(i->get_val(), nullptr)); | |
571 | } else { | |
572 | if constexpr (leaf_has_children) { | |
573 | assert(!c.cache.query_cache(i->get_val().paddr, nullptr)); | |
574 | } | |
575 | } | |
576 | } | |
577 | } else { | |
578 | ceph_abort("impossible"); | |
579 | } | |
580 | } | |
581 | } | |
582 | ||
583 | using check_child_trackers_ret = base_iertr::future<>; | |
584 | check_child_trackers_ret check_child_trackers( | |
585 | op_context_t<node_key_t> c) { | |
586 | mapped_space_visitor_t checker = [c, this]( | |
587 | paddr_t, | |
588 | node_key_t, | |
589 | extent_len_t, | |
590 | depth_t depth, | |
591 | extent_types_t, | |
592 | iterator& iter) { | |
593 | if constexpr (!leaf_has_children) { | |
594 | if (depth == 1) { | |
595 | return seastar::now(); | |
596 | } | |
597 | } | |
598 | if (depth > 1) { | |
599 | auto &node = iter.get_internal(depth).node; | |
600 | assert(node->is_valid()); | |
601 | check_node<typename internal_node_t::base_t>(c, node); | |
602 | } else { | |
603 | assert(depth == 1); | |
604 | auto &node = iter.leaf.node; | |
605 | assert(node->is_valid()); | |
606 | check_node<LogicalCachedExtent>(c, node); | |
607 | } | |
608 | return seastar::now(); | |
609 | }; | |
610 | ||
611 | return seastar::do_with( | |
612 | std::move(checker), | |
613 | [this, c](auto &checker) { | |
614 | return iterate_repeat( | |
615 | c, | |
616 | lower_bound( | |
617 | c, | |
618 | min_max_t<node_key_t>::min, | |
619 | &checker), | |
620 | [](auto &pos) { | |
621 | if (pos.is_end()) { | |
622 | return base_iertr::make_ready_future< | |
623 | seastar::stop_iteration>( | |
624 | seastar::stop_iteration::yes); | |
625 | } | |
626 | return base_iertr::make_ready_future< | |
627 | seastar::stop_iteration>( | |
628 | seastar::stop_iteration::no); | |
629 | }, | |
630 | &checker); | |
631 | }); | |
632 | } | |
633 | ||
634 | using iterate_repeat_ret_inner = base_iertr::future< | |
635 | seastar::stop_iteration>; | |
636 | template <typename F> | |
637 | static base_iertr::future<> iterate_repeat( | |
638 | op_context_t<node_key_t> c, | |
639 | iterator_fut &&iter_fut, | |
640 | F &&f, | |
641 | mapped_space_visitor_t *visitor=nullptr) { | |
642 | return std::move( | |
643 | iter_fut | |
644 | ).si_then([c, visitor, f=std::forward<F>(f)](auto iter) { | |
645 | return seastar::do_with( | |
646 | iter, | |
647 | std::move(f), | |
648 | [c, visitor](auto &pos, auto &f) { | |
649 | return trans_intr::repeat( | |
650 | [c, visitor, &f, &pos] { | |
651 | return f( | |
652 | pos | |
653 | ).si_then([c, visitor, &pos](auto done) { | |
654 | if (done == seastar::stop_iteration::yes) { | |
655 | return iterate_repeat_ret_inner( | |
656 | interruptible::ready_future_marker{}, | |
657 | seastar::stop_iteration::yes); | |
658 | } else { | |
659 | ceph_assert(!pos.is_end()); | |
660 | return pos.next( | |
661 | c, visitor | |
662 | ).si_then([&pos](auto next) { | |
663 | pos = next; | |
664 | return iterate_repeat_ret_inner( | |
665 | interruptible::ready_future_marker{}, | |
666 | seastar::stop_iteration::no); | |
667 | }); | |
668 | } | |
669 | }); | |
670 | }); | |
671 | }); | |
672 | }); | |
673 | } | |
674 | ||
675 | /** | |
676 | * insert | |
677 | * | |
678 | * Inserts val at laddr with iter as a hint. If element at laddr already | |
679 | * exists returns iterator to that element unchanged and returns false. | |
680 | * | |
681 | * Invalidates all outstanding iterators for this tree on this transaction. | |
682 | * | |
683 | * @param c [in] op context | |
684 | * @param iter [in] hint, insertion constant if immediately prior to iter | |
685 | * @param laddr [in] addr at which to insert | |
686 | * @param val [in] val to insert | |
687 | * @return pair<iter, bool> where iter points to element at addr, bool true | |
688 | * iff element at laddr did not exist. | |
689 | */ | |
690 | using insert_iertr = base_iertr; | |
691 | using insert_ret = insert_iertr::future<std::pair<iterator, bool>>; | |
692 | insert_ret insert( | |
693 | op_context_t<node_key_t> c, | |
694 | iterator iter, | |
695 | node_key_t laddr, | |
696 | node_val_t val, | |
697 | LogicalCachedExtent* nextent | |
698 | ) { | |
699 | LOG_PREFIX(FixedKVBtree::insert); | |
700 | SUBTRACET( | |
701 | seastore_fixedkv_tree, | |
702 | "inserting laddr {} at iter {}", | |
703 | c.trans, | |
704 | laddr, | |
705 | iter.is_end() ? min_max_t<node_key_t>::max : iter.get_key()); | |
706 | return seastar::do_with( | |
707 | iter, | |
708 | [this, c, laddr, val, nextent](auto &ret) { | |
709 | return find_insertion( | |
710 | c, laddr, ret | |
711 | ).si_then([this, c, laddr, val, &ret, nextent] { | |
712 | if (!ret.at_boundary() && ret.get_key() == laddr) { | |
713 | return insert_ret( | |
714 | interruptible::ready_future_marker{}, | |
715 | std::make_pair(ret, false)); | |
716 | } else { | |
717 | ++(get_tree_stats<self_type>(c.trans).num_inserts); | |
718 | return handle_split( | |
719 | c, ret | |
720 | ).si_then([c, laddr, val, &ret, nextent] { | |
721 | if (!ret.leaf.node->is_mutable()) { | |
722 | CachedExtentRef mut = c.cache.duplicate_for_write( | |
723 | c.trans, ret.leaf.node | |
724 | ); | |
725 | ret.leaf.node = mut->cast<leaf_node_t>(); | |
726 | } | |
727 | auto iter = typename leaf_node_t::const_iterator( | |
728 | ret.leaf.node.get(), ret.leaf.pos); | |
729 | assert(iter == ret.leaf.node->lower_bound(laddr)); | |
730 | assert(iter == ret.leaf.node->end() || iter->get_key() > laddr); | |
731 | assert(laddr >= ret.leaf.node->get_meta().begin && | |
732 | laddr < ret.leaf.node->get_meta().end); | |
733 | ret.leaf.node->insert(iter, laddr, val, nextent); | |
734 | return insert_ret( | |
735 | interruptible::ready_future_marker{}, | |
736 | std::make_pair(ret, true)); | |
737 | }); | |
738 | } | |
739 | }); | |
740 | }); | |
741 | } | |
742 | ||
743 | insert_ret insert( | |
744 | op_context_t<node_key_t> c, | |
745 | node_key_t laddr, | |
746 | node_val_t val, | |
747 | LogicalCachedExtent* nextent) { | |
748 | return lower_bound( | |
749 | c, laddr | |
750 | ).si_then([this, c, laddr, val, nextent](auto iter) { | |
751 | return this->insert(c, iter, laddr, val, nextent); | |
752 | }); | |
753 | } | |
754 | ||
755 | /** | |
756 | * update | |
757 | * | |
758 | * Invalidates all outstanding iterators for this tree on this transaction. | |
759 | * | |
760 | * @param c [in] op context | |
761 | * @param iter [in] iterator to element to update, must not be end | |
762 | * @param val [in] val with which to update | |
763 | * @return iterator to newly updated element | |
764 | */ | |
765 | using update_iertr = base_iertr; | |
766 | using update_ret = update_iertr::future<iterator>; | |
767 | update_ret update( | |
768 | op_context_t<node_key_t> c, | |
769 | iterator iter, | |
770 | node_val_t val, | |
771 | LogicalCachedExtent* nextent) | |
772 | { | |
773 | LOG_PREFIX(FixedKVBtree::update); | |
774 | SUBTRACET( | |
775 | seastore_fixedkv_tree, | |
776 | "update element at {}", | |
777 | c.trans, | |
778 | iter.is_end() ? min_max_t<node_key_t>::max : iter.get_key()); | |
779 | if (!iter.leaf.node->is_mutable()) { | |
780 | CachedExtentRef mut = c.cache.duplicate_for_write( | |
781 | c.trans, iter.leaf.node | |
782 | ); | |
783 | iter.leaf.node = mut->cast<leaf_node_t>(); | |
784 | } | |
785 | ++(get_tree_stats<self_type>(c.trans).num_updates); | |
786 | iter.leaf.node->update( | |
787 | iter.leaf.node->iter_idx(iter.leaf.pos), | |
788 | val, | |
789 | nextent); | |
790 | return update_ret( | |
791 | interruptible::ready_future_marker{}, | |
792 | iter); | |
793 | } | |
794 | ||
795 | ||
796 | /** | |
797 | * remove | |
798 | * | |
799 | * Invalidates all outstanding iterators for this tree on this transaction. | |
800 | * | |
801 | * @param c [in] op context | |
802 | * @param iter [in] iterator to element to remove, must not be end | |
803 | */ | |
804 | using remove_iertr = base_iertr; | |
805 | using remove_ret = remove_iertr::future<>; | |
806 | remove_ret remove( | |
807 | op_context_t<node_key_t> c, | |
808 | iterator iter) | |
809 | { | |
810 | LOG_PREFIX(FixedKVBtree::remove); | |
811 | SUBTRACET( | |
812 | seastore_fixedkv_tree, | |
813 | "remove element at {}", | |
814 | c.trans, | |
815 | iter.is_end() ? min_max_t<node_key_t>::max : iter.get_key()); | |
816 | assert(!iter.is_end()); | |
817 | ++(get_tree_stats<self_type>(c.trans).num_erases); | |
818 | return seastar::do_with( | |
819 | iter, | |
820 | [this, c](auto &ret) { | |
821 | if (!ret.leaf.node->is_mutable()) { | |
822 | CachedExtentRef mut = c.cache.duplicate_for_write( | |
823 | c.trans, ret.leaf.node | |
824 | ); | |
825 | ret.leaf.node = mut->cast<leaf_node_t>(); | |
826 | } | |
827 | ret.leaf.node->remove( | |
828 | ret.leaf.node->iter_idx(ret.leaf.pos)); | |
829 | ||
830 | return handle_merge( | |
831 | c, ret | |
832 | ); | |
833 | }); | |
834 | } | |
835 | ||
836 | /** | |
837 | * init_cached_extent | |
838 | * | |
839 | * Checks whether e is live (reachable from fixed kv tree) and drops or initializes | |
840 | * accordingly. | |
841 | * | |
842 | * Returns if e is live. | |
843 | */ | |
844 | using init_cached_extent_iertr = base_iertr; | |
845 | using init_cached_extent_ret = init_cached_extent_iertr::future<bool>; | |
846 | init_cached_extent_ret init_cached_extent( | |
847 | op_context_t<node_key_t> c, | |
848 | CachedExtentRef e) | |
849 | { | |
850 | assert(!e->is_logical()); | |
851 | LOG_PREFIX(FixedKVTree::init_cached_extent); | |
852 | SUBTRACET(seastore_fixedkv_tree, "extent {}", c.trans, *e); | |
853 | if (e->get_type() == internal_node_t::TYPE) { | |
854 | auto eint = e->cast<internal_node_t>(); | |
855 | return lower_bound( | |
856 | c, eint->get_node_meta().begin | |
857 | ).si_then([e, c, eint](auto iter) { | |
858 | // Note, this check is valid even if iter.is_end() | |
859 | LOG_PREFIX(FixedKVTree::init_cached_extent); | |
860 | depth_t cand_depth = eint->get_node_meta().depth; | |
861 | if (cand_depth <= iter.get_depth() && | |
862 | &*iter.get_internal(cand_depth).node == &*eint) { | |
863 | SUBTRACET( | |
864 | seastore_fixedkv_tree, | |
865 | "extent {} is live", | |
866 | c.trans, | |
867 | *eint); | |
868 | return true; | |
869 | } else { | |
870 | SUBTRACET( | |
871 | seastore_fixedkv_tree, | |
872 | "extent {} is not live", | |
873 | c.trans, | |
874 | *eint); | |
875 | return false; | |
876 | } | |
877 | }); | |
878 | } else if (e->get_type() == leaf_node_t::TYPE) { | |
879 | auto eleaf = e->cast<leaf_node_t>(); | |
880 | return lower_bound( | |
881 | c, eleaf->get_node_meta().begin | |
882 | ).si_then([c, e, eleaf](auto iter) { | |
883 | // Note, this check is valid even if iter.is_end() | |
884 | LOG_PREFIX(FixedKVTree::init_cached_extent); | |
885 | if (iter.leaf.node == &*eleaf) { | |
886 | SUBTRACET( | |
887 | seastore_fixedkv_tree, | |
888 | "extent {} is live", | |
889 | c.trans, | |
890 | *eleaf); | |
891 | return true; | |
892 | } else { | |
893 | SUBTRACET( | |
894 | seastore_fixedkv_tree, | |
895 | "extent {} is not live", | |
896 | c.trans, | |
897 | *eleaf); | |
898 | return false; | |
899 | } | |
900 | }); | |
901 | } else { | |
902 | SUBTRACET( | |
903 | seastore_fixedkv_tree, | |
904 | "found other extent {} type {}", | |
905 | c.trans, | |
906 | *e, | |
907 | e->get_type()); | |
908 | return init_cached_extent_ret( | |
909 | interruptible::ready_future_marker{}, | |
910 | true); | |
911 | } | |
912 | } | |
913 | ||
914 | /// get_leaf_if_live: get leaf node at laddr/addr if still live | |
915 | using get_leaf_if_live_iertr = base_iertr; | |
916 | using get_leaf_if_live_ret = get_leaf_if_live_iertr::future<CachedExtentRef>; | |
917 | get_leaf_if_live_ret get_leaf_if_live( | |
918 | op_context_t<node_key_t> c, | |
919 | paddr_t addr, | |
920 | node_key_t laddr, | |
921 | extent_len_t len) | |
922 | { | |
923 | LOG_PREFIX(FixedKVBtree::get_leaf_if_live); | |
924 | return lower_bound( | |
925 | c, laddr | |
926 | ).si_then([FNAME, c, addr, laddr, len](auto iter) { | |
927 | if (iter.leaf.node->get_paddr() == addr) { | |
928 | SUBTRACET( | |
929 | seastore_fixedkv_tree, | |
930 | "extent laddr {} addr {}~{} found: {}", | |
931 | c.trans, | |
932 | laddr, | |
933 | addr, | |
934 | len, | |
935 | *iter.leaf.node); | |
936 | return CachedExtentRef(iter.leaf.node); | |
937 | } else { | |
938 | SUBTRACET( | |
939 | seastore_fixedkv_tree, | |
940 | "extent laddr {} addr {}~{} is not live, does not match node {}", | |
941 | c.trans, | |
942 | laddr, | |
943 | addr, | |
944 | len, | |
945 | *iter.leaf.node); | |
946 | return CachedExtentRef(); | |
947 | } | |
948 | }); | |
949 | } | |
950 | ||
951 | ||
952 | /// get_internal_if_live: get internal node at laddr/addr if still live | |
953 | using get_internal_if_live_iertr = base_iertr; | |
954 | using get_internal_if_live_ret = get_internal_if_live_iertr::future<CachedExtentRef>; | |
955 | get_internal_if_live_ret get_internal_if_live( | |
956 | op_context_t<node_key_t> c, | |
957 | paddr_t addr, | |
958 | node_key_t laddr, | |
959 | extent_len_t len) | |
960 | { | |
961 | LOG_PREFIX(FixedKVBtree::get_internal_if_live); | |
962 | return lower_bound( | |
963 | c, laddr | |
964 | ).si_then([FNAME, c, addr, laddr, len](auto iter) { | |
965 | for (depth_t d = 2; d <= iter.get_depth(); ++d) { | |
966 | CachedExtent &node = *iter.get_internal(d).node; | |
967 | auto internal_node = node.cast<internal_node_t>(); | |
968 | if (internal_node->get_paddr() == addr) { | |
969 | SUBTRACET( | |
970 | seastore_fixedkv_tree, | |
971 | "extent laddr {} addr {}~{} found: {}", | |
972 | c.trans, | |
973 | laddr, | |
974 | addr, | |
975 | len, | |
976 | *internal_node); | |
977 | assert(internal_node->get_node_meta().begin == laddr); | |
978 | return CachedExtentRef(internal_node); | |
979 | } | |
980 | } | |
981 | SUBTRACET( | |
982 | seastore_fixedkv_tree, | |
983 | "extent laddr {} addr {}~{} is not live, no matching internal node", | |
984 | c.trans, | |
985 | laddr, | |
986 | addr, | |
987 | len); | |
988 | return CachedExtentRef(); | |
989 | }); | |
990 | } | |
991 | ||
992 | ||
993 | /** | |
994 | * rewrite_extent | |
995 | * | |
996 | * Rewrites a fresh copy of extent into transaction and updates internal | |
997 | * references. | |
998 | */ | |
999 | using rewrite_extent_iertr = base_iertr; | |
1000 | using rewrite_extent_ret = rewrite_extent_iertr::future<>; | |
1001 | rewrite_extent_ret rewrite_extent( | |
1002 | op_context_t<node_key_t> c, | |
1003 | CachedExtentRef e) { | |
1004 | LOG_PREFIX(FixedKVBtree::rewrite_extent); | |
1005 | assert(is_lba_backref_node(e->get_type())); | |
1006 | ||
1007 | auto do_rewrite = [&](auto &fixed_kv_extent) { | |
1008 | auto n_fixed_kv_extent = c.cache.template alloc_new_extent< | |
1009 | std::remove_reference_t<decltype(fixed_kv_extent)> | |
1010 | >( | |
1011 | c.trans, | |
1012 | fixed_kv_extent.get_length(), | |
1013 | fixed_kv_extent.get_user_hint(), | |
1014 | // get target rewrite generation | |
1015 | fixed_kv_extent.get_rewrite_generation()); | |
1016 | fixed_kv_extent.get_bptr().copy_out( | |
1017 | 0, | |
1018 | fixed_kv_extent.get_length(), | |
1019 | n_fixed_kv_extent->get_bptr().c_str()); | |
1020 | n_fixed_kv_extent->set_modify_time(fixed_kv_extent.get_modify_time()); | |
1021 | n_fixed_kv_extent->range = n_fixed_kv_extent->get_node_meta(); | |
1022 | ||
1023 | if (fixed_kv_extent.get_type() == internal_node_t::TYPE || | |
1024 | leaf_node_t::do_has_children) { | |
1025 | if (!fixed_kv_extent.is_pending()) { | |
1026 | n_fixed_kv_extent->copy_sources.emplace(&fixed_kv_extent); | |
1027 | n_fixed_kv_extent->prior_instance = &fixed_kv_extent; | |
1028 | } else { | |
1029 | ceph_assert(fixed_kv_extent.is_mutation_pending()); | |
1030 | n_fixed_kv_extent->copy_sources.emplace( | |
1031 | (typename internal_node_t::base_t* | |
1032 | )fixed_kv_extent.get_prior_instance().get()); | |
1033 | n_fixed_kv_extent->children = std::move(fixed_kv_extent.children); | |
1034 | n_fixed_kv_extent->prior_instance = fixed_kv_extent.get_prior_instance(); | |
1035 | n_fixed_kv_extent->adjust_ptracker_for_children(); | |
1036 | } | |
1037 | } | |
1038 | ||
1039 | /* This is a bit underhanded. Any relative addrs here must necessarily | |
1040 | * be record relative as we are rewriting a dirty extent. Thus, we | |
1041 | * are using resolve_relative_addrs with a (likely negative) block | |
1042 | * relative offset to correct them to block-relative offsets adjusted | |
1043 | * for our new transaction location. | |
1044 | * | |
1045 | * Upon commit, these now block relative addresses will be interpretted | |
1046 | * against the real final address. | |
1047 | */ | |
1048 | if (!n_fixed_kv_extent->get_paddr().is_absolute()) { | |
1049 | // backend_type_t::SEGMENTED | |
1050 | assert(n_fixed_kv_extent->get_paddr().is_record_relative()); | |
1051 | n_fixed_kv_extent->resolve_relative_addrs( | |
1052 | make_record_relative_paddr(0).block_relative_to( | |
1053 | n_fixed_kv_extent->get_paddr())); | |
1054 | } // else: backend_type_t::RANDOM_BLOCK | |
1055 | ||
1056 | SUBTRACET( | |
1057 | seastore_fixedkv_tree, | |
1058 | "rewriting {} into {}", | |
1059 | c.trans, | |
1060 | fixed_kv_extent, | |
1061 | *n_fixed_kv_extent); | |
1062 | ||
1063 | return update_internal_mapping( | |
1064 | c, | |
1065 | n_fixed_kv_extent->get_node_meta().depth, | |
1066 | n_fixed_kv_extent->get_node_meta().begin, | |
1067 | e->get_paddr(), | |
1068 | n_fixed_kv_extent->get_paddr(), | |
1069 | n_fixed_kv_extent | |
1070 | ).si_then([c, e] { | |
1071 | c.cache.retire_extent(c.trans, e); | |
1072 | }); | |
1073 | }; | |
1074 | ||
1075 | CachedExtentRef n_fixed_kv_extent; | |
1076 | if (e->get_type() == internal_node_t::TYPE) { | |
1077 | auto lint = e->cast<internal_node_t>(); | |
1078 | return do_rewrite(*lint); | |
1079 | } else { | |
1080 | assert(e->get_type() == leaf_node_t::TYPE); | |
1081 | auto lleaf = e->cast<leaf_node_t>(); | |
1082 | return do_rewrite(*lleaf); | |
1083 | } | |
1084 | } | |
1085 | ||
1086 | using update_internal_mapping_iertr = base_iertr; | |
1087 | using update_internal_mapping_ret = update_internal_mapping_iertr::future<>; | |
1088 | update_internal_mapping_ret update_internal_mapping( | |
1089 | op_context_t<node_key_t> c, | |
1090 | depth_t depth, | |
1091 | node_key_t laddr, | |
1092 | paddr_t old_addr, | |
1093 | paddr_t new_addr, | |
1094 | typename internal_node_t::base_ref nextent) | |
1095 | { | |
1096 | LOG_PREFIX(FixedKVBtree::update_internal_mapping); | |
1097 | SUBTRACET( | |
1098 | seastore_fixedkv_tree, | |
1099 | "updating laddr {} at depth {} from {} to {}, nextent {}", | |
1100 | c.trans, | |
1101 | laddr, | |
1102 | depth, | |
1103 | old_addr, | |
1104 | new_addr, | |
1105 | *nextent); | |
1106 | ||
1107 | return lower_bound( | |
1108 | c, laddr, nullptr, depth + 1 | |
1109 | ).si_then([=, this](auto iter) { | |
1110 | assert(iter.get_depth() >= depth); | |
1111 | if (depth == iter.get_depth()) { | |
1112 | SUBTRACET(seastore_fixedkv_tree, "update at root", c.trans); | |
1113 | ||
1114 | if (laddr != min_max_t<node_key_t>::min) { | |
1115 | SUBERRORT( | |
1116 | seastore_fixedkv_tree, | |
1117 | "updating root laddr {} at depth {} from {} to {}," | |
1118 | "laddr is not 0", | |
1119 | c.trans, | |
1120 | laddr, | |
1121 | depth, | |
1122 | old_addr, | |
1123 | new_addr, | |
1124 | get_root().get_location()); | |
1125 | ceph_assert(0 == "impossible"); | |
1126 | } | |
1127 | ||
1128 | if (get_root().get_location() != old_addr) { | |
1129 | SUBERRORT( | |
1130 | seastore_fixedkv_tree, | |
1131 | "updating root laddr {} at depth {} from {} to {}," | |
1132 | "root addr {} does not match", | |
1133 | c.trans, | |
1134 | laddr, | |
1135 | depth, | |
1136 | old_addr, | |
1137 | new_addr, | |
1138 | get_root().get_location()); | |
1139 | ceph_assert(0 == "impossible"); | |
1140 | } | |
1141 | ||
1142 | root_block = c.cache.duplicate_for_write( | |
1143 | c.trans, root_block)->template cast<RootBlock>(); | |
1144 | get_root().set_location(new_addr); | |
1145 | set_root_node(nextent); | |
1146 | } else { | |
1147 | auto &parent = iter.get_internal(depth + 1); | |
1148 | assert(parent.node); | |
1149 | assert(parent.pos < parent.node->get_size()); | |
1150 | auto piter = parent.node->iter_idx(parent.pos); | |
1151 | ||
1152 | if (piter->get_key() != laddr) { | |
1153 | SUBERRORT( | |
1154 | seastore_fixedkv_tree, | |
1155 | "updating laddr {} at depth {} from {} to {}," | |
1156 | "node {} pos {} val pivot addr {} does not match", | |
1157 | c.trans, | |
1158 | laddr, | |
1159 | depth, | |
1160 | old_addr, | |
1161 | new_addr, | |
1162 | *(parent.node), | |
1163 | parent.pos, | |
1164 | piter->get_key()); | |
1165 | ceph_assert(0 == "impossible"); | |
1166 | } | |
1167 | ||
1168 | ||
1169 | if (piter->get_val() != old_addr) { | |
1170 | SUBERRORT( | |
1171 | seastore_fixedkv_tree, | |
1172 | "updating laddr {} at depth {} from {} to {}," | |
1173 | "node {} pos {} val addr {} does not match", | |
1174 | c.trans, | |
1175 | laddr, | |
1176 | depth, | |
1177 | old_addr, | |
1178 | new_addr, | |
1179 | *(parent.node), | |
1180 | parent.pos, | |
1181 | piter->get_val()); | |
1182 | ceph_assert(0 == "impossible"); | |
1183 | } | |
1184 | ||
1185 | CachedExtentRef mut = c.cache.duplicate_for_write( | |
1186 | c.trans, | |
1187 | parent.node | |
1188 | ); | |
1189 | typename internal_node_t::Ref mparent = mut->cast<internal_node_t>(); | |
1190 | mparent->update(piter, new_addr, nextent.get()); | |
1191 | ||
1192 | /* Note, iter is now invalid as we didn't udpate either the parent | |
1193 | * node reference to the new mutable instance nor did we update the | |
1194 | * child pointer to the new node. Not a problem as we'll now just | |
1195 | * destruct it. | |
1196 | */ | |
1197 | } | |
1198 | return seastar::now(); | |
1199 | }); | |
1200 | } | |
1201 | ||
1202 | ||
1203 | private: | |
1204 | RootBlockRef root_block; | |
1205 | ||
1206 | template <typename T> | |
1207 | using node_position_t = typename iterator::template node_position_t<T>; | |
1208 | ||
1209 | using get_internal_node_iertr = base_iertr; | |
1210 | using get_internal_node_ret = get_internal_node_iertr::future<InternalNodeRef>; | |
1211 | static get_internal_node_ret get_internal_node( | |
1212 | op_context_t<node_key_t> c, | |
1213 | depth_t depth, | |
1214 | paddr_t offset, | |
1215 | node_key_t begin, | |
1216 | node_key_t end, | |
1217 | typename std::optional<node_position_t<internal_node_t>> parent_pos) | |
1218 | { | |
1219 | LOG_PREFIX(FixedKVBtree::get_internal_node); | |
1220 | SUBTRACET( | |
1221 | seastore_fixedkv_tree, | |
1222 | "reading internal at offset {}, depth {}, begin {}, end {}", | |
1223 | c.trans, | |
1224 | offset, | |
1225 | depth, | |
1226 | begin, | |
1227 | end); | |
1228 | assert(depth > 1); | |
1229 | auto init_internal = [c, depth, begin, end, | |
1230 | parent_pos=std::move(parent_pos)] | |
1231 | (internal_node_t &node) { | |
1232 | assert(!node.is_pending()); | |
1233 | assert(!node.is_linked()); | |
1234 | node.range = fixed_kv_node_meta_t<node_key_t>{begin, end, depth}; | |
1235 | if (parent_pos) { | |
1236 | auto &parent = parent_pos->node; | |
1237 | parent->link_child(&node, parent_pos->pos); | |
1238 | } else { | |
1239 | assert(node.range.is_root()); | |
1240 | auto root_block = c.cache.get_root_fast(c.trans); | |
1241 | if (root_block->is_mutation_pending()) { | |
1242 | auto &stable_root = (RootBlockRef&)*root_block->get_prior_instance(); | |
1243 | link_phy_tree_root_node(stable_root, &node); | |
1244 | } else { | |
1245 | assert(!root_block->is_pending()); | |
1246 | link_phy_tree_root_node(root_block, &node); | |
1247 | } | |
1248 | } | |
1249 | }; | |
1250 | return c.cache.template get_absent_extent<internal_node_t>( | |
1251 | c.trans, | |
1252 | offset, | |
1253 | node_size, | |
1254 | init_internal | |
1255 | ).si_then([FNAME, c, offset, init_internal, depth, begin, end]( | |
1256 | typename internal_node_t::Ref ret) { | |
1257 | SUBTRACET( | |
1258 | seastore_fixedkv_tree, | |
1259 | "read internal at offset {} {}", | |
1260 | c.trans, | |
1261 | offset, | |
1262 | *ret); | |
1263 | // This can only happen during init_cached_extent | |
1264 | // or when backref extent being rewritten by gc space reclaiming | |
1265 | if (!ret->is_pending() && !ret->is_linked()) { | |
1266 | assert(ret->is_dirty() | |
1267 | || (is_backref_node(ret->get_type()) | |
1268 | && ret->is_clean())); | |
1269 | init_internal(*ret); | |
1270 | } | |
1271 | auto meta = ret->get_meta(); | |
1272 | if (ret->get_size()) { | |
1273 | ceph_assert(meta.begin <= ret->begin()->get_key()); | |
1274 | ceph_assert(meta.end > (ret->end() - 1)->get_key()); | |
1275 | } | |
1276 | ceph_assert(depth == meta.depth); | |
1277 | ceph_assert(begin == meta.begin); | |
1278 | ceph_assert(end == meta.end); | |
1279 | return get_internal_node_ret( | |
1280 | interruptible::ready_future_marker{}, | |
1281 | ret); | |
1282 | }); | |
1283 | } | |
1284 | ||
1285 | ||
1286 | using get_leaf_node_iertr = base_iertr; | |
1287 | using get_leaf_node_ret = get_leaf_node_iertr::future<LeafNodeRef>; | |
1288 | static get_leaf_node_ret get_leaf_node( | |
1289 | op_context_t<node_key_t> c, | |
1290 | paddr_t offset, | |
1291 | node_key_t begin, | |
1292 | node_key_t end, | |
1293 | typename std::optional<node_position_t<leaf_node_t>> parent_pos) | |
1294 | { | |
1295 | LOG_PREFIX(FixedKVBtree::get_leaf_node); | |
1296 | SUBTRACET( | |
1297 | seastore_fixedkv_tree, | |
1298 | "reading leaf at offset {}, begin {}, end {}", | |
1299 | c.trans, | |
1300 | offset, | |
1301 | begin, | |
1302 | end); | |
1303 | auto init_leaf = [c, begin, end, | |
1304 | parent_pos=std::move(parent_pos)] | |
1305 | (leaf_node_t &node) { | |
1306 | assert(!node.is_pending()); | |
1307 | assert(!node.is_linked()); | |
1308 | node.range = fixed_kv_node_meta_t<node_key_t>{begin, end, 1}; | |
1309 | if (parent_pos) { | |
1310 | auto &parent = parent_pos->node; | |
1311 | parent->link_child(&node, parent_pos->pos); | |
1312 | } else { | |
1313 | assert(node.range.is_root()); | |
1314 | auto root_block = c.cache.get_root_fast(c.trans); | |
1315 | if (root_block->is_mutation_pending()) { | |
1316 | auto &stable_root = (RootBlockRef&)*root_block->get_prior_instance(); | |
1317 | link_phy_tree_root_node(stable_root, &node); | |
1318 | } else { | |
1319 | assert(!root_block->is_pending()); | |
1320 | link_phy_tree_root_node(root_block, &node); | |
1321 | } | |
1322 | } | |
1323 | }; | |
1324 | return c.cache.template get_absent_extent<leaf_node_t>( | |
1325 | c.trans, | |
1326 | offset, | |
1327 | node_size, | |
1328 | init_leaf | |
1329 | ).si_then([FNAME, c, offset, init_leaf, begin, end] | |
1330 | (typename leaf_node_t::Ref ret) { | |
1331 | SUBTRACET( | |
1332 | seastore_fixedkv_tree, | |
1333 | "read leaf at offset {} {}", | |
1334 | c.trans, | |
1335 | offset, | |
1336 | *ret); | |
1337 | // This can only happen during init_cached_extent | |
1338 | // or when backref extent being rewritten by gc space reclaiming | |
1339 | if (!ret->is_pending() && !ret->is_linked()) { | |
1340 | assert(ret->is_dirty() | |
1341 | || (is_backref_node(ret->get_type()) | |
1342 | && ret->is_clean())); | |
1343 | init_leaf(*ret); | |
1344 | } | |
1345 | auto meta = ret->get_meta(); | |
1346 | if (ret->get_size()) { | |
1347 | ceph_assert(meta.begin <= ret->begin()->get_key()); | |
1348 | ceph_assert(meta.end > (ret->end() - 1)->get_key()); | |
1349 | } | |
1350 | ceph_assert(1 == meta.depth); | |
1351 | ceph_assert(begin == meta.begin); | |
1352 | ceph_assert(end == meta.end); | |
1353 | return get_leaf_node_ret( | |
1354 | interruptible::ready_future_marker{}, | |
1355 | ret); | |
1356 | }); | |
1357 | } | |
1358 | ||
1359 | using lookup_root_iertr = base_iertr; | |
1360 | using lookup_root_ret = lookup_root_iertr::future<>; | |
1361 | lookup_root_ret lookup_root( | |
1362 | op_context_t<node_key_t> c, | |
1363 | iterator &iter, | |
1364 | mapped_space_visitor_t *visitor) const { | |
1365 | LOG_PREFIX(FixedKVBtree::lookup_root); | |
1366 | SUBTRACET(seastore_fixedkv_tree, | |
1367 | "looking up root on {}", | |
1368 | c.trans, | |
1369 | *root_block); | |
1370 | auto [found, fut] = get_root_node(c); | |
1371 | ||
1372 | auto on_found_internal = | |
1373 | [this, visitor, &iter](InternalNodeRef &root_node) { | |
1374 | iter.get_internal(get_root().get_depth()).node = root_node; | |
1375 | if (visitor) (*visitor)( | |
1376 | root_node->get_paddr(), | |
1377 | root_node->get_node_meta().begin, | |
1378 | root_node->get_length(), | |
1379 | get_root().get_depth(), | |
1380 | internal_node_t::TYPE, | |
1381 | iter); | |
1382 | return lookup_root_iertr::now(); | |
1383 | }; | |
1384 | auto on_found_leaf = | |
1385 | [visitor, &iter, this](LeafNodeRef root_node) { | |
1386 | iter.leaf.node = root_node; | |
1387 | if (visitor) (*visitor)( | |
1388 | root_node->get_paddr(), | |
1389 | root_node->get_node_meta().begin, | |
1390 | root_node->get_length(), | |
1391 | get_root().get_depth(), | |
1392 | leaf_node_t::TYPE, | |
1393 | iter); | |
1394 | return lookup_root_iertr::now(); | |
1395 | }; | |
1396 | ||
1397 | if (found) { | |
1398 | return fut.then_interruptible( | |
1399 | [this, c, on_found_internal=std::move(on_found_internal), | |
1400 | on_found_leaf=std::move(on_found_leaf)](auto root) { | |
1401 | LOG_PREFIX(FixedKVBtree::lookup_root); | |
1402 | ceph_assert(root); | |
1403 | SUBTRACET(seastore_fixedkv_tree, | |
1404 | "got root node on {}, res: {}", | |
1405 | c.trans, | |
1406 | *root_block, | |
1407 | *root); | |
1408 | ||
1409 | if (get_root().get_depth() > 1) { | |
1410 | auto root_node = root->template cast<internal_node_t>(); | |
1411 | return on_found_internal(root_node); | |
1412 | } else { | |
1413 | auto root_node = root->template cast<leaf_node_t>(); | |
1414 | return on_found_leaf(root_node); | |
1415 | } | |
1416 | }); | |
1417 | } else { | |
1418 | if (get_root().get_depth() > 1) { | |
1419 | return get_internal_node( | |
1420 | c, | |
1421 | get_root().get_depth(), | |
1422 | get_root().get_location(), | |
1423 | min_max_t<node_key_t>::min, | |
1424 | min_max_t<node_key_t>::max, | |
1425 | std::nullopt | |
1426 | ).si_then([on_found=std::move(on_found_internal)](InternalNodeRef root_node) { | |
1427 | return on_found(root_node); | |
1428 | }); | |
1429 | } else { | |
1430 | return get_leaf_node( | |
1431 | c, | |
1432 | get_root().get_location(), | |
1433 | min_max_t<node_key_t>::min, | |
1434 | min_max_t<node_key_t>::max, | |
1435 | std::nullopt | |
1436 | ).si_then([on_found=std::move(on_found_leaf)](LeafNodeRef root_node) { | |
1437 | return on_found(root_node); | |
1438 | }); | |
1439 | } | |
1440 | } | |
1441 | } | |
1442 | ||
1443 | using lookup_internal_level_iertr = base_iertr; | |
1444 | using lookup_internal_level_ret = lookup_internal_level_iertr::future<>; | |
1445 | template <typename F> | |
1446 | static lookup_internal_level_ret lookup_internal_level( | |
1447 | op_context_t<node_key_t> c, | |
1448 | depth_t depth, | |
1449 | iterator &iter, | |
1450 | F &f, | |
1451 | mapped_space_visitor_t *visitor | |
1452 | ) { | |
1453 | assert(depth > 1); | |
1454 | auto &parent_entry = iter.get_internal(depth + 1); | |
1455 | auto parent = parent_entry.node; | |
1456 | auto node_iter = parent->iter_idx(parent_entry.pos); | |
1457 | ||
1458 | auto on_found = [depth, visitor, &iter, &f](InternalNodeRef node) { | |
1459 | auto &entry = iter.get_internal(depth); | |
1460 | entry.node = node; | |
1461 | auto node_iter = f(*node); | |
1462 | assert(node_iter != node->end()); | |
1463 | entry.pos = node_iter->get_offset(); | |
1464 | if (visitor) | |
1465 | (*visitor)( | |
1466 | node->get_paddr(), | |
1467 | node->get_node_meta().begin, | |
1468 | node->get_length(), | |
1469 | depth, | |
1470 | node->get_type(), | |
1471 | iter); | |
1472 | return seastar::now(); | |
1473 | }; | |
1474 | ||
1475 | auto v = parent->template get_child<internal_node_t>(c, node_iter); | |
1476 | if (v.has_child()) { | |
1477 | return v.get_child_fut().then( | |
1478 | [on_found=std::move(on_found), node_iter, c, | |
1479 | parent_entry](auto child) mutable { | |
1480 | LOG_PREFIX(FixedKVBtree::lookup_internal_level); | |
1481 | SUBTRACET(seastore_fixedkv_tree, | |
1482 | "got child on {}, pos: {}, res: {}", | |
1483 | c.trans, | |
1484 | *parent_entry.node, | |
1485 | parent_entry.pos, | |
1486 | *child); | |
1487 | auto &cnode = (typename internal_node_t::base_t &)*child; | |
1488 | assert(cnode.get_node_meta().begin == node_iter.get_key()); | |
1489 | assert(cnode.get_node_meta().end > node_iter.get_key()); | |
1490 | return on_found(child->template cast<internal_node_t>()); | |
1491 | }); | |
1492 | } | |
1493 | ||
1494 | auto child_pos = v.get_child_pos(); | |
1495 | auto next_iter = node_iter + 1; | |
1496 | auto begin = node_iter->get_key(); | |
1497 | auto end = next_iter == parent->end() | |
1498 | ? parent->get_node_meta().end | |
1499 | : next_iter->get_key(); | |
1500 | return get_internal_node( | |
1501 | c, | |
1502 | depth, | |
1503 | node_iter->get_val().maybe_relative_to(parent->get_paddr()), | |
1504 | begin, | |
1505 | end, | |
1506 | std::make_optional<node_position_t<internal_node_t>>( | |
1507 | child_pos.template get_parent<internal_node_t>(), | |
1508 | child_pos.get_pos()) | |
1509 | ).si_then([on_found=std::move(on_found)](InternalNodeRef node) { | |
1510 | return on_found(node); | |
1511 | }); | |
1512 | } | |
1513 | ||
1514 | using lookup_leaf_iertr = base_iertr; | |
1515 | using lookup_leaf_ret = lookup_leaf_iertr::future<>; | |
1516 | template <typename F> | |
1517 | static lookup_internal_level_ret lookup_leaf( | |
1518 | op_context_t<node_key_t> c, | |
1519 | iterator &iter, | |
1520 | F &f, | |
1521 | mapped_space_visitor_t *visitor | |
1522 | ) { | |
1523 | auto &parent_entry = iter.get_internal(2); | |
1524 | auto parent = parent_entry.node; | |
1525 | assert(parent); | |
1526 | auto node_iter = parent->iter_idx(parent_entry.pos); | |
1527 | ||
1528 | auto on_found = [visitor, &iter, &f](LeafNodeRef node) { | |
1529 | iter.leaf.node = node; | |
1530 | auto node_iter = f(*node); | |
1531 | iter.leaf.pos = node_iter->get_offset(); | |
1532 | if (visitor) | |
1533 | (*visitor)( | |
1534 | node->get_paddr(), | |
1535 | node->get_node_meta().begin, | |
1536 | node->get_length(), | |
1537 | 1, | |
1538 | node->get_type(), | |
1539 | iter); | |
1540 | return seastar::now(); | |
1541 | }; | |
1542 | ||
1543 | auto v = parent->template get_child<leaf_node_t>(c, node_iter); | |
1544 | if (v.has_child()) { | |
1545 | return v.get_child_fut().then( | |
1546 | [on_found=std::move(on_found), node_iter, c, | |
1547 | parent_entry](auto child) mutable { | |
1548 | LOG_PREFIX(FixedKVBtree::lookup_leaf); | |
1549 | SUBTRACET(seastore_fixedkv_tree, | |
1550 | "got child on {}, pos: {}, res: {}", | |
1551 | c.trans, | |
1552 | *parent_entry.node, | |
1553 | parent_entry.pos, | |
1554 | *child); | |
1555 | auto &cnode = (typename internal_node_t::base_t &)*child; | |
1556 | assert(cnode.get_node_meta().begin == node_iter.get_key()); | |
1557 | assert(cnode.get_node_meta().end > node_iter.get_key()); | |
1558 | return on_found(child->template cast<leaf_node_t>()); | |
1559 | }); | |
1560 | } | |
1561 | ||
1562 | auto child_pos = v.get_child_pos(); | |
1563 | auto next_iter = node_iter + 1; | |
1564 | auto begin = node_iter->get_key(); | |
1565 | auto end = next_iter == parent->end() | |
1566 | ? parent->get_node_meta().end | |
1567 | : next_iter->get_key(); | |
1568 | ||
1569 | return get_leaf_node( | |
1570 | c, | |
1571 | node_iter->get_val().maybe_relative_to(parent->get_paddr()), | |
1572 | begin, | |
1573 | end, | |
1574 | std::make_optional<node_position_t<leaf_node_t>>( | |
1575 | child_pos.template get_parent<leaf_node_t>(), | |
1576 | child_pos.get_pos()) | |
1577 | ).si_then([on_found=std::move(on_found)](LeafNodeRef node) { | |
1578 | return on_found(node); | |
1579 | }); | |
1580 | } | |
1581 | ||
1582 | /** | |
1583 | * lookup_depth_range | |
1584 | * | |
1585 | * Performs node lookups on depths [from, to) using li and ll to | |
1586 | * specific target at each level. Note, may leave the iterator | |
1587 | * at_boundary(), call handle_boundary() prior to returning out | |
1588 | * lf FixedKVBtree. | |
1589 | */ | |
1590 | using lookup_depth_range_iertr = base_iertr; | |
1591 | using lookup_depth_range_ret = lookup_depth_range_iertr::future<>; | |
1592 | template <typename LI, typename LL> | |
1593 | static lookup_depth_range_ret lookup_depth_range( | |
1594 | op_context_t<node_key_t> c, ///< [in] context | |
1595 | iterator &iter, ///< [in,out] iterator to populate | |
1596 | depth_t from, ///< [in] from inclusive | |
1597 | depth_t to, ///< [in] to exclusive, (to <= from, to == from is a noop) | |
1598 | LI &li, ///< [in] internal->iterator | |
1599 | LL &ll, ///< [in] leaf->iterator | |
1600 | mapped_space_visitor_t *visitor ///< [in] mapped space visitor | |
1601 | ) { | |
1602 | LOG_PREFIX(FixedKVBtree::lookup_depth_range); | |
1603 | SUBTRACET(seastore_fixedkv_tree, "{} -> {}", c.trans, from, to); | |
1604 | return seastar::do_with( | |
1605 | from, | |
1606 | [c, to, visitor, &iter, &li, &ll](auto &d) { | |
1607 | return trans_intr::repeat( | |
1608 | [c, to, visitor, &iter, &li, &ll, &d] { | |
1609 | if (d > to) { | |
1610 | return [&] { | |
1611 | if (d > 1) { | |
1612 | return lookup_internal_level( | |
1613 | c, | |
1614 | d, | |
1615 | iter, | |
1616 | li, | |
1617 | visitor); | |
1618 | } else { | |
1619 | assert(d == 1); | |
1620 | return lookup_leaf( | |
1621 | c, | |
1622 | iter, | |
1623 | ll, | |
1624 | visitor); | |
1625 | } | |
1626 | }().si_then([&d] { | |
1627 | --d; | |
1628 | return lookup_depth_range_iertr::make_ready_future< | |
1629 | seastar::stop_iteration | |
1630 | >(seastar::stop_iteration::no); | |
1631 | }); | |
1632 | } else { | |
1633 | return lookup_depth_range_iertr::make_ready_future< | |
1634 | seastar::stop_iteration | |
1635 | >(seastar::stop_iteration::yes); | |
1636 | } | |
1637 | }); | |
1638 | }); | |
1639 | } | |
1640 | ||
1641 | using lookup_iertr = base_iertr; | |
1642 | using lookup_ret = lookup_iertr::future<iterator>; | |
1643 | template <typename LI, typename LL> | |
1644 | lookup_ret lookup( | |
1645 | op_context_t<node_key_t> c, | |
1646 | LI &&lookup_internal, | |
1647 | LL &&lookup_leaf, | |
1648 | depth_t min_depth, | |
1649 | mapped_space_visitor_t *visitor | |
1650 | ) const { | |
1651 | LOG_PREFIX(FixedKVBtree::lookup); | |
1652 | assert(min_depth > 0); | |
1653 | return seastar::do_with( | |
1654 | iterator{get_root().get_depth()}, | |
1655 | std::forward<LI>(lookup_internal), | |
1656 | std::forward<LL>(lookup_leaf), | |
1657 | [FNAME, this, visitor, c, min_depth](auto &iter, auto &li, auto &ll) { | |
1658 | return lookup_root( | |
1659 | c, iter, visitor | |
1660 | ).si_then([FNAME, this, visitor, c, &iter, &li, &ll, min_depth] { | |
1661 | if (iter.get_depth() > 1) { | |
1662 | auto &root_entry = *(iter.internal.rbegin()); | |
1663 | root_entry.pos = li(*(root_entry.node)).get_offset(); | |
1664 | } else { | |
1665 | auto &root_entry = iter.leaf; | |
1666 | auto riter = ll(*(root_entry.node)); | |
1667 | root_entry.pos = riter->get_offset(); | |
1668 | } | |
1669 | SUBTRACET(seastore_fixedkv_tree, "got root, depth {}", | |
1670 | c.trans, get_root().get_depth()); | |
1671 | return lookup_depth_range( | |
1672 | c, | |
1673 | iter, | |
1674 | get_root().get_depth() - 1, | |
1675 | min_depth - 1, | |
1676 | li, | |
1677 | ll, | |
1678 | visitor | |
1679 | ).si_then([c, visitor, &iter, min_depth] { | |
1680 | // It's only when the lookup is triggered by | |
1681 | // update_internal_mapping() that min_depth is | |
1682 | // NOT 1 | |
1683 | if (min_depth == 1 && iter.at_boundary()) { | |
1684 | return iter.handle_boundary(c, visitor); | |
1685 | } else { | |
1686 | return lookup_iertr::now(); | |
1687 | } | |
1688 | }); | |
1689 | }).si_then([&iter] { | |
1690 | return std::move(iter); | |
1691 | }); | |
1692 | }); | |
1693 | } | |
1694 | ||
1695 | /** | |
1696 | * find_insertion | |
1697 | * | |
1698 | * Prepare iter for insertion. iter should begin pointing at | |
1699 | * the valid insertion point (lower_bound(laddr)). | |
1700 | * | |
1701 | * Upon completion, iter will point at the | |
1702 | * position at which laddr should be inserted. iter may, upon completion, | |
1703 | * point at the end of a leaf other than the end leaf if that's the correct | |
1704 | * insertion point. | |
1705 | */ | |
1706 | using find_insertion_iertr = base_iertr; | |
1707 | using find_insertion_ret = find_insertion_iertr::future<>; | |
1708 | static find_insertion_ret find_insertion( | |
1709 | op_context_t<node_key_t> c, | |
1710 | node_key_t laddr, | |
1711 | iterator &iter) | |
1712 | { | |
1713 | assert(iter.is_end() || iter.get_key() >= laddr); | |
1714 | if (!iter.is_end() && iter.get_key() == laddr) { | |
1715 | return seastar::now(); | |
1716 | } else if (iter.leaf.node->get_node_meta().begin <= laddr) { | |
1717 | #ifndef NDEBUG | |
1718 | auto p = iter; | |
1719 | if (p.leaf.pos > 0) { | |
1720 | --p.leaf.pos; | |
1721 | assert(p.get_key() < laddr); | |
1722 | } | |
1723 | #endif | |
1724 | return seastar::now(); | |
1725 | } else { | |
1726 | assert(iter.leaf.pos == 0); | |
1727 | return iter.prev( | |
1728 | c | |
1729 | ).si_then([laddr, &iter](auto p) { | |
1730 | boost::ignore_unused(laddr); // avoid clang warning; | |
1731 | assert(p.leaf.node->get_node_meta().begin <= laddr); | |
1732 | assert(p.get_key() < laddr); | |
1733 | // Note, this is specifically allowed to violate the iterator | |
1734 | // invariant that pos is a valid index for the node in the event | |
1735 | // that the insertion point is at the end of a node. | |
1736 | p.leaf.pos++; | |
1737 | assert(p.at_boundary()); | |
1738 | iter = p; | |
1739 | return seastar::now(); | |
1740 | }); | |
1741 | } | |
1742 | } | |
1743 | ||
1744 | /** | |
1745 | * handle_split | |
1746 | * | |
1747 | * Split nodes in iter as needed for insertion. First, scan iter from leaf | |
1748 | * to find first non-full level. Then, split from there towards leaf. | |
1749 | * | |
1750 | * Upon completion, iter will point at the newly split insertion point. As | |
1751 | * with find_insertion, iter's leaf pointer may be end without iter being | |
1752 | * end. | |
1753 | */ | |
1754 | using handle_split_iertr = base_iertr; | |
1755 | using handle_split_ret = handle_split_iertr::future<>; | |
1756 | handle_split_ret handle_split( | |
1757 | op_context_t<node_key_t> c, | |
1758 | iterator &iter) | |
1759 | { | |
1760 | LOG_PREFIX(FixedKVBtree::handle_split); | |
1761 | ||
1762 | depth_t split_from = iter.check_split(); | |
1763 | ||
1764 | SUBTRACET(seastore_fixedkv_tree, "split_from {}, depth {}", c.trans, split_from, iter.get_depth()); | |
1765 | ||
1766 | if (split_from == iter.get_depth()) { | |
1767 | auto nroot = c.cache.template alloc_new_extent<internal_node_t>( | |
1768 | c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); | |
1769 | fixed_kv_node_meta_t<node_key_t> meta{ | |
1770 | min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, iter.get_depth() + 1}; | |
1771 | nroot->set_meta(meta); | |
1772 | nroot->range = meta; | |
1773 | nroot->journal_insert( | |
1774 | nroot->begin(), | |
1775 | min_max_t<node_key_t>::min, | |
1776 | get_root().get_location(), | |
1777 | nullptr); | |
1778 | iter.internal.push_back({nroot, 0}); | |
1779 | ||
1780 | get_tree_stats<self_type>(c.trans).depth = iter.get_depth(); | |
1781 | get_tree_stats<self_type>(c.trans).extents_num_delta++; | |
1782 | ||
1783 | root_block = c.cache.duplicate_for_write( | |
1784 | c.trans, root_block)->template cast<RootBlock>(); | |
1785 | get_root().set_location(nroot->get_paddr()); | |
1786 | get_root().set_depth(iter.get_depth()); | |
1787 | ceph_assert(get_root().get_depth() <= MAX_FIXEDKVBTREE_DEPTH); | |
1788 | set_root_node(nroot); | |
1789 | } | |
1790 | ||
1791 | /* pos may be either node_position_t<leaf_node_t> or | |
1792 | * node_position_t<internal_node_t> */ | |
1793 | auto split_level = [&](auto &parent_pos, auto &pos) { | |
1794 | LOG_PREFIX(FixedKVBtree::handle_split); | |
1795 | auto [left, right, pivot] = pos.node->make_split_children(c); | |
1796 | ||
1797 | auto parent_node = parent_pos.node; | |
1798 | auto parent_iter = parent_pos.get_iter(); | |
1799 | ||
1800 | parent_node->update( | |
1801 | parent_iter, | |
1802 | left->get_paddr(), | |
1803 | left.get()); | |
1804 | parent_node->insert( | |
1805 | parent_iter + 1, | |
1806 | pivot, | |
1807 | right->get_paddr(), | |
1808 | right.get()); | |
1809 | ||
1810 | SUBTRACET( | |
1811 | seastore_fixedkv_tree, | |
1812 | "splitted {} into left: {}, right: {}", | |
1813 | c.trans, | |
1814 | *pos.node, | |
1815 | *left, | |
1816 | *right); | |
1817 | c.cache.retire_extent(c.trans, pos.node); | |
1818 | ||
1819 | get_tree_stats<self_type>(c.trans).extents_num_delta++; | |
1820 | return std::make_pair(left, right); | |
1821 | }; | |
1822 | ||
1823 | for (; split_from > 0; --split_from) { | |
1824 | auto &parent_pos = iter.get_internal(split_from + 1); | |
1825 | if (!parent_pos.node->is_mutable()) { | |
1826 | parent_pos.node = c.cache.duplicate_for_write( | |
1827 | c.trans, parent_pos.node | |
1828 | )->template cast<internal_node_t>(); | |
1829 | } | |
1830 | ||
1831 | if (split_from > 1) { | |
1832 | auto &pos = iter.get_internal(split_from); | |
1833 | SUBTRACET( | |
1834 | seastore_fixedkv_tree, | |
1835 | "splitting internal {} at depth {}, parent: {} at pos: {}", | |
1836 | c.trans, | |
1837 | *pos.node, | |
1838 | split_from, | |
1839 | *parent_pos.node, | |
1840 | parent_pos.pos); | |
1841 | auto [left, right] = split_level(parent_pos, pos); | |
1842 | ||
1843 | if (pos.pos < left->get_size()) { | |
1844 | pos.node = left; | |
1845 | } else { | |
1846 | pos.node = right; | |
1847 | pos.pos -= left->get_size(); | |
1848 | ||
1849 | parent_pos.pos += 1; | |
1850 | } | |
1851 | } else { | |
1852 | auto &pos = iter.leaf; | |
1853 | SUBTRACET( | |
1854 | seastore_fixedkv_tree, | |
1855 | "splitting leaf {}, parent: {} at pos: {}", | |
1856 | c.trans, | |
1857 | *pos.node, | |
1858 | *parent_pos.node, | |
1859 | parent_pos.pos); | |
1860 | auto [left, right] = split_level(parent_pos, pos); | |
1861 | ||
1862 | /* right->get_node_meta().begin == pivot == right->begin()->get_key() | |
1863 | * Thus, if pos.pos == left->get_size(), we want iter to point to | |
1864 | * left with pos.pos at the end rather than right with pos.pos = 0 | |
1865 | * since the insertion would be to the left of the first element | |
1866 | * of right and thus necessarily less than right->get_node_meta().begin. | |
1867 | */ | |
1868 | if (pos.pos <= left->get_size()) { | |
1869 | pos.node = left; | |
1870 | } else { | |
1871 | pos.node = right; | |
1872 | pos.pos -= left->get_size(); | |
1873 | ||
1874 | parent_pos.pos += 1; | |
1875 | } | |
1876 | } | |
1877 | } | |
1878 | ||
1879 | return seastar::now(); | |
1880 | } | |
1881 | ||
1882 | ||
1883 | using handle_merge_iertr = base_iertr; | |
1884 | using handle_merge_ret = handle_merge_iertr::future<>; | |
1885 | handle_merge_ret handle_merge( | |
1886 | op_context_t<node_key_t> c, | |
1887 | iterator &iter) | |
1888 | { | |
1889 | LOG_PREFIX(FixedKVBtree::handle_merge); | |
1890 | if (iter.get_depth() == 1 || | |
1891 | !iter.leaf.node->below_min_capacity()) { | |
1892 | SUBTRACET( | |
1893 | seastore_fixedkv_tree, | |
1894 | "no need to merge leaf, leaf size {}, depth {}", | |
1895 | c.trans, | |
1896 | iter.leaf.node->get_size(), | |
1897 | iter.get_depth()); | |
1898 | return seastar::now(); | |
1899 | } | |
1900 | ||
1901 | return seastar::do_with( | |
1902 | depth_t{1}, | |
1903 | [FNAME, this, c, &iter](auto &to_merge) { | |
1904 | return trans_intr::repeat( | |
1905 | [FNAME, this, c, &iter, &to_merge] { | |
1906 | SUBTRACET( | |
1907 | seastore_fixedkv_tree, | |
1908 | "merging depth {}", | |
1909 | c.trans, | |
1910 | to_merge); | |
1911 | auto &parent_pos = iter.get_internal(to_merge + 1); | |
1912 | auto merge_fut = handle_merge_iertr::now(); | |
1913 | if (to_merge > 1) { | |
1914 | auto &pos = iter.get_internal(to_merge); | |
1915 | merge_fut = merge_level(c, to_merge, parent_pos, pos); | |
1916 | } else { | |
1917 | auto &pos = iter.leaf; | |
1918 | merge_fut = merge_level(c, to_merge, parent_pos, pos); | |
1919 | } | |
1920 | ||
1921 | return merge_fut.si_then([FNAME, this, c, &iter, &to_merge] { | |
1922 | ++to_merge; | |
1923 | auto &pos = iter.get_internal(to_merge); | |
1924 | if (to_merge == iter.get_depth()) { | |
1925 | if (pos.node->get_size() == 1) { | |
1926 | SUBTRACET(seastore_fixedkv_tree, "collapsing root", c.trans); | |
1927 | c.cache.retire_extent(c.trans, pos.node); | |
1928 | assert(pos.pos == 0); | |
1929 | auto node_iter = pos.get_iter(); | |
1930 | iter.internal.pop_back(); | |
1931 | get_tree_stats<self_type>(c.trans).depth = iter.get_depth(); | |
1932 | get_tree_stats<self_type>(c.trans).extents_num_delta--; | |
1933 | ||
1934 | root_block = c.cache.duplicate_for_write( | |
1935 | c.trans, root_block | |
1936 | )->template cast<RootBlock>(); | |
1937 | get_root().set_location( | |
1938 | node_iter->get_val().maybe_relative_to(pos.node->get_paddr())); | |
1939 | get_root().set_depth(iter.get_depth()); | |
1940 | if (iter.get_depth() > 1) { | |
1941 | auto root_node = iter.get_internal(iter.get_depth()).node; | |
1942 | set_root_node(root_node); | |
1943 | } else { | |
1944 | set_root_node(iter.leaf.node); | |
1945 | } | |
1946 | } else { | |
1947 | SUBTRACET(seastore_fixedkv_tree, "no need to collapse root", c.trans); | |
1948 | } | |
1949 | return seastar::stop_iteration::yes; | |
1950 | } else if (pos.node->below_min_capacity()) { | |
1951 | SUBTRACET( | |
1952 | seastore_fixedkv_tree, | |
1953 | "continuing, next node {} depth {} at min", | |
1954 | c.trans, | |
1955 | *pos.node, | |
1956 | to_merge); | |
1957 | return seastar::stop_iteration::no; | |
1958 | } else { | |
1959 | SUBTRACET( | |
1960 | seastore_fixedkv_tree, | |
1961 | "complete, next node {} depth {} not min", | |
1962 | c.trans, | |
1963 | *pos.node, | |
1964 | to_merge); | |
1965 | return seastar::stop_iteration::yes; | |
1966 | } | |
1967 | }); | |
1968 | }); | |
1969 | }); | |
1970 | } | |
1971 | ||
1972 | template <typename NodeType, | |
1973 | std::enable_if_t<std::is_same_v<NodeType, leaf_node_t>, int> = 0> | |
1974 | base_iertr::future<typename NodeType::Ref> get_node( | |
1975 | op_context_t<node_key_t> c, | |
1976 | depth_t depth, | |
1977 | paddr_t addr, | |
1978 | node_key_t begin, | |
1979 | node_key_t end, | |
1980 | typename std::optional<node_position_t<leaf_node_t>> parent_pos) { | |
1981 | assert(depth == 1); | |
1982 | return get_leaf_node(c, addr, begin, end, std::move(parent_pos)); | |
1983 | } | |
1984 | ||
1985 | template <typename NodeType, | |
1986 | std::enable_if_t<std::is_same_v<NodeType, internal_node_t>, int> = 0> | |
1987 | base_iertr::future<typename NodeType::Ref> get_node( | |
1988 | op_context_t<node_key_t> c, | |
1989 | depth_t depth, | |
1990 | paddr_t addr, | |
1991 | node_key_t begin, | |
1992 | node_key_t end, | |
1993 | typename std::optional<node_position_t<internal_node_t>> parent_pos) { | |
1994 | return get_internal_node(c, depth, addr, begin, end, std::move(parent_pos)); | |
1995 | } | |
1996 | ||
1997 | template <typename NodeType> | |
1998 | handle_merge_ret merge_level( | |
1999 | op_context_t<node_key_t> c, | |
2000 | depth_t depth, | |
2001 | node_position_t<internal_node_t> &parent_pos, | |
2002 | node_position_t<NodeType> &pos) | |
2003 | { | |
2004 | LOG_PREFIX(FixedKVBtree::merge_level); | |
2005 | if (!parent_pos.node->is_mutable()) { | |
2006 | parent_pos.node = c.cache.duplicate_for_write( | |
2007 | c.trans, parent_pos.node | |
2008 | )->template cast<internal_node_t>(); | |
2009 | } | |
2010 | ||
2011 | auto iter = parent_pos.get_iter(); | |
2012 | assert(iter.get_offset() < parent_pos.node->get_size()); | |
2013 | bool donor_is_left = ((iter.get_offset() + 1) == parent_pos.node->get_size()); | |
2014 | auto donor_iter = donor_is_left ? (iter - 1) : (iter + 1); | |
2015 | auto next_iter = donor_iter + 1; | |
2016 | auto begin = donor_iter->get_key(); | |
2017 | auto end = next_iter == parent_pos.node->end() | |
2018 | ? parent_pos.node->get_node_meta().end | |
2019 | : next_iter->get_key(); | |
2020 | ||
2021 | SUBTRACET(seastore_fixedkv_tree, "parent: {}, node: {}", c.trans, *parent_pos.node, *pos.node); | |
2022 | auto do_merge = [c, iter, donor_iter, donor_is_left, &parent_pos, &pos]( | |
2023 | typename NodeType::Ref donor) { | |
2024 | LOG_PREFIX(FixedKVBtree::merge_level); | |
2025 | auto [l, r] = donor_is_left ? | |
2026 | std::make_pair(donor, pos.node) : std::make_pair(pos.node, donor); | |
2027 | ||
2028 | auto [liter, riter] = donor_is_left ? | |
2029 | std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); | |
2030 | ||
2031 | if (donor->at_min_capacity()) { | |
2032 | auto replacement = l->make_full_merge(c, r); | |
2033 | ||
2034 | parent_pos.node->update( | |
2035 | liter, | |
2036 | replacement->get_paddr(), | |
2037 | replacement.get()); | |
2038 | parent_pos.node->remove(riter); | |
2039 | ||
2040 | pos.node = replacement; | |
2041 | if (donor_is_left) { | |
2042 | pos.pos += r->get_size(); | |
2043 | parent_pos.pos--; | |
2044 | } | |
2045 | ||
2046 | SUBTRACET(seastore_fixedkv_tree, "l: {}, r: {}, replacement: {}", c.trans, *l, *r, *replacement); | |
2047 | c.cache.retire_extent(c.trans, l); | |
2048 | c.cache.retire_extent(c.trans, r); | |
2049 | get_tree_stats<self_type>(c.trans).extents_num_delta--; | |
2050 | } else { | |
2051 | LOG_PREFIX(FixedKVBtree::merge_level); | |
2052 | auto [replacement_l, replacement_r, pivot] = | |
2053 | l->make_balanced( | |
2054 | c, | |
2055 | r, | |
2056 | !donor_is_left); | |
2057 | ||
2058 | parent_pos.node->update( | |
2059 | liter, | |
2060 | replacement_l->get_paddr(), | |
2061 | replacement_l.get()); | |
2062 | parent_pos.node->replace( | |
2063 | riter, | |
2064 | pivot, | |
2065 | replacement_r->get_paddr(), | |
2066 | replacement_r.get()); | |
2067 | ||
2068 | if (donor_is_left) { | |
2069 | assert(parent_pos.pos > 0); | |
2070 | parent_pos.pos--; | |
2071 | } | |
2072 | ||
2073 | auto orig_position = donor_is_left ? | |
2074 | l->get_size() + pos.pos : | |
2075 | pos.pos; | |
2076 | if (orig_position < replacement_l->get_size()) { | |
2077 | pos.node = replacement_l; | |
2078 | pos.pos = orig_position; | |
2079 | } else { | |
2080 | parent_pos.pos++; | |
2081 | pos.node = replacement_r; | |
2082 | pos.pos = orig_position - replacement_l->get_size(); | |
2083 | } | |
2084 | ||
2085 | SUBTRACET( | |
2086 | seastore_fixedkv_tree, | |
2087 | "l: {}, r: {}, replacement_l: {}, replacement_r: {}", | |
2088 | c.trans, *l, *r, *replacement_l, *replacement_r); | |
2089 | c.cache.retire_extent(c.trans, l); | |
2090 | c.cache.retire_extent(c.trans, r); | |
2091 | } | |
2092 | ||
2093 | return seastar::now(); | |
2094 | }; | |
2095 | ||
2096 | auto v = parent_pos.node->template get_child<NodeType>(c, donor_iter); | |
2097 | if (v.has_child()) { | |
2098 | return v.get_child_fut().then( | |
2099 | [do_merge=std::move(do_merge), &pos, | |
2100 | donor_iter, donor_is_left, c, parent_pos](auto child) mutable { | |
2101 | LOG_PREFIX(FixedKVBtree::merge_level); | |
2102 | SUBTRACET(seastore_fixedkv_tree, | |
2103 | "got child on {}, pos: {}, res: {}", | |
2104 | c.trans, | |
2105 | *parent_pos.node, | |
2106 | donor_iter.get_offset(), | |
2107 | *child); | |
2108 | auto &node = (typename internal_node_t::base_t&)*child; | |
2109 | assert(donor_is_left ? | |
2110 | node.get_node_meta().end == pos.node->get_node_meta().begin : | |
2111 | node.get_node_meta().begin == pos.node->get_node_meta().end); | |
2112 | assert(node.get_node_meta().begin == donor_iter.get_key()); | |
2113 | assert(node.get_node_meta().end > donor_iter.get_key()); | |
2114 | return do_merge(child->template cast<NodeType>()); | |
2115 | }); | |
2116 | } | |
2117 | ||
2118 | auto child_pos = v.get_child_pos(); | |
2119 | return get_node<NodeType>( | |
2120 | c, | |
2121 | depth, | |
2122 | donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()), | |
2123 | begin, | |
2124 | end, | |
2125 | std::make_optional<node_position_t<NodeType>>( | |
2126 | child_pos.template get_parent<NodeType>(), | |
2127 | child_pos.get_pos()) | |
2128 | ).si_then([do_merge=std::move(do_merge)](typename NodeType::Ref donor) { | |
2129 | return do_merge(donor); | |
2130 | }); | |
2131 | } | |
2132 | }; | |
2133 | ||
2134 | template <typename T> | |
2135 | struct is_fixed_kv_tree : std::false_type {}; | |
2136 | ||
2137 | template < | |
2138 | typename node_key_t, | |
2139 | typename node_val_t, | |
2140 | typename internal_node_t, | |
2141 | typename leaf_node_t, | |
2142 | typename pin_t, | |
2143 | size_t node_size, | |
2144 | bool leaf_has_children> | |
2145 | struct is_fixed_kv_tree< | |
2146 | FixedKVBtree< | |
2147 | node_key_t, | |
2148 | node_val_t, | |
2149 | internal_node_t, | |
2150 | leaf_node_t, | |
2151 | pin_t, | |
2152 | node_size, | |
2153 | leaf_has_children>> : std::true_type {}; | |
2154 | ||
2155 | template < | |
2156 | typename tree_type_t, | |
2157 | typename node_key_t, | |
2158 | typename F, | |
2159 | std::enable_if_t<is_fixed_kv_tree<tree_type_t>::value, int> = 0> | |
2160 | auto with_btree( | |
2161 | Cache &cache, | |
2162 | op_context_t<node_key_t> c, | |
2163 | F &&f) { | |
2164 | return cache.get_root( | |
2165 | c.trans | |
2166 | ).si_then([f=std::forward<F>(f)](RootBlockRef croot) mutable { | |
2167 | return seastar::do_with( | |
2168 | tree_type_t(croot), | |
2169 | [f=std::move(f)](auto &btree) mutable { | |
2170 | return f(btree); | |
2171 | }); | |
2172 | }); | |
2173 | } | |
2174 | ||
2175 | template < | |
2176 | typename tree_type_t, | |
2177 | typename State, | |
2178 | typename node_key_t, | |
2179 | typename F, | |
2180 | std::enable_if_t<is_fixed_kv_tree<tree_type_t>::value, int> = 0> | |
2181 | auto with_btree_state( | |
2182 | Cache &cache, | |
2183 | op_context_t<node_key_t> c, | |
2184 | State &&init, | |
2185 | F &&f) { | |
2186 | return seastar::do_with( | |
2187 | std::forward<State>(init), | |
2188 | [&cache, c, f=std::forward<F>(f)](auto &state) mutable { | |
2189 | return with_btree<tree_type_t>( | |
2190 | cache, | |
2191 | c, | |
2192 | [&state, f=std::move(f)](auto &btree) mutable { | |
2193 | return f(btree, state); | |
2194 | }).si_then([&state] { | |
2195 | return seastar::make_ready_future<State>(std::move(state)); | |
2196 | }); | |
2197 | }); | |
2198 | } | |
2199 | ||
2200 | template < | |
2201 | typename tree_type_t, | |
2202 | typename State, | |
2203 | typename node_key_t, | |
2204 | typename F, | |
2205 | std::enable_if_t<is_fixed_kv_tree<tree_type_t>::value, int> = 0> | |
2206 | auto with_btree_state( | |
2207 | Cache &cache, | |
2208 | op_context_t<node_key_t> c, | |
2209 | F &&f) { | |
2210 | return crimson::os::seastore::with_btree_state<tree_type_t, State>( | |
2211 | cache, c, State{}, std::forward<F>(f)); | |
2212 | } | |
2213 | ||
2214 | template < | |
2215 | typename tree_type_t, | |
2216 | typename Ret, | |
2217 | typename node_key_t, | |
2218 | typename F> | |
2219 | auto with_btree_ret( | |
2220 | Cache &cache, | |
2221 | op_context_t<node_key_t> c, | |
2222 | F &&f) { | |
2223 | return with_btree_state<tree_type_t, Ret>( | |
2224 | cache, | |
2225 | c, | |
2226 | [f=std::forward<F>(f)](auto &btree, auto &ret) mutable { | |
2227 | return f( | |
2228 | btree | |
2229 | ).si_then([&ret](auto &&_ret) { | |
2230 | ret = std::move(_ret); | |
2231 | }); | |
2232 | }); | |
2233 | } | |
2234 | ||
2235 | } | |
2236 |