]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "crimson/common/log.h" | |
5 | #include "crimson/os/seastore/logging.h" | |
6 | ||
7 | #include "crimson/os/seastore/lba_manager/btree/lba_btree.h" | |
8 | ||
9 | SET_SUBSYS(seastore_lba); | |
10 | ||
11 | namespace crimson::os::seastore::lba_manager::btree { | |
12 | ||
13 | LBABtree::mkfs_ret LBABtree::mkfs(op_context_t c) | |
14 | { | |
15 | auto root_leaf = c.cache.alloc_new_extent<LBALeafNode>( | |
16 | c.trans, | |
17 | LBA_BLOCK_SIZE); | |
18 | root_leaf->set_size(0); | |
19 | lba_node_meta_t meta{0, L_ADDR_MAX, 1}; | |
20 | root_leaf->set_meta(meta); | |
21 | root_leaf->pin.set_range(meta); | |
22 | c.trans.get_lba_tree_stats().depth = 1u; | |
23 | return lba_root_t{root_leaf->get_paddr(), 1u}; | |
24 | } | |
25 | ||
26 | LBABtree::iterator::handle_boundary_ret LBABtree::iterator::handle_boundary( | |
27 | op_context_t c, | |
28 | mapped_space_visitor_t *visitor) | |
29 | { | |
30 | assert(at_boundary()); | |
31 | depth_t depth_with_space = 2; | |
32 | for (; depth_with_space <= get_depth(); ++depth_with_space) { | |
33 | if ((get_internal(depth_with_space).pos + 1) < | |
34 | get_internal(depth_with_space).node->get_size()) { | |
35 | break; | |
36 | } | |
37 | } | |
38 | ||
39 | if (depth_with_space <= get_depth()) { | |
40 | return seastar::do_with( | |
41 | [](const LBAInternalNode &internal) { return internal.begin(); }, | |
42 | [](const LBALeafNode &leaf) { return leaf.begin(); }, | |
43 | [this, c, depth_with_space, visitor](auto &li, auto &ll) { | |
44 | for (depth_t depth = 2; depth < depth_with_space; ++depth) { | |
45 | get_internal(depth).reset(); | |
46 | } | |
47 | leaf.reset(); | |
48 | get_internal(depth_with_space).pos++; | |
49 | // note, cannot result in at_boundary() by construction | |
50 | return lookup_depth_range( | |
51 | c, *this, depth_with_space - 1, 0, li, ll, visitor | |
52 | ); | |
53 | }); | |
54 | } else { | |
55 | // end | |
56 | return seastar::now(); | |
57 | } | |
58 | } | |
59 | ||
60 | LBABtree::iterator_fut LBABtree::iterator::next( | |
61 | op_context_t c, | |
62 | mapped_space_visitor_t *visitor) const | |
63 | { | |
64 | assert_valid(); | |
65 | assert(!is_end()); | |
66 | ||
67 | auto ret = *this; | |
68 | ret.leaf.pos++; | |
69 | if (ret.at_boundary()) { | |
70 | return seastar::do_with( | |
71 | ret, | |
72 | [c, visitor](auto &ret) mutable { | |
73 | return ret.handle_boundary( | |
74 | c, visitor | |
75 | ).si_then([&ret] { | |
76 | return std::move(ret); | |
77 | }); | |
78 | }); | |
79 | } else { | |
80 | return iterator_fut( | |
81 | interruptible::ready_future_marker{}, | |
82 | ret); | |
83 | } | |
84 | ||
85 | } | |
86 | ||
87 | LBABtree::iterator_fut LBABtree::iterator::prev(op_context_t c) const | |
88 | { | |
89 | assert_valid(); | |
90 | assert(!is_begin()); | |
91 | ||
92 | auto ret = *this; | |
93 | ||
94 | if (ret.leaf.pos > 0) { | |
95 | ret.leaf.pos--; | |
96 | return iterator_fut( | |
97 | interruptible::ready_future_marker{}, | |
98 | ret); | |
99 | } | |
100 | ||
101 | depth_t depth_with_space = 2; | |
102 | for (; depth_with_space <= get_depth(); ++depth_with_space) { | |
103 | if (ret.get_internal(depth_with_space).pos > 0) { | |
104 | break; | |
105 | } | |
106 | } | |
107 | ||
108 | assert(depth_with_space <= ret.get_depth()); // must not be begin() | |
109 | return seastar::do_with( | |
110 | std::move(ret), | |
111 | [](const LBAInternalNode &internal) { return --internal.end(); }, | |
112 | [](const LBALeafNode &leaf) { return --leaf.end(); }, | |
113 | [c, depth_with_space](auto &ret, auto &li, auto &ll) { | |
114 | for (depth_t depth = 2; depth < depth_with_space; ++depth) { | |
115 | ret.get_internal(depth).reset(); | |
116 | } | |
117 | ret.leaf.reset(); | |
118 | ret.get_internal(depth_with_space).pos--; | |
119 | // note, cannot result in at_boundary() by construction | |
120 | return lookup_depth_range( | |
121 | c, ret, depth_with_space - 1, 0, li, ll, nullptr | |
122 | ).si_then([&ret] { | |
123 | assert(!ret.at_boundary()); | |
124 | return std::move(ret); | |
125 | }); | |
126 | }); | |
127 | } | |
128 | ||
129 | LBABtree::iterator_fut LBABtree::lower_bound( | |
130 | op_context_t c, | |
131 | laddr_t addr, | |
132 | mapped_space_visitor_t *visitor) const | |
133 | { | |
134 | LOG_PREFIX(LBATree::lower_bound); | |
135 | return lookup( | |
136 | c, | |
137 | [addr](const LBAInternalNode &internal) { | |
138 | assert(internal.get_size() > 0); | |
139 | auto iter = internal.upper_bound(addr); | |
140 | assert(iter != internal.begin()); | |
141 | --iter; | |
142 | return iter; | |
143 | }, | |
144 | [FNAME, c, addr](const LBALeafNode &leaf) { | |
145 | auto ret = leaf.lower_bound(addr); | |
146 | DEBUGT( | |
147 | "leaf addr {}, got ret offset {}, size {}, end {}", | |
148 | c.trans, | |
149 | addr, | |
150 | ret.get_offset(), | |
151 | leaf.get_size(), | |
152 | ret == leaf.end()); | |
153 | return ret; | |
154 | }, | |
155 | visitor | |
156 | ).si_then([FNAME, c](auto &&ret) { | |
157 | DEBUGT( | |
158 | "ret.leaf.pos {}", | |
159 | c.trans, | |
160 | ret.leaf.pos); | |
161 | ret.assert_valid(); | |
162 | return std::move(ret); | |
163 | }); | |
164 | } | |
165 | ||
166 | LBABtree::insert_ret LBABtree::insert( | |
167 | op_context_t c, | |
168 | iterator iter, | |
169 | laddr_t laddr, | |
170 | lba_map_val_t val) | |
171 | { | |
172 | LOG_PREFIX(LBATree::insert); | |
173 | DEBUGT( | |
174 | "inserting laddr {} at iter {}", | |
175 | c.trans, | |
176 | laddr, | |
177 | iter.is_end() ? L_ADDR_MAX : iter.get_key()); | |
178 | return seastar::do_with( | |
179 | iter, | |
180 | [this, c, laddr, val](auto &ret) { | |
181 | return find_insertion( | |
182 | c, laddr, ret | |
183 | ).si_then([this, c, laddr, val, &ret] { | |
184 | if (!ret.at_boundary() && ret.get_key() == laddr) { | |
185 | return insert_ret( | |
186 | interruptible::ready_future_marker{}, | |
187 | std::make_pair(ret, false)); | |
188 | } else { | |
189 | ++(c.trans.get_lba_tree_stats().num_inserts); | |
190 | return handle_split( | |
191 | c, ret | |
192 | ).si_then([c, laddr, val, &ret] { | |
193 | if (!ret.leaf.node->is_pending()) { | |
194 | CachedExtentRef mut = c.cache.duplicate_for_write( | |
195 | c.trans, ret.leaf.node | |
196 | ); | |
197 | ret.leaf.node = mut->cast<LBALeafNode>(); | |
198 | } | |
199 | auto iter = LBALeafNode::const_iterator( | |
200 | ret.leaf.node.get(), ret.leaf.pos); | |
201 | assert(iter == ret.leaf.node->lower_bound(laddr)); | |
202 | assert(iter == ret.leaf.node->end() || iter->get_key() > laddr); | |
203 | assert(laddr >= ret.leaf.node->get_meta().begin && | |
204 | laddr < ret.leaf.node->get_meta().end); | |
205 | ret.leaf.node->insert(iter, laddr, val); | |
206 | return insert_ret( | |
207 | interruptible::ready_future_marker{}, | |
208 | std::make_pair(ret, true)); | |
209 | }); | |
210 | } | |
211 | }); | |
212 | }); | |
213 | } | |
214 | ||
215 | LBABtree::update_ret LBABtree::update( | |
216 | op_context_t c, | |
217 | iterator iter, | |
218 | lba_map_val_t val) | |
219 | { | |
220 | LOG_PREFIX(LBATree::update); | |
221 | DEBUGT( | |
222 | "update element at {}", | |
223 | c.trans, | |
224 | iter.is_end() ? L_ADDR_MAX : iter.get_key()); | |
225 | if (!iter.leaf.node->is_pending()) { | |
226 | CachedExtentRef mut = c.cache.duplicate_for_write( | |
227 | c.trans, iter.leaf.node | |
228 | ); | |
229 | iter.leaf.node = mut->cast<LBALeafNode>(); | |
230 | } | |
231 | iter.leaf.node->update( | |
232 | iter.leaf.node->iter_idx(iter.leaf.pos), | |
233 | val); | |
234 | return update_ret( | |
235 | interruptible::ready_future_marker{}, | |
236 | iter); | |
237 | } | |
238 | ||
239 | LBABtree::remove_ret LBABtree::remove( | |
240 | op_context_t c, | |
241 | iterator iter) | |
242 | { | |
243 | LOG_PREFIX(LBATree::remove); | |
244 | DEBUGT( | |
245 | "remove element at {}", | |
246 | c.trans, | |
247 | iter.is_end() ? L_ADDR_MAX : iter.get_key()); | |
248 | assert(!iter.is_end()); | |
249 | ++(c.trans.get_lba_tree_stats().num_erases); | |
250 | return seastar::do_with( | |
251 | iter, | |
252 | [this, c](auto &ret) { | |
253 | if (!ret.leaf.node->is_pending()) { | |
254 | CachedExtentRef mut = c.cache.duplicate_for_write( | |
255 | c.trans, ret.leaf.node | |
256 | ); | |
257 | ret.leaf.node = mut->cast<LBALeafNode>(); | |
258 | } | |
259 | ret.leaf.node->remove( | |
260 | ret.leaf.node->iter_idx(ret.leaf.pos)); | |
261 | ||
262 | return handle_merge( | |
263 | c, ret | |
264 | ); | |
265 | }); | |
266 | } | |
267 | ||
268 | LBABtree::init_cached_extent_ret LBABtree::init_cached_extent( | |
269 | op_context_t c, | |
270 | CachedExtentRef e) | |
271 | { | |
272 | LOG_PREFIX(LBATree::init_cached_extent); | |
273 | DEBUGT("extent {}", c.trans, *e); | |
274 | if (e->is_logical()) { | |
275 | auto logn = e->cast<LogicalCachedExtent>(); | |
276 | return lower_bound( | |
277 | c, | |
278 | logn->get_laddr() | |
279 | ).si_then([FNAME, e, c, logn](auto iter) { | |
280 | if (!iter.is_end() && | |
281 | iter.get_key() == logn->get_laddr() && | |
282 | iter.get_val().paddr == logn->get_paddr()) { | |
283 | logn->set_pin(iter.get_pin()); | |
284 | ceph_assert(iter.get_val().len == e->get_length()); | |
285 | if (c.pins) { | |
286 | c.pins->add_pin( | |
287 | static_cast<BtreeLBAPin&>(logn->get_pin()).pin); | |
288 | } | |
289 | DEBUGT("logical extent {} live, initialized", c.trans, *logn); | |
290 | return e; | |
291 | } else { | |
292 | DEBUGT("logical extent {} not live, dropping", c.trans, *logn); | |
293 | c.cache.drop_from_cache(logn); | |
294 | return CachedExtentRef(); | |
295 | } | |
296 | }); | |
297 | } else if (e->get_type() == extent_types_t::LADDR_INTERNAL) { | |
298 | auto eint = e->cast<LBAInternalNode>(); | |
299 | return lower_bound( | |
300 | c, eint->get_node_meta().begin | |
301 | ).si_then([FNAME, e, c, eint](auto iter) { | |
302 | // Note, this check is valid even if iter.is_end() | |
303 | depth_t cand_depth = eint->get_node_meta().depth; | |
304 | if (cand_depth <= iter.get_depth() && | |
305 | &*iter.get_internal(cand_depth).node == &*eint) { | |
306 | DEBUGT("extent {} is live", c.trans, *eint); | |
307 | return e; | |
308 | } else { | |
309 | DEBUGT("extent {} is not live", c.trans, *eint); | |
310 | c.cache.drop_from_cache(eint); | |
311 | return CachedExtentRef(); | |
312 | } | |
313 | }); | |
314 | } else if (e->get_type() == extent_types_t::LADDR_LEAF) { | |
315 | auto eleaf = e->cast<LBALeafNode>(); | |
316 | return lower_bound( | |
317 | c, eleaf->get_node_meta().begin | |
318 | ).si_then([FNAME, c, e, eleaf](auto iter) { | |
319 | // Note, this check is valid even if iter.is_end() | |
320 | if (iter.leaf.node == &*eleaf) { | |
321 | DEBUGT("extent {} is live", c.trans, *eleaf); | |
322 | return e; | |
323 | } else { | |
324 | DEBUGT("extent {} is not live", c.trans, *eleaf); | |
325 | c.cache.drop_from_cache(eleaf); | |
326 | return CachedExtentRef(); | |
327 | } | |
328 | }); | |
329 | } else { | |
330 | DEBUGT( | |
331 | "found other extent {} type {}", | |
332 | c.trans, | |
333 | *e, | |
334 | e->get_type()); | |
335 | return init_cached_extent_ret( | |
336 | interruptible::ready_future_marker{}, | |
337 | e); | |
338 | } | |
339 | } | |
340 | ||
341 | LBABtree::get_internal_if_live_ret | |
342 | LBABtree::get_internal_if_live( | |
343 | op_context_t c, | |
344 | paddr_t addr, | |
345 | laddr_t laddr, | |
346 | segment_off_t len) | |
347 | { | |
348 | LOG_PREFIX(BtreeLBAManager::get_leaf_if_live); | |
349 | return lower_bound( | |
350 | c, laddr | |
351 | ).si_then([FNAME, c, addr, laddr, len](auto iter) { | |
352 | for (depth_t d = 2; d <= iter.get_depth(); ++d) { | |
353 | CachedExtent &node = *iter.get_internal(d).node; | |
354 | auto internal_node = node.cast<LBAInternalNode>(); | |
355 | if (internal_node->get_paddr() == addr) { | |
356 | DEBUGT( | |
357 | "extent laddr {} addr {}~{} found: {}", | |
358 | c.trans, | |
359 | laddr, | |
360 | addr, | |
361 | len, | |
362 | *internal_node); | |
363 | assert(internal_node->get_node_meta().begin == laddr); | |
364 | return CachedExtentRef(internal_node); | |
365 | } | |
366 | } | |
367 | DEBUGT( | |
368 | "extent laddr {} addr {}~{} is not live, no matching internal node", | |
369 | c.trans, | |
370 | laddr, | |
371 | addr, | |
372 | len); | |
373 | return CachedExtentRef(); | |
374 | }); | |
375 | } | |
376 | ||
377 | LBABtree::get_leaf_if_live_ret | |
378 | LBABtree::get_leaf_if_live( | |
379 | op_context_t c, | |
380 | paddr_t addr, | |
381 | laddr_t laddr, | |
382 | segment_off_t len) | |
383 | { | |
384 | LOG_PREFIX(BtreeLBAManager::get_leaf_if_live); | |
385 | return lower_bound( | |
386 | c, laddr | |
387 | ).si_then([FNAME, c, addr, laddr, len](auto iter) { | |
388 | if (iter.leaf.node->get_paddr() == addr) { | |
389 | DEBUGT( | |
390 | "extent laddr {} addr {}~{} found: {}", | |
391 | c.trans, | |
392 | laddr, | |
393 | addr, | |
394 | len, | |
395 | *iter.leaf.node); | |
396 | return CachedExtentRef(iter.leaf.node); | |
397 | } else { | |
398 | DEBUGT( | |
399 | "extent laddr {} addr {}~{} is not live, does not match node {}", | |
400 | c.trans, | |
401 | laddr, | |
402 | addr, | |
403 | len, | |
404 | *iter.leaf.node); | |
405 | return CachedExtentRef(); | |
406 | } | |
407 | }); | |
408 | } | |
409 | ||
410 | ||
411 | LBABtree::rewrite_lba_extent_ret LBABtree::rewrite_lba_extent( | |
412 | op_context_t c, | |
413 | CachedExtentRef e) | |
414 | { | |
415 | LOG_PREFIX(LBABtree::rewrite_lba_extent); | |
416 | assert(e->get_type() == extent_types_t::LADDR_INTERNAL || | |
417 | e->get_type() == extent_types_t::LADDR_LEAF); | |
418 | ||
419 | auto do_rewrite = [&](auto &lba_extent) { | |
420 | auto nlba_extent = c.cache.alloc_new_extent< | |
421 | std::remove_reference_t<decltype(lba_extent)> | |
422 | >( | |
423 | c.trans, | |
424 | lba_extent.get_length()); | |
425 | lba_extent.get_bptr().copy_out( | |
426 | 0, | |
427 | lba_extent.get_length(), | |
428 | nlba_extent->get_bptr().c_str()); | |
429 | nlba_extent->pin.set_range(nlba_extent->get_node_meta()); | |
430 | ||
431 | /* This is a bit underhanded. Any relative addrs here must necessarily | |
432 | * be record relative as we are rewriting a dirty extent. Thus, we | |
433 | * are using resolve_relative_addrs with a (likely negative) block | |
434 | * relative offset to correct them to block-relative offsets adjusted | |
435 | * for our new transaction location. | |
436 | * | |
437 | * Upon commit, these now block relative addresses will be interpretted | |
438 | * against the real final address. | |
439 | */ | |
440 | nlba_extent->resolve_relative_addrs( | |
441 | make_record_relative_paddr(0) - nlba_extent->get_paddr()); | |
442 | ||
443 | DEBUGT( | |
444 | "rewriting {} into {}", | |
445 | c.trans, | |
446 | lba_extent, | |
447 | *nlba_extent); | |
448 | ||
449 | return update_internal_mapping( | |
450 | c, | |
451 | nlba_extent->get_node_meta().depth, | |
452 | nlba_extent->get_node_meta().begin, | |
453 | e->get_paddr(), | |
454 | nlba_extent->get_paddr() | |
455 | ).si_then([c, e] { | |
456 | c.cache.retire_extent(c.trans, e); | |
457 | }); | |
458 | }; | |
459 | ||
460 | CachedExtentRef nlba_extent; | |
461 | if (e->get_type() == extent_types_t::LADDR_INTERNAL) { | |
462 | auto lint = e->cast<LBAInternalNode>(); | |
463 | return do_rewrite(*lint); | |
464 | } else { | |
465 | assert(e->get_type() == extent_types_t::LADDR_LEAF); | |
466 | auto lleaf = e->cast<LBALeafNode>(); | |
467 | return do_rewrite(*lleaf); | |
468 | } | |
469 | } | |
470 | ||
471 | LBABtree::get_internal_node_ret LBABtree::get_internal_node( | |
472 | op_context_t c, | |
473 | depth_t depth, | |
474 | paddr_t offset, | |
475 | laddr_t begin, | |
476 | laddr_t end) | |
477 | { | |
478 | LOG_PREFIX(LBATree::get_internal_node); | |
479 | DEBUGT( | |
480 | "reading internal at offset {}, depth {}, begin {}, end {}", | |
481 | c.trans, | |
482 | offset, | |
483 | depth, | |
484 | begin, | |
485 | end); | |
486 | assert(depth > 1); | |
487 | auto init_internal = [c, depth, begin, end](LBAInternalNode &node) { | |
488 | assert(!node.is_pending()); | |
489 | assert(!node.pin.is_linked()); | |
490 | node.pin.set_range(lba_node_meta_t{begin, end, depth}); | |
491 | if (c.pins) { | |
492 | c.pins->add_pin(node.pin); | |
493 | } | |
494 | }; | |
495 | return c.cache.get_extent<LBAInternalNode>( | |
496 | c.trans, | |
497 | offset, | |
498 | LBA_BLOCK_SIZE, | |
499 | init_internal | |
500 | ).si_then([FNAME, c, offset, init_internal, depth, begin, end]( | |
501 | LBAInternalNodeRef ret) { | |
502 | DEBUGT( | |
503 | "read internal at offset {} {}", | |
504 | c.trans, | |
505 | offset, | |
506 | *ret); | |
507 | // This can only happen during init_cached_extent | |
508 | if (c.pins && !ret->is_pending() && !ret->pin.is_linked()) { | |
509 | assert(ret->is_dirty()); | |
510 | init_internal(*ret); | |
511 | } | |
512 | auto meta = ret->get_meta(); | |
513 | if (ret->get_size()) { | |
514 | ceph_assert(meta.begin <= ret->begin()->get_key()); | |
515 | ceph_assert(meta.end > (ret->end() - 1)->get_key()); | |
516 | } | |
517 | ceph_assert(depth == meta.depth); | |
518 | ceph_assert(begin == meta.begin); | |
519 | ceph_assert(end == meta.end); | |
520 | return get_internal_node_ret( | |
521 | interruptible::ready_future_marker{}, | |
522 | ret); | |
523 | }); | |
524 | } | |
525 | ||
526 | LBABtree::get_leaf_node_ret LBABtree::get_leaf_node( | |
527 | op_context_t c, | |
528 | paddr_t offset, | |
529 | laddr_t begin, | |
530 | laddr_t end) | |
531 | { | |
532 | LOG_PREFIX(LBATree::get_leaf_node); | |
533 | DEBUGT( | |
534 | "reading leaf at offset {}, begin {}, end {}", | |
535 | c.trans, | |
536 | offset, | |
537 | begin, | |
538 | end); | |
539 | auto init_leaf = [c, begin, end](LBALeafNode &node) { | |
540 | assert(!node.is_pending()); | |
541 | assert(!node.pin.is_linked()); | |
542 | node.pin.set_range(lba_node_meta_t{begin, end, 1}); | |
543 | if (c.pins) { | |
544 | c.pins->add_pin(node.pin); | |
545 | } | |
546 | }; | |
547 | return c.cache.get_extent<LBALeafNode>( | |
548 | c.trans, | |
549 | offset, | |
550 | LBA_BLOCK_SIZE, | |
551 | init_leaf | |
552 | ).si_then([FNAME, c, offset, init_leaf, begin, end](LBALeafNodeRef ret) { | |
553 | DEBUGT( | |
554 | "read leaf at offset {} {}", | |
555 | c.trans, | |
556 | offset, | |
557 | *ret); | |
558 | // This can only happen during init_cached_extent | |
559 | if (c.pins && !ret->is_pending() && !ret->pin.is_linked()) { | |
560 | assert(ret->is_dirty()); | |
561 | init_leaf(*ret); | |
562 | } | |
563 | auto meta = ret->get_meta(); | |
564 | if (ret->get_size()) { | |
565 | ceph_assert(meta.begin <= ret->begin()->get_key()); | |
566 | ceph_assert(meta.end > (ret->end() - 1)->get_key()); | |
567 | } | |
568 | ceph_assert(1 == meta.depth); | |
569 | ceph_assert(begin == meta.begin); | |
570 | ceph_assert(end == meta.end); | |
571 | return get_leaf_node_ret( | |
572 | interruptible::ready_future_marker{}, | |
573 | ret); | |
574 | }); | |
575 | } | |
576 | ||
577 | LBABtree::find_insertion_ret LBABtree::find_insertion( | |
578 | op_context_t c, | |
579 | laddr_t laddr, | |
580 | iterator &iter) | |
581 | { | |
582 | assert(iter.is_end() || iter.get_key() >= laddr); | |
583 | if (!iter.is_end() && iter.get_key() == laddr) { | |
584 | return seastar::now(); | |
585 | } else if (iter.leaf.node->get_node_meta().begin <= laddr) { | |
586 | #ifndef NDEBUG | |
587 | auto p = iter; | |
588 | if (p.leaf.pos > 0) { | |
589 | --p.leaf.pos; | |
590 | assert(p.get_key() < laddr); | |
591 | } | |
592 | #endif | |
593 | return seastar::now(); | |
594 | } else { | |
595 | assert(iter.leaf.pos == 0); | |
596 | return iter.prev( | |
597 | c | |
598 | ).si_then([laddr, &iter](auto p) { | |
599 | assert(p.leaf.node->get_node_meta().begin <= laddr); | |
600 | assert(p.get_key() < laddr); | |
601 | // Note, this is specifically allowed to violate the iterator | |
602 | // invariant that pos is a valid index for the node in the event | |
603 | // that the insertion point is at the end of a node. | |
604 | p.leaf.pos++; | |
605 | assert(p.at_boundary()); | |
606 | iter = p; | |
607 | return seastar::now(); | |
608 | }); | |
609 | } | |
610 | } | |
611 | ||
612 | LBABtree::handle_split_ret LBABtree::handle_split( | |
613 | op_context_t c, | |
614 | iterator &iter) | |
615 | { | |
616 | LOG_PREFIX(LBATree::handle_split); | |
617 | ||
618 | depth_t split_from = iter.check_split(); | |
619 | ||
620 | DEBUGT("split_from {}, depth {}", c.trans, split_from, iter.get_depth()); | |
621 | ||
622 | if (split_from == iter.get_depth()) { | |
623 | auto nroot = c.cache.alloc_new_extent<LBAInternalNode>( | |
624 | c.trans, LBA_BLOCK_SIZE); | |
625 | lba_node_meta_t meta{0, L_ADDR_MAX, iter.get_depth() + 1}; | |
626 | nroot->set_meta(meta); | |
627 | nroot->pin.set_range(meta); | |
628 | nroot->journal_insert( | |
629 | nroot->begin(), | |
630 | L_ADDR_MIN, | |
631 | root.get_location(), | |
632 | nullptr); | |
633 | iter.internal.push_back({nroot, 0}); | |
634 | ||
635 | root.set_location(nroot->get_paddr()); | |
636 | root.set_depth(iter.get_depth()); | |
637 | c.trans.get_lba_tree_stats().depth = iter.get_depth(); | |
638 | root_dirty = true; | |
639 | } | |
640 | ||
641 | /* pos may be either node_position_t<LBALeafNode> or | |
642 | * node_position_t<LBAInternalNode> */ | |
643 | auto split_level = [&](auto &parent_pos, auto &pos) { | |
644 | LOG_PREFIX(LBATree::handle_split); | |
645 | auto [left, right, pivot] = pos.node->make_split_children(c); | |
646 | ||
647 | auto parent_node = parent_pos.node; | |
648 | auto parent_iter = parent_pos.get_iter(); | |
649 | ||
650 | parent_node->update( | |
651 | parent_iter, | |
652 | left->get_paddr()); | |
653 | parent_node->insert( | |
654 | parent_iter + 1, | |
655 | pivot, | |
656 | right->get_paddr()); | |
657 | ||
658 | DEBUGT("splitted {} into left: {}, right: {}", | |
659 | c.trans, | |
660 | *pos.node, | |
661 | *left, | |
662 | *right); | |
663 | c.cache.retire_extent(c.trans, pos.node); | |
664 | ||
665 | return std::make_pair(left, right); | |
666 | }; | |
667 | ||
668 | for (; split_from > 0; --split_from) { | |
669 | auto &parent_pos = iter.get_internal(split_from + 1); | |
670 | if (!parent_pos.node->is_pending()) { | |
671 | parent_pos.node = c.cache.duplicate_for_write( | |
672 | c.trans, parent_pos.node | |
673 | )->cast<LBAInternalNode>(); | |
674 | } | |
675 | ||
676 | if (split_from > 1) { | |
677 | auto &pos = iter.get_internal(split_from); | |
678 | DEBUGT("splitting internal {} at depth {}, parent: {} at pos: {}", | |
679 | c.trans, | |
680 | *pos.node, | |
681 | split_from, | |
682 | *parent_pos.node, | |
683 | parent_pos.pos); | |
684 | auto [left, right] = split_level(parent_pos, pos); | |
685 | ||
686 | if (pos.pos < left->get_size()) { | |
687 | pos.node = left; | |
688 | } else { | |
689 | pos.node = right; | |
690 | pos.pos -= left->get_size(); | |
691 | ||
692 | parent_pos.pos += 1; | |
693 | } | |
694 | } else { | |
695 | auto &pos = iter.leaf; | |
696 | DEBUGT("splitting leaf {}, parent: {} at pos: {}", | |
697 | c.trans, | |
698 | *pos.node, | |
699 | *parent_pos.node, | |
700 | parent_pos.pos); | |
701 | auto [left, right] = split_level(parent_pos, pos); | |
702 | ||
703 | /* right->get_node_meta().begin == pivot == right->begin()->get_key() | |
704 | * Thus, if pos.pos == left->get_size(), we want iter to point to | |
705 | * left with pos.pos at the end rather than right with pos.pos = 0 | |
706 | * since the insertion would be to the left of the first element | |
707 | * of right and thus necessarily less than right->get_node_meta().begin. | |
708 | */ | |
709 | if (pos.pos <= left->get_size()) { | |
710 | pos.node = left; | |
711 | } else { | |
712 | pos.node = right; | |
713 | pos.pos -= left->get_size(); | |
714 | ||
715 | parent_pos.pos += 1; | |
716 | } | |
717 | } | |
718 | } | |
719 | ||
720 | return seastar::now(); | |
721 | } | |
722 | ||
723 | template <typename NodeType> | |
724 | LBABtree::base_iertr::future<typename NodeType::Ref> get_node( | |
725 | op_context_t c, | |
726 | depth_t depth, | |
727 | paddr_t addr, | |
728 | laddr_t begin, | |
729 | laddr_t end); | |
730 | ||
731 | template <> | |
732 | LBABtree::base_iertr::future<LBALeafNodeRef> get_node<LBALeafNode>( | |
733 | op_context_t c, | |
734 | depth_t depth, | |
735 | paddr_t addr, | |
736 | laddr_t begin, | |
737 | laddr_t end) { | |
738 | assert(depth == 1); | |
739 | return LBABtree::get_leaf_node(c, addr, begin, end); | |
740 | } | |
741 | ||
742 | template <> | |
743 | LBABtree::base_iertr::future<LBAInternalNodeRef> get_node<LBAInternalNode>( | |
744 | op_context_t c, | |
745 | depth_t depth, | |
746 | paddr_t addr, | |
747 | laddr_t begin, | |
748 | laddr_t end) { | |
749 | return LBABtree::get_internal_node(c, depth, addr, begin, end); | |
750 | } | |
751 | ||
752 | template <typename NodeType> | |
753 | LBABtree::handle_merge_ret merge_level( | |
754 | op_context_t c, | |
755 | depth_t depth, | |
756 | LBABtree::node_position_t<LBAInternalNode> &parent_pos, | |
757 | LBABtree::node_position_t<NodeType> &pos) | |
758 | { | |
759 | LOG_PREFIX(LBABtree::merge_level); | |
760 | if (!parent_pos.node->is_pending()) { | |
761 | parent_pos.node = c.cache.duplicate_for_write( | |
762 | c.trans, parent_pos.node | |
763 | )->cast<LBAInternalNode>(); | |
764 | } | |
765 | ||
766 | auto iter = parent_pos.get_iter(); | |
767 | assert(iter.get_offset() < parent_pos.node->get_size()); | |
768 | bool donor_is_left = ((iter.get_offset() + 1) == parent_pos.node->get_size()); | |
769 | auto donor_iter = donor_is_left ? (iter - 1) : (iter + 1); | |
770 | auto next_iter = donor_iter + 1; | |
771 | auto begin = donor_iter->get_key(); | |
772 | auto end = next_iter == parent_pos.node->end() | |
773 | ? parent_pos.node->get_node_meta().end | |
774 | : next_iter->get_key(); | |
775 | ||
776 | DEBUGT("parent: {}, node: {}", c.trans, *parent_pos.node, *pos.node); | |
777 | return get_node<NodeType>( | |
778 | c, | |
779 | depth, | |
780 | donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()), | |
781 | begin, | |
782 | end | |
783 | ).si_then([c, iter, donor_iter, donor_is_left, &parent_pos, &pos]( | |
784 | typename NodeType::Ref donor) { | |
785 | LOG_PREFIX(LBABtree::merge_level); | |
786 | auto [l, r] = donor_is_left ? | |
787 | std::make_pair(donor, pos.node) : std::make_pair(pos.node, donor); | |
788 | ||
789 | auto [liter, riter] = donor_is_left ? | |
790 | std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); | |
791 | ||
792 | if (donor->at_min_capacity()) { | |
793 | auto replacement = l->make_full_merge(c, r); | |
794 | ||
795 | parent_pos.node->update( | |
796 | liter, | |
797 | replacement->get_paddr()); | |
798 | parent_pos.node->remove(riter); | |
799 | ||
800 | pos.node = replacement; | |
801 | if (donor_is_left) { | |
802 | pos.pos += r->get_size(); | |
803 | parent_pos.pos--; | |
804 | } | |
805 | ||
806 | DEBUGT("l: {}, r: {}, replacement: {}", c.trans, *l, *r, *replacement); | |
807 | c.cache.retire_extent(c.trans, l); | |
808 | c.cache.retire_extent(c.trans, r); | |
809 | } else { | |
810 | LOG_PREFIX(LBABtree::merge_level); | |
811 | auto [replacement_l, replacement_r, pivot] = | |
812 | l->make_balanced( | |
813 | c, | |
814 | r, | |
815 | !donor_is_left); | |
816 | ||
817 | parent_pos.node->update( | |
818 | liter, | |
819 | replacement_l->get_paddr()); | |
820 | parent_pos.node->replace( | |
821 | riter, | |
822 | pivot, | |
823 | replacement_r->get_paddr()); | |
824 | ||
825 | if (donor_is_left) { | |
826 | assert(parent_pos.pos > 0); | |
827 | parent_pos.pos--; | |
828 | } | |
829 | ||
830 | auto orig_position = donor_is_left ? | |
831 | l->get_size() + pos.pos : | |
832 | pos.pos; | |
833 | if (orig_position < replacement_l->get_size()) { | |
834 | pos.node = replacement_l; | |
835 | pos.pos = orig_position; | |
836 | } else { | |
837 | parent_pos.pos++; | |
838 | pos.node = replacement_r; | |
839 | pos.pos = orig_position - replacement_l->get_size(); | |
840 | } | |
841 | ||
842 | DEBUGT("l: {}, r: {}, replacement_l: {}, replacement_r: {}", | |
843 | c.trans, *l, *r, *replacement_l, *replacement_r); | |
844 | c.cache.retire_extent(c.trans, l); | |
845 | c.cache.retire_extent(c.trans, r); | |
846 | } | |
847 | ||
848 | return seastar::now(); | |
849 | }); | |
850 | } | |
851 | ||
852 | LBABtree::handle_merge_ret LBABtree::handle_merge( | |
853 | op_context_t c, | |
854 | iterator &iter) | |
855 | { | |
856 | LOG_PREFIX(LBATree::handle_merge); | |
857 | if (iter.get_depth() == 1 || | |
858 | !iter.leaf.node->below_min_capacity()) { | |
859 | DEBUGT( | |
860 | "no need to merge leaf, leaf size {}, depth {}", | |
861 | c.trans, | |
862 | iter.leaf.node->get_size(), | |
863 | iter.get_depth()); | |
864 | return seastar::now(); | |
865 | } | |
866 | ||
867 | return seastar::do_with( | |
868 | depth_t{1}, | |
869 | [FNAME, this, c, &iter](auto &to_merge) { | |
870 | return trans_intr::repeat( | |
871 | [FNAME, this, c, &iter, &to_merge] { | |
872 | DEBUGT( | |
873 | "merging depth {}", | |
874 | c.trans, | |
875 | to_merge); | |
876 | auto &parent_pos = iter.get_internal(to_merge + 1); | |
877 | auto merge_fut = handle_merge_iertr::now(); | |
878 | if (to_merge > 1) { | |
879 | auto &pos = iter.get_internal(to_merge); | |
880 | merge_fut = merge_level(c, to_merge, parent_pos, pos); | |
881 | } else { | |
882 | auto &pos = iter.leaf; | |
883 | merge_fut = merge_level(c, to_merge, parent_pos, pos); | |
884 | } | |
885 | ||
886 | return merge_fut.si_then([FNAME, this, c, &iter, &to_merge] { | |
887 | ++to_merge; | |
888 | auto &pos = iter.get_internal(to_merge); | |
889 | if (to_merge == iter.get_depth()) { | |
890 | if (pos.node->get_size() == 1) { | |
891 | DEBUGT("collapsing root", c.trans); | |
892 | c.cache.retire_extent(c.trans, pos.node); | |
893 | assert(pos.pos == 0); | |
894 | auto node_iter = pos.get_iter(); | |
895 | root.set_location( | |
896 | node_iter->get_val().maybe_relative_to(pos.node->get_paddr())); | |
897 | iter.internal.pop_back(); | |
898 | root.set_depth(iter.get_depth()); | |
899 | c.trans.get_lba_tree_stats().depth = iter.get_depth(); | |
900 | root_dirty = true; | |
901 | } else { | |
902 | DEBUGT("no need to collapse root", c.trans); | |
903 | } | |
904 | return seastar::stop_iteration::yes; | |
905 | } else if (pos.node->below_min_capacity()) { | |
906 | DEBUGT( | |
907 | "continuing, next node {} depth {} at min", | |
908 | c.trans, | |
909 | *pos.node, | |
910 | to_merge); | |
911 | return seastar::stop_iteration::no; | |
912 | } else { | |
913 | DEBUGT( | |
914 | "complete, next node {} depth {} not min", | |
915 | c.trans, | |
916 | *pos.node, | |
917 | to_merge); | |
918 | return seastar::stop_iteration::yes; | |
919 | } | |
920 | }); | |
921 | }); | |
922 | }); | |
923 | } | |
924 | ||
925 | LBABtree::update_internal_mapping_ret LBABtree::update_internal_mapping( | |
926 | op_context_t c, | |
927 | depth_t depth, | |
928 | laddr_t laddr, | |
929 | paddr_t old_addr, | |
930 | paddr_t new_addr) | |
931 | { | |
932 | LOG_PREFIX(LBATree::update_internal_mapping); | |
933 | DEBUGT( | |
934 | "updating laddr {} at depth {} from {} to {}", | |
935 | c.trans, | |
936 | laddr, | |
937 | depth, | |
938 | old_addr, | |
939 | new_addr); | |
940 | ||
941 | return lower_bound( | |
942 | c, laddr | |
943 | ).si_then([=](auto iter) { | |
944 | assert(iter.get_depth() >= depth); | |
945 | if (depth == iter.get_depth()) { | |
946 | DEBUGT("update at root", c.trans); | |
947 | ||
948 | if (laddr != 0) { | |
949 | ERRORT( | |
950 | "updating root laddr {} at depth {} from {} to {}," | |
951 | "laddr is not 0", | |
952 | c.trans, | |
953 | laddr, | |
954 | depth, | |
955 | old_addr, | |
956 | new_addr, | |
957 | root.get_location()); | |
958 | ceph_assert(0 == "impossible"); | |
959 | } | |
960 | ||
961 | if (root.get_location() != old_addr) { | |
962 | ERRORT( | |
963 | "updating root laddr {} at depth {} from {} to {}," | |
964 | "root addr {} does not match", | |
965 | c.trans, | |
966 | laddr, | |
967 | depth, | |
968 | old_addr, | |
969 | new_addr, | |
970 | root.get_location()); | |
971 | ceph_assert(0 == "impossible"); | |
972 | } | |
973 | ||
974 | root.set_location(new_addr); | |
975 | root_dirty = true; | |
976 | } else { | |
977 | auto &parent = iter.get_internal(depth + 1); | |
978 | assert(parent.node); | |
979 | assert(parent.pos < parent.node->get_size()); | |
980 | auto piter = parent.node->iter_idx(parent.pos); | |
981 | ||
982 | if (piter->get_key() != laddr) { | |
983 | ERRORT( | |
984 | "updating laddr {} at depth {} from {} to {}," | |
985 | "node {} pos {} val pivot addr {} does not match", | |
986 | c.trans, | |
987 | laddr, | |
988 | depth, | |
989 | old_addr, | |
990 | new_addr, | |
991 | *(parent.node), | |
992 | parent.pos, | |
993 | piter->get_key()); | |
994 | ceph_assert(0 == "impossible"); | |
995 | } | |
996 | ||
997 | ||
998 | if (piter->get_val() != old_addr) { | |
999 | ERRORT( | |
1000 | "updating laddr {} at depth {} from {} to {}," | |
1001 | "node {} pos {} val addr {} does not match", | |
1002 | c.trans, | |
1003 | laddr, | |
1004 | depth, | |
1005 | old_addr, | |
1006 | new_addr, | |
1007 | *(parent.node), | |
1008 | parent.pos, | |
1009 | piter->get_val()); | |
1010 | ceph_assert(0 == "impossible"); | |
1011 | } | |
1012 | ||
1013 | CachedExtentRef mut = c.cache.duplicate_for_write( | |
1014 | c.trans, | |
1015 | parent.node | |
1016 | ); | |
1017 | LBAInternalNodeRef mparent = mut->cast<LBAInternalNode>(); | |
1018 | mparent->update(piter, new_addr); | |
1019 | ||
1020 | /* Note, iter is now invalid as we didn't udpate either the parent | |
1021 | * node reference to the new mutable instance nor did we update the | |
1022 | * child pointer to the new node. Not a problem as we'll now just | |
1023 | * destruct it. | |
1024 | */ | |
1025 | } | |
1026 | return seastar::now(); | |
1027 | }); | |
1028 | } | |
1029 | } |