]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #pragma once | |
5 | ||
6 | #include <ostream> | |
7 | #include <sstream> | |
8 | ||
9 | #include "common/likely.h" | |
10 | #include "crimson/common/log.h" | |
11 | #include "node_extent_accessor.h" | |
12 | #include "node_impl.h" | |
13 | #include "stages/node_stage_layout.h" | |
14 | ||
15 | namespace crimson::os::seastore::onode { | |
16 | ||
17 | template <node_type_t NODE_TYPE> struct insert_key_type; | |
18 | template <> struct insert_key_type<node_type_t::INTERNAL> { | |
19 | static constexpr auto type = KeyT::VIEW; }; | |
20 | template <> struct insert_key_type<node_type_t::LEAF> { | |
21 | static constexpr auto type = KeyT::HOBJ; }; | |
22 | ||
23 | template <node_type_t NODE_TYPE> struct node_impl_type; | |
24 | template <> struct node_impl_type<node_type_t::INTERNAL> { | |
25 | using type = InternalNodeImpl; }; | |
26 | template <> struct node_impl_type<node_type_t::LEAF> { | |
27 | using type = LeafNodeImpl; }; | |
28 | ||
29 | template <node_type_t NODE_TYPE> struct node_marker_type; | |
30 | template <> struct node_marker_type<node_type_t::INTERNAL> { | |
31 | using type = InternalNodeImpl::internal_marker_t; }; | |
32 | template <> struct node_marker_type<node_type_t::LEAF> { | |
33 | using type = LeafNodeImpl::leaf_marker_t; }; | |
34 | ||
35 | /** | |
36 | * NodeLayoutT | |
37 | * | |
38 | * Contains templated and concrete implementations for both InternalNodeImpl | |
39 | * and LeafNodeImpl under a specific node layout. | |
40 | */ | |
41 | template <typename FieldType, node_type_t NODE_TYPE> | |
42 | class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { | |
43 | public: | |
44 | using URef = std::unique_ptr<NodeLayoutT>; | |
45 | using extent_t = NodeExtentAccessorT<FieldType, NODE_TYPE>; | |
46 | using parent_t = typename node_impl_type<NODE_TYPE>::type; | |
47 | using marker_t = typename node_marker_type<NODE_TYPE>::type; | |
48 | using node_stage_t = typename extent_t::node_stage_t; | |
49 | using position_t = typename extent_t::position_t; | |
50 | using value_t = typename extent_t::value_t; | |
51 | static constexpr auto FIELD_TYPE = extent_t::FIELD_TYPE; | |
52 | static constexpr auto KEY_TYPE = insert_key_type<NODE_TYPE>::type; | |
53 | static constexpr auto STAGE = STAGE_T::STAGE; | |
54 | ||
55 | NodeLayoutT(const NodeLayoutT&) = delete; | |
56 | NodeLayoutT(NodeLayoutT&&) = delete; | |
57 | NodeLayoutT& operator=(const NodeLayoutT&) = delete; | |
58 | NodeLayoutT& operator=(NodeLayoutT&&) = delete; | |
59 | ~NodeLayoutT() override = default; | |
60 | ||
61 | static URef load(NodeExtentRef extent, bool expect_is_level_tail) { | |
62 | std::unique_ptr<NodeLayoutT> ret(new NodeLayoutT(extent)); | |
63 | assert(ret->is_level_tail() == expect_is_level_tail); | |
64 | return ret; | |
65 | } | |
66 | ||
67 | using alloc_ertr = NodeExtentManager::tm_ertr; | |
68 | static alloc_ertr::future<typename parent_t::fresh_impl_t> allocate( | |
69 | context_t c, bool is_level_tail, level_t level) { | |
70 | // NOTE: Currently, all the node types have the same size for simplicity. | |
71 | // But depending on the requirement, we may need to make node size | |
72 | // configurable by field_type_t and node_type_t, or totally flexible. | |
73 | return c.nm.alloc_extent(c.t, node_stage_t::EXTENT_SIZE | |
74 | ).safe_then([is_level_tail, level](auto extent) { | |
75 | assert(extent->is_initial_pending()); | |
76 | auto mut = extent->get_mutable(); | |
77 | node_stage_t::bootstrap_extent( | |
78 | mut, FIELD_TYPE, NODE_TYPE, is_level_tail, level); | |
79 | return typename parent_t::fresh_impl_t{ | |
80 | std::unique_ptr<parent_t>(new NodeLayoutT(extent)), mut}; | |
81 | }); | |
82 | } | |
83 | ||
84 | protected: | |
85 | /* | |
86 | * NodeImpl | |
87 | */ | |
88 | field_type_t field_type() const override { return FIELD_TYPE; } | |
89 | laddr_t laddr() const override { return extent.get_laddr(); } | |
90 | void prepare_mutate(context_t c) override { return extent.prepare_mutate(c); } | |
91 | bool is_level_tail() const override { return extent.read().is_level_tail(); } | |
92 | bool is_empty() const override { return extent.read().keys() == 0; } | |
93 | level_t level() const override { return extent.read().level(); } | |
94 | node_offset_t free_size() const override { return extent.read().free_size(); } | |
95 | ||
96 | key_view_t get_key_view(const search_position_t& position) const override { | |
97 | key_view_t ret; | |
98 | STAGE_T::get_key_view(extent.read(), cast_down<STAGE>(position), ret); | |
99 | return ret; | |
100 | } | |
101 | ||
102 | key_view_t get_largest_key_view() const override { | |
103 | key_view_t index_key; | |
104 | STAGE_T::template lookup_largest_slot<false, true, false>( | |
105 | extent.read(), nullptr, &index_key, nullptr); | |
106 | return index_key; | |
107 | } | |
108 | ||
109 | void next_position(search_position_t& pos) const override { | |
110 | assert(!pos.is_end()); | |
111 | bool find_next = STAGE_T::next_position(extent.read(), cast_down<STAGE>(pos)); | |
112 | if (find_next) { | |
113 | pos = search_position_t::end(); | |
114 | } | |
115 | } | |
116 | ||
117 | node_stats_t get_stats() const override { | |
118 | node_stats_t stats; | |
119 | auto& node_stage = extent.read(); | |
120 | key_view_t index_key; | |
121 | if (node_stage.keys()) { | |
122 | STAGE_T::get_stats(node_stage, stats, index_key); | |
123 | } | |
124 | stats.size_persistent = node_stage_t::EXTENT_SIZE; | |
125 | stats.size_filled = filled_size(); | |
126 | if constexpr (NODE_TYPE == node_type_t::INTERNAL) { | |
127 | if (is_level_tail()) { | |
128 | stats.size_logical += sizeof(value_t); | |
129 | stats.size_value += sizeof(value_t); | |
130 | stats.num_kvs += 1; | |
131 | } | |
132 | } | |
133 | return stats; | |
134 | } | |
135 | ||
136 | std::ostream& dump(std::ostream& os) const override { | |
137 | auto& node_stage = extent.read(); | |
138 | auto p_start = node_stage.p_start(); | |
139 | dump_brief(os); | |
140 | auto stats = get_stats(); | |
141 | os << " num_kvs=" << stats.num_kvs | |
142 | << ", logical=" << stats.size_logical | |
143 | << "B, overhead=" << stats.size_overhead | |
144 | << "B, value=" << stats.size_value << "B"; | |
145 | os << ":\n header: " << node_stage_t::header_size() << "B"; | |
146 | size_t size = 0u; | |
147 | if (node_stage.keys()) { | |
148 | STAGE_T::dump(node_stage, os, " ", size, p_start); | |
149 | } else { | |
150 | size += node_stage_t::header_size(); | |
151 | if (NODE_TYPE == node_type_t::LEAF || !node_stage.is_level_tail()) { | |
152 | os << " empty!"; | |
153 | } | |
154 | } | |
155 | if constexpr (NODE_TYPE == node_type_t::INTERNAL) { | |
156 | if (node_stage.is_level_tail()) { | |
157 | size += sizeof(laddr_t); | |
158 | auto value_ptr = node_stage.get_end_p_laddr(); | |
159 | int offset = reinterpret_cast<const char*>(value_ptr) - p_start; | |
160 | os << "\n tail value: 0x" | |
161 | << std::hex << value_ptr->value << std::dec | |
162 | << " " << size << "B" | |
163 | << " @" << offset << "B"; | |
164 | } | |
165 | } | |
166 | assert(size == filled_size()); | |
167 | return os; | |
168 | } | |
169 | ||
170 | std::ostream& dump_brief(std::ostream& os) const override { | |
171 | auto& node_stage = extent.read(); | |
172 | os << "Node" << NODE_TYPE << FIELD_TYPE | |
173 | << "@0x" << std::hex << extent.get_laddr() | |
174 | << "+" << node_stage_t::EXTENT_SIZE << std::dec | |
175 | << (node_stage.is_level_tail() ? "$" : "") | |
176 | << "(level=" << (unsigned)node_stage.level() | |
177 | << ", filled=" << filled_size() << "B" | |
178 | << ", free=" << node_stage.free_size() << "B" | |
179 | << ")"; | |
180 | return os; | |
181 | } | |
182 | ||
183 | void validate_layout() const override { | |
184 | #ifndef NDEBUG | |
185 | STAGE_T::validate(extent.read()); | |
186 | #endif | |
187 | } | |
188 | ||
189 | void test_copy_to(NodeExtentMutable& to) const override { | |
190 | extent.test_copy_to(to); | |
191 | } | |
192 | ||
193 | void test_set_tail(NodeExtentMutable& mut) override { | |
194 | node_stage_t::update_is_level_tail(mut, extent.read(), true); | |
195 | } | |
196 | ||
197 | /* | |
198 | * Common | |
199 | */ | |
200 | const value_t* get_p_value(const search_position_t& position, | |
201 | key_view_t* index_key=nullptr, marker_t={}) const override { | |
202 | auto& node_stage = extent.read(); | |
203 | if constexpr (NODE_TYPE == node_type_t::INTERNAL) { | |
204 | assert(!index_key); | |
205 | if (position.is_end()) { | |
206 | assert(is_level_tail()); | |
207 | return node_stage.get_end_p_laddr(); | |
208 | } | |
209 | } else { | |
210 | assert(!position.is_end()); | |
211 | } | |
212 | if (index_key) { | |
213 | return STAGE_T::template get_p_value<true>( | |
214 | node_stage, cast_down<STAGE>(position), index_key); | |
215 | } else { | |
216 | return STAGE_T::get_p_value(node_stage, cast_down<STAGE>(position)); | |
217 | } | |
218 | } | |
219 | ||
220 | lookup_result_t<NODE_TYPE> lower_bound( | |
221 | const key_hobj_t& key, MatchHistory& history, | |
222 | key_view_t* index_key=nullptr, marker_t={}) const override { | |
223 | auto& node_stage = extent.read(); | |
224 | if constexpr (NODE_TYPE == node_type_t::LEAF) { | |
225 | if (unlikely(node_stage.keys() == 0)) { | |
226 | history.set<STAGE_LEFT>(MatchKindCMP::LT); | |
227 | return lookup_result_t<NODE_TYPE>::end(); | |
228 | } | |
229 | } | |
230 | ||
231 | typename STAGE_T::result_t result_raw; | |
232 | if (index_key) { | |
233 | result_raw = STAGE_T::template lower_bound<true>( | |
234 | node_stage, key, history, index_key); | |
235 | #ifndef NDEBUG | |
236 | if (!result_raw.is_end()) { | |
237 | full_key_t<KeyT::VIEW> index; | |
238 | STAGE_T::get_key_view(node_stage, result_raw.position, index); | |
239 | assert(index == *index_key); | |
240 | } | |
241 | #endif | |
242 | } else { | |
243 | result_raw = STAGE_T::lower_bound(node_stage, key, history); | |
244 | } | |
245 | #ifndef NDEBUG | |
246 | if (result_raw.is_end()) { | |
247 | assert(result_raw.mstat == MSTAT_END); | |
248 | } else { | |
249 | full_key_t<KeyT::VIEW> index; | |
250 | STAGE_T::get_key_view(node_stage, result_raw.position, index); | |
251 | assert_mstat(key, index, result_raw.mstat); | |
252 | } | |
253 | #endif | |
254 | ||
255 | // calculate MSTAT_LT3 | |
256 | if constexpr (FIELD_TYPE == field_type_t::N0) { | |
257 | // currently only internal node checks mstat | |
258 | if constexpr (NODE_TYPE == node_type_t::INTERNAL) { | |
259 | if (result_raw.mstat == MSTAT_LT2) { | |
260 | auto cmp = compare_to<KeyT::HOBJ>( | |
261 | key, node_stage[result_raw.position.index].shard_pool); | |
262 | assert(cmp != MatchKindCMP::GT); | |
263 | if (cmp != MatchKindCMP::EQ) { | |
264 | result_raw.mstat = MSTAT_LT3; | |
265 | } | |
266 | } | |
267 | } | |
268 | } | |
269 | ||
270 | auto result = normalize(std::move(result_raw)); | |
271 | if (result.is_end()) { | |
272 | assert(node_stage.is_level_tail()); | |
273 | assert(result.p_value == nullptr); | |
274 | if constexpr (NODE_TYPE == node_type_t::INTERNAL) { | |
275 | result.p_value = node_stage.get_end_p_laddr(); | |
276 | } | |
277 | } else { | |
278 | assert(result.p_value != nullptr); | |
279 | } | |
280 | return result; | |
281 | } | |
282 | ||
283 | const value_t* insert( | |
284 | const full_key_t<KEY_TYPE>& key, const value_t& value, | |
285 | search_position_t& insert_pos, match_stage_t& insert_stage, | |
286 | node_offset_t& insert_size) override { | |
287 | logger().debug("OTree::Layout::Insert: begin at " | |
288 | "insert_pos({}), insert_stage={}, insert_size={}B ...", | |
289 | insert_pos, insert_stage, insert_size); | |
290 | if (unlikely(logger().is_enabled(seastar::log_level::trace))) { | |
291 | std::ostringstream sos; | |
292 | dump(sos); | |
293 | logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str()); | |
294 | } | |
295 | auto ret = extent.template insert_replayable<KEY_TYPE>( | |
296 | key, value, cast_down<STAGE>(insert_pos), insert_stage, insert_size); | |
297 | logger().debug("OTree::Layout::Insert: done at " | |
298 | "insert_pos({}), insert_stage={}, insert_size={}B", | |
299 | insert_pos, insert_stage, insert_size); | |
300 | if (unlikely(logger().is_enabled(seastar::log_level::trace))) { | |
301 | std::ostringstream sos; | |
302 | dump(sos); | |
303 | logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str()); | |
304 | } | |
305 | validate_layout(); | |
306 | assert(get_key_view(insert_pos) == key); | |
307 | return ret; | |
308 | } | |
309 | ||
310 | std::tuple<search_position_t, bool, const value_t*> split_insert( | |
311 | NodeExtentMutable& right_mut, NodeImpl& right_impl, | |
312 | const full_key_t<KEY_TYPE>& key, const value_t& value, | |
313 | search_position_t& _insert_pos, match_stage_t& insert_stage, | |
314 | node_offset_t& insert_size) override { | |
315 | logger().info("OTree::Layout::Split: begin at " | |
316 | "insert_pos({}), insert_stage={}, insert_size={}B, " | |
317 | "{:#x}=>{:#x} ...", | |
318 | _insert_pos, insert_stage, insert_size, | |
319 | laddr(), right_impl.laddr()); | |
320 | if (unlikely(logger().is_enabled(seastar::log_level::debug))) { | |
321 | std::ostringstream sos; | |
322 | dump(sos); | |
323 | logger().debug("OTree::Layout::Split: -- dump\n{}", sos.str()); | |
324 | } | |
325 | #ifdef UNIT_TESTS_BUILT | |
326 | auto insert_stage_pre = insert_stage; | |
327 | #endif | |
328 | ||
329 | auto& insert_pos = cast_down<STAGE>(_insert_pos); | |
330 | auto& node_stage = extent.read(); | |
331 | typename STAGE_T::StagedIterator split_at; | |
332 | bool is_insert_left; | |
333 | size_t split_size; | |
334 | size_t target_split_size; | |
335 | { | |
336 | size_t empty_size = node_stage.size_before(0); | |
337 | size_t filled_kv_size = filled_size() - empty_size; | |
338 | /** NODE_BLOCK_SIZE considerations | |
339 | * | |
340 | * Generally, | |
341 | * target_split_size = (filled_size + insert_size) / 2 | |
342 | * We can have two locate_split() strategies: | |
343 | * A. the simpler one is to locate the largest split position where | |
344 | * the estimated left_node_size <= target_split_size; | |
345 | * B. the fair one takes a further step to calculate the next slot of | |
346 | * P KiB, and if left_node_size + P/2 < target_split_size, compensate | |
347 | * the split position to include the next slot; (TODO) | |
348 | * | |
349 | * Say that the node_block_size = N KiB, the largest allowed | |
350 | * insert_size = 1/I * N KiB (I > 1). We want to identify the minimal 'I' | |
351 | * that won't lead to "double split" effect, meaning after a split, | |
352 | * the right node size is still larger than N KiB and need to split | |
353 | * again. I think "double split" makes split much more complicated and | |
354 | * we can no longer identify whether the node is safe under concurrent | |
355 | * operations. | |
356 | * | |
357 | * We need to evaluate the worst case in order to identify 'I'. This means: | |
358 | * - filled_size ~= N KiB | |
359 | * - insert_size == N/I KiB | |
360 | * - target_split_size ~= (I+1)/2I * N KiB | |
361 | * To simplify the below calculations, node_block_size is normalized to 1. | |
362 | * | |
363 | * With strategy A, the worst case is when left_node_size cannot include | |
364 | * the next slot that will just overflow the target_split_size: | |
365 | * - left_node_size + 1/I ~= (I+1)/2I | |
366 | * - left_node_size ~= (I-1)/2I | |
367 | * - right_node_size ~= 1 + 1/I - left_node_size ~= (I+3)/2I | |
368 | * The right_node_size cannot larger than the node_block_size in the | |
369 | * worst case, which means (I+3)/2I < 1, so I > 3, meaning the largest | |
370 | * possible insert_size must be smaller than 1/3 of the node_block_size. | |
371 | * | |
372 | * With strategy B, the worst case is when left_node_size cannot include | |
373 | * the next slot that will just overflow the threshold | |
374 | * target_split_size - 1/2I, thus: | |
375 | * - left_node_size ~= (I+1)/2I - 1/2I ~= 1/2 | |
376 | * - right_node_size ~= 1 + 1/I - 1/2 ~= (I+2)/2I < node_block_size(1) | |
377 | * - I > 2 | |
378 | * This means the largest possible insert_size must be smaller than 1/2 of | |
379 | * the node_block_size, which is better than strategy A. | |
380 | ||
381 | * In order to avoid "double split", there is another side-effect we need | |
382 | * to take into consideration: if split happens with snap-gen indexes, the | |
383 | * according ns-oid string needs to be copied to the right node. That is | |
384 | * to say: right_node_size + string_size < node_block_size. | |
385 | * | |
386 | * Say that the largest allowed string size is 1/S of the largest allowed | |
387 | * insert_size N/I KiB. If we go with stragety B, the equation should be | |
388 | * changed to: | |
389 | * - right_node_size ~= (I+2)/2I + 1/(I*S) < 1 | |
390 | * - I > 2 + 2/S (S > 1) | |
391 | * | |
392 | * Now back to NODE_BLOCK_SIZE calculation, if we have limits of at most | |
393 | * X KiB ns-oid string and Y KiB of onode_t to store in this BTree, then: | |
394 | * - largest_insert_size ~= X+Y KiB | |
395 | * - 1/S == X/(X+Y) | |
396 | * - I > (4X+2Y)/(X+Y) | |
397 | * - node_block_size(N) == I * insert_size > 4X+2Y KiB | |
398 | * | |
399 | * In conclusion, | |
400 | * (TODO) the current node block size (4 KiB) is too small to | |
401 | * store entire 2 KiB ns-oid string. We need to consider a larger | |
402 | * node_block_size. | |
403 | * | |
404 | * We are setting X = Y = 640 B in order not to break the current | |
405 | * implementations with 4KiB node. | |
406 | * | |
407 | * (TODO) Implement smarter logics to check when "double split" happens. | |
408 | */ | |
409 | target_split_size = empty_size + (filled_kv_size + insert_size) / 2; | |
410 | assert(insert_size < (node_stage.total_size() - empty_size) / 2); | |
411 | ||
412 | std::optional<bool> _is_insert_left; | |
413 | split_at.set(node_stage); | |
414 | split_size = 0; | |
415 | bool locate_nxt = STAGE_T::recursively_locate_split_inserted( | |
416 | split_size, 0, target_split_size, insert_pos, | |
417 | insert_stage, insert_size, _is_insert_left, split_at); | |
418 | is_insert_left = *_is_insert_left; | |
419 | logger().debug("OTree::Layout::Split: -- located " | |
420 | "split_at({}), insert_pos({}), is_insert_left={}, " | |
421 | "split_size={}B(target={}B, current={}B)", | |
422 | split_at, insert_pos, is_insert_left, | |
423 | split_size, target_split_size, filled_size()); | |
424 | // split_size can be larger than target_split_size in strategy B | |
425 | // assert(split_size <= target_split_size); | |
426 | if (locate_nxt) { | |
427 | assert(insert_stage == STAGE); | |
428 | assert(split_at.get().is_last()); | |
429 | split_at.set_end(); | |
430 | assert(insert_pos.index == split_at.index()); | |
431 | } | |
432 | } | |
433 | ||
434 | auto append_at = split_at; | |
435 | // TODO(cross-node string dedup) | |
436 | typename STAGE_T::template StagedAppender<KEY_TYPE> right_appender; | |
437 | right_appender.init(&right_mut, right_mut.get_write()); | |
438 | const value_t* p_value = nullptr; | |
439 | if (!is_insert_left) { | |
440 | // right node: append [start(append_at), insert_pos) | |
441 | STAGE_T::template append_until<KEY_TYPE>( | |
442 | append_at, right_appender, insert_pos, insert_stage); | |
443 | logger().debug("OTree::Layout::Split: -- right appended until " | |
444 | "insert_pos({}), insert_stage={}, insert/append the rest ...", | |
445 | insert_pos, insert_stage); | |
446 | // right node: append [insert_pos(key, value)] | |
447 | bool is_front_insert = (insert_pos == position_t::begin()); | |
448 | [[maybe_unused]] bool is_end = STAGE_T::template append_insert<KEY_TYPE>( | |
449 | key, value, append_at, right_appender, | |
450 | is_front_insert, insert_stage, p_value); | |
451 | assert(append_at.is_end() == is_end); | |
452 | } else { | |
453 | logger().debug("OTree::Layout::Split: -- right appending ..."); | |
454 | } | |
455 | ||
456 | // right node: append (insert_pos, end) | |
457 | auto pos_end = position_t::end(); | |
458 | STAGE_T::template append_until<KEY_TYPE>( | |
459 | append_at, right_appender, pos_end, STAGE); | |
460 | assert(append_at.is_end()); | |
461 | right_appender.wrap(); | |
462 | if (unlikely(logger().is_enabled(seastar::log_level::debug))) { | |
463 | std::ostringstream sos; | |
464 | right_impl.dump(sos); | |
465 | logger().debug("OTree::Layout::Split: -- right node dump\n{}", sos.str()); | |
466 | } | |
467 | right_impl.validate_layout(); | |
468 | ||
469 | // mutate left node | |
470 | if (is_insert_left) { | |
471 | logger().debug("OTree::Layout::Split: -- left trim/insert at " | |
472 | "insert_pos({}), insert_stage={} ...", | |
473 | insert_pos, insert_stage); | |
474 | p_value = extent.template split_insert_replayable<KEY_TYPE>( | |
475 | split_at, key, value, insert_pos, insert_stage, insert_size); | |
476 | assert(get_key_view(_insert_pos) == key); | |
477 | } else { | |
478 | logger().debug("OTree::Layout::Split: -- left trim ..."); | |
479 | assert(right_impl.get_key_view(_insert_pos) == key); | |
480 | extent.split_replayable(split_at); | |
481 | } | |
482 | if (unlikely(logger().is_enabled(seastar::log_level::debug))) { | |
483 | std::ostringstream sos; | |
484 | dump(sos); | |
485 | logger().debug("OTree::Layout::Split: -- left node dump\n{}", sos.str()); | |
486 | } | |
487 | validate_layout(); | |
488 | assert(p_value); | |
489 | ||
490 | auto split_pos = normalize(split_at.get_pos()); | |
491 | logger().info("OTree::Layout::Split: done at " | |
492 | "insert_pos({}), insert_stage={}, insert_size={}B, split_at({}), " | |
493 | "is_insert_left={}, split_size={}B(target={}B)", | |
494 | _insert_pos, insert_stage, insert_size, split_pos, | |
495 | is_insert_left, split_size, target_split_size); | |
496 | assert(split_size == filled_size()); | |
497 | ||
498 | #ifdef UNIT_TESTS_BUILT | |
499 | InsertType insert_type; | |
500 | search_position_t last_pos; | |
501 | if (is_insert_left) { | |
502 | STAGE_T::template lookup_largest_slot<true, false, false>( | |
503 | extent.read(), &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr); | |
504 | } else { | |
505 | node_stage_t right_stage{reinterpret_cast<FieldType*>(right_mut.get_write())}; | |
506 | STAGE_T::template lookup_largest_slot<true, false, false>( | |
507 | right_stage, &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr); | |
508 | } | |
509 | if (_insert_pos == search_position_t::begin()) { | |
510 | insert_type = InsertType::BEGIN; | |
511 | } else if (_insert_pos == last_pos) { | |
512 | insert_type = InsertType::LAST; | |
513 | } else { | |
514 | insert_type = InsertType::MID; | |
515 | } | |
516 | last_split = {split_pos, insert_stage_pre, is_insert_left, insert_type}; | |
517 | #endif | |
518 | return {split_pos, is_insert_left, p_value}; | |
519 | } | |
520 | ||
521 | /* | |
522 | * InternalNodeImpl | |
523 | */ | |
524 | void replace_child_addr( | |
525 | const search_position_t& pos, laddr_t dst, laddr_t src) override { | |
526 | if constexpr (NODE_TYPE == node_type_t::INTERNAL) { | |
527 | const laddr_packed_t* p_value = get_p_value(pos); | |
528 | assert(p_value->value == src); | |
529 | extent.update_child_addr_replayable(dst, const_cast<laddr_packed_t*>(p_value)); | |
530 | } else { | |
531 | ceph_abort("impossible path"); | |
532 | } | |
533 | } | |
534 | ||
535 | std::tuple<match_stage_t, node_offset_t> evaluate_insert( | |
536 | const key_view_t& key, const laddr_t& value, | |
537 | search_position_t& insert_pos) const override { | |
538 | if constexpr (NODE_TYPE == node_type_t::INTERNAL) { | |
539 | auto packed_value = laddr_packed_t{value}; | |
540 | auto& node_stage = extent.read(); | |
541 | match_stage_t insert_stage; | |
542 | node_offset_t insert_size; | |
543 | if (unlikely(!node_stage.keys())) { | |
544 | assert(insert_pos.is_end()); | |
545 | insert_stage = STAGE; | |
546 | insert_size = STAGE_T::template insert_size<KeyT::VIEW>(key, packed_value); | |
547 | } else { | |
548 | std::tie(insert_stage, insert_size) = STAGE_T::evaluate_insert( | |
549 | node_stage, key, packed_value, cast_down<STAGE>(insert_pos), false); | |
550 | } | |
551 | return {insert_stage, insert_size}; | |
552 | } else { | |
553 | ceph_abort("impossible path"); | |
554 | } | |
555 | } | |
556 | ||
557 | /* | |
558 | * LeafNodeImpl | |
559 | */ | |
560 | void get_largest_slot(search_position_t& pos, | |
561 | key_view_t& index_key, const onode_t** pp_value) const override { | |
562 | if constexpr (NODE_TYPE == node_type_t::LEAF) { | |
563 | STAGE_T::template lookup_largest_slot<true, true, true>( | |
564 | extent.read(), &cast_down_fill_0<STAGE>(pos), &index_key, pp_value); | |
565 | } else { | |
566 | ceph_abort("impossible path"); | |
567 | } | |
568 | } | |
569 | ||
570 | std::tuple<match_stage_t, node_offset_t> evaluate_insert( | |
571 | const key_hobj_t& key, const onode_t& value, | |
572 | const MatchHistory& history, match_stat_t mstat, | |
573 | search_position_t& insert_pos) const override { | |
574 | if constexpr (NODE_TYPE == node_type_t::LEAF) { | |
575 | if (unlikely(is_empty())) { | |
576 | assert(insert_pos.is_end()); | |
577 | return {STAGE, STAGE_T::template insert_size<KeyT::HOBJ>(key, value)}; | |
578 | } else { | |
579 | return STAGE_T::evaluate_insert( | |
580 | key, value, history, mstat, cast_down<STAGE>(insert_pos)); | |
581 | } | |
582 | } else { | |
583 | ceph_abort("impossible path"); | |
584 | } | |
585 | } | |
586 | ||
587 | private: | |
588 | NodeLayoutT(NodeExtentRef extent) : extent{extent} {} | |
589 | ||
590 | node_offset_t filled_size() const { | |
591 | auto& node_stage = extent.read(); | |
592 | auto ret = node_stage.size_before(node_stage.keys()); | |
593 | assert(ret == node_stage.total_size() - node_stage.free_size()); | |
594 | return ret; | |
595 | } | |
596 | ||
597 | static seastar::logger& logger() { | |
598 | return crimson::get_logger(ceph_subsys_filestore); | |
599 | } | |
600 | ||
601 | extent_t extent; | |
602 | }; | |
603 | ||
604 | using InternalNode0 = NodeLayoutT<node_fields_0_t, node_type_t::INTERNAL>; | |
605 | using InternalNode1 = NodeLayoutT<node_fields_1_t, node_type_t::INTERNAL>; | |
606 | using InternalNode2 = NodeLayoutT<node_fields_2_t, node_type_t::INTERNAL>; | |
607 | using InternalNode3 = NodeLayoutT<internal_fields_3_t, node_type_t::INTERNAL>; | |
608 | using LeafNode0 = NodeLayoutT<node_fields_0_t, node_type_t::LEAF>; | |
609 | using LeafNode1 = NodeLayoutT<node_fields_1_t, node_type_t::LEAF>; | |
610 | using LeafNode2 = NodeLayoutT<node_fields_2_t, node_type_t::LEAF>; | |
611 | using LeafNode3 = NodeLayoutT<leaf_fields_3_t, node_type_t::LEAF>; | |
612 | ||
613 | } |