]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/cache.cc
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / crimson / os / seastore / cache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "crimson/os/seastore/cache.h"
5
6 #include <sstream>
7 #include <string_view>
8
9 #include <seastar/core/metrics.hh>
10
11 #include "crimson/os/seastore/logging.h"
12 #include "crimson/common/config_proxy.h"
13 #include "crimson/os/seastore/async_cleaner.h"
14
15 // included for get_extent_by_type
16 #include "crimson/os/seastore/collection_manager/collection_flat_node.h"
17 #include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
18 #include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
19 #include "crimson/os/seastore/object_data_handler.h"
20 #include "crimson/os/seastore/collection_manager/collection_flat_node.h"
21 #include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h"
22 #include "crimson/os/seastore/backref/backref_tree_node.h"
23 #include "test/crimson/seastore/test_block.h"
24
25 using std::string_view;
26
27 SET_SUBSYS(seastore_cache);
28
29 namespace crimson::os::seastore {
30
31 std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent) {
32 return out << "backref_entry_t{"
33 << ent.paddr << "~" << ent.len << ", "
34 << "laddr: " << ent.laddr << ", "
35 << "type: " << ent.type << ", "
36 << "seq: " << ent.seq << ", "
37 << "}";
38 }
39
40 Cache::Cache(
41 ExtentPlacementManager &epm)
42 : epm(epm),
43 lru(crimson::common::get_conf<Option::size_t>(
44 "seastore_cache_lru_size"))
45 {
46 LOG_PREFIX(Cache::Cache);
47 INFO("created, lru_size={}", lru.get_capacity());
48 register_metrics();
49 segment_providers_by_device_id.resize(DEVICE_ID_MAX, nullptr);
50 }
51
52 Cache::~Cache()
53 {
54 LOG_PREFIX(Cache::~Cache);
55 for (auto &i: extents) {
56 ERROR("extent is still alive -- {}", i);
57 }
58 ceph_assert(extents.empty());
59 }
60
61 Cache::retire_extent_ret Cache::retire_extent_addr(
62 Transaction &t, paddr_t addr, extent_len_t length)
63 {
64 LOG_PREFIX(Cache::retire_extent_addr);
65 TRACET("retire {}~{}", t, addr, length);
66
67 assert(addr.is_real() && !addr.is_block_relative());
68
69 CachedExtentRef ext;
70 auto result = t.get_extent(addr, &ext);
71 if (result == Transaction::get_extent_ret::PRESENT) {
72 DEBUGT("retire {}~{} on t -- {}", t, addr, length, *ext);
73 t.add_to_retired_set(CachedExtentRef(&*ext));
74 return retire_extent_iertr::now();
75 } else if (result == Transaction::get_extent_ret::RETIRED) {
76 ERRORT("retire {}~{} failed, already retired -- {}", t, addr, length, *ext);
77 ceph_abort();
78 }
79
80 // any relative addr must have been on the transaction
81 assert(!addr.is_relative());
82
83 // absent from transaction
84 // retiring is not included by the cache hit metrics
85 ext = query_cache(addr, nullptr);
86 if (ext) {
87 DEBUGT("retire {}~{} in cache -- {}", t, addr, length, *ext);
88 } else {
89 // add a new placeholder to Cache
90 ext = CachedExtent::make_cached_extent_ref<
91 RetiredExtentPlaceholder>(length);
92 ext->init(CachedExtent::extent_state_t::CLEAN,
93 addr,
94 PLACEMENT_HINT_NULL,
95 NULL_GENERATION,
96 TRANS_ID_NULL);
97 DEBUGT("retire {}~{} as placeholder, add extent -- {}",
98 t, addr, length, *ext);
99 const auto t_src = t.get_src();
100 add_extent(ext, &t_src);
101 }
102 t.add_to_read_set(ext);
103 t.add_to_retired_set(ext);
104 return retire_extent_iertr::now();
105 }
106
107 void Cache::dump_contents()
108 {
109 LOG_PREFIX(Cache::dump_contents);
110 DEBUG("enter");
111 for (auto &&i: extents) {
112 DEBUG("live {}", i);
113 }
114 DEBUG("exit");
115 }
116
117 void Cache::register_metrics()
118 {
119 LOG_PREFIX(Cache::register_metrics);
120 DEBUG("");
121
122 stats = {};
123
124 namespace sm = seastar::metrics;
125 using src_t = Transaction::src_t;
126
127 std::map<src_t, sm::label_instance> labels_by_src {
128 {src_t::MUTATE, sm::label_instance("src", "MUTATE")},
129 {src_t::READ, sm::label_instance("src", "READ")},
130 {src_t::TRIM_DIRTY, sm::label_instance("src", "TRIM_DIRTY")},
131 {src_t::TRIM_ALLOC, sm::label_instance("src", "TRIM_ALLOC")},
132 {src_t::CLEANER_MAIN, sm::label_instance("src", "CLEANER_MAIN")},
133 {src_t::CLEANER_COLD, sm::label_instance("src", "CLEANER_COLD")},
134 };
135 assert(labels_by_src.size() == (std::size_t)src_t::MAX);
136
137 std::map<extent_types_t, sm::label_instance> labels_by_ext {
138 {extent_types_t::ROOT, sm::label_instance("ext", "ROOT")},
139 {extent_types_t::LADDR_INTERNAL, sm::label_instance("ext", "LADDR_INTERNAL")},
140 {extent_types_t::LADDR_LEAF, sm::label_instance("ext", "LADDR_LEAF")},
141 {extent_types_t::DINK_LADDR_LEAF, sm::label_instance("ext", "DINK_LADDR_LEAF")},
142 {extent_types_t::OMAP_INNER, sm::label_instance("ext", "OMAP_INNER")},
143 {extent_types_t::OMAP_LEAF, sm::label_instance("ext", "OMAP_LEAF")},
144 {extent_types_t::ONODE_BLOCK_STAGED, sm::label_instance("ext", "ONODE_BLOCK_STAGED")},
145 {extent_types_t::COLL_BLOCK, sm::label_instance("ext", "COLL_BLOCK")},
146 {extent_types_t::OBJECT_DATA_BLOCK, sm::label_instance("ext", "OBJECT_DATA_BLOCK")},
147 {extent_types_t::RETIRED_PLACEHOLDER, sm::label_instance("ext", "RETIRED_PLACEHOLDER")},
148 {extent_types_t::ALLOC_INFO, sm::label_instance("ext", "ALLOC_INFO")},
149 {extent_types_t::JOURNAL_TAIL, sm::label_instance("ext", "JOURNAL_TAIL")},
150 {extent_types_t::TEST_BLOCK, sm::label_instance("ext", "TEST_BLOCK")},
151 {extent_types_t::TEST_BLOCK_PHYSICAL, sm::label_instance("ext", "TEST_BLOCK_PHYSICAL")},
152 {extent_types_t::BACKREF_INTERNAL, sm::label_instance("ext", "BACKREF_INTERNAL")},
153 {extent_types_t::BACKREF_LEAF, sm::label_instance("ext", "BACKREF_LEAF")}
154 };
155 assert(labels_by_ext.size() == (std::size_t)extent_types_t::NONE);
156
157 /*
158 * trans_created
159 */
160 for (auto& [src, src_label] : labels_by_src) {
161 metrics.add_group(
162 "cache",
163 {
164 sm::make_counter(
165 "trans_created",
166 get_by_src(stats.trans_created_by_src, src),
167 sm::description("total number of transaction created"),
168 {src_label}
169 ),
170 }
171 );
172 }
173
174 /*
175 * cache_query: cache_access and cache_hit
176 */
177 for (auto& [src, src_label] : labels_by_src) {
178 metrics.add_group(
179 "cache",
180 {
181 sm::make_counter(
182 "cache_access",
183 get_by_src(stats.cache_query_by_src, src).access,
184 sm::description("total number of cache accesses"),
185 {src_label}
186 ),
187 sm::make_counter(
188 "cache_hit",
189 get_by_src(stats.cache_query_by_src, src).hit,
190 sm::description("total number of cache hits"),
191 {src_label}
192 ),
193 }
194 );
195 }
196
197 {
198 /*
199 * efforts discarded/committed
200 */
201 auto effort_label = sm::label("effort");
202
203 // invalidated efforts
204 using namespace std::literals::string_view_literals;
205 const string_view invalidated_effort_names[] = {
206 "READ"sv,
207 "MUTATE"sv,
208 "RETIRE"sv,
209 "FRESH"sv,
210 "FRESH_OOL_WRITTEN"sv,
211 };
212 for (auto& [src, src_label] : labels_by_src) {
213 auto& efforts = get_by_src(stats.invalidated_efforts_by_src, src);
214 for (auto& [ext, ext_label] : labels_by_ext) {
215 auto& counter = get_by_ext(efforts.num_trans_invalidated, ext);
216 metrics.add_group(
217 "cache",
218 {
219 sm::make_counter(
220 "trans_invalidated_by_extent",
221 counter,
222 sm::description("total number of transactions invalidated by extents"),
223 {src_label, ext_label}
224 ),
225 }
226 );
227 }
228
229 if (src == src_t::READ) {
230 // read transaction won't have non-read efforts
231 auto read_effort_label = effort_label("READ");
232 metrics.add_group(
233 "cache",
234 {
235 sm::make_counter(
236 "invalidated_extents",
237 efforts.read.num,
238 sm::description("extents of invalidated transactions"),
239 {src_label, read_effort_label}
240 ),
241 sm::make_counter(
242 "invalidated_extent_bytes",
243 efforts.read.bytes,
244 sm::description("extent bytes of invalidated transactions"),
245 {src_label, read_effort_label}
246 ),
247 }
248 );
249 continue;
250 }
251
252 // non READ invalidated efforts
253 for (auto& effort_name : invalidated_effort_names) {
254 auto& effort = [&effort_name, &efforts]() -> io_stat_t& {
255 if (effort_name == "READ") {
256 return efforts.read;
257 } else if (effort_name == "MUTATE") {
258 return efforts.mutate;
259 } else if (effort_name == "RETIRE") {
260 return efforts.retire;
261 } else if (effort_name == "FRESH") {
262 return efforts.fresh;
263 } else {
264 assert(effort_name == "FRESH_OOL_WRITTEN");
265 return efforts.fresh_ool_written;
266 }
267 }();
268 metrics.add_group(
269 "cache",
270 {
271 sm::make_counter(
272 "invalidated_extents",
273 effort.num,
274 sm::description("extents of invalidated transactions"),
275 {src_label, effort_label(effort_name)}
276 ),
277 sm::make_counter(
278 "invalidated_extent_bytes",
279 effort.bytes,
280 sm::description("extent bytes of invalidated transactions"),
281 {src_label, effort_label(effort_name)}
282 ),
283 }
284 );
285 } // effort_name
286
287 metrics.add_group(
288 "cache",
289 {
290 sm::make_counter(
291 "trans_invalidated",
292 efforts.total_trans_invalidated,
293 sm::description("total number of transactions invalidated"),
294 {src_label}
295 ),
296 sm::make_counter(
297 "invalidated_delta_bytes",
298 efforts.mutate_delta_bytes,
299 sm::description("delta bytes of invalidated transactions"),
300 {src_label}
301 ),
302 sm::make_counter(
303 "invalidated_ool_records",
304 efforts.num_ool_records,
305 sm::description("number of ool-records from invalidated transactions"),
306 {src_label}
307 ),
308 sm::make_counter(
309 "invalidated_ool_record_bytes",
310 efforts.ool_record_bytes,
311 sm::description("bytes of ool-record from invalidated transactions"),
312 {src_label}
313 ),
314 }
315 );
316 } // src
317
318 // committed efforts
319 const string_view committed_effort_names[] = {
320 "READ"sv,
321 "MUTATE"sv,
322 "RETIRE"sv,
323 "FRESH_INVALID"sv,
324 "FRESH_INLINE"sv,
325 "FRESH_OOL"sv,
326 };
327 for (auto& [src, src_label] : labels_by_src) {
328 if (src == src_t::READ) {
329 // READ transaction won't commit
330 continue;
331 }
332 auto& efforts = get_by_src(stats.committed_efforts_by_src, src);
333 metrics.add_group(
334 "cache",
335 {
336 sm::make_counter(
337 "trans_committed",
338 efforts.num_trans,
339 sm::description("total number of transaction committed"),
340 {src_label}
341 ),
342 sm::make_counter(
343 "committed_ool_records",
344 efforts.num_ool_records,
345 sm::description("number of ool-records from committed transactions"),
346 {src_label}
347 ),
348 sm::make_counter(
349 "committed_ool_record_metadata_bytes",
350 efforts.ool_record_metadata_bytes,
351 sm::description("bytes of ool-record metadata from committed transactions"),
352 {src_label}
353 ),
354 sm::make_counter(
355 "committed_ool_record_data_bytes",
356 efforts.ool_record_data_bytes,
357 sm::description("bytes of ool-record data from committed transactions"),
358 {src_label}
359 ),
360 sm::make_counter(
361 "committed_inline_record_metadata_bytes",
362 efforts.inline_record_metadata_bytes,
363 sm::description("bytes of inline-record metadata from committed transactions"
364 "(excludes delta buffer)"),
365 {src_label}
366 ),
367 }
368 );
369 for (auto& effort_name : committed_effort_names) {
370 auto& effort_by_ext = [&efforts, &effort_name]()
371 -> counter_by_extent_t<io_stat_t>& {
372 if (effort_name == "READ") {
373 return efforts.read_by_ext;
374 } else if (effort_name == "MUTATE") {
375 return efforts.mutate_by_ext;
376 } else if (effort_name == "RETIRE") {
377 return efforts.retire_by_ext;
378 } else if (effort_name == "FRESH_INVALID") {
379 return efforts.fresh_invalid_by_ext;
380 } else if (effort_name == "FRESH_INLINE") {
381 return efforts.fresh_inline_by_ext;
382 } else {
383 assert(effort_name == "FRESH_OOL");
384 return efforts.fresh_ool_by_ext;
385 }
386 }();
387 for (auto& [ext, ext_label] : labels_by_ext) {
388 auto& effort = get_by_ext(effort_by_ext, ext);
389 metrics.add_group(
390 "cache",
391 {
392 sm::make_counter(
393 "committed_extents",
394 effort.num,
395 sm::description("extents of committed transactions"),
396 {src_label, effort_label(effort_name), ext_label}
397 ),
398 sm::make_counter(
399 "committed_extent_bytes",
400 effort.bytes,
401 sm::description("extent bytes of committed transactions"),
402 {src_label, effort_label(effort_name), ext_label}
403 ),
404 }
405 );
406 } // ext
407 } // effort_name
408
409 auto& delta_by_ext = efforts.delta_bytes_by_ext;
410 for (auto& [ext, ext_label] : labels_by_ext) {
411 auto& value = get_by_ext(delta_by_ext, ext);
412 metrics.add_group(
413 "cache",
414 {
415 sm::make_counter(
416 "committed_delta_bytes",
417 value,
418 sm::description("delta bytes of committed transactions"),
419 {src_label, ext_label}
420 ),
421 }
422 );
423 } // ext
424 } // src
425
426 // successful read efforts
427 metrics.add_group(
428 "cache",
429 {
430 sm::make_counter(
431 "trans_read_successful",
432 stats.success_read_efforts.num_trans,
433 sm::description("total number of successful read transactions")
434 ),
435 sm::make_counter(
436 "successful_read_extents",
437 stats.success_read_efforts.read.num,
438 sm::description("extents of successful read transactions")
439 ),
440 sm::make_counter(
441 "successful_read_extent_bytes",
442 stats.success_read_efforts.read.bytes,
443 sm::description("extent bytes of successful read transactions")
444 ),
445 }
446 );
447 }
448
449 /**
450 * Cached extents (including placeholders)
451 *
452 * Dirty extents
453 */
454 metrics.add_group(
455 "cache",
456 {
457 sm::make_counter(
458 "cached_extents",
459 [this] {
460 return extents.size();
461 },
462 sm::description("total number of cached extents")
463 ),
464 sm::make_counter(
465 "cached_extent_bytes",
466 [this] {
467 return extents.get_bytes();
468 },
469 sm::description("total bytes of cached extents")
470 ),
471 sm::make_counter(
472 "dirty_extents",
473 [this] {
474 return dirty.size();
475 },
476 sm::description("total number of dirty extents")
477 ),
478 sm::make_counter(
479 "dirty_extent_bytes",
480 stats.dirty_bytes,
481 sm::description("total bytes of dirty extents")
482 ),
483 sm::make_counter(
484 "cache_lru_size_bytes",
485 [this] {
486 return lru.get_current_contents_bytes();
487 },
488 sm::description("total bytes pinned by the lru")
489 ),
490 sm::make_counter(
491 "cache_lru_size_extents",
492 [this] {
493 return lru.get_current_contents_extents();
494 },
495 sm::description("total extents pinned by the lru")
496 ),
497 }
498 );
499
500 /**
501 * tree stats
502 */
503 auto tree_label = sm::label("tree");
504 auto onode_label = tree_label("ONODE");
505 auto omap_label = tree_label("OMAP");
506 auto lba_label = tree_label("LBA");
507 auto backref_label = tree_label("BACKREF");
508 auto register_tree_metrics = [&labels_by_src, &onode_label, &omap_label, this](
509 const sm::label_instance& tree_label,
510 uint64_t& tree_depth,
511 int64_t& tree_extents_num,
512 counter_by_src_t<tree_efforts_t>& committed_tree_efforts,
513 counter_by_src_t<tree_efforts_t>& invalidated_tree_efforts) {
514 metrics.add_group(
515 "cache",
516 {
517 sm::make_counter(
518 "tree_depth",
519 tree_depth,
520 sm::description("the depth of tree"),
521 {tree_label}
522 ),
523 sm::make_counter(
524 "tree_extents_num",
525 tree_extents_num,
526 sm::description("num of extents of the tree"),
527 {tree_label}
528 )
529 }
530 );
531 for (auto& [src, src_label] : labels_by_src) {
532 if (src == src_t::READ) {
533 // READ transaction won't contain any tree inserts and erases
534 continue;
535 }
536 if (is_background_transaction(src) &&
537 (tree_label == onode_label ||
538 tree_label == omap_label)) {
539 // CLEANER transaction won't contain any onode/omap tree operations
540 continue;
541 }
542 auto& committed_efforts = get_by_src(committed_tree_efforts, src);
543 auto& invalidated_efforts = get_by_src(invalidated_tree_efforts, src);
544 metrics.add_group(
545 "cache",
546 {
547 sm::make_counter(
548 "tree_inserts_committed",
549 committed_efforts.num_inserts,
550 sm::description("total number of committed insert operations"),
551 {tree_label, src_label}
552 ),
553 sm::make_counter(
554 "tree_erases_committed",
555 committed_efforts.num_erases,
556 sm::description("total number of committed erase operations"),
557 {tree_label, src_label}
558 ),
559 sm::make_counter(
560 "tree_updates_committed",
561 committed_efforts.num_updates,
562 sm::description("total number of committed update operations"),
563 {tree_label, src_label}
564 ),
565 sm::make_counter(
566 "tree_inserts_invalidated",
567 invalidated_efforts.num_inserts,
568 sm::description("total number of invalidated insert operations"),
569 {tree_label, src_label}
570 ),
571 sm::make_counter(
572 "tree_erases_invalidated",
573 invalidated_efforts.num_erases,
574 sm::description("total number of invalidated erase operations"),
575 {tree_label, src_label}
576 ),
577 sm::make_counter(
578 "tree_updates_invalidated",
579 invalidated_efforts.num_updates,
580 sm::description("total number of invalidated update operations"),
581 {tree_label, src_label}
582 ),
583 }
584 );
585 }
586 };
587 register_tree_metrics(
588 onode_label,
589 stats.onode_tree_depth,
590 stats.onode_tree_extents_num,
591 stats.committed_onode_tree_efforts,
592 stats.invalidated_onode_tree_efforts);
593 register_tree_metrics(
594 omap_label,
595 stats.omap_tree_depth,
596 stats.omap_tree_extents_num,
597 stats.committed_omap_tree_efforts,
598 stats.invalidated_omap_tree_efforts);
599 register_tree_metrics(
600 lba_label,
601 stats.lba_tree_depth,
602 stats.lba_tree_extents_num,
603 stats.committed_lba_tree_efforts,
604 stats.invalidated_lba_tree_efforts);
605 register_tree_metrics(
606 backref_label,
607 stats.backref_tree_depth,
608 stats.backref_tree_extents_num,
609 stats.committed_backref_tree_efforts,
610 stats.invalidated_backref_tree_efforts);
611
612 /**
613 * conflict combinations
614 */
615 auto srcs_label = sm::label("srcs");
616 auto num_srcs = static_cast<std::size_t>(Transaction::src_t::MAX);
617 std::size_t srcs_index = 0;
618 for (uint8_t src2_int = 0; src2_int < num_srcs; ++src2_int) {
619 auto src2 = static_cast<Transaction::src_t>(src2_int);
620 for (uint8_t src1_int = src2_int; src1_int < num_srcs; ++src1_int) {
621 ++srcs_index;
622 auto src1 = static_cast<Transaction::src_t>(src1_int);
623 // impossible combinations
624 // should be consistent with checks in account_conflict()
625 if ((src1 == Transaction::src_t::READ &&
626 src2 == Transaction::src_t::READ) ||
627 (src1 == Transaction::src_t::TRIM_DIRTY &&
628 src2 == Transaction::src_t::TRIM_DIRTY) ||
629 (src1 == Transaction::src_t::CLEANER_MAIN &&
630 src2 == Transaction::src_t::CLEANER_MAIN) ||
631 (src1 == Transaction::src_t::CLEANER_COLD &&
632 src2 == Transaction::src_t::CLEANER_COLD) ||
633 (src1 == Transaction::src_t::TRIM_ALLOC &&
634 src2 == Transaction::src_t::TRIM_ALLOC)) {
635 continue;
636 }
637 std::ostringstream oss;
638 oss << src1 << "," << src2;
639 metrics.add_group(
640 "cache",
641 {
642 sm::make_counter(
643 "trans_srcs_invalidated",
644 stats.trans_conflicts_by_srcs[srcs_index - 1],
645 sm::description("total number conflicted transactions by src pair"),
646 {srcs_label(oss.str())}
647 ),
648 }
649 );
650 }
651 }
652 assert(srcs_index == NUM_SRC_COMB);
653 srcs_index = 0;
654 for (uint8_t src_int = 0; src_int < num_srcs; ++src_int) {
655 ++srcs_index;
656 auto src = static_cast<Transaction::src_t>(src_int);
657 std::ostringstream oss;
658 oss << "UNKNOWN," << src;
659 metrics.add_group(
660 "cache",
661 {
662 sm::make_counter(
663 "trans_srcs_invalidated",
664 stats.trans_conflicts_by_unknown[srcs_index - 1],
665 sm::description("total number conflicted transactions by src pair"),
666 {srcs_label(oss.str())}
667 ),
668 }
669 );
670 }
671
672 /**
673 * rewrite version
674 */
675 metrics.add_group(
676 "cache",
677 {
678 sm::make_counter(
679 "version_count_dirty",
680 stats.committed_dirty_version.num,
681 sm::description("total number of rewrite-dirty extents")
682 ),
683 sm::make_counter(
684 "version_sum_dirty",
685 stats.committed_dirty_version.version,
686 sm::description("sum of the version from rewrite-dirty extents")
687 ),
688 sm::make_counter(
689 "version_count_reclaim",
690 stats.committed_reclaim_version.num,
691 sm::description("total number of rewrite-reclaim extents")
692 ),
693 sm::make_counter(
694 "version_sum_reclaim",
695 stats.committed_reclaim_version.version,
696 sm::description("sum of the version from rewrite-reclaim extents")
697 ),
698 }
699 );
700 }
701
702 void Cache::add_extent(
703 CachedExtentRef ref,
704 const Transaction::src_t* p_src=nullptr)
705 {
706 assert(ref->is_valid());
707 assert(ref->user_hint == PLACEMENT_HINT_NULL);
708 assert(ref->rewrite_generation == NULL_GENERATION);
709 extents.insert(*ref);
710 if (ref->is_dirty()) {
711 add_to_dirty(ref);
712 } else {
713 touch_extent(*ref, p_src);
714 }
715 }
716
717 void Cache::mark_dirty(CachedExtentRef ref)
718 {
719 if (ref->is_dirty()) {
720 assert(ref->primary_ref_list_hook.is_linked());
721 return;
722 }
723
724 lru.remove_from_lru(*ref);
725 ref->state = CachedExtent::extent_state_t::DIRTY;
726 add_to_dirty(ref);
727 }
728
729 void Cache::add_to_dirty(CachedExtentRef ref)
730 {
731 assert(ref->is_dirty());
732 assert(!ref->primary_ref_list_hook.is_linked());
733 ceph_assert(ref->get_modify_time() != NULL_TIME);
734 intrusive_ptr_add_ref(&*ref);
735 dirty.push_back(*ref);
736 stats.dirty_bytes += ref->get_length();
737 }
738
739 void Cache::remove_from_dirty(CachedExtentRef ref)
740 {
741 if (ref->is_dirty()) {
742 ceph_assert(ref->primary_ref_list_hook.is_linked());
743 stats.dirty_bytes -= ref->get_length();
744 dirty.erase(dirty.s_iterator_to(*ref));
745 intrusive_ptr_release(&*ref);
746 } else {
747 ceph_assert(!ref->primary_ref_list_hook.is_linked());
748 }
749 }
750
751 void Cache::remove_extent(CachedExtentRef ref)
752 {
753 assert(ref->is_valid());
754 if (ref->is_dirty()) {
755 remove_from_dirty(ref);
756 } else if (!ref->is_placeholder()) {
757 lru.remove_from_lru(*ref);
758 }
759 extents.erase(*ref);
760 }
761
762 void Cache::commit_retire_extent(
763 Transaction& t,
764 CachedExtentRef ref)
765 {
766 remove_extent(ref);
767
768 ref->dirty_from_or_retired_at = JOURNAL_SEQ_NULL;
769 invalidate_extent(t, *ref);
770 }
771
772 void Cache::commit_replace_extent(
773 Transaction& t,
774 CachedExtentRef next,
775 CachedExtentRef prev)
776 {
777 assert(next->is_dirty());
778 assert(next->get_paddr() == prev->get_paddr());
779 assert(next->version == prev->version + 1);
780 extents.replace(*next, *prev);
781
782 if (prev->get_type() == extent_types_t::ROOT) {
783 assert(prev->is_stable_clean()
784 || prev->primary_ref_list_hook.is_linked());
785 if (prev->is_dirty()) {
786 stats.dirty_bytes -= prev->get_length();
787 dirty.erase(dirty.s_iterator_to(*prev));
788 intrusive_ptr_release(&*prev);
789 }
790 add_to_dirty(next);
791 } else if (prev->is_dirty()) {
792 assert(prev->get_dirty_from() == next->get_dirty_from());
793 assert(prev->primary_ref_list_hook.is_linked());
794 auto prev_it = dirty.iterator_to(*prev);
795 dirty.insert(prev_it, *next);
796 dirty.erase(prev_it);
797 intrusive_ptr_release(&*prev);
798 intrusive_ptr_add_ref(&*next);
799 } else {
800 lru.remove_from_lru(*prev);
801 add_to_dirty(next);
802 }
803
804 next->on_replace_prior(t);
805 invalidate_extent(t, *prev);
806 }
807
808 void Cache::invalidate_extent(
809 Transaction& t,
810 CachedExtent& extent)
811 {
812 if (!extent.may_conflict()) {
813 assert(extent.transactions.empty());
814 extent.set_invalid(t);
815 return;
816 }
817
818 LOG_PREFIX(Cache::invalidate_extent);
819 bool do_conflict_log = true;
820 for (auto &&i: extent.transactions) {
821 if (!i.t->conflicted) {
822 if (do_conflict_log) {
823 SUBDEBUGT(seastore_t, "conflict begin -- {}", t, extent);
824 do_conflict_log = false;
825 }
826 assert(!i.t->is_weak());
827 account_conflict(t.get_src(), i.t->get_src());
828 mark_transaction_conflicted(*i.t, extent);
829 }
830 }
831 extent.set_invalid(t);
832 }
833
834 void Cache::mark_transaction_conflicted(
835 Transaction& t, CachedExtent& conflicting_extent)
836 {
837 LOG_PREFIX(Cache::mark_transaction_conflicted);
838 SUBTRACET(seastore_t, "", t);
839 assert(!t.conflicted);
840 t.conflicted = true;
841
842 auto& efforts = get_by_src(stats.invalidated_efforts_by_src,
843 t.get_src());
844 ++efforts.total_trans_invalidated;
845
846 auto& counter = get_by_ext(efforts.num_trans_invalidated,
847 conflicting_extent.get_type());
848 ++counter;
849
850 io_stat_t read_stat;
851 for (auto &i: t.read_set) {
852 read_stat.increment(i.ref->get_length());
853 }
854 efforts.read.increment_stat(read_stat);
855
856 if (t.get_src() != Transaction::src_t::READ) {
857 io_stat_t retire_stat;
858 for (auto &i: t.retired_set) {
859 retire_stat.increment(i->get_length());
860 }
861 efforts.retire.increment_stat(retire_stat);
862
863 auto& fresh_stat = t.get_fresh_block_stats();
864 efforts.fresh.increment_stat(fresh_stat);
865
866 io_stat_t delta_stat;
867 for (auto &i: t.mutated_block_list) {
868 if (!i->is_valid()) {
869 continue;
870 }
871 efforts.mutate.increment(i->get_length());
872 delta_stat.increment(i->get_delta().length());
873 }
874 efforts.mutate_delta_bytes += delta_stat.bytes;
875
876 for (auto &i: t.pre_alloc_list) {
877 epm.mark_space_free(i->get_paddr(), i->get_length());
878 }
879
880 auto& ool_stats = t.get_ool_write_stats();
881 efforts.fresh_ool_written.increment_stat(ool_stats.extents);
882 efforts.num_ool_records += ool_stats.num_records;
883 auto ool_record_bytes = (ool_stats.md_bytes + ool_stats.get_data_bytes());
884 efforts.ool_record_bytes += ool_record_bytes;
885
886 if (is_background_transaction(t.get_src())) {
887 // CLEANER transaction won't contain any onode/omap tree operations
888 assert(t.onode_tree_stats.is_clear());
889 assert(t.omap_tree_stats.is_clear());
890 } else {
891 get_by_src(stats.invalidated_onode_tree_efforts, t.get_src()
892 ).increment(t.onode_tree_stats);
893 get_by_src(stats.invalidated_omap_tree_efforts, t.get_src()
894 ).increment(t.omap_tree_stats);
895 }
896
897 get_by_src(stats.invalidated_lba_tree_efforts, t.get_src()
898 ).increment(t.lba_tree_stats);
899 get_by_src(stats.invalidated_backref_tree_efforts, t.get_src()
900 ).increment(t.backref_tree_stats);
901
902 SUBDEBUGT(seastore_t,
903 "discard {} read, {} fresh, {} delta, {} retire, {}({}B) ool-records",
904 t,
905 read_stat,
906 fresh_stat,
907 delta_stat,
908 retire_stat,
909 ool_stats.num_records,
910 ool_record_bytes);
911 } else {
912 // read transaction won't have non-read efforts
913 assert(t.retired_set.empty());
914 assert(t.get_fresh_block_stats().is_clear());
915 assert(t.mutated_block_list.empty());
916 assert(t.get_ool_write_stats().is_clear());
917 assert(t.onode_tree_stats.is_clear());
918 assert(t.omap_tree_stats.is_clear());
919 assert(t.lba_tree_stats.is_clear());
920 assert(t.backref_tree_stats.is_clear());
921 SUBDEBUGT(seastore_t, "discard {} read", t, read_stat);
922 }
923 }
924
925 void Cache::on_transaction_destruct(Transaction& t)
926 {
927 LOG_PREFIX(Cache::on_transaction_destruct);
928 SUBTRACET(seastore_t, "", t);
929 if (t.get_src() == Transaction::src_t::READ &&
930 t.conflicted == false) {
931 io_stat_t read_stat;
932 for (auto &i: t.read_set) {
933 read_stat.increment(i.ref->get_length());
934 }
935 SUBDEBUGT(seastore_t, "done {} read", t, read_stat);
936
937 if (!t.is_weak()) {
938 // exclude weak transaction as it is impossible to conflict
939 ++stats.success_read_efforts.num_trans;
940 stats.success_read_efforts.read.increment_stat(read_stat);
941 }
942
943 // read transaction won't have non-read efforts
944 assert(t.retired_set.empty());
945 assert(t.get_fresh_block_stats().is_clear());
946 assert(t.mutated_block_list.empty());
947 assert(t.onode_tree_stats.is_clear());
948 assert(t.omap_tree_stats.is_clear());
949 assert(t.lba_tree_stats.is_clear());
950 assert(t.backref_tree_stats.is_clear());
951 }
952 }
953
954 CachedExtentRef Cache::alloc_new_extent_by_type(
955 Transaction &t, ///< [in, out] current transaction
956 extent_types_t type, ///< [in] type tag
957 extent_len_t length, ///< [in] length
958 placement_hint_t hint, ///< [in] user hint
959 rewrite_gen_t gen ///< [in] rewrite generation
960 )
961 {
962 LOG_PREFIX(Cache::alloc_new_extent_by_type);
963 SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}, gen={}",
964 t, type, length, hint, rewrite_gen_printer_t{gen});
965 switch (type) {
966 case extent_types_t::ROOT:
967 ceph_assert(0 == "ROOT is never directly alloc'd");
968 return CachedExtentRef();
969 case extent_types_t::LADDR_INTERNAL:
970 return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length, hint, gen);
971 case extent_types_t::LADDR_LEAF:
972 return alloc_new_extent<lba_manager::btree::LBALeafNode>(
973 t, length, hint, gen);
974 case extent_types_t::ONODE_BLOCK_STAGED:
975 return alloc_new_extent<onode::SeastoreNodeExtent>(t, length, hint, gen);
976 case extent_types_t::OMAP_INNER:
977 return alloc_new_extent<omap_manager::OMapInnerNode>(t, length, hint, gen);
978 case extent_types_t::OMAP_LEAF:
979 return alloc_new_extent<omap_manager::OMapLeafNode>(t, length, hint, gen);
980 case extent_types_t::COLL_BLOCK:
981 return alloc_new_extent<collection_manager::CollectionNode>(t, length, hint, gen);
982 case extent_types_t::OBJECT_DATA_BLOCK:
983 return alloc_new_extent<ObjectDataBlock>(t, length, hint, gen);
984 case extent_types_t::RETIRED_PLACEHOLDER:
985 ceph_assert(0 == "impossible");
986 return CachedExtentRef();
987 case extent_types_t::TEST_BLOCK:
988 return alloc_new_extent<TestBlock>(t, length, hint, gen);
989 case extent_types_t::TEST_BLOCK_PHYSICAL:
990 return alloc_new_extent<TestBlockPhysical>(t, length, hint, gen);
991 case extent_types_t::NONE: {
992 ceph_assert(0 == "NONE is an invalid extent type");
993 return CachedExtentRef();
994 }
995 default:
996 ceph_assert(0 == "impossible");
997 return CachedExtentRef();
998 }
999 }
1000
1001 CachedExtentRef Cache::duplicate_for_write(
1002 Transaction &t,
1003 CachedExtentRef i) {
1004 LOG_PREFIX(Cache::duplicate_for_write);
1005 assert(i->is_fully_loaded());
1006
1007 if (i->is_mutable())
1008 return i;
1009
1010 if (i->is_exist_clean()) {
1011 i->version++;
1012 i->state = CachedExtent::extent_state_t::EXIST_MUTATION_PENDING;
1013 i->last_committed_crc = i->get_crc32c();
1014 // deepcopy the buffer of exist clean extent beacuse it shares
1015 // buffer with original clean extent.
1016 auto bp = i->get_bptr();
1017 auto nbp = ceph::bufferptr(bp.c_str(), bp.length());
1018 i->set_bptr(std::move(nbp));
1019
1020 t.add_mutated_extent(i);
1021 DEBUGT("duplicate existing extent {}", t, *i);
1022 return i;
1023 }
1024
1025 auto ret = i->duplicate_for_write(t);
1026 ret->pending_for_transaction = t.get_trans_id();
1027 ret->prior_instance = i;
1028 // duplicate_for_write won't occur after ool write finished
1029 assert(!i->prior_poffset);
1030 auto [iter, inserted] = i->mutation_pendings.insert(*ret);
1031 ceph_assert(inserted);
1032 t.add_mutated_extent(ret);
1033 if (ret->get_type() == extent_types_t::ROOT) {
1034 t.root = ret->cast<RootBlock>();
1035 } else {
1036 ret->last_committed_crc = i->last_committed_crc;
1037 }
1038
1039 ret->version++;
1040 ret->state = CachedExtent::extent_state_t::MUTATION_PENDING;
1041 DEBUGT("{} -> {}", t, *i, *ret);
1042 return ret;
1043 }
1044
1045 record_t Cache::prepare_record(
1046 Transaction &t,
1047 const journal_seq_t &journal_head,
1048 const journal_seq_t &journal_dirty_tail)
1049 {
1050 LOG_PREFIX(Cache::prepare_record);
1051 SUBTRACET(seastore_t, "enter", t);
1052
1053 auto trans_src = t.get_src();
1054 assert(!t.is_weak());
1055 assert(trans_src != Transaction::src_t::READ);
1056
1057 auto& efforts = get_by_src(stats.committed_efforts_by_src,
1058 trans_src);
1059
1060 // Should be valid due to interruptible future
1061 io_stat_t read_stat;
1062 for (auto &i: t.read_set) {
1063 if (!i.ref->is_valid()) {
1064 SUBERRORT(seastore_t,
1065 "read_set got invalid extent, aborting -- {}", t, *i.ref);
1066 ceph_abort("no invalid extent allowed in transactions' read_set");
1067 }
1068 get_by_ext(efforts.read_by_ext,
1069 i.ref->get_type()).increment(i.ref->get_length());
1070 read_stat.increment(i.ref->get_length());
1071 }
1072 t.read_set.clear();
1073 t.write_set.clear();
1074
1075 record_t record(trans_src);
1076 auto commit_time = seastar::lowres_system_clock::now();
1077
1078 // Add new copy of mutated blocks, set_io_wait to block until written
1079 record.deltas.reserve(t.mutated_block_list.size());
1080 io_stat_t delta_stat;
1081 for (auto &i: t.mutated_block_list) {
1082 if (!i->is_valid()) {
1083 DEBUGT("invalid mutated extent -- {}", t, *i);
1084 continue;
1085 }
1086 assert(i->is_exist_mutation_pending() ||
1087 i->prior_instance);
1088 get_by_ext(efforts.mutate_by_ext,
1089 i->get_type()).increment(i->get_length());
1090
1091 auto delta_bl = i->get_delta();
1092 auto delta_length = delta_bl.length();
1093 i->set_modify_time(commit_time);
1094 DEBUGT("mutated extent with {}B delta -- {}",
1095 t, delta_length, *i);
1096 if (!i->is_exist_mutation_pending()) {
1097 DEBUGT("commit replace extent ... -- {}, prior={}",
1098 t, *i, *i->prior_instance);
1099 // extent with EXIST_MUTATION_PENDING doesn't have
1100 // prior_instance field so skip these extents.
1101 // the existing extents should be added into Cache
1102 // during complete_commit to sync with gc transaction.
1103 commit_replace_extent(t, i, i->prior_instance);
1104 }
1105
1106 i->prepare_write();
1107 i->set_io_wait();
1108 i->prepare_commit();
1109
1110 assert(i->get_version() > 0);
1111 auto final_crc = i->get_crc32c();
1112 if (i->get_type() == extent_types_t::ROOT) {
1113 SUBTRACET(seastore_t, "writing out root delta {}B -- {}",
1114 t, delta_length, *i);
1115 assert(t.root == i);
1116 root = t.root;
1117 record.push_back(
1118 delta_info_t{
1119 extent_types_t::ROOT,
1120 P_ADDR_NULL,
1121 L_ADDR_NULL,
1122 0,
1123 0,
1124 0,
1125 t.root->get_version() - 1,
1126 MAX_SEG_SEQ,
1127 segment_type_t::NULL_SEG,
1128 std::move(delta_bl)
1129 });
1130 } else {
1131 auto sseq = NULL_SEG_SEQ;
1132 auto stype = segment_type_t::NULL_SEG;
1133
1134 // FIXME: This is specific to the segmented implementation
1135 if (i->get_paddr().get_addr_type() == paddr_types_t::SEGMENT) {
1136 auto sid = i->get_paddr().as_seg_paddr().get_segment_id();
1137 auto sinfo = get_segment_info(sid);
1138 if (sinfo) {
1139 sseq = sinfo->seq;
1140 stype = sinfo->type;
1141 }
1142 }
1143
1144 record.push_back(
1145 delta_info_t{
1146 i->get_type(),
1147 i->get_paddr(),
1148 (i->is_logical()
1149 ? i->cast<LogicalCachedExtent>()->get_laddr()
1150 : L_ADDR_NULL),
1151 i->last_committed_crc,
1152 final_crc,
1153 i->get_length(),
1154 i->get_version() - 1,
1155 sseq,
1156 stype,
1157 std::move(delta_bl)
1158 });
1159 i->last_committed_crc = final_crc;
1160 }
1161 assert(delta_length);
1162 get_by_ext(efforts.delta_bytes_by_ext,
1163 i->get_type()) += delta_length;
1164 delta_stat.increment(delta_length);
1165 }
1166
1167 // Transaction is now a go, set up in-memory cache state
1168 // invalidate now invalid blocks
1169 io_stat_t retire_stat;
1170 std::vector<alloc_delta_t> alloc_deltas;
1171 alloc_delta_t rel_delta;
1172 rel_delta.op = alloc_delta_t::op_types_t::CLEAR;
1173 for (auto &i: t.retired_set) {
1174 get_by_ext(efforts.retire_by_ext,
1175 i->get_type()).increment(i->get_length());
1176 retire_stat.increment(i->get_length());
1177 DEBUGT("retired and remove extent -- {}", t, *i);
1178 commit_retire_extent(t, i);
1179 if (is_backref_mapped_extent_node(i)
1180 || is_retired_placeholder(i->get_type())) {
1181 rel_delta.alloc_blk_ranges.emplace_back(
1182 i->get_paddr(),
1183 L_ADDR_NULL,
1184 i->get_length(),
1185 i->get_type());
1186 }
1187 }
1188 alloc_deltas.emplace_back(std::move(rel_delta));
1189
1190 record.extents.reserve(t.inline_block_list.size());
1191 io_stat_t fresh_stat;
1192 io_stat_t fresh_invalid_stat;
1193 alloc_delta_t alloc_delta;
1194 alloc_delta.op = alloc_delta_t::op_types_t::SET;
1195 for (auto &i: t.inline_block_list) {
1196 if (!i->is_valid()) {
1197 DEBUGT("invalid fresh inline extent -- {}", t, *i);
1198 fresh_invalid_stat.increment(i->get_length());
1199 get_by_ext(efforts.fresh_invalid_by_ext,
1200 i->get_type()).increment(i->get_length());
1201 } else {
1202 TRACET("fresh inline extent -- {}", t, *i);
1203 }
1204 fresh_stat.increment(i->get_length());
1205 get_by_ext(efforts.fresh_inline_by_ext,
1206 i->get_type()).increment(i->get_length());
1207 assert(i->is_inline() || i->get_paddr().is_fake());
1208
1209 bufferlist bl;
1210 i->prepare_write();
1211 i->prepare_commit();
1212 bl.append(i->get_bptr());
1213 if (i->get_type() == extent_types_t::ROOT) {
1214 ceph_assert(0 == "ROOT never gets written as a fresh block");
1215 }
1216
1217 assert(bl.length() == i->get_length());
1218 auto modify_time = i->get_modify_time();
1219 if (modify_time == NULL_TIME) {
1220 modify_time = commit_time;
1221 }
1222 record.push_back(extent_t{
1223 i->get_type(),
1224 i->is_logical()
1225 ? i->cast<LogicalCachedExtent>()->get_laddr()
1226 : (is_lba_node(i->get_type())
1227 ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
1228 : L_ADDR_NULL),
1229 std::move(bl)
1230 },
1231 modify_time);
1232 if (i->is_valid()
1233 && is_backref_mapped_extent_node(i)) {
1234 alloc_delta.alloc_blk_ranges.emplace_back(
1235 i->get_paddr(),
1236 i->is_logical()
1237 ? i->cast<LogicalCachedExtent>()->get_laddr()
1238 : (is_lba_node(i->get_type())
1239 ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
1240 : L_ADDR_NULL),
1241 i->get_length(),
1242 i->get_type());
1243 }
1244 }
1245
1246 for (auto &i: t.written_ool_block_list) {
1247 TRACET("fresh ool extent -- {}", t, *i);
1248 ceph_assert(i->is_valid());
1249 assert(!i->is_inline());
1250 get_by_ext(efforts.fresh_ool_by_ext,
1251 i->get_type()).increment(i->get_length());
1252 i->prepare_commit();
1253 if (is_backref_mapped_extent_node(i)) {
1254 alloc_delta.alloc_blk_ranges.emplace_back(
1255 i->get_paddr(),
1256 i->is_logical()
1257 ? i->cast<LogicalCachedExtent>()->get_laddr()
1258 : i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin,
1259 i->get_length(),
1260 i->get_type());
1261 }
1262 }
1263
1264 for (auto &i: t.existing_block_list) {
1265 if (i->is_valid()) {
1266 alloc_delta.alloc_blk_ranges.emplace_back(
1267 i->get_paddr(),
1268 i->cast<LogicalCachedExtent>()->get_laddr(),
1269 i->get_length(),
1270 i->get_type());
1271 }
1272 }
1273 alloc_deltas.emplace_back(std::move(alloc_delta));
1274
1275 for (auto b : alloc_deltas) {
1276 bufferlist bl;
1277 encode(b, bl);
1278 delta_info_t delta;
1279 delta.type = extent_types_t::ALLOC_INFO;
1280 delta.bl = bl;
1281 record.push_back(std::move(delta));
1282 }
1283
1284 if (is_background_transaction(trans_src)) {
1285 assert(journal_head != JOURNAL_SEQ_NULL);
1286 assert(journal_dirty_tail != JOURNAL_SEQ_NULL);
1287 journal_seq_t dirty_tail;
1288 auto maybe_dirty_tail = get_oldest_dirty_from();
1289 if (!maybe_dirty_tail.has_value()) {
1290 dirty_tail = journal_head;
1291 SUBINFOT(seastore_t, "dirty_tail all trimmed, set to head {}, src={}",
1292 t, dirty_tail, trans_src);
1293 } else if (*maybe_dirty_tail == JOURNAL_SEQ_NULL) {
1294 dirty_tail = journal_dirty_tail;
1295 SUBINFOT(seastore_t, "dirty_tail is pending, set to {}, src={}",
1296 t, dirty_tail, trans_src);
1297 } else {
1298 dirty_tail = *maybe_dirty_tail;
1299 }
1300 ceph_assert(dirty_tail != JOURNAL_SEQ_NULL);
1301 journal_seq_t alloc_tail;
1302 auto maybe_alloc_tail = get_oldest_backref_dirty_from();
1303 if (!maybe_alloc_tail.has_value()) {
1304 // FIXME: the replay point of the allocations requires to be accurate.
1305 // Setting the alloc_tail to get_journal_head() cannot skip replaying the
1306 // last unnecessary record.
1307 alloc_tail = journal_head;
1308 SUBINFOT(seastore_t, "alloc_tail all trimmed, set to head {}, src={}",
1309 t, alloc_tail, trans_src);
1310 } else if (*maybe_alloc_tail == JOURNAL_SEQ_NULL) {
1311 ceph_abort("impossible");
1312 } else {
1313 alloc_tail = *maybe_alloc_tail;
1314 }
1315 ceph_assert(alloc_tail != JOURNAL_SEQ_NULL);
1316 auto tails = journal_tail_delta_t{alloc_tail, dirty_tail};
1317 SUBDEBUGT(seastore_t, "update tails as delta {}", t, tails);
1318 bufferlist bl;
1319 encode(tails, bl);
1320 delta_info_t delta;
1321 delta.type = extent_types_t::JOURNAL_TAIL;
1322 delta.bl = bl;
1323 record.push_back(std::move(delta));
1324 }
1325
1326 ceph_assert(t.get_fresh_block_stats().num ==
1327 t.inline_block_list.size() +
1328 t.written_ool_block_list.size() +
1329 t.num_delayed_invalid_extents +
1330 t.num_allocated_invalid_extents);
1331
1332 auto& ool_stats = t.get_ool_write_stats();
1333 ceph_assert(ool_stats.extents.num == t.written_ool_block_list.size());
1334
1335 if (record.is_empty()) {
1336 SUBINFOT(seastore_t,
1337 "record to submit is empty, src={}", t, trans_src);
1338 assert(t.onode_tree_stats.is_clear());
1339 assert(t.omap_tree_stats.is_clear());
1340 assert(t.lba_tree_stats.is_clear());
1341 assert(t.backref_tree_stats.is_clear());
1342 assert(ool_stats.is_clear());
1343 }
1344
1345 if (record.modify_time == NULL_TIME) {
1346 record.modify_time = commit_time;
1347 }
1348
1349 SUBDEBUGT(seastore_t,
1350 "commit H{} dirty_from={}, alloc_from={}, "
1351 "{} read, {} fresh with {} invalid, "
1352 "{} delta, {} retire, {}(md={}B, data={}B) ool-records, "
1353 "{}B md, {}B data, modify_time={}",
1354 t, (void*)&t.get_handle(),
1355 get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
1356 get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL),
1357 read_stat,
1358 fresh_stat,
1359 fresh_invalid_stat,
1360 delta_stat,
1361 retire_stat,
1362 ool_stats.num_records,
1363 ool_stats.md_bytes,
1364 ool_stats.get_data_bytes(),
1365 record.size.get_raw_mdlength(),
1366 record.size.dlength,
1367 sea_time_point_printer_t{record.modify_time});
1368 if (is_background_transaction(trans_src)) {
1369 // background transaction won't contain any onode tree operations
1370 assert(t.onode_tree_stats.is_clear());
1371 assert(t.omap_tree_stats.is_clear());
1372 } else {
1373 if (t.onode_tree_stats.depth) {
1374 stats.onode_tree_depth = t.onode_tree_stats.depth;
1375 }
1376 if (t.omap_tree_stats.depth) {
1377 stats.omap_tree_depth = t.omap_tree_stats.depth;
1378 }
1379 stats.onode_tree_extents_num += t.onode_tree_stats.extents_num_delta;
1380 ceph_assert(stats.onode_tree_extents_num >= 0);
1381 get_by_src(stats.committed_onode_tree_efforts, trans_src
1382 ).increment(t.onode_tree_stats);
1383 stats.omap_tree_extents_num += t.omap_tree_stats.extents_num_delta;
1384 ceph_assert(stats.omap_tree_extents_num >= 0);
1385 get_by_src(stats.committed_omap_tree_efforts, trans_src
1386 ).increment(t.omap_tree_stats);
1387 }
1388
1389 if (t.lba_tree_stats.depth) {
1390 stats.lba_tree_depth = t.lba_tree_stats.depth;
1391 }
1392 stats.lba_tree_extents_num += t.lba_tree_stats.extents_num_delta;
1393 ceph_assert(stats.lba_tree_extents_num >= 0);
1394 get_by_src(stats.committed_lba_tree_efforts, trans_src
1395 ).increment(t.lba_tree_stats);
1396 if (t.backref_tree_stats.depth) {
1397 stats.backref_tree_depth = t.backref_tree_stats.depth;
1398 }
1399 stats.backref_tree_extents_num += t.backref_tree_stats.extents_num_delta;
1400 ceph_assert(stats.backref_tree_extents_num >= 0);
1401 get_by_src(stats.committed_backref_tree_efforts, trans_src
1402 ).increment(t.backref_tree_stats);
1403
1404 ++(efforts.num_trans);
1405 efforts.num_ool_records += ool_stats.num_records;
1406 efforts.ool_record_metadata_bytes += ool_stats.md_bytes;
1407 efforts.ool_record_data_bytes += ool_stats.get_data_bytes();
1408 efforts.inline_record_metadata_bytes +=
1409 (record.size.get_raw_mdlength() - record.get_delta_size());
1410
1411 auto &rewrite_version_stats = t.get_rewrite_version_stats();
1412 if (trans_src == Transaction::src_t::TRIM_DIRTY) {
1413 stats.committed_dirty_version.increment_stat(rewrite_version_stats);
1414 } else if (trans_src == Transaction::src_t::CLEANER_MAIN ||
1415 trans_src == Transaction::src_t::CLEANER_COLD) {
1416 stats.committed_reclaim_version.increment_stat(rewrite_version_stats);
1417 } else {
1418 assert(rewrite_version_stats.is_clear());
1419 }
1420
1421 return record;
1422 }
1423
1424 void Cache::backref_batch_update(
1425 std::vector<backref_entry_ref> &&list,
1426 const journal_seq_t &seq)
1427 {
1428 LOG_PREFIX(Cache::backref_batch_update);
1429 DEBUG("inserting {} entries at {}", list.size(), seq);
1430 ceph_assert(seq != JOURNAL_SEQ_NULL);
1431
1432 for (auto &ent : list) {
1433 backref_entry_mset.insert(*ent);
1434 }
1435
1436 auto iter = backref_entryrefs_by_seq.find(seq);
1437 if (iter == backref_entryrefs_by_seq.end()) {
1438 backref_entryrefs_by_seq.emplace(seq, std::move(list));
1439 } else {
1440 iter->second.insert(
1441 iter->second.end(),
1442 std::make_move_iterator(list.begin()),
1443 std::make_move_iterator(list.end()));
1444 }
1445 }
1446
1447 void Cache::complete_commit(
1448 Transaction &t,
1449 paddr_t final_block_start,
1450 journal_seq_t start_seq)
1451 {
1452 LOG_PREFIX(Cache::complete_commit);
1453 SUBTRACET(seastore_t, "final_block_start={}, start_seq={}",
1454 t, final_block_start, start_seq);
1455
1456 std::vector<backref_entry_ref> backref_list;
1457 t.for_each_fresh_block([&](const CachedExtentRef &i) {
1458 if (!i->is_valid()) {
1459 return;
1460 }
1461
1462 bool is_inline = false;
1463 if (i->is_inline()) {
1464 is_inline = true;
1465 i->set_paddr(final_block_start.add_relative(i->get_paddr()));
1466 }
1467 i->last_committed_crc = i->get_crc32c();
1468 i->pending_for_transaction = TRANS_ID_NULL;
1469 i->on_initial_write();
1470
1471 i->state = CachedExtent::extent_state_t::CLEAN;
1472 DEBUGT("add extent as fresh, inline={} -- {}",
1473 t, is_inline, *i);
1474 const auto t_src = t.get_src();
1475 i->invalidate_hints();
1476 add_extent(i, &t_src);
1477 epm.commit_space_used(i->get_paddr(), i->get_length());
1478 if (is_backref_mapped_extent_node(i)) {
1479 DEBUGT("backref_list new {} len {}",
1480 t,
1481 i->get_paddr(),
1482 i->get_length());
1483 backref_list.emplace_back(
1484 std::make_unique<backref_entry_t>(
1485 i->get_paddr(),
1486 i->is_logical()
1487 ? i->cast<LogicalCachedExtent>()->get_laddr()
1488 : (is_lba_node(i->get_type())
1489 ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
1490 : L_ADDR_NULL),
1491 i->get_length(),
1492 i->get_type(),
1493 start_seq));
1494 } else if (is_backref_node(i->get_type())) {
1495 add_backref_extent(
1496 i->get_paddr(),
1497 i->cast<backref::BackrefNode>()->get_node_meta().begin,
1498 i->get_type());
1499 } else {
1500 ERRORT("{}", t, *i);
1501 ceph_abort("not possible");
1502 }
1503 });
1504
1505 // Add new copy of mutated blocks, set_io_wait to block until written
1506 for (auto &i: t.mutated_block_list) {
1507 if (!i->is_valid()) {
1508 continue;
1509 }
1510 assert(i->is_exist_mutation_pending() ||
1511 i->prior_instance);
1512 i->on_delta_write(final_block_start);
1513 i->pending_for_transaction = TRANS_ID_NULL;
1514 i->prior_instance = CachedExtentRef();
1515 i->state = CachedExtent::extent_state_t::DIRTY;
1516 assert(i->version > 0);
1517 if (i->version == 1 || i->get_type() == extent_types_t::ROOT) {
1518 i->dirty_from_or_retired_at = start_seq;
1519 DEBUGT("commit extent done, become dirty -- {}", t, *i);
1520 } else {
1521 DEBUGT("commit extent done -- {}", t, *i);
1522 }
1523 }
1524
1525 for (auto &i: t.retired_set) {
1526 epm.mark_space_free(i->get_paddr(), i->get_length());
1527 }
1528 for (auto &i: t.existing_block_list) {
1529 if (i->is_valid()) {
1530 epm.mark_space_used(i->get_paddr(), i->get_length());
1531 }
1532 }
1533
1534 for (auto &i: t.mutated_block_list) {
1535 if (!i->is_valid()) {
1536 continue;
1537 }
1538 i->complete_io();
1539 }
1540
1541 last_commit = start_seq;
1542 for (auto &i: t.retired_set) {
1543 i->dirty_from_or_retired_at = start_seq;
1544 if (is_backref_mapped_extent_node(i)
1545 || is_retired_placeholder(i->get_type())) {
1546 DEBUGT("backref_list free {} len {}",
1547 t,
1548 i->get_paddr(),
1549 i->get_length());
1550 backref_list.emplace_back(
1551 std::make_unique<backref_entry_t>(
1552 i->get_paddr(),
1553 L_ADDR_NULL,
1554 i->get_length(),
1555 i->get_type(),
1556 start_seq));
1557 } else if (is_backref_node(i->get_type())) {
1558 remove_backref_extent(i->get_paddr());
1559 } else {
1560 ERRORT("{}", t, *i);
1561 ceph_abort("not possible");
1562 }
1563 }
1564
1565 auto existing_stats = t.get_existing_block_stats();
1566 DEBUGT("total existing blocks num: {}, exist clean num: {}, "
1567 "exist mutation pending num: {}",
1568 t,
1569 existing_stats.valid_num,
1570 existing_stats.clean_num,
1571 existing_stats.mutated_num);
1572 for (auto &i: t.existing_block_list) {
1573 if (i->is_valid()) {
1574 if (i->is_exist_clean()) {
1575 i->state = CachedExtent::extent_state_t::CLEAN;
1576 } else {
1577 assert(i->state == CachedExtent::extent_state_t::DIRTY);
1578 }
1579 DEBUGT("backref_list new existing {} len {}",
1580 t,
1581 i->get_paddr(),
1582 i->get_length());
1583 backref_list.emplace_back(
1584 std::make_unique<backref_entry_t>(
1585 i->get_paddr(),
1586 i->cast<LogicalCachedExtent>()->get_laddr(),
1587 i->get_length(),
1588 i->get_type(),
1589 start_seq));
1590 const auto t_src = t.get_src();
1591 add_extent(i, &t_src);
1592 }
1593 }
1594 if (!backref_list.empty()) {
1595 backref_batch_update(std::move(backref_list), start_seq);
1596 }
1597
1598 for (auto &i: t.pre_alloc_list) {
1599 if (!i->is_valid()) {
1600 epm.mark_space_free(i->get_paddr(), i->get_length());
1601 }
1602 }
1603 }
1604
1605 void Cache::init()
1606 {
1607 LOG_PREFIX(Cache::init);
1608 if (root) {
1609 // initial creation will do mkfs followed by mount each of which calls init
1610 DEBUG("remove extent -- prv_root={}", *root);
1611 remove_extent(root);
1612 root = nullptr;
1613 }
1614 root = new RootBlock();
1615 root->init(CachedExtent::extent_state_t::CLEAN,
1616 P_ADDR_ROOT,
1617 PLACEMENT_HINT_NULL,
1618 NULL_GENERATION,
1619 TRANS_ID_NULL);
1620 INFO("init root -- {}", *root);
1621 extents.insert(*root);
1622 }
1623
1624 Cache::mkfs_iertr::future<> Cache::mkfs(Transaction &t)
1625 {
1626 LOG_PREFIX(Cache::mkfs);
1627 INFOT("create root", t);
1628 return get_root(t).si_then([this, &t](auto croot) {
1629 duplicate_for_write(t, croot);
1630 return mkfs_iertr::now();
1631 }).handle_error_interruptible(
1632 mkfs_iertr::pass_further{},
1633 crimson::ct_error::assert_all{
1634 "Invalid error in Cache::mkfs"
1635 }
1636 );
1637 }
1638
1639 Cache::close_ertr::future<> Cache::close()
1640 {
1641 LOG_PREFIX(Cache::close);
1642 INFO("close with {}({}B) dirty, dirty_from={}, alloc_from={}, "
1643 "{}({}B) lru, totally {}({}B) indexed extents",
1644 dirty.size(),
1645 stats.dirty_bytes,
1646 get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
1647 get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL),
1648 lru.get_current_contents_extents(),
1649 lru.get_current_contents_bytes(),
1650 extents.size(),
1651 extents.get_bytes());
1652 root.reset();
1653 for (auto i = dirty.begin(); i != dirty.end(); ) {
1654 auto ptr = &*i;
1655 stats.dirty_bytes -= ptr->get_length();
1656 dirty.erase(i++);
1657 intrusive_ptr_release(ptr);
1658 }
1659 backref_extents.clear();
1660 backref_entryrefs_by_seq.clear();
1661 assert(stats.dirty_bytes == 0);
1662 lru.clear();
1663 return close_ertr::now();
1664 }
1665
1666 Cache::replay_delta_ret
1667 Cache::replay_delta(
1668 journal_seq_t journal_seq,
1669 paddr_t record_base,
1670 const delta_info_t &delta,
1671 const journal_seq_t &dirty_tail,
1672 const journal_seq_t &alloc_tail,
1673 sea_time_point modify_time)
1674 {
1675 LOG_PREFIX(Cache::replay_delta);
1676 assert(dirty_tail != JOURNAL_SEQ_NULL);
1677 assert(alloc_tail != JOURNAL_SEQ_NULL);
1678 ceph_assert(modify_time != NULL_TIME);
1679
1680 // FIXME: This is specific to the segmented implementation
1681 /* The journal may validly contain deltas for extents in
1682 * since released segments. We can detect those cases by
1683 * checking whether the segment in question currently has a
1684 * sequence number > the current journal segment seq. We can
1685 * safetly skip these deltas because the extent must already
1686 * have been rewritten.
1687 */
1688 if (delta.paddr != P_ADDR_NULL &&
1689 delta.paddr.get_addr_type() == paddr_types_t::SEGMENT) {
1690 auto& seg_addr = delta.paddr.as_seg_paddr();
1691 auto seg_info = get_segment_info(seg_addr.get_segment_id());
1692 if (seg_info) {
1693 auto delta_paddr_segment_seq = seg_info->seq;
1694 auto delta_paddr_segment_type = seg_info->type;
1695 if (delta_paddr_segment_seq != delta.ext_seq ||
1696 delta_paddr_segment_type != delta.seg_type) {
1697 DEBUG("delta is obsolete, delta_paddr_segment_seq={},"
1698 " delta_paddr_segment_type={} -- {}",
1699 segment_seq_printer_t{delta_paddr_segment_seq},
1700 delta_paddr_segment_type,
1701 delta);
1702 return replay_delta_ertr::make_ready_future<bool>(false);
1703 }
1704 }
1705 }
1706
1707 if (delta.type == extent_types_t::JOURNAL_TAIL) {
1708 // this delta should have been dealt with during segment cleaner mounting
1709 return replay_delta_ertr::make_ready_future<bool>(false);
1710 }
1711
1712 // replay alloc
1713 if (delta.type == extent_types_t::ALLOC_INFO) {
1714 if (journal_seq < alloc_tail) {
1715 DEBUG("journal_seq {} < alloc_tail {}, don't replay {}",
1716 journal_seq, alloc_tail, delta);
1717 return replay_delta_ertr::make_ready_future<bool>(false);
1718 }
1719
1720 alloc_delta_t alloc_delta;
1721 decode(alloc_delta, delta.bl);
1722 std::vector<backref_entry_ref> backref_list;
1723 for (auto &alloc_blk : alloc_delta.alloc_blk_ranges) {
1724 if (alloc_blk.paddr.is_relative()) {
1725 assert(alloc_blk.paddr.is_record_relative());
1726 alloc_blk.paddr = record_base.add_relative(alloc_blk.paddr);
1727 }
1728 DEBUG("replay alloc_blk {}~{} {}, journal_seq: {}",
1729 alloc_blk.paddr, alloc_blk.len, alloc_blk.laddr, journal_seq);
1730 backref_list.emplace_back(
1731 std::make_unique<backref_entry_t>(
1732 alloc_blk.paddr,
1733 alloc_blk.laddr,
1734 alloc_blk.len,
1735 alloc_blk.type,
1736 journal_seq));
1737 }
1738 if (!backref_list.empty()) {
1739 backref_batch_update(std::move(backref_list), journal_seq);
1740 }
1741 return replay_delta_ertr::make_ready_future<bool>(true);
1742 }
1743
1744 // replay dirty
1745 if (journal_seq < dirty_tail) {
1746 DEBUG("journal_seq {} < dirty_tail {}, don't replay {}",
1747 journal_seq, dirty_tail, delta);
1748 return replay_delta_ertr::make_ready_future<bool>(false);
1749 }
1750
1751 if (delta.type == extent_types_t::ROOT) {
1752 TRACE("replay root delta at {} {}, remove extent ... -- {}, prv_root={}",
1753 journal_seq, record_base, delta, *root);
1754 remove_extent(root);
1755 root->apply_delta_and_adjust_crc(record_base, delta.bl);
1756 root->dirty_from_or_retired_at = journal_seq;
1757 root->state = CachedExtent::extent_state_t::DIRTY;
1758 DEBUG("replayed root delta at {} {}, add extent -- {}, root={}",
1759 journal_seq, record_base, delta, *root);
1760 root->set_modify_time(modify_time);
1761 add_extent(root);
1762 return replay_delta_ertr::make_ready_future<bool>(true);
1763 } else {
1764 auto _get_extent_if_cached = [this](paddr_t addr)
1765 -> get_extent_ertr::future<CachedExtentRef> {
1766 // replay is not included by the cache hit metrics
1767 auto ret = query_cache(addr, nullptr);
1768 if (ret) {
1769 // no retired-placeholder should be exist yet because no transaction
1770 // has been created.
1771 assert(ret->get_type() != extent_types_t::RETIRED_PLACEHOLDER);
1772 return ret->wait_io().then([ret] {
1773 return ret;
1774 });
1775 } else {
1776 return seastar::make_ready_future<CachedExtentRef>();
1777 }
1778 };
1779 auto extent_fut = (delta.pversion == 0 ?
1780 // replay is not included by the cache hit metrics
1781 _get_extent_by_type(
1782 delta.type,
1783 delta.paddr,
1784 delta.laddr,
1785 delta.length,
1786 nullptr,
1787 [](CachedExtent &) {},
1788 [](CachedExtent &) {}) :
1789 _get_extent_if_cached(
1790 delta.paddr)
1791 ).handle_error(
1792 replay_delta_ertr::pass_further{},
1793 crimson::ct_error::assert_all{
1794 "Invalid error in Cache::replay_delta"
1795 }
1796 );
1797 return extent_fut.safe_then([=, this, &delta](auto extent) {
1798 if (!extent) {
1799 DEBUG("replay extent is not present, so delta is obsolete at {} {} -- {}",
1800 journal_seq, record_base, delta);
1801 assert(delta.pversion > 0);
1802 return replay_delta_ertr::make_ready_future<bool>(true);
1803 }
1804
1805 DEBUG("replay extent delta at {} {} ... -- {}, prv_extent={}",
1806 journal_seq, record_base, delta, *extent);
1807
1808 assert(extent->last_committed_crc == delta.prev_crc);
1809 assert(extent->version == delta.pversion);
1810 extent->apply_delta_and_adjust_crc(record_base, delta.bl);
1811 extent->set_modify_time(modify_time);
1812 assert(extent->last_committed_crc == delta.final_crc);
1813
1814 extent->version++;
1815 if (extent->version == 1) {
1816 extent->dirty_from_or_retired_at = journal_seq;
1817 DEBUG("replayed extent delta at {} {}, become dirty -- {}, extent={}" ,
1818 journal_seq, record_base, delta, *extent);
1819 } else {
1820 DEBUG("replayed extent delta at {} {} -- {}, extent={}" ,
1821 journal_seq, record_base, delta, *extent);
1822 }
1823 mark_dirty(extent);
1824 return replay_delta_ertr::make_ready_future<bool>(true);
1825 });
1826 }
1827 }
1828
1829 Cache::get_next_dirty_extents_ret Cache::get_next_dirty_extents(
1830 Transaction &t,
1831 journal_seq_t seq,
1832 size_t max_bytes)
1833 {
1834 LOG_PREFIX(Cache::get_next_dirty_extents);
1835 if (dirty.empty()) {
1836 DEBUGT("max_bytes={}B, seq={}, dirty is empty",
1837 t, max_bytes, seq);
1838 } else {
1839 DEBUGT("max_bytes={}B, seq={}, dirty_from={}",
1840 t, max_bytes, seq, dirty.begin()->get_dirty_from());
1841 }
1842 std::vector<CachedExtentRef> cand;
1843 size_t bytes_so_far = 0;
1844 for (auto i = dirty.begin();
1845 i != dirty.end() && bytes_so_far < max_bytes;
1846 ++i) {
1847 auto dirty_from = i->get_dirty_from();
1848 //dirty extents must be fully loaded
1849 assert(i->is_fully_loaded());
1850 if (unlikely(dirty_from == JOURNAL_SEQ_NULL)) {
1851 ERRORT("got dirty extent with JOURNAL_SEQ_NULL -- {}", t, *i);
1852 ceph_abort();
1853 }
1854 if (dirty_from < seq) {
1855 TRACET("next extent -- {}", t, *i);
1856 if (!cand.empty() && cand.back()->get_dirty_from() > dirty_from) {
1857 ERRORT("dirty extents are not ordered by dirty_from -- last={}, next={}",
1858 t, *cand.back(), *i);
1859 ceph_abort();
1860 }
1861 bytes_so_far += i->get_length();
1862 cand.push_back(&*i);
1863 } else {
1864 break;
1865 }
1866 }
1867 return seastar::do_with(
1868 std::move(cand),
1869 decltype(cand)(),
1870 [FNAME, this, &t](auto &cand, auto &ret) {
1871 return trans_intr::do_for_each(
1872 cand,
1873 [FNAME, this, &t, &ret](auto &ext) {
1874 TRACET("waiting on extent -- {}", t, *ext);
1875 return trans_intr::make_interruptible(
1876 ext->wait_io()
1877 ).then_interruptible([FNAME, this, ext, &t, &ret] {
1878 if (!ext->is_valid()) {
1879 ++(get_by_src(stats.trans_conflicts_by_unknown, t.get_src()));
1880 mark_transaction_conflicted(t, *ext);
1881 return;
1882 }
1883
1884 CachedExtentRef on_transaction;
1885 auto result = t.get_extent(ext->get_paddr(), &on_transaction);
1886 if (result == Transaction::get_extent_ret::ABSENT) {
1887 DEBUGT("extent is absent on t -- {}", t, *ext);
1888 t.add_to_read_set(ext);
1889 if (ext->get_type() == extent_types_t::ROOT) {
1890 if (t.root) {
1891 assert(&*t.root == &*ext);
1892 ceph_assert(0 == "t.root would have to already be in the read set");
1893 } else {
1894 assert(&*ext == &*root);
1895 t.root = root;
1896 }
1897 }
1898 ret.push_back(ext);
1899 } else if (result == Transaction::get_extent_ret::PRESENT) {
1900 DEBUGT("extent is present on t -- {}, on t {}", t, *ext, *on_transaction);
1901 ret.push_back(on_transaction);
1902 } else {
1903 assert(result == Transaction::get_extent_ret::RETIRED);
1904 DEBUGT("extent is retired on t -- {}", t, *ext);
1905 }
1906 });
1907 }).then_interruptible([&ret] {
1908 return std::move(ret);
1909 });
1910 });
1911 }
1912
1913 Cache::get_root_ret Cache::get_root(Transaction &t)
1914 {
1915 LOG_PREFIX(Cache::get_root);
1916 if (t.root) {
1917 TRACET("root already on t -- {}", t, *t.root);
1918 return t.root->wait_io().then([&t] {
1919 return get_root_iertr::make_ready_future<RootBlockRef>(
1920 t.root);
1921 });
1922 } else {
1923 DEBUGT("root not on t -- {}", t, *root);
1924 t.root = root;
1925 t.add_to_read_set(root);
1926 return root->wait_io().then([root=root] {
1927 return get_root_iertr::make_ready_future<RootBlockRef>(
1928 root);
1929 });
1930 }
1931 }
1932
1933 Cache::get_extent_ertr::future<CachedExtentRef> Cache::_get_extent_by_type(
1934 extent_types_t type,
1935 paddr_t offset,
1936 laddr_t laddr,
1937 extent_len_t length,
1938 const Transaction::src_t* p_src,
1939 extent_init_func_t &&extent_init_func,
1940 extent_init_func_t &&on_cache)
1941 {
1942 return [=, this, extent_init_func=std::move(extent_init_func)]() mutable {
1943 src_ext_t* p_metric_key = nullptr;
1944 src_ext_t metric_key;
1945 if (p_src) {
1946 metric_key = std::make_pair(*p_src, type);
1947 p_metric_key = &metric_key;
1948 }
1949
1950 switch (type) {
1951 case extent_types_t::ROOT:
1952 ceph_assert(0 == "ROOT is never directly read");
1953 return get_extent_ertr::make_ready_future<CachedExtentRef>();
1954 case extent_types_t::BACKREF_INTERNAL:
1955 return get_extent<backref::BackrefInternalNode>(
1956 offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
1957 ).safe_then([](auto extent) {
1958 return CachedExtentRef(extent.detach(), false /* add_ref */);
1959 });
1960 case extent_types_t::BACKREF_LEAF:
1961 return get_extent<backref::BackrefLeafNode>(
1962 offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
1963 ).safe_then([](auto extent) {
1964 return CachedExtentRef(extent.detach(), false /* add_ref */);
1965 });
1966 case extent_types_t::LADDR_INTERNAL:
1967 return get_extent<lba_manager::btree::LBAInternalNode>(
1968 offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
1969 ).safe_then([](auto extent) {
1970 return CachedExtentRef(extent.detach(), false /* add_ref */);
1971 });
1972 case extent_types_t::LADDR_LEAF:
1973 return get_extent<lba_manager::btree::LBALeafNode>(
1974 offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
1975 ).safe_then([](auto extent) {
1976 return CachedExtentRef(extent.detach(), false /* add_ref */);
1977 });
1978 case extent_types_t::OMAP_INNER:
1979 return get_extent<omap_manager::OMapInnerNode>(
1980 offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
1981 ).safe_then([](auto extent) {
1982 return CachedExtentRef(extent.detach(), false /* add_ref */);
1983 });
1984 case extent_types_t::OMAP_LEAF:
1985 return get_extent<omap_manager::OMapLeafNode>(
1986 offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
1987 ).safe_then([](auto extent) {
1988 return CachedExtentRef(extent.detach(), false /* add_ref */);
1989 });
1990 case extent_types_t::COLL_BLOCK:
1991 return get_extent<collection_manager::CollectionNode>(
1992 offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
1993 ).safe_then([](auto extent) {
1994 return CachedExtentRef(extent.detach(), false /* add_ref */);
1995 });
1996 case extent_types_t::ONODE_BLOCK_STAGED:
1997 return get_extent<onode::SeastoreNodeExtent>(
1998 offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
1999 ).safe_then([](auto extent) {
2000 return CachedExtentRef(extent.detach(), false /* add_ref */);
2001 });
2002 case extent_types_t::OBJECT_DATA_BLOCK:
2003 return get_extent<ObjectDataBlock>(
2004 offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
2005 ).safe_then([](auto extent) {
2006 return CachedExtentRef(extent.detach(), false /* add_ref */);
2007 });
2008 case extent_types_t::RETIRED_PLACEHOLDER:
2009 ceph_assert(0 == "impossible");
2010 return get_extent_ertr::make_ready_future<CachedExtentRef>();
2011 case extent_types_t::TEST_BLOCK:
2012 return get_extent<TestBlock>(
2013 offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
2014 ).safe_then([](auto extent) {
2015 return CachedExtentRef(extent.detach(), false /* add_ref */);
2016 });
2017 case extent_types_t::TEST_BLOCK_PHYSICAL:
2018 return get_extent<TestBlockPhysical>(
2019 offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
2020 ).safe_then([](auto extent) {
2021 return CachedExtentRef(extent.detach(), false /* add_ref */);
2022 });
2023 case extent_types_t::NONE: {
2024 ceph_assert(0 == "NONE is an invalid extent type");
2025 return get_extent_ertr::make_ready_future<CachedExtentRef>();
2026 }
2027 default:
2028 ceph_assert(0 == "impossible");
2029 return get_extent_ertr::make_ready_future<CachedExtentRef>();
2030 }
2031 }().safe_then([laddr](CachedExtentRef e) {
2032 assert(e->is_logical() == (laddr != L_ADDR_NULL));
2033 if (e->is_logical()) {
2034 e->cast<LogicalCachedExtent>()->set_laddr(laddr);
2035 }
2036 return get_extent_ertr::make_ready_future<CachedExtentRef>(e);
2037 });
2038 }
2039
2040 }