1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "crimson/os/seastore/cache.h"
9 #include <seastar/core/metrics.hh>
11 #include "crimson/os/seastore/logging.h"
12 #include "crimson/common/config_proxy.h"
13 #include "crimson/os/seastore/async_cleaner.h"
15 // included for get_extent_by_type
16 #include "crimson/os/seastore/collection_manager/collection_flat_node.h"
17 #include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
18 #include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h"
19 #include "crimson/os/seastore/object_data_handler.h"
20 #include "crimson/os/seastore/collection_manager/collection_flat_node.h"
21 #include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h"
22 #include "crimson/os/seastore/backref/backref_tree_node.h"
23 #include "test/crimson/seastore/test_block.h"
25 using std::string_view
;
27 SET_SUBSYS(seastore_cache
);
29 namespace crimson::os::seastore
{
31 std::ostream
&operator<<(std::ostream
&out
, const backref_entry_t
&ent
) {
32 return out
<< "backref_entry_t{"
33 << ent
.paddr
<< "~" << ent
.len
<< ", "
34 << "laddr: " << ent
.laddr
<< ", "
35 << "type: " << ent
.type
<< ", "
36 << "seq: " << ent
.seq
<< ", "
41 ExtentPlacementManager
&epm
)
43 lru(crimson::common::get_conf
<Option::size_t>(
44 "seastore_cache_lru_size"))
46 LOG_PREFIX(Cache::Cache
);
47 INFO("created, lru_size={}", lru
.get_capacity());
49 segment_providers_by_device_id
.resize(DEVICE_ID_MAX
, nullptr);
54 LOG_PREFIX(Cache::~Cache
);
55 for (auto &i
: extents
) {
56 ERROR("extent is still alive -- {}", i
);
58 ceph_assert(extents
.empty());
61 Cache::retire_extent_ret
Cache::retire_extent_addr(
62 Transaction
&t
, paddr_t addr
, extent_len_t length
)
64 LOG_PREFIX(Cache::retire_extent_addr
);
65 TRACET("retire {}~{}", t
, addr
, length
);
67 assert(addr
.is_real() && !addr
.is_block_relative());
70 auto result
= t
.get_extent(addr
, &ext
);
71 if (result
== Transaction::get_extent_ret::PRESENT
) {
72 DEBUGT("retire {}~{} on t -- {}", t
, addr
, length
, *ext
);
73 t
.add_to_retired_set(CachedExtentRef(&*ext
));
74 return retire_extent_iertr::now();
75 } else if (result
== Transaction::get_extent_ret::RETIRED
) {
76 ERRORT("retire {}~{} failed, already retired -- {}", t
, addr
, length
, *ext
);
80 // any relative addr must have been on the transaction
81 assert(!addr
.is_relative());
83 // absent from transaction
84 // retiring is not included by the cache hit metrics
85 ext
= query_cache(addr
, nullptr);
87 DEBUGT("retire {}~{} in cache -- {}", t
, addr
, length
, *ext
);
89 // add a new placeholder to Cache
90 ext
= CachedExtent::make_cached_extent_ref
<
91 RetiredExtentPlaceholder
>(length
);
92 ext
->init(CachedExtent::extent_state_t::CLEAN
,
97 DEBUGT("retire {}~{} as placeholder, add extent -- {}",
98 t
, addr
, length
, *ext
);
99 const auto t_src
= t
.get_src();
100 add_extent(ext
, &t_src
);
102 t
.add_to_read_set(ext
);
103 t
.add_to_retired_set(ext
);
104 return retire_extent_iertr::now();
107 void Cache::dump_contents()
109 LOG_PREFIX(Cache::dump_contents
);
111 for (auto &&i
: extents
) {
117 void Cache::register_metrics()
119 LOG_PREFIX(Cache::register_metrics
);
124 namespace sm
= seastar::metrics
;
125 using src_t
= Transaction::src_t
;
127 std::map
<src_t
, sm::label_instance
> labels_by_src
{
128 {src_t::MUTATE
, sm::label_instance("src", "MUTATE")},
129 {src_t::READ
, sm::label_instance("src", "READ")},
130 {src_t::TRIM_DIRTY
, sm::label_instance("src", "TRIM_DIRTY")},
131 {src_t::TRIM_ALLOC
, sm::label_instance("src", "TRIM_ALLOC")},
132 {src_t::CLEANER_MAIN
, sm::label_instance("src", "CLEANER_MAIN")},
133 {src_t::CLEANER_COLD
, sm::label_instance("src", "CLEANER_COLD")},
135 assert(labels_by_src
.size() == (std::size_t)src_t::MAX
);
137 std::map
<extent_types_t
, sm::label_instance
> labels_by_ext
{
138 {extent_types_t::ROOT
, sm::label_instance("ext", "ROOT")},
139 {extent_types_t::LADDR_INTERNAL
, sm::label_instance("ext", "LADDR_INTERNAL")},
140 {extent_types_t::LADDR_LEAF
, sm::label_instance("ext", "LADDR_LEAF")},
141 {extent_types_t::DINK_LADDR_LEAF
, sm::label_instance("ext", "DINK_LADDR_LEAF")},
142 {extent_types_t::OMAP_INNER
, sm::label_instance("ext", "OMAP_INNER")},
143 {extent_types_t::OMAP_LEAF
, sm::label_instance("ext", "OMAP_LEAF")},
144 {extent_types_t::ONODE_BLOCK_STAGED
, sm::label_instance("ext", "ONODE_BLOCK_STAGED")},
145 {extent_types_t::COLL_BLOCK
, sm::label_instance("ext", "COLL_BLOCK")},
146 {extent_types_t::OBJECT_DATA_BLOCK
, sm::label_instance("ext", "OBJECT_DATA_BLOCK")},
147 {extent_types_t::RETIRED_PLACEHOLDER
, sm::label_instance("ext", "RETIRED_PLACEHOLDER")},
148 {extent_types_t::ALLOC_INFO
, sm::label_instance("ext", "ALLOC_INFO")},
149 {extent_types_t::JOURNAL_TAIL
, sm::label_instance("ext", "JOURNAL_TAIL")},
150 {extent_types_t::TEST_BLOCK
, sm::label_instance("ext", "TEST_BLOCK")},
151 {extent_types_t::TEST_BLOCK_PHYSICAL
, sm::label_instance("ext", "TEST_BLOCK_PHYSICAL")},
152 {extent_types_t::BACKREF_INTERNAL
, sm::label_instance("ext", "BACKREF_INTERNAL")},
153 {extent_types_t::BACKREF_LEAF
, sm::label_instance("ext", "BACKREF_LEAF")}
155 assert(labels_by_ext
.size() == (std::size_t)extent_types_t::NONE
);
160 for (auto& [src
, src_label
] : labels_by_src
) {
166 get_by_src(stats
.trans_created_by_src
, src
),
167 sm::description("total number of transaction created"),
175 * cache_query: cache_access and cache_hit
177 for (auto& [src
, src_label
] : labels_by_src
) {
183 get_by_src(stats
.cache_query_by_src
, src
).access
,
184 sm::description("total number of cache accesses"),
189 get_by_src(stats
.cache_query_by_src
, src
).hit
,
190 sm::description("total number of cache hits"),
199 * efforts discarded/committed
201 auto effort_label
= sm::label("effort");
203 // invalidated efforts
204 using namespace std::literals::string_view_literals
;
205 const string_view invalidated_effort_names
[] = {
210 "FRESH_OOL_WRITTEN"sv
,
212 for (auto& [src
, src_label
] : labels_by_src
) {
213 auto& efforts
= get_by_src(stats
.invalidated_efforts_by_src
, src
);
214 for (auto& [ext
, ext_label
] : labels_by_ext
) {
215 auto& counter
= get_by_ext(efforts
.num_trans_invalidated
, ext
);
220 "trans_invalidated_by_extent",
222 sm::description("total number of transactions invalidated by extents"),
223 {src_label
, ext_label
}
229 if (src
== src_t::READ
) {
230 // read transaction won't have non-read efforts
231 auto read_effort_label
= effort_label("READ");
236 "invalidated_extents",
238 sm::description("extents of invalidated transactions"),
239 {src_label
, read_effort_label
}
242 "invalidated_extent_bytes",
244 sm::description("extent bytes of invalidated transactions"),
245 {src_label
, read_effort_label
}
252 // non READ invalidated efforts
253 for (auto& effort_name
: invalidated_effort_names
) {
254 auto& effort
= [&effort_name
, &efforts
]() -> io_stat_t
& {
255 if (effort_name
== "READ") {
257 } else if (effort_name
== "MUTATE") {
258 return efforts
.mutate
;
259 } else if (effort_name
== "RETIRE") {
260 return efforts
.retire
;
261 } else if (effort_name
== "FRESH") {
262 return efforts
.fresh
;
264 assert(effort_name
== "FRESH_OOL_WRITTEN");
265 return efforts
.fresh_ool_written
;
272 "invalidated_extents",
274 sm::description("extents of invalidated transactions"),
275 {src_label
, effort_label(effort_name
)}
278 "invalidated_extent_bytes",
280 sm::description("extent bytes of invalidated transactions"),
281 {src_label
, effort_label(effort_name
)}
292 efforts
.total_trans_invalidated
,
293 sm::description("total number of transactions invalidated"),
297 "invalidated_delta_bytes",
298 efforts
.mutate_delta_bytes
,
299 sm::description("delta bytes of invalidated transactions"),
303 "invalidated_ool_records",
304 efforts
.num_ool_records
,
305 sm::description("number of ool-records from invalidated transactions"),
309 "invalidated_ool_record_bytes",
310 efforts
.ool_record_bytes
,
311 sm::description("bytes of ool-record from invalidated transactions"),
319 const string_view committed_effort_names
[] = {
327 for (auto& [src
, src_label
] : labels_by_src
) {
328 if (src
== src_t::READ
) {
329 // READ transaction won't commit
332 auto& efforts
= get_by_src(stats
.committed_efforts_by_src
, src
);
339 sm::description("total number of transaction committed"),
343 "committed_ool_records",
344 efforts
.num_ool_records
,
345 sm::description("number of ool-records from committed transactions"),
349 "committed_ool_record_metadata_bytes",
350 efforts
.ool_record_metadata_bytes
,
351 sm::description("bytes of ool-record metadata from committed transactions"),
355 "committed_ool_record_data_bytes",
356 efforts
.ool_record_data_bytes
,
357 sm::description("bytes of ool-record data from committed transactions"),
361 "committed_inline_record_metadata_bytes",
362 efforts
.inline_record_metadata_bytes
,
363 sm::description("bytes of inline-record metadata from committed transactions"
364 "(excludes delta buffer)"),
369 for (auto& effort_name
: committed_effort_names
) {
370 auto& effort_by_ext
= [&efforts
, &effort_name
]()
371 -> counter_by_extent_t
<io_stat_t
>& {
372 if (effort_name
== "READ") {
373 return efforts
.read_by_ext
;
374 } else if (effort_name
== "MUTATE") {
375 return efforts
.mutate_by_ext
;
376 } else if (effort_name
== "RETIRE") {
377 return efforts
.retire_by_ext
;
378 } else if (effort_name
== "FRESH_INVALID") {
379 return efforts
.fresh_invalid_by_ext
;
380 } else if (effort_name
== "FRESH_INLINE") {
381 return efforts
.fresh_inline_by_ext
;
383 assert(effort_name
== "FRESH_OOL");
384 return efforts
.fresh_ool_by_ext
;
387 for (auto& [ext
, ext_label
] : labels_by_ext
) {
388 auto& effort
= get_by_ext(effort_by_ext
, ext
);
395 sm::description("extents of committed transactions"),
396 {src_label
, effort_label(effort_name
), ext_label
}
399 "committed_extent_bytes",
401 sm::description("extent bytes of committed transactions"),
402 {src_label
, effort_label(effort_name
), ext_label
}
409 auto& delta_by_ext
= efforts
.delta_bytes_by_ext
;
410 for (auto& [ext
, ext_label
] : labels_by_ext
) {
411 auto& value
= get_by_ext(delta_by_ext
, ext
);
416 "committed_delta_bytes",
418 sm::description("delta bytes of committed transactions"),
419 {src_label
, ext_label
}
426 // successful read efforts
431 "trans_read_successful",
432 stats
.success_read_efforts
.num_trans
,
433 sm::description("total number of successful read transactions")
436 "successful_read_extents",
437 stats
.success_read_efforts
.read
.num
,
438 sm::description("extents of successful read transactions")
441 "successful_read_extent_bytes",
442 stats
.success_read_efforts
.read
.bytes
,
443 sm::description("extent bytes of successful read transactions")
450 * Cached extents (including placeholders)
460 return extents
.size();
462 sm::description("total number of cached extents")
465 "cached_extent_bytes",
467 return extents
.get_bytes();
469 sm::description("total bytes of cached extents")
476 sm::description("total number of dirty extents")
479 "dirty_extent_bytes",
481 sm::description("total bytes of dirty extents")
484 "cache_lru_size_bytes",
486 return lru
.get_current_contents_bytes();
488 sm::description("total bytes pinned by the lru")
491 "cache_lru_size_extents",
493 return lru
.get_current_contents_extents();
495 sm::description("total extents pinned by the lru")
503 auto tree_label
= sm::label("tree");
504 auto onode_label
= tree_label("ONODE");
505 auto omap_label
= tree_label("OMAP");
506 auto lba_label
= tree_label("LBA");
507 auto backref_label
= tree_label("BACKREF");
508 auto register_tree_metrics
= [&labels_by_src
, &onode_label
, &omap_label
, this](
509 const sm::label_instance
& tree_label
,
510 uint64_t& tree_depth
,
511 int64_t& tree_extents_num
,
512 counter_by_src_t
<tree_efforts_t
>& committed_tree_efforts
,
513 counter_by_src_t
<tree_efforts_t
>& invalidated_tree_efforts
) {
520 sm::description("the depth of tree"),
526 sm::description("num of extents of the tree"),
531 for (auto& [src
, src_label
] : labels_by_src
) {
532 if (src
== src_t::READ
) {
533 // READ transaction won't contain any tree inserts and erases
536 if (is_background_transaction(src
) &&
537 (tree_label
== onode_label
||
538 tree_label
== omap_label
)) {
539 // CLEANER transaction won't contain any onode/omap tree operations
542 auto& committed_efforts
= get_by_src(committed_tree_efforts
, src
);
543 auto& invalidated_efforts
= get_by_src(invalidated_tree_efforts
, src
);
548 "tree_inserts_committed",
549 committed_efforts
.num_inserts
,
550 sm::description("total number of committed insert operations"),
551 {tree_label
, src_label
}
554 "tree_erases_committed",
555 committed_efforts
.num_erases
,
556 sm::description("total number of committed erase operations"),
557 {tree_label
, src_label
}
560 "tree_updates_committed",
561 committed_efforts
.num_updates
,
562 sm::description("total number of committed update operations"),
563 {tree_label
, src_label
}
566 "tree_inserts_invalidated",
567 invalidated_efforts
.num_inserts
,
568 sm::description("total number of invalidated insert operations"),
569 {tree_label
, src_label
}
572 "tree_erases_invalidated",
573 invalidated_efforts
.num_erases
,
574 sm::description("total number of invalidated erase operations"),
575 {tree_label
, src_label
}
578 "tree_updates_invalidated",
579 invalidated_efforts
.num_updates
,
580 sm::description("total number of invalidated update operations"),
581 {tree_label
, src_label
}
587 register_tree_metrics(
589 stats
.onode_tree_depth
,
590 stats
.onode_tree_extents_num
,
591 stats
.committed_onode_tree_efforts
,
592 stats
.invalidated_onode_tree_efforts
);
593 register_tree_metrics(
595 stats
.omap_tree_depth
,
596 stats
.omap_tree_extents_num
,
597 stats
.committed_omap_tree_efforts
,
598 stats
.invalidated_omap_tree_efforts
);
599 register_tree_metrics(
601 stats
.lba_tree_depth
,
602 stats
.lba_tree_extents_num
,
603 stats
.committed_lba_tree_efforts
,
604 stats
.invalidated_lba_tree_efforts
);
605 register_tree_metrics(
607 stats
.backref_tree_depth
,
608 stats
.backref_tree_extents_num
,
609 stats
.committed_backref_tree_efforts
,
610 stats
.invalidated_backref_tree_efforts
);
613 * conflict combinations
615 auto srcs_label
= sm::label("srcs");
616 auto num_srcs
= static_cast<std::size_t>(Transaction::src_t::MAX
);
617 std::size_t srcs_index
= 0;
618 for (uint8_t src2_int
= 0; src2_int
< num_srcs
; ++src2_int
) {
619 auto src2
= static_cast<Transaction::src_t
>(src2_int
);
620 for (uint8_t src1_int
= src2_int
; src1_int
< num_srcs
; ++src1_int
) {
622 auto src1
= static_cast<Transaction::src_t
>(src1_int
);
623 // impossible combinations
624 // should be consistent with checks in account_conflict()
625 if ((src1
== Transaction::src_t::READ
&&
626 src2
== Transaction::src_t::READ
) ||
627 (src1
== Transaction::src_t::TRIM_DIRTY
&&
628 src2
== Transaction::src_t::TRIM_DIRTY
) ||
629 (src1
== Transaction::src_t::CLEANER_MAIN
&&
630 src2
== Transaction::src_t::CLEANER_MAIN
) ||
631 (src1
== Transaction::src_t::CLEANER_COLD
&&
632 src2
== Transaction::src_t::CLEANER_COLD
) ||
633 (src1
== Transaction::src_t::TRIM_ALLOC
&&
634 src2
== Transaction::src_t::TRIM_ALLOC
)) {
637 std::ostringstream oss
;
638 oss
<< src1
<< "," << src2
;
643 "trans_srcs_invalidated",
644 stats
.trans_conflicts_by_srcs
[srcs_index
- 1],
645 sm::description("total number conflicted transactions by src pair"),
646 {srcs_label(oss
.str())}
652 assert(srcs_index
== NUM_SRC_COMB
);
654 for (uint8_t src_int
= 0; src_int
< num_srcs
; ++src_int
) {
656 auto src
= static_cast<Transaction::src_t
>(src_int
);
657 std::ostringstream oss
;
658 oss
<< "UNKNOWN," << src
;
663 "trans_srcs_invalidated",
664 stats
.trans_conflicts_by_unknown
[srcs_index
- 1],
665 sm::description("total number conflicted transactions by src pair"),
666 {srcs_label(oss
.str())}
679 "version_count_dirty",
680 stats
.committed_dirty_version
.num
,
681 sm::description("total number of rewrite-dirty extents")
685 stats
.committed_dirty_version
.version
,
686 sm::description("sum of the version from rewrite-dirty extents")
689 "version_count_reclaim",
690 stats
.committed_reclaim_version
.num
,
691 sm::description("total number of rewrite-reclaim extents")
694 "version_sum_reclaim",
695 stats
.committed_reclaim_version
.version
,
696 sm::description("sum of the version from rewrite-reclaim extents")
702 void Cache::add_extent(
704 const Transaction::src_t
* p_src
=nullptr)
706 assert(ref
->is_valid());
707 assert(ref
->user_hint
== PLACEMENT_HINT_NULL
);
708 assert(ref
->rewrite_generation
== NULL_GENERATION
);
709 extents
.insert(*ref
);
710 if (ref
->is_dirty()) {
713 touch_extent(*ref
, p_src
);
717 void Cache::mark_dirty(CachedExtentRef ref
)
719 if (ref
->is_dirty()) {
720 assert(ref
->primary_ref_list_hook
.is_linked());
724 lru
.remove_from_lru(*ref
);
725 ref
->state
= CachedExtent::extent_state_t::DIRTY
;
729 void Cache::add_to_dirty(CachedExtentRef ref
)
731 assert(ref
->is_dirty());
732 assert(!ref
->primary_ref_list_hook
.is_linked());
733 ceph_assert(ref
->get_modify_time() != NULL_TIME
);
734 intrusive_ptr_add_ref(&*ref
);
735 dirty
.push_back(*ref
);
736 stats
.dirty_bytes
+= ref
->get_length();
739 void Cache::remove_from_dirty(CachedExtentRef ref
)
741 if (ref
->is_dirty()) {
742 ceph_assert(ref
->primary_ref_list_hook
.is_linked());
743 stats
.dirty_bytes
-= ref
->get_length();
744 dirty
.erase(dirty
.s_iterator_to(*ref
));
745 intrusive_ptr_release(&*ref
);
747 ceph_assert(!ref
->primary_ref_list_hook
.is_linked());
751 void Cache::remove_extent(CachedExtentRef ref
)
753 assert(ref
->is_valid());
754 if (ref
->is_dirty()) {
755 remove_from_dirty(ref
);
756 } else if (!ref
->is_placeholder()) {
757 lru
.remove_from_lru(*ref
);
762 void Cache::commit_retire_extent(
768 ref
->dirty_from_or_retired_at
= JOURNAL_SEQ_NULL
;
769 invalidate_extent(t
, *ref
);
772 void Cache::commit_replace_extent(
774 CachedExtentRef next
,
775 CachedExtentRef prev
)
777 assert(next
->is_dirty());
778 assert(next
->get_paddr() == prev
->get_paddr());
779 assert(next
->version
== prev
->version
+ 1);
780 extents
.replace(*next
, *prev
);
782 if (prev
->get_type() == extent_types_t::ROOT
) {
783 assert(prev
->is_stable_clean()
784 || prev
->primary_ref_list_hook
.is_linked());
785 if (prev
->is_dirty()) {
786 stats
.dirty_bytes
-= prev
->get_length();
787 dirty
.erase(dirty
.s_iterator_to(*prev
));
788 intrusive_ptr_release(&*prev
);
791 } else if (prev
->is_dirty()) {
792 assert(prev
->get_dirty_from() == next
->get_dirty_from());
793 assert(prev
->primary_ref_list_hook
.is_linked());
794 auto prev_it
= dirty
.iterator_to(*prev
);
795 dirty
.insert(prev_it
, *next
);
796 dirty
.erase(prev_it
);
797 intrusive_ptr_release(&*prev
);
798 intrusive_ptr_add_ref(&*next
);
800 lru
.remove_from_lru(*prev
);
804 next
->on_replace_prior(t
);
805 invalidate_extent(t
, *prev
);
808 void Cache::invalidate_extent(
810 CachedExtent
& extent
)
812 if (!extent
.may_conflict()) {
813 assert(extent
.transactions
.empty());
814 extent
.set_invalid(t
);
818 LOG_PREFIX(Cache::invalidate_extent
);
819 bool do_conflict_log
= true;
820 for (auto &&i
: extent
.transactions
) {
821 if (!i
.t
->conflicted
) {
822 if (do_conflict_log
) {
823 SUBDEBUGT(seastore_t
, "conflict begin -- {}", t
, extent
);
824 do_conflict_log
= false;
826 assert(!i
.t
->is_weak());
827 account_conflict(t
.get_src(), i
.t
->get_src());
828 mark_transaction_conflicted(*i
.t
, extent
);
831 extent
.set_invalid(t
);
834 void Cache::mark_transaction_conflicted(
835 Transaction
& t
, CachedExtent
& conflicting_extent
)
837 LOG_PREFIX(Cache::mark_transaction_conflicted
);
838 SUBTRACET(seastore_t
, "", t
);
839 assert(!t
.conflicted
);
842 auto& efforts
= get_by_src(stats
.invalidated_efforts_by_src
,
844 ++efforts
.total_trans_invalidated
;
846 auto& counter
= get_by_ext(efforts
.num_trans_invalidated
,
847 conflicting_extent
.get_type());
851 for (auto &i
: t
.read_set
) {
852 read_stat
.increment(i
.ref
->get_length());
854 efforts
.read
.increment_stat(read_stat
);
856 if (t
.get_src() != Transaction::src_t::READ
) {
857 io_stat_t retire_stat
;
858 for (auto &i
: t
.retired_set
) {
859 retire_stat
.increment(i
->get_length());
861 efforts
.retire
.increment_stat(retire_stat
);
863 auto& fresh_stat
= t
.get_fresh_block_stats();
864 efforts
.fresh
.increment_stat(fresh_stat
);
866 io_stat_t delta_stat
;
867 for (auto &i
: t
.mutated_block_list
) {
868 if (!i
->is_valid()) {
871 efforts
.mutate
.increment(i
->get_length());
872 delta_stat
.increment(i
->get_delta().length());
874 efforts
.mutate_delta_bytes
+= delta_stat
.bytes
;
876 for (auto &i
: t
.pre_alloc_list
) {
877 epm
.mark_space_free(i
->get_paddr(), i
->get_length());
880 auto& ool_stats
= t
.get_ool_write_stats();
881 efforts
.fresh_ool_written
.increment_stat(ool_stats
.extents
);
882 efforts
.num_ool_records
+= ool_stats
.num_records
;
883 auto ool_record_bytes
= (ool_stats
.md_bytes
+ ool_stats
.get_data_bytes());
884 efforts
.ool_record_bytes
+= ool_record_bytes
;
886 if (is_background_transaction(t
.get_src())) {
887 // CLEANER transaction won't contain any onode/omap tree operations
888 assert(t
.onode_tree_stats
.is_clear());
889 assert(t
.omap_tree_stats
.is_clear());
891 get_by_src(stats
.invalidated_onode_tree_efforts
, t
.get_src()
892 ).increment(t
.onode_tree_stats
);
893 get_by_src(stats
.invalidated_omap_tree_efforts
, t
.get_src()
894 ).increment(t
.omap_tree_stats
);
897 get_by_src(stats
.invalidated_lba_tree_efforts
, t
.get_src()
898 ).increment(t
.lba_tree_stats
);
899 get_by_src(stats
.invalidated_backref_tree_efforts
, t
.get_src()
900 ).increment(t
.backref_tree_stats
);
902 SUBDEBUGT(seastore_t
,
903 "discard {} read, {} fresh, {} delta, {} retire, {}({}B) ool-records",
909 ool_stats
.num_records
,
912 // read transaction won't have non-read efforts
913 assert(t
.retired_set
.empty());
914 assert(t
.get_fresh_block_stats().is_clear());
915 assert(t
.mutated_block_list
.empty());
916 assert(t
.get_ool_write_stats().is_clear());
917 assert(t
.onode_tree_stats
.is_clear());
918 assert(t
.omap_tree_stats
.is_clear());
919 assert(t
.lba_tree_stats
.is_clear());
920 assert(t
.backref_tree_stats
.is_clear());
921 SUBDEBUGT(seastore_t
, "discard {} read", t
, read_stat
);
925 void Cache::on_transaction_destruct(Transaction
& t
)
927 LOG_PREFIX(Cache::on_transaction_destruct
);
928 SUBTRACET(seastore_t
, "", t
);
929 if (t
.get_src() == Transaction::src_t::READ
&&
930 t
.conflicted
== false) {
932 for (auto &i
: t
.read_set
) {
933 read_stat
.increment(i
.ref
->get_length());
935 SUBDEBUGT(seastore_t
, "done {} read", t
, read_stat
);
938 // exclude weak transaction as it is impossible to conflict
939 ++stats
.success_read_efforts
.num_trans
;
940 stats
.success_read_efforts
.read
.increment_stat(read_stat
);
943 // read transaction won't have non-read efforts
944 assert(t
.retired_set
.empty());
945 assert(t
.get_fresh_block_stats().is_clear());
946 assert(t
.mutated_block_list
.empty());
947 assert(t
.onode_tree_stats
.is_clear());
948 assert(t
.omap_tree_stats
.is_clear());
949 assert(t
.lba_tree_stats
.is_clear());
950 assert(t
.backref_tree_stats
.is_clear());
954 CachedExtentRef
Cache::alloc_new_extent_by_type(
955 Transaction
&t
, ///< [in, out] current transaction
956 extent_types_t type
, ///< [in] type tag
957 extent_len_t length
, ///< [in] length
958 placement_hint_t hint
, ///< [in] user hint
959 rewrite_gen_t gen
///< [in] rewrite generation
962 LOG_PREFIX(Cache::alloc_new_extent_by_type
);
963 SUBDEBUGT(seastore_cache
, "allocate {} {}B, hint={}, gen={}",
964 t
, type
, length
, hint
, rewrite_gen_printer_t
{gen
});
966 case extent_types_t::ROOT
:
967 ceph_assert(0 == "ROOT is never directly alloc'd");
968 return CachedExtentRef();
969 case extent_types_t::LADDR_INTERNAL
:
970 return alloc_new_extent
<lba_manager::btree::LBAInternalNode
>(t
, length
, hint
, gen
);
971 case extent_types_t::LADDR_LEAF
:
972 return alloc_new_extent
<lba_manager::btree::LBALeafNode
>(
973 t
, length
, hint
, gen
);
974 case extent_types_t::ONODE_BLOCK_STAGED
:
975 return alloc_new_extent
<onode::SeastoreNodeExtent
>(t
, length
, hint
, gen
);
976 case extent_types_t::OMAP_INNER
:
977 return alloc_new_extent
<omap_manager::OMapInnerNode
>(t
, length
, hint
, gen
);
978 case extent_types_t::OMAP_LEAF
:
979 return alloc_new_extent
<omap_manager::OMapLeafNode
>(t
, length
, hint
, gen
);
980 case extent_types_t::COLL_BLOCK
:
981 return alloc_new_extent
<collection_manager::CollectionNode
>(t
, length
, hint
, gen
);
982 case extent_types_t::OBJECT_DATA_BLOCK
:
983 return alloc_new_extent
<ObjectDataBlock
>(t
, length
, hint
, gen
);
984 case extent_types_t::RETIRED_PLACEHOLDER
:
985 ceph_assert(0 == "impossible");
986 return CachedExtentRef();
987 case extent_types_t::TEST_BLOCK
:
988 return alloc_new_extent
<TestBlock
>(t
, length
, hint
, gen
);
989 case extent_types_t::TEST_BLOCK_PHYSICAL
:
990 return alloc_new_extent
<TestBlockPhysical
>(t
, length
, hint
, gen
);
991 case extent_types_t::NONE
: {
992 ceph_assert(0 == "NONE is an invalid extent type");
993 return CachedExtentRef();
996 ceph_assert(0 == "impossible");
997 return CachedExtentRef();
1001 CachedExtentRef
Cache::duplicate_for_write(
1003 CachedExtentRef i
) {
1004 LOG_PREFIX(Cache::duplicate_for_write
);
1005 assert(i
->is_fully_loaded());
1007 if (i
->is_mutable())
1010 if (i
->is_exist_clean()) {
1012 i
->state
= CachedExtent::extent_state_t::EXIST_MUTATION_PENDING
;
1013 i
->last_committed_crc
= i
->get_crc32c();
1014 // deepcopy the buffer of exist clean extent beacuse it shares
1015 // buffer with original clean extent.
1016 auto bp
= i
->get_bptr();
1017 auto nbp
= ceph::bufferptr(bp
.c_str(), bp
.length());
1018 i
->set_bptr(std::move(nbp
));
1020 t
.add_mutated_extent(i
);
1021 DEBUGT("duplicate existing extent {}", t
, *i
);
1025 auto ret
= i
->duplicate_for_write(t
);
1026 ret
->pending_for_transaction
= t
.get_trans_id();
1027 ret
->prior_instance
= i
;
1028 // duplicate_for_write won't occur after ool write finished
1029 assert(!i
->prior_poffset
);
1030 auto [iter
, inserted
] = i
->mutation_pendings
.insert(*ret
);
1031 ceph_assert(inserted
);
1032 t
.add_mutated_extent(ret
);
1033 if (ret
->get_type() == extent_types_t::ROOT
) {
1034 t
.root
= ret
->cast
<RootBlock
>();
1036 ret
->last_committed_crc
= i
->last_committed_crc
;
1040 ret
->state
= CachedExtent::extent_state_t::MUTATION_PENDING
;
1041 DEBUGT("{} -> {}", t
, *i
, *ret
);
1045 record_t
Cache::prepare_record(
1047 const journal_seq_t
&journal_head
,
1048 const journal_seq_t
&journal_dirty_tail
)
1050 LOG_PREFIX(Cache::prepare_record
);
1051 SUBTRACET(seastore_t
, "enter", t
);
1053 auto trans_src
= t
.get_src();
1054 assert(!t
.is_weak());
1055 assert(trans_src
!= Transaction::src_t::READ
);
1057 auto& efforts
= get_by_src(stats
.committed_efforts_by_src
,
1060 // Should be valid due to interruptible future
1061 io_stat_t read_stat
;
1062 for (auto &i
: t
.read_set
) {
1063 if (!i
.ref
->is_valid()) {
1064 SUBERRORT(seastore_t
,
1065 "read_set got invalid extent, aborting -- {}", t
, *i
.ref
);
1066 ceph_abort("no invalid extent allowed in transactions' read_set");
1068 get_by_ext(efforts
.read_by_ext
,
1069 i
.ref
->get_type()).increment(i
.ref
->get_length());
1070 read_stat
.increment(i
.ref
->get_length());
1073 t
.write_set
.clear();
1075 record_t
record(trans_src
);
1076 auto commit_time
= seastar::lowres_system_clock::now();
1078 // Add new copy of mutated blocks, set_io_wait to block until written
1079 record
.deltas
.reserve(t
.mutated_block_list
.size());
1080 io_stat_t delta_stat
;
1081 for (auto &i
: t
.mutated_block_list
) {
1082 if (!i
->is_valid()) {
1083 DEBUGT("invalid mutated extent -- {}", t
, *i
);
1086 assert(i
->is_exist_mutation_pending() ||
1088 get_by_ext(efforts
.mutate_by_ext
,
1089 i
->get_type()).increment(i
->get_length());
1091 auto delta_bl
= i
->get_delta();
1092 auto delta_length
= delta_bl
.length();
1093 i
->set_modify_time(commit_time
);
1094 DEBUGT("mutated extent with {}B delta -- {}",
1095 t
, delta_length
, *i
);
1096 if (!i
->is_exist_mutation_pending()) {
1097 DEBUGT("commit replace extent ... -- {}, prior={}",
1098 t
, *i
, *i
->prior_instance
);
1099 // extent with EXIST_MUTATION_PENDING doesn't have
1100 // prior_instance field so skip these extents.
1101 // the existing extents should be added into Cache
1102 // during complete_commit to sync with gc transaction.
1103 commit_replace_extent(t
, i
, i
->prior_instance
);
1108 i
->prepare_commit();
1110 assert(i
->get_version() > 0);
1111 auto final_crc
= i
->get_crc32c();
1112 if (i
->get_type() == extent_types_t::ROOT
) {
1113 SUBTRACET(seastore_t
, "writing out root delta {}B -- {}",
1114 t
, delta_length
, *i
);
1115 assert(t
.root
== i
);
1119 extent_types_t::ROOT
,
1125 t
.root
->get_version() - 1,
1127 segment_type_t::NULL_SEG
,
1131 auto sseq
= NULL_SEG_SEQ
;
1132 auto stype
= segment_type_t::NULL_SEG
;
1134 // FIXME: This is specific to the segmented implementation
1135 if (i
->get_paddr().get_addr_type() == paddr_types_t::SEGMENT
) {
1136 auto sid
= i
->get_paddr().as_seg_paddr().get_segment_id();
1137 auto sinfo
= get_segment_info(sid
);
1140 stype
= sinfo
->type
;
1149 ? i
->cast
<LogicalCachedExtent
>()->get_laddr()
1151 i
->last_committed_crc
,
1154 i
->get_version() - 1,
1159 i
->last_committed_crc
= final_crc
;
1161 assert(delta_length
);
1162 get_by_ext(efforts
.delta_bytes_by_ext
,
1163 i
->get_type()) += delta_length
;
1164 delta_stat
.increment(delta_length
);
1167 // Transaction is now a go, set up in-memory cache state
1168 // invalidate now invalid blocks
1169 io_stat_t retire_stat
;
1170 std::vector
<alloc_delta_t
> alloc_deltas
;
1171 alloc_delta_t rel_delta
;
1172 rel_delta
.op
= alloc_delta_t::op_types_t::CLEAR
;
1173 for (auto &i
: t
.retired_set
) {
1174 get_by_ext(efforts
.retire_by_ext
,
1175 i
->get_type()).increment(i
->get_length());
1176 retire_stat
.increment(i
->get_length());
1177 DEBUGT("retired and remove extent -- {}", t
, *i
);
1178 commit_retire_extent(t
, i
);
1179 if (is_backref_mapped_extent_node(i
)
1180 || is_retired_placeholder(i
->get_type())) {
1181 rel_delta
.alloc_blk_ranges
.emplace_back(
1188 alloc_deltas
.emplace_back(std::move(rel_delta
));
1190 record
.extents
.reserve(t
.inline_block_list
.size());
1191 io_stat_t fresh_stat
;
1192 io_stat_t fresh_invalid_stat
;
1193 alloc_delta_t alloc_delta
;
1194 alloc_delta
.op
= alloc_delta_t::op_types_t::SET
;
1195 for (auto &i
: t
.inline_block_list
) {
1196 if (!i
->is_valid()) {
1197 DEBUGT("invalid fresh inline extent -- {}", t
, *i
);
1198 fresh_invalid_stat
.increment(i
->get_length());
1199 get_by_ext(efforts
.fresh_invalid_by_ext
,
1200 i
->get_type()).increment(i
->get_length());
1202 TRACET("fresh inline extent -- {}", t
, *i
);
1204 fresh_stat
.increment(i
->get_length());
1205 get_by_ext(efforts
.fresh_inline_by_ext
,
1206 i
->get_type()).increment(i
->get_length());
1207 assert(i
->is_inline() || i
->get_paddr().is_fake());
1211 i
->prepare_commit();
1212 bl
.append(i
->get_bptr());
1213 if (i
->get_type() == extent_types_t::ROOT
) {
1214 ceph_assert(0 == "ROOT never gets written as a fresh block");
1217 assert(bl
.length() == i
->get_length());
1218 auto modify_time
= i
->get_modify_time();
1219 if (modify_time
== NULL_TIME
) {
1220 modify_time
= commit_time
;
1222 record
.push_back(extent_t
{
1225 ? i
->cast
<LogicalCachedExtent
>()->get_laddr()
1226 : (is_lba_node(i
->get_type())
1227 ? i
->cast
<lba_manager::btree::LBANode
>()->get_node_meta().begin
1233 && is_backref_mapped_extent_node(i
)) {
1234 alloc_delta
.alloc_blk_ranges
.emplace_back(
1237 ? i
->cast
<LogicalCachedExtent
>()->get_laddr()
1238 : (is_lba_node(i
->get_type())
1239 ? i
->cast
<lba_manager::btree::LBANode
>()->get_node_meta().begin
1246 for (auto &i
: t
.written_ool_block_list
) {
1247 TRACET("fresh ool extent -- {}", t
, *i
);
1248 ceph_assert(i
->is_valid());
1249 assert(!i
->is_inline());
1250 get_by_ext(efforts
.fresh_ool_by_ext
,
1251 i
->get_type()).increment(i
->get_length());
1252 i
->prepare_commit();
1253 if (is_backref_mapped_extent_node(i
)) {
1254 alloc_delta
.alloc_blk_ranges
.emplace_back(
1257 ? i
->cast
<LogicalCachedExtent
>()->get_laddr()
1258 : i
->cast
<lba_manager::btree::LBANode
>()->get_node_meta().begin
,
1264 for (auto &i
: t
.existing_block_list
) {
1265 if (i
->is_valid()) {
1266 alloc_delta
.alloc_blk_ranges
.emplace_back(
1268 i
->cast
<LogicalCachedExtent
>()->get_laddr(),
1273 alloc_deltas
.emplace_back(std::move(alloc_delta
));
1275 for (auto b
: alloc_deltas
) {
1279 delta
.type
= extent_types_t::ALLOC_INFO
;
1281 record
.push_back(std::move(delta
));
1284 if (is_background_transaction(trans_src
)) {
1285 assert(journal_head
!= JOURNAL_SEQ_NULL
);
1286 assert(journal_dirty_tail
!= JOURNAL_SEQ_NULL
);
1287 journal_seq_t dirty_tail
;
1288 auto maybe_dirty_tail
= get_oldest_dirty_from();
1289 if (!maybe_dirty_tail
.has_value()) {
1290 dirty_tail
= journal_head
;
1291 SUBINFOT(seastore_t
, "dirty_tail all trimmed, set to head {}, src={}",
1292 t
, dirty_tail
, trans_src
);
1293 } else if (*maybe_dirty_tail
== JOURNAL_SEQ_NULL
) {
1294 dirty_tail
= journal_dirty_tail
;
1295 SUBINFOT(seastore_t
, "dirty_tail is pending, set to {}, src={}",
1296 t
, dirty_tail
, trans_src
);
1298 dirty_tail
= *maybe_dirty_tail
;
1300 ceph_assert(dirty_tail
!= JOURNAL_SEQ_NULL
);
1301 journal_seq_t alloc_tail
;
1302 auto maybe_alloc_tail
= get_oldest_backref_dirty_from();
1303 if (!maybe_alloc_tail
.has_value()) {
1304 // FIXME: the replay point of the allocations requires to be accurate.
1305 // Setting the alloc_tail to get_journal_head() cannot skip replaying the
1306 // last unnecessary record.
1307 alloc_tail
= journal_head
;
1308 SUBINFOT(seastore_t
, "alloc_tail all trimmed, set to head {}, src={}",
1309 t
, alloc_tail
, trans_src
);
1310 } else if (*maybe_alloc_tail
== JOURNAL_SEQ_NULL
) {
1311 ceph_abort("impossible");
1313 alloc_tail
= *maybe_alloc_tail
;
1315 ceph_assert(alloc_tail
!= JOURNAL_SEQ_NULL
);
1316 auto tails
= journal_tail_delta_t
{alloc_tail
, dirty_tail
};
1317 SUBDEBUGT(seastore_t
, "update tails as delta {}", t
, tails
);
1321 delta
.type
= extent_types_t::JOURNAL_TAIL
;
1323 record
.push_back(std::move(delta
));
1326 ceph_assert(t
.get_fresh_block_stats().num
==
1327 t
.inline_block_list
.size() +
1328 t
.written_ool_block_list
.size() +
1329 t
.num_delayed_invalid_extents
+
1330 t
.num_allocated_invalid_extents
);
1332 auto& ool_stats
= t
.get_ool_write_stats();
1333 ceph_assert(ool_stats
.extents
.num
== t
.written_ool_block_list
.size());
1335 if (record
.is_empty()) {
1336 SUBINFOT(seastore_t
,
1337 "record to submit is empty, src={}", t
, trans_src
);
1338 assert(t
.onode_tree_stats
.is_clear());
1339 assert(t
.omap_tree_stats
.is_clear());
1340 assert(t
.lba_tree_stats
.is_clear());
1341 assert(t
.backref_tree_stats
.is_clear());
1342 assert(ool_stats
.is_clear());
1345 if (record
.modify_time
== NULL_TIME
) {
1346 record
.modify_time
= commit_time
;
1349 SUBDEBUGT(seastore_t
,
1350 "commit H{} dirty_from={}, alloc_from={}, "
1351 "{} read, {} fresh with {} invalid, "
1352 "{} delta, {} retire, {}(md={}B, data={}B) ool-records, "
1353 "{}B md, {}B data, modify_time={}",
1354 t
, (void*)&t
.get_handle(),
1355 get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL
),
1356 get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL
),
1362 ool_stats
.num_records
,
1364 ool_stats
.get_data_bytes(),
1365 record
.size
.get_raw_mdlength(),
1366 record
.size
.dlength
,
1367 sea_time_point_printer_t
{record
.modify_time
});
1368 if (is_background_transaction(trans_src
)) {
1369 // background transaction won't contain any onode tree operations
1370 assert(t
.onode_tree_stats
.is_clear());
1371 assert(t
.omap_tree_stats
.is_clear());
1373 if (t
.onode_tree_stats
.depth
) {
1374 stats
.onode_tree_depth
= t
.onode_tree_stats
.depth
;
1376 if (t
.omap_tree_stats
.depth
) {
1377 stats
.omap_tree_depth
= t
.omap_tree_stats
.depth
;
1379 stats
.onode_tree_extents_num
+= t
.onode_tree_stats
.extents_num_delta
;
1380 ceph_assert(stats
.onode_tree_extents_num
>= 0);
1381 get_by_src(stats
.committed_onode_tree_efforts
, trans_src
1382 ).increment(t
.onode_tree_stats
);
1383 stats
.omap_tree_extents_num
+= t
.omap_tree_stats
.extents_num_delta
;
1384 ceph_assert(stats
.omap_tree_extents_num
>= 0);
1385 get_by_src(stats
.committed_omap_tree_efforts
, trans_src
1386 ).increment(t
.omap_tree_stats
);
1389 if (t
.lba_tree_stats
.depth
) {
1390 stats
.lba_tree_depth
= t
.lba_tree_stats
.depth
;
1392 stats
.lba_tree_extents_num
+= t
.lba_tree_stats
.extents_num_delta
;
1393 ceph_assert(stats
.lba_tree_extents_num
>= 0);
1394 get_by_src(stats
.committed_lba_tree_efforts
, trans_src
1395 ).increment(t
.lba_tree_stats
);
1396 if (t
.backref_tree_stats
.depth
) {
1397 stats
.backref_tree_depth
= t
.backref_tree_stats
.depth
;
1399 stats
.backref_tree_extents_num
+= t
.backref_tree_stats
.extents_num_delta
;
1400 ceph_assert(stats
.backref_tree_extents_num
>= 0);
1401 get_by_src(stats
.committed_backref_tree_efforts
, trans_src
1402 ).increment(t
.backref_tree_stats
);
1404 ++(efforts
.num_trans
);
1405 efforts
.num_ool_records
+= ool_stats
.num_records
;
1406 efforts
.ool_record_metadata_bytes
+= ool_stats
.md_bytes
;
1407 efforts
.ool_record_data_bytes
+= ool_stats
.get_data_bytes();
1408 efforts
.inline_record_metadata_bytes
+=
1409 (record
.size
.get_raw_mdlength() - record
.get_delta_size());
1411 auto &rewrite_version_stats
= t
.get_rewrite_version_stats();
1412 if (trans_src
== Transaction::src_t::TRIM_DIRTY
) {
1413 stats
.committed_dirty_version
.increment_stat(rewrite_version_stats
);
1414 } else if (trans_src
== Transaction::src_t::CLEANER_MAIN
||
1415 trans_src
== Transaction::src_t::CLEANER_COLD
) {
1416 stats
.committed_reclaim_version
.increment_stat(rewrite_version_stats
);
1418 assert(rewrite_version_stats
.is_clear());
1424 void Cache::backref_batch_update(
1425 std::vector
<backref_entry_ref
> &&list
,
1426 const journal_seq_t
&seq
)
1428 LOG_PREFIX(Cache::backref_batch_update
);
1429 DEBUG("inserting {} entries at {}", list
.size(), seq
);
1430 ceph_assert(seq
!= JOURNAL_SEQ_NULL
);
1432 for (auto &ent
: list
) {
1433 backref_entry_mset
.insert(*ent
);
1436 auto iter
= backref_entryrefs_by_seq
.find(seq
);
1437 if (iter
== backref_entryrefs_by_seq
.end()) {
1438 backref_entryrefs_by_seq
.emplace(seq
, std::move(list
));
1440 iter
->second
.insert(
1442 std::make_move_iterator(list
.begin()),
1443 std::make_move_iterator(list
.end()));
1447 void Cache::complete_commit(
1449 paddr_t final_block_start
,
1450 journal_seq_t start_seq
)
1452 LOG_PREFIX(Cache::complete_commit
);
1453 SUBTRACET(seastore_t
, "final_block_start={}, start_seq={}",
1454 t
, final_block_start
, start_seq
);
1456 std::vector
<backref_entry_ref
> backref_list
;
1457 t
.for_each_fresh_block([&](const CachedExtentRef
&i
) {
1458 if (!i
->is_valid()) {
1462 bool is_inline
= false;
1463 if (i
->is_inline()) {
1465 i
->set_paddr(final_block_start
.add_relative(i
->get_paddr()));
1467 i
->last_committed_crc
= i
->get_crc32c();
1468 i
->pending_for_transaction
= TRANS_ID_NULL
;
1469 i
->on_initial_write();
1471 i
->state
= CachedExtent::extent_state_t::CLEAN
;
1472 DEBUGT("add extent as fresh, inline={} -- {}",
1474 const auto t_src
= t
.get_src();
1475 i
->invalidate_hints();
1476 add_extent(i
, &t_src
);
1477 epm
.commit_space_used(i
->get_paddr(), i
->get_length());
1478 if (is_backref_mapped_extent_node(i
)) {
1479 DEBUGT("backref_list new {} len {}",
1483 backref_list
.emplace_back(
1484 std::make_unique
<backref_entry_t
>(
1487 ? i
->cast
<LogicalCachedExtent
>()->get_laddr()
1488 : (is_lba_node(i
->get_type())
1489 ? i
->cast
<lba_manager::btree::LBANode
>()->get_node_meta().begin
1494 } else if (is_backref_node(i
->get_type())) {
1497 i
->cast
<backref::BackrefNode
>()->get_node_meta().begin
,
1500 ERRORT("{}", t
, *i
);
1501 ceph_abort("not possible");
1505 // Add new copy of mutated blocks, set_io_wait to block until written
1506 for (auto &i
: t
.mutated_block_list
) {
1507 if (!i
->is_valid()) {
1510 assert(i
->is_exist_mutation_pending() ||
1512 i
->on_delta_write(final_block_start
);
1513 i
->pending_for_transaction
= TRANS_ID_NULL
;
1514 i
->prior_instance
= CachedExtentRef();
1515 i
->state
= CachedExtent::extent_state_t::DIRTY
;
1516 assert(i
->version
> 0);
1517 if (i
->version
== 1 || i
->get_type() == extent_types_t::ROOT
) {
1518 i
->dirty_from_or_retired_at
= start_seq
;
1519 DEBUGT("commit extent done, become dirty -- {}", t
, *i
);
1521 DEBUGT("commit extent done -- {}", t
, *i
);
1525 for (auto &i
: t
.retired_set
) {
1526 epm
.mark_space_free(i
->get_paddr(), i
->get_length());
1528 for (auto &i
: t
.existing_block_list
) {
1529 if (i
->is_valid()) {
1530 epm
.mark_space_used(i
->get_paddr(), i
->get_length());
1534 for (auto &i
: t
.mutated_block_list
) {
1535 if (!i
->is_valid()) {
1541 last_commit
= start_seq
;
1542 for (auto &i
: t
.retired_set
) {
1543 i
->dirty_from_or_retired_at
= start_seq
;
1544 if (is_backref_mapped_extent_node(i
)
1545 || is_retired_placeholder(i
->get_type())) {
1546 DEBUGT("backref_list free {} len {}",
1550 backref_list
.emplace_back(
1551 std::make_unique
<backref_entry_t
>(
1557 } else if (is_backref_node(i
->get_type())) {
1558 remove_backref_extent(i
->get_paddr());
1560 ERRORT("{}", t
, *i
);
1561 ceph_abort("not possible");
1565 auto existing_stats
= t
.get_existing_block_stats();
1566 DEBUGT("total existing blocks num: {}, exist clean num: {}, "
1567 "exist mutation pending num: {}",
1569 existing_stats
.valid_num
,
1570 existing_stats
.clean_num
,
1571 existing_stats
.mutated_num
);
1572 for (auto &i
: t
.existing_block_list
) {
1573 if (i
->is_valid()) {
1574 if (i
->is_exist_clean()) {
1575 i
->state
= CachedExtent::extent_state_t::CLEAN
;
1577 assert(i
->state
== CachedExtent::extent_state_t::DIRTY
);
1579 DEBUGT("backref_list new existing {} len {}",
1583 backref_list
.emplace_back(
1584 std::make_unique
<backref_entry_t
>(
1586 i
->cast
<LogicalCachedExtent
>()->get_laddr(),
1590 const auto t_src
= t
.get_src();
1591 add_extent(i
, &t_src
);
1594 if (!backref_list
.empty()) {
1595 backref_batch_update(std::move(backref_list
), start_seq
);
1598 for (auto &i
: t
.pre_alloc_list
) {
1599 if (!i
->is_valid()) {
1600 epm
.mark_space_free(i
->get_paddr(), i
->get_length());
1607 LOG_PREFIX(Cache::init
);
1609 // initial creation will do mkfs followed by mount each of which calls init
1610 DEBUG("remove extent -- prv_root={}", *root
);
1611 remove_extent(root
);
1614 root
= new RootBlock();
1615 root
->init(CachedExtent::extent_state_t::CLEAN
,
1617 PLACEMENT_HINT_NULL
,
1620 INFO("init root -- {}", *root
);
1621 extents
.insert(*root
);
1624 Cache::mkfs_iertr::future
<> Cache::mkfs(Transaction
&t
)
1626 LOG_PREFIX(Cache::mkfs
);
1627 INFOT("create root", t
);
1628 return get_root(t
).si_then([this, &t
](auto croot
) {
1629 duplicate_for_write(t
, croot
);
1630 return mkfs_iertr::now();
1631 }).handle_error_interruptible(
1632 mkfs_iertr::pass_further
{},
1633 crimson::ct_error::assert_all
{
1634 "Invalid error in Cache::mkfs"
1639 Cache::close_ertr::future
<> Cache::close()
1641 LOG_PREFIX(Cache::close
);
1642 INFO("close with {}({}B) dirty, dirty_from={}, alloc_from={}, "
1643 "{}({}B) lru, totally {}({}B) indexed extents",
1646 get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL
),
1647 get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL
),
1648 lru
.get_current_contents_extents(),
1649 lru
.get_current_contents_bytes(),
1651 extents
.get_bytes());
1653 for (auto i
= dirty
.begin(); i
!= dirty
.end(); ) {
1655 stats
.dirty_bytes
-= ptr
->get_length();
1657 intrusive_ptr_release(ptr
);
1659 backref_extents
.clear();
1660 backref_entryrefs_by_seq
.clear();
1661 assert(stats
.dirty_bytes
== 0);
1663 return close_ertr::now();
1666 Cache::replay_delta_ret
1667 Cache::replay_delta(
1668 journal_seq_t journal_seq
,
1669 paddr_t record_base
,
1670 const delta_info_t
&delta
,
1671 const journal_seq_t
&dirty_tail
,
1672 const journal_seq_t
&alloc_tail
,
1673 sea_time_point modify_time
)
1675 LOG_PREFIX(Cache::replay_delta
);
1676 assert(dirty_tail
!= JOURNAL_SEQ_NULL
);
1677 assert(alloc_tail
!= JOURNAL_SEQ_NULL
);
1678 ceph_assert(modify_time
!= NULL_TIME
);
1680 // FIXME: This is specific to the segmented implementation
1681 /* The journal may validly contain deltas for extents in
1682 * since released segments. We can detect those cases by
1683 * checking whether the segment in question currently has a
1684 * sequence number > the current journal segment seq. We can
1685 * safetly skip these deltas because the extent must already
1686 * have been rewritten.
1688 if (delta
.paddr
!= P_ADDR_NULL
&&
1689 delta
.paddr
.get_addr_type() == paddr_types_t::SEGMENT
) {
1690 auto& seg_addr
= delta
.paddr
.as_seg_paddr();
1691 auto seg_info
= get_segment_info(seg_addr
.get_segment_id());
1693 auto delta_paddr_segment_seq
= seg_info
->seq
;
1694 auto delta_paddr_segment_type
= seg_info
->type
;
1695 if (delta_paddr_segment_seq
!= delta
.ext_seq
||
1696 delta_paddr_segment_type
!= delta
.seg_type
) {
1697 DEBUG("delta is obsolete, delta_paddr_segment_seq={},"
1698 " delta_paddr_segment_type={} -- {}",
1699 segment_seq_printer_t
{delta_paddr_segment_seq
},
1700 delta_paddr_segment_type
,
1702 return replay_delta_ertr::make_ready_future
<bool>(false);
1707 if (delta
.type
== extent_types_t::JOURNAL_TAIL
) {
1708 // this delta should have been dealt with during segment cleaner mounting
1709 return replay_delta_ertr::make_ready_future
<bool>(false);
1713 if (delta
.type
== extent_types_t::ALLOC_INFO
) {
1714 if (journal_seq
< alloc_tail
) {
1715 DEBUG("journal_seq {} < alloc_tail {}, don't replay {}",
1716 journal_seq
, alloc_tail
, delta
);
1717 return replay_delta_ertr::make_ready_future
<bool>(false);
1720 alloc_delta_t alloc_delta
;
1721 decode(alloc_delta
, delta
.bl
);
1722 std::vector
<backref_entry_ref
> backref_list
;
1723 for (auto &alloc_blk
: alloc_delta
.alloc_blk_ranges
) {
1724 if (alloc_blk
.paddr
.is_relative()) {
1725 assert(alloc_blk
.paddr
.is_record_relative());
1726 alloc_blk
.paddr
= record_base
.add_relative(alloc_blk
.paddr
);
1728 DEBUG("replay alloc_blk {}~{} {}, journal_seq: {}",
1729 alloc_blk
.paddr
, alloc_blk
.len
, alloc_blk
.laddr
, journal_seq
);
1730 backref_list
.emplace_back(
1731 std::make_unique
<backref_entry_t
>(
1738 if (!backref_list
.empty()) {
1739 backref_batch_update(std::move(backref_list
), journal_seq
);
1741 return replay_delta_ertr::make_ready_future
<bool>(true);
1745 if (journal_seq
< dirty_tail
) {
1746 DEBUG("journal_seq {} < dirty_tail {}, don't replay {}",
1747 journal_seq
, dirty_tail
, delta
);
1748 return replay_delta_ertr::make_ready_future
<bool>(false);
1751 if (delta
.type
== extent_types_t::ROOT
) {
1752 TRACE("replay root delta at {} {}, remove extent ... -- {}, prv_root={}",
1753 journal_seq
, record_base
, delta
, *root
);
1754 remove_extent(root
);
1755 root
->apply_delta_and_adjust_crc(record_base
, delta
.bl
);
1756 root
->dirty_from_or_retired_at
= journal_seq
;
1757 root
->state
= CachedExtent::extent_state_t::DIRTY
;
1758 DEBUG("replayed root delta at {} {}, add extent -- {}, root={}",
1759 journal_seq
, record_base
, delta
, *root
);
1760 root
->set_modify_time(modify_time
);
1762 return replay_delta_ertr::make_ready_future
<bool>(true);
1764 auto _get_extent_if_cached
= [this](paddr_t addr
)
1765 -> get_extent_ertr::future
<CachedExtentRef
> {
1766 // replay is not included by the cache hit metrics
1767 auto ret
= query_cache(addr
, nullptr);
1769 // no retired-placeholder should be exist yet because no transaction
1770 // has been created.
1771 assert(ret
->get_type() != extent_types_t::RETIRED_PLACEHOLDER
);
1772 return ret
->wait_io().then([ret
] {
1776 return seastar::make_ready_future
<CachedExtentRef
>();
1779 auto extent_fut
= (delta
.pversion
== 0 ?
1780 // replay is not included by the cache hit metrics
1781 _get_extent_by_type(
1787 [](CachedExtent
&) {},
1788 [](CachedExtent
&) {}) :
1789 _get_extent_if_cached(
1792 replay_delta_ertr::pass_further
{},
1793 crimson::ct_error::assert_all
{
1794 "Invalid error in Cache::replay_delta"
1797 return extent_fut
.safe_then([=, this, &delta
](auto extent
) {
1799 DEBUG("replay extent is not present, so delta is obsolete at {} {} -- {}",
1800 journal_seq
, record_base
, delta
);
1801 assert(delta
.pversion
> 0);
1802 return replay_delta_ertr::make_ready_future
<bool>(true);
1805 DEBUG("replay extent delta at {} {} ... -- {}, prv_extent={}",
1806 journal_seq
, record_base
, delta
, *extent
);
1808 assert(extent
->last_committed_crc
== delta
.prev_crc
);
1809 assert(extent
->version
== delta
.pversion
);
1810 extent
->apply_delta_and_adjust_crc(record_base
, delta
.bl
);
1811 extent
->set_modify_time(modify_time
);
1812 assert(extent
->last_committed_crc
== delta
.final_crc
);
1815 if (extent
->version
== 1) {
1816 extent
->dirty_from_or_retired_at
= journal_seq
;
1817 DEBUG("replayed extent delta at {} {}, become dirty -- {}, extent={}" ,
1818 journal_seq
, record_base
, delta
, *extent
);
1820 DEBUG("replayed extent delta at {} {} -- {}, extent={}" ,
1821 journal_seq
, record_base
, delta
, *extent
);
1824 return replay_delta_ertr::make_ready_future
<bool>(true);
1829 Cache::get_next_dirty_extents_ret
Cache::get_next_dirty_extents(
1834 LOG_PREFIX(Cache::get_next_dirty_extents
);
1835 if (dirty
.empty()) {
1836 DEBUGT("max_bytes={}B, seq={}, dirty is empty",
1839 DEBUGT("max_bytes={}B, seq={}, dirty_from={}",
1840 t
, max_bytes
, seq
, dirty
.begin()->get_dirty_from());
1842 std::vector
<CachedExtentRef
> cand
;
1843 size_t bytes_so_far
= 0;
1844 for (auto i
= dirty
.begin();
1845 i
!= dirty
.end() && bytes_so_far
< max_bytes
;
1847 auto dirty_from
= i
->get_dirty_from();
1848 //dirty extents must be fully loaded
1849 assert(i
->is_fully_loaded());
1850 if (unlikely(dirty_from
== JOURNAL_SEQ_NULL
)) {
1851 ERRORT("got dirty extent with JOURNAL_SEQ_NULL -- {}", t
, *i
);
1854 if (dirty_from
< seq
) {
1855 TRACET("next extent -- {}", t
, *i
);
1856 if (!cand
.empty() && cand
.back()->get_dirty_from() > dirty_from
) {
1857 ERRORT("dirty extents are not ordered by dirty_from -- last={}, next={}",
1858 t
, *cand
.back(), *i
);
1861 bytes_so_far
+= i
->get_length();
1862 cand
.push_back(&*i
);
1867 return seastar::do_with(
1870 [FNAME
, this, &t
](auto &cand
, auto &ret
) {
1871 return trans_intr::do_for_each(
1873 [FNAME
, this, &t
, &ret
](auto &ext
) {
1874 TRACET("waiting on extent -- {}", t
, *ext
);
1875 return trans_intr::make_interruptible(
1877 ).then_interruptible([FNAME
, this, ext
, &t
, &ret
] {
1878 if (!ext
->is_valid()) {
1879 ++(get_by_src(stats
.trans_conflicts_by_unknown
, t
.get_src()));
1880 mark_transaction_conflicted(t
, *ext
);
1884 CachedExtentRef on_transaction
;
1885 auto result
= t
.get_extent(ext
->get_paddr(), &on_transaction
);
1886 if (result
== Transaction::get_extent_ret::ABSENT
) {
1887 DEBUGT("extent is absent on t -- {}", t
, *ext
);
1888 t
.add_to_read_set(ext
);
1889 if (ext
->get_type() == extent_types_t::ROOT
) {
1891 assert(&*t
.root
== &*ext
);
1892 ceph_assert(0 == "t.root would have to already be in the read set");
1894 assert(&*ext
== &*root
);
1899 } else if (result
== Transaction::get_extent_ret::PRESENT
) {
1900 DEBUGT("extent is present on t -- {}, on t {}", t
, *ext
, *on_transaction
);
1901 ret
.push_back(on_transaction
);
1903 assert(result
== Transaction::get_extent_ret::RETIRED
);
1904 DEBUGT("extent is retired on t -- {}", t
, *ext
);
1907 }).then_interruptible([&ret
] {
1908 return std::move(ret
);
1913 Cache::get_root_ret
Cache::get_root(Transaction
&t
)
1915 LOG_PREFIX(Cache::get_root
);
1917 TRACET("root already on t -- {}", t
, *t
.root
);
1918 return t
.root
->wait_io().then([&t
] {
1919 return get_root_iertr::make_ready_future
<RootBlockRef
>(
1923 DEBUGT("root not on t -- {}", t
, *root
);
1925 t
.add_to_read_set(root
);
1926 return root
->wait_io().then([root
=root
] {
1927 return get_root_iertr::make_ready_future
<RootBlockRef
>(
1933 Cache::get_extent_ertr::future
<CachedExtentRef
> Cache::_get_extent_by_type(
1934 extent_types_t type
,
1937 extent_len_t length
,
1938 const Transaction::src_t
* p_src
,
1939 extent_init_func_t
&&extent_init_func
,
1940 extent_init_func_t
&&on_cache
)
1942 return [=, this, extent_init_func
=std::move(extent_init_func
)]() mutable {
1943 src_ext_t
* p_metric_key
= nullptr;
1944 src_ext_t metric_key
;
1946 metric_key
= std::make_pair(*p_src
, type
);
1947 p_metric_key
= &metric_key
;
1951 case extent_types_t::ROOT
:
1952 ceph_assert(0 == "ROOT is never directly read");
1953 return get_extent_ertr::make_ready_future
<CachedExtentRef
>();
1954 case extent_types_t::BACKREF_INTERNAL
:
1955 return get_extent
<backref::BackrefInternalNode
>(
1956 offset
, length
, p_metric_key
, std::move(extent_init_func
), std::move(on_cache
)
1957 ).safe_then([](auto extent
) {
1958 return CachedExtentRef(extent
.detach(), false /* add_ref */);
1960 case extent_types_t::BACKREF_LEAF
:
1961 return get_extent
<backref::BackrefLeafNode
>(
1962 offset
, length
, p_metric_key
, std::move(extent_init_func
), std::move(on_cache
)
1963 ).safe_then([](auto extent
) {
1964 return CachedExtentRef(extent
.detach(), false /* add_ref */);
1966 case extent_types_t::LADDR_INTERNAL
:
1967 return get_extent
<lba_manager::btree::LBAInternalNode
>(
1968 offset
, length
, p_metric_key
, std::move(extent_init_func
), std::move(on_cache
)
1969 ).safe_then([](auto extent
) {
1970 return CachedExtentRef(extent
.detach(), false /* add_ref */);
1972 case extent_types_t::LADDR_LEAF
:
1973 return get_extent
<lba_manager::btree::LBALeafNode
>(
1974 offset
, length
, p_metric_key
, std::move(extent_init_func
), std::move(on_cache
)
1975 ).safe_then([](auto extent
) {
1976 return CachedExtentRef(extent
.detach(), false /* add_ref */);
1978 case extent_types_t::OMAP_INNER
:
1979 return get_extent
<omap_manager::OMapInnerNode
>(
1980 offset
, length
, p_metric_key
, std::move(extent_init_func
), std::move(on_cache
)
1981 ).safe_then([](auto extent
) {
1982 return CachedExtentRef(extent
.detach(), false /* add_ref */);
1984 case extent_types_t::OMAP_LEAF
:
1985 return get_extent
<omap_manager::OMapLeafNode
>(
1986 offset
, length
, p_metric_key
, std::move(extent_init_func
), std::move(on_cache
)
1987 ).safe_then([](auto extent
) {
1988 return CachedExtentRef(extent
.detach(), false /* add_ref */);
1990 case extent_types_t::COLL_BLOCK
:
1991 return get_extent
<collection_manager::CollectionNode
>(
1992 offset
, length
, p_metric_key
, std::move(extent_init_func
), std::move(on_cache
)
1993 ).safe_then([](auto extent
) {
1994 return CachedExtentRef(extent
.detach(), false /* add_ref */);
1996 case extent_types_t::ONODE_BLOCK_STAGED
:
1997 return get_extent
<onode::SeastoreNodeExtent
>(
1998 offset
, length
, p_metric_key
, std::move(extent_init_func
), std::move(on_cache
)
1999 ).safe_then([](auto extent
) {
2000 return CachedExtentRef(extent
.detach(), false /* add_ref */);
2002 case extent_types_t::OBJECT_DATA_BLOCK
:
2003 return get_extent
<ObjectDataBlock
>(
2004 offset
, length
, p_metric_key
, std::move(extent_init_func
), std::move(on_cache
)
2005 ).safe_then([](auto extent
) {
2006 return CachedExtentRef(extent
.detach(), false /* add_ref */);
2008 case extent_types_t::RETIRED_PLACEHOLDER
:
2009 ceph_assert(0 == "impossible");
2010 return get_extent_ertr::make_ready_future
<CachedExtentRef
>();
2011 case extent_types_t::TEST_BLOCK
:
2012 return get_extent
<TestBlock
>(
2013 offset
, length
, p_metric_key
, std::move(extent_init_func
), std::move(on_cache
)
2014 ).safe_then([](auto extent
) {
2015 return CachedExtentRef(extent
.detach(), false /* add_ref */);
2017 case extent_types_t::TEST_BLOCK_PHYSICAL
:
2018 return get_extent
<TestBlockPhysical
>(
2019 offset
, length
, p_metric_key
, std::move(extent_init_func
), std::move(on_cache
)
2020 ).safe_then([](auto extent
) {
2021 return CachedExtentRef(extent
.detach(), false /* add_ref */);
2023 case extent_types_t::NONE
: {
2024 ceph_assert(0 == "NONE is an invalid extent type");
2025 return get_extent_ertr::make_ready_future
<CachedExtentRef
>();
2028 ceph_assert(0 == "impossible");
2029 return get_extent_ertr::make_ready_future
<CachedExtentRef
>();
2031 }().safe_then([laddr
](CachedExtentRef e
) {
2032 assert(e
->is_logical() == (laddr
!= L_ADDR_NULL
));
2033 if (e
->is_logical()) {
2034 e
->cast
<LogicalCachedExtent
>()->set_laddr(laddr
);
2036 return get_extent_ertr::make_ready_future
<CachedExtentRef
>(e
);