1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
8 #include "seastar/core/shared_future.hh"
10 #include "include/buffer.h"
12 #include "crimson/os/seastore/logging.h"
13 #include "crimson/os/seastore/seastore_types.h"
14 #include "crimson/os/seastore/transaction.h"
15 #include "crimson/os/seastore/segment_manager.h"
16 #include "crimson/common/errorator.h"
17 #include "crimson/os/seastore/cached_extent.h"
18 #include "crimson/os/seastore/root_block.h"
19 #include "crimson/os/seastore/segment_cleaner.h"
20 #include "crimson/os/seastore/random_block_manager.h"
22 namespace crimson::os::seastore
{
27 * This component is responsible for buffer management, including
28 * transaction lifecycle.
30 * Seastore transactions are expressed as an atomic combination of
31 * 1) newly written blocks
32 * 2) logical mutations to existing physical blocks
36 * As such, any transaction has 3 components:
37 * 1) read_set: references to extents read during the transaction
38 * See Transaction::read_set
39 * 2) write_set: references to extents to be written as:
40 * a) new physical blocks, see Transaction::fresh_block_list
41 * b) mutations to existing physical blocks,
42 * see Transaction::mutated_block_list
43 * 3) retired_set: extent refs to be retired either due to 2b or
44 * due to releasing the extent generally.
46 * In the case of 2b, the CachedExtent will have been copied into
47 * a fresh CachedExtentRef such that the source extent ref is present
48 * in the read set and the newly allocated extent is present in the
51 * A transaction has 3 phases:
52 * 1) construction: user calls Cache::get_transaction() and populates
53 * the returned transaction by calling Cache methods
54 * 2) submission: user calls Cache::try_start_transaction(). If
55 * succcessful, the user may construct a record and submit the
56 * transaction to the journal.
57 * 3) completion: once the transaction is durable, the user must call
58 * Cache::complete_commit() with the block offset to complete
61 * Internally, in phase 1, the fields in Transaction are filled in.
62 * - reads may block if the referenced extent is being written
63 * - once a read obtains a particular CachedExtentRef for a paddr_t,
64 * it'll always get the same one until overwritten
65 * - once a paddr_t is overwritten or written, subsequent reads of
66 * that addr will get the new ref
68 * In phase 2, if all extents in the read set are valid (not expired),
69 * we can commit (otherwise, we fail and the user must retry).
70 * - Expire all extents in the retired_set (they must all be valid)
71 * - Remove all extents in the retired_set from Cache::extents
72 * - Mark all extents in the write_set wait_io(), add promises to
74 * - Merge Transaction::write_set into Cache::extents
76 * After phase 2, the user will submit the record to the journal.
77 * Once complete, we perform phase 3:
78 * - For each CachedExtent in block_list, call
79 * CachedExtent::complete_initial_write(paddr_t) with the block's
80 * final offset (inferred from the extent's position in the block_list
81 * and extent lengths).
82 * - For each block in mutation_list, call
83 * CachedExtent::delta_written(paddr_t) with the address of the start
85 * - Complete all promises with the final record start paddr_t
89 using base_ertr
= crimson::errorator
<
90 crimson::ct_error::input_output_error
>;
91 using base_iertr
= trans_iertr
<base_ertr
>;
93 Cache(ExtentReader
&reader
);
96 /// Creates empty transaction by source
97 TransactionRef
create_transaction(
98 Transaction::src_t src
,
101 LOG_PREFIX(Cache::create_transaction
);
103 ++(get_by_src(stats
.trans_created_by_src
, src
));
105 auto ret
= std::make_unique
<Transaction
>(
106 get_dummy_ordering_handle(),
110 [this](Transaction
& t
) {
111 return on_transaction_destruct(t
);
114 SUBDEBUGT(seastore_cache
, "created name={}, source={}, is_weak={}",
115 *ret
, name
, src
, is_weak
);
119 /// Resets transaction preserving
120 void reset_transaction_preserve_handle(Transaction
&t
) {
121 LOG_PREFIX(Cache::reset_transaction_preserve_handle
);
123 ++(get_by_src(stats
.trans_created_by_src
, t
.get_src()));
125 t
.reset_preserve_handle(last_commit
);
126 SUBDEBUGT(seastore_cache
, "reset", t
);
132 * Drop extent from cache. Intended for use when
133 * ref refers to a logically dead extent as during
136 void drop_from_cache(CachedExtentRef ref
) {
140 /// Declare ref retired in t
141 void retire_extent(Transaction
&t
, CachedExtentRef ref
) {
142 t
.add_to_retired_set(ref
);
145 /// Declare paddr retired in t
146 using retire_extent_iertr
= base_iertr
;
147 using retire_extent_ret
= base_iertr::future
<>;
148 retire_extent_ret
retire_extent_addr(
149 Transaction
&t
, paddr_t addr
, extent_len_t length
);
154 * returns ref to current root or t.root if modified in t
156 using get_root_iertr
= base_iertr
;
157 using get_root_ret
= get_root_iertr::future
<RootBlockRef
>;
158 get_root_ret
get_root(Transaction
&t
);
163 * returns t.root and assume it is already present/read in t
165 RootBlockRef
get_root_fast(Transaction
&t
) {
173 * returns ref to extent at offset~length of type T either from
174 * - extent_set if already in cache
177 using src_ext_t
= std::pair
<Transaction::src_t
, extent_types_t
>;
178 using get_extent_ertr
= base_ertr
;
179 template <typename T
>
180 using get_extent_ret
= get_extent_ertr::future
<TCachedExtentRef
<T
>>;
181 template <typename T
, typename Func
>
182 get_extent_ret
<T
> get_extent(
183 paddr_t offset
, ///< [in] starting addr
184 segment_off_t length
, ///< [in] length
185 const src_ext_t
* p_metric_key
, ///< [in] cache query metric key
186 Func
&&extent_init_func
///< [in] init func for extent
188 auto cached
= query_cache(offset
, p_metric_key
);
190 auto ret
= CachedExtent::make_cached_extent_ref
<T
>(
191 alloc_cache_buf(length
));
192 ret
->set_paddr(offset
);
193 ret
->state
= CachedExtent::extent_state_t::CLEAN_PENDING
;
195 extent_init_func(*ret
);
196 return read_extent
<T
>(
200 // extent PRESENT in cache
201 if (cached
->get_type() == extent_types_t::RETIRED_PLACEHOLDER
) {
202 auto ret
= CachedExtent::make_cached_extent_ref
<T
>(
203 alloc_cache_buf(length
));
204 ret
->set_paddr(offset
);
205 ret
->state
= CachedExtent::extent_state_t::CLEAN_PENDING
;
206 extents
.replace(*ret
, *cached
);
208 // replace placeholder in transactions
209 while (!cached
->transactions
.empty()) {
210 auto t
= cached
->transactions
.begin()->t
;
211 t
->replace_placeholder(*cached
, *ret
);
214 cached
->state
= CachedExtent::extent_state_t::INVALID
;
215 extent_init_func(*ret
);
216 return read_extent
<T
>(
219 auto ret
= TCachedExtentRef
<T
>(static_cast<T
*>(cached
.get()));
221 ).then([ret
=std::move(ret
)]() mutable
222 -> get_extent_ret
<T
> {
223 // ret may be invalid, caller must check
224 return get_extent_ret
<T
>(
225 get_extent_ertr::ready_future_marker
{},
230 template <typename T
>
231 get_extent_ret
<T
> get_extent(
232 paddr_t offset
, ///< [in] starting addr
233 segment_off_t length
, ///< [in] length
234 const src_ext_t
* p_metric_key
///< [in] cache query metric key
236 return get_extent
<T
>(
237 offset
, length
, p_metric_key
,
242 * get_extent_if_cached
244 * Returns extent at offset if in cache
246 using get_extent_if_cached_iertr
= base_iertr
;
247 using get_extent_if_cached_ret
=
248 get_extent_if_cached_iertr::future
<CachedExtentRef
>;
249 get_extent_if_cached_ret
get_extent_if_cached(
252 extent_types_t type
) {
254 LOG_PREFIX(Cache::get_extent_if_cached
);
255 auto result
= t
.get_extent(offset
, &ret
);
256 if (result
!= Transaction::get_extent_ret::ABSENT
) {
257 // including get_extent_ret::RETIRED
258 SUBDEBUGT(seastore_cache
,
259 "Found extent at offset {} on transaction: {}",
261 return get_extent_if_cached_iertr::make_ready_future
<
262 CachedExtentRef
>(ret
);
265 // get_extent_ret::ABSENT from transaction
266 auto metric_key
= std::make_pair(t
.get_src(), type
);
267 ret
= query_cache(offset
, &metric_key
);
269 // retired_placeholder is not really cached yet
270 ret
->get_type() == extent_types_t::RETIRED_PLACEHOLDER
) {
271 SUBDEBUGT(seastore_cache
,
272 "No extent at offset {}, retired_placeholder: {}",
274 return get_extent_if_cached_iertr::make_ready_future
<
278 // present in cache and is not a retired_placeholder
279 SUBDEBUGT(seastore_cache
,
280 "Found extent at offset {} in cache: {}",
282 t
.add_to_read_set(ret
);
284 return ret
->wait_io().then([ret
] {
285 return get_extent_if_cached_iertr::make_ready_future
<
286 CachedExtentRef
>(ret
);
293 * returns ref to extent at offset~length of type T either from
294 * - t if modified by t
295 * - extent_set if already in cache
298 * t *must not* have retired offset
300 using get_extent_iertr
= base_iertr
;
301 template <typename T
, typename Func
>
302 get_extent_iertr::future
<TCachedExtentRef
<T
>> get_extent(
305 segment_off_t length
,
306 Func
&&extent_init_func
) {
308 LOG_PREFIX(Cache::get_extent
);
309 auto result
= t
.get_extent(offset
, &ret
);
310 if (result
!= Transaction::get_extent_ret::ABSENT
) {
311 assert(result
!= Transaction::get_extent_ret::RETIRED
);
312 SUBDEBUGT(seastore_cache
,
313 "Found extent at offset {} on transaction: {}",
315 return seastar::make_ready_future
<TCachedExtentRef
<T
>>(
318 auto metric_key
= std::make_pair(t
.get_src(), T::TYPE
);
319 return trans_intr::make_interruptible(
321 offset
, length
, &metric_key
,
322 std::forward
<Func
>(extent_init_func
))
323 ).si_then([this, FNAME
, offset
, &t
](auto ref
) {
324 (void)this; // silence incorrect clang warning about capture
325 if (!ref
->is_valid()) {
326 SUBDEBUGT(seastore_cache
, "got invalid extent: {}", t
, ref
);
327 ++(get_by_src(stats
.trans_conflicts_by_unknown
, t
.get_src()));
328 mark_transaction_conflicted(t
, *ref
);
329 return get_extent_iertr::make_ready_future
<TCachedExtentRef
<T
>>();
331 SUBDEBUGT(seastore_cache
,
332 "Read extent at offset {} in cache: {}",
335 t
.add_to_read_set(ref
);
336 return get_extent_iertr::make_ready_future
<TCachedExtentRef
<T
>>(
342 template <typename T
>
343 get_extent_iertr::future
<TCachedExtentRef
<T
>> get_extent(
346 segment_off_t length
) {
347 return get_extent
<T
>(t
, offset
, length
, [](T
&){});
354 * Based on type, instantiate the correct concrete type
355 * and read in the extent at location offset~length.
358 // This is a workaround std::move_only_function not being available,
359 // not really worth generalizing at this time.
360 class extent_init_func_t
{
362 virtual void operator()(CachedExtent
&extent
) = 0;
363 virtual ~callable_i() = default;
365 template <typename Func
>
366 struct callable_wrapper final
: callable_i
{
368 callable_wrapper(Func
&&func
) : func(std::forward
<Func
>(func
)) {}
369 void operator()(CachedExtent
&extent
) final
{
372 ~callable_wrapper() final
= default;
375 std::unique_ptr
<callable_i
> wrapped
;
376 template <typename Func
>
377 extent_init_func_t(Func
&&func
) : wrapped(
378 std::make_unique
<callable_wrapper
<Func
>>(std::forward
<Func
>(func
)))
380 void operator()(CachedExtent
&extent
) {
381 return (*wrapped
)(extent
);
384 get_extent_ertr::future
<CachedExtentRef
> _get_extent_by_type(
388 segment_off_t length
,
389 const Transaction::src_t
* p_src
,
390 extent_init_func_t
&&extent_init_func
393 using get_extent_by_type_iertr
= get_extent_iertr
;
394 using get_extent_by_type_ret
= get_extent_by_type_iertr::future
<
396 get_extent_by_type_ret
_get_extent_by_type(
401 segment_off_t length
,
402 extent_init_func_t
&&extent_init_func
) {
404 auto status
= t
.get_extent(offset
, &ret
);
405 if (status
== Transaction::get_extent_ret::RETIRED
) {
406 return seastar::make_ready_future
<CachedExtentRef
>();
407 } else if (status
== Transaction::get_extent_ret::PRESENT
) {
408 return seastar::make_ready_future
<CachedExtentRef
>(ret
);
410 auto src
= t
.get_src();
411 return trans_intr::make_interruptible(
413 type
, offset
, laddr
, length
, &src
,
414 std::move(extent_init_func
))
415 ).si_then([=, &t
](CachedExtentRef ret
) {
416 if (!ret
->is_valid()) {
417 LOG_PREFIX(Cache::get_extent_by_type
);
418 SUBDEBUGT(seastore_cache
, "got invalid extent: {}", t
, ret
);
419 ++(get_by_src(stats
.trans_conflicts_by_unknown
, t
.get_src()));
420 mark_transaction_conflicted(t
, *ret
.get());
421 return get_extent_ertr::make_ready_future
<CachedExtentRef
>();
424 t
.add_to_read_set(ret
);
425 return get_extent_ertr::make_ready_future
<CachedExtentRef
>(
433 template <typename Func
>
434 get_extent_by_type_ret
get_extent_by_type(
435 Transaction
&t
, ///< [in] transaction
436 extent_types_t type
, ///< [in] type tag
437 paddr_t offset
, ///< [in] starting addr
438 laddr_t laddr
, ///< [in] logical address if logical
439 segment_off_t length
, ///< [in] length
440 Func
&&extent_init_func
///< [in] extent init func
442 return _get_extent_by_type(
448 extent_init_func_t(std::forward
<Func
>(extent_init_func
)));
450 get_extent_by_type_ret
get_extent_by_type(
457 return get_extent_by_type(
458 t
, type
, offset
, laddr
, length
, [](CachedExtent
&) {});
465 * Allocates a fresh extent. if delayed is true, addr will be alloc'd later
467 template <typename T
>
468 TCachedExtentRef
<T
> alloc_new_extent(
469 Transaction
&t
, ///< [in, out] current transaction
470 segment_off_t length
, ///< [in] length
471 bool delayed
= false ///< [in] whether the paddr allocation of extent is delayed
473 auto ret
= CachedExtent::make_cached_extent_ref
<T
>(
474 alloc_cache_buf(length
));
475 t
.add_fresh_extent(ret
, delayed
);
476 ret
->state
= CachedExtent::extent_state_t::INITIAL_WRITE_PENDING
;
484 void mark_delayed_extent_inline(
486 LogicalCachedExtentRef
& ref
) {
487 t
.mark_delayed_extent_inline(ref
);
490 void mark_delayed_extent_ool(
492 LogicalCachedExtentRef
& ref
,
493 paddr_t final_addr
) {
494 t
.mark_delayed_extent_ool(ref
, final_addr
);
500 * Allocates a fresh extent. addr will be relative until commit.
502 CachedExtentRef
alloc_new_extent_by_type(
503 Transaction
&t
, ///< [in, out] current transaction
504 extent_types_t type
, ///< [in] type tag
505 segment_off_t length
, ///< [in] length
506 bool delayed
= false ///< [in] whether delay addr allocation
510 * Allocates mutable buffer from extent_set on offset~len
512 * TODO: Note, currently all implementations literally copy the
513 * buffer. This needn't be true, CachedExtent implementations could
514 * choose to refer to the same buffer unmodified until commit and just
515 * buffer the mutations in an ancillary data structure.
517 * @param current transaction
518 * @param extent to duplicate
519 * @return mutable extent
521 CachedExtentRef
duplicate_for_write(
522 Transaction
&t
, ///< [in, out] current transaction
523 CachedExtentRef i
///< [in] ref to existing extent
529 * Construct the record for Journal from transaction.
531 record_t
prepare_record(
532 Transaction
&t
///< [in, out] current transaction
538 * Must be called upon completion of write. Releases blocks on mutating
539 * extents, fills in addresses, and calls relevant callbacks on fresh
540 * and mutated exents.
542 void complete_commit(
543 Transaction
&t
, ///< [in, out] current transaction
544 paddr_t final_block_start
, ///< [in] offset of initial block
545 journal_seq_t seq
, ///< [in] journal commit seq
546 SegmentCleaner
*cleaner
=nullptr ///< [out] optional segment stat listener
557 * Alloc initial root node and add to t. The intention is for other
558 * components to use t to adjust the resulting root ref prior to commit.
560 using mkfs_iertr
= base_iertr
;
561 mkfs_iertr::future
<> mkfs(Transaction
&t
);
566 * TODO: should flush dirty blocks
568 using close_ertr
= crimson::errorator
<
569 crimson::ct_error::input_output_error
>;
570 close_ertr::future
<> close();
575 * Intended for use in Journal::delta. For each delta, should decode delta,
576 * read relevant block from disk or cache (using correct type), and call
577 * CachedExtent::apply_delta marking the extent dirty.
579 using replay_delta_ertr
= crimson::errorator
<
580 crimson::ct_error::input_output_error
>;
581 using replay_delta_ret
= replay_delta_ertr::future
<>;
582 replay_delta_ret
replay_delta(
584 paddr_t record_block_base
,
585 const delta_info_t
&delta
);
588 * init_cached_extents
590 * Calls passed lambda for each dirty cached block. Intended for use
591 * after replay to allow lba_manager (or w/e) to read in any ancestor
594 using init_cached_extents_iertr
= base_iertr
;
595 using init_cached_extents_ret
= init_cached_extents_iertr::future
<>;
596 template <typename F
>
597 init_cached_extents_ret
init_cached_extents(
601 // journal replay should has been finished at this point,
602 // Cache::root should have been inserted to the dirty list
603 assert(root
->is_dirty());
604 std::vector
<CachedExtentRef
> dirty
;
605 for (auto &e
: extents
) {
606 dirty
.push_back(CachedExtentRef(&e
));
608 return seastar::do_with(
611 [&t
](auto &f
, auto &refs
) mutable {
612 return trans_intr::do_for_each(
614 [&t
, &f
](auto &e
) { return f(t
, e
); });
615 }).handle_error_interruptible(
616 init_cached_extents_iertr::pass_further
{},
617 crimson::ct_error::assert_all
{
618 "Invalid error in Cache::init_cached_extents"
624 * update_extent_from_transaction
626 * Updates passed extent based on t. If extent has been retired,
627 * a null result will be returned.
629 CachedExtentRef
update_extent_from_transaction(
631 CachedExtentRef extent
) {
632 if (extent
->get_type() == extent_types_t::ROOT
) {
636 t
.add_to_read_set(extent
);
637 t
.root
= extent
->cast
<RootBlock
>();
641 auto result
= t
.get_extent(extent
->get_paddr(), &extent
);
642 if (result
== Transaction::get_extent_ret::RETIRED
) {
643 return CachedExtentRef();
645 if (result
== Transaction::get_extent_ret::ABSENT
) {
646 t
.add_to_read_set(extent
);
656 * Dump summary of contents (TODO)
659 std::ostream
&out
) const {
664 * get_next_dirty_extents
666 * Returns extents with get_dirty_from() < seq and adds to read set of
669 using get_next_dirty_extents_iertr
= base_iertr
;
670 using get_next_dirty_extents_ret
= get_next_dirty_extents_iertr::future
<
671 std::vector
<CachedExtentRef
>>;
672 get_next_dirty_extents_ret
get_next_dirty_extents(
677 /// returns std::nullopt if no dirty extents or get_dirty_from() for oldest
678 std::optional
<journal_seq_t
> get_oldest_dirty_from() const {
682 auto oldest
= dirty
.begin()->get_dirty_from();
683 if (oldest
== journal_seq_t()) {
691 /// Dump live extents
692 void dump_contents();
695 ExtentReader
&reader
; ///< ref to extent reader
696 RootBlockRef root
; ///< ref to current root
697 ExtentIndex extents
; ///< set of live extents
699 journal_seq_t last_commit
= JOURNAL_SEQ_MIN
;
704 * holds refs to dirty extents. Ordered by CachedExtent::get_dirty_from().
706 CachedExtent::list dirty
;
711 * holds references to recently used extents
715 const size_t capacity
= 0;
717 // current size (bytes)
720 CachedExtent::list lru
;
722 void trim_to_capacity() {
723 while (contents
> capacity
) {
724 assert(lru
.size() > 0);
725 remove_from_lru(lru
.front());
729 void add_to_lru(CachedExtent
&extent
) {
732 !extent
.is_pending() &&
733 !extent
.is_placeholder());
735 if (!extent
.primary_ref_list_hook
.is_linked()) {
736 contents
+= extent
.get_length();
737 intrusive_ptr_add_ref(&extent
);
738 lru
.push_back(extent
);
744 LRU(size_t capacity
) : capacity(capacity
) {}
746 size_t get_current_contents_bytes() const {
750 size_t get_current_contents_extents() const {
754 void remove_from_lru(CachedExtent
&extent
) {
755 assert(extent
.is_clean());
756 assert(!extent
.is_pending());
757 assert(!extent
.is_placeholder());
759 if (extent
.primary_ref_list_hook
.is_linked()) {
760 lru
.erase(lru
.s_iterator_to(extent
));
761 assert(contents
>= extent
.get_length());
762 contents
-= extent
.get_length();
763 intrusive_ptr_release(&extent
);
767 void move_to_top(CachedExtent
&extent
) {
770 !extent
.is_pending() &&
771 !extent
.is_placeholder());
773 if (extent
.primary_ref_list_hook
.is_linked()) {
774 lru
.erase(lru
.s_iterator_to(extent
));
775 intrusive_ptr_release(&extent
);
776 assert(contents
>= extent
.get_length());
777 contents
-= extent
.get_length();
783 LOG_PREFIX(Cache::LRU::clear
);
784 for (auto iter
= lru
.begin(); iter
!= lru
.end();) {
785 SUBDEBUG(seastore_cache
, "clearing {}", *iter
);
786 remove_from_lru(*(iter
++));
795 struct query_counters_t
{
803 * Count the number of extents involved in the effort and the total bytes of
806 * Each effort_t represents the effort of a set of extents involved in the
807 * transaction, classified by read, mutate, retire and allocate behaviors,
808 * see XXX_trans_efforts_t.
811 uint64_t extents
= 0;
814 void increment(uint64_t extent_len
) {
820 template <typename CounterT
>
821 using counter_by_extent_t
= std::array
<CounterT
, EXTENT_TYPES_MAX
>;
823 struct invalid_trans_efforts_t
{
826 uint64_t mutate_delta_bytes
= 0;
829 effort_t fresh_ool_written
;
830 counter_by_extent_t
<uint64_t> num_trans_invalidated
;
831 uint64_t num_ool_records
= 0;
832 uint64_t ool_record_bytes
= 0;
835 struct commit_trans_efforts_t
{
836 counter_by_extent_t
<effort_t
> read_by_ext
;
837 counter_by_extent_t
<effort_t
> mutate_by_ext
;
838 counter_by_extent_t
<uint64_t> delta_bytes_by_ext
;
839 counter_by_extent_t
<effort_t
> retire_by_ext
;
840 counter_by_extent_t
<effort_t
> fresh_invalid_by_ext
; // inline but is already invalid (retired)
841 counter_by_extent_t
<effort_t
> fresh_inline_by_ext
;
842 counter_by_extent_t
<effort_t
> fresh_ool_by_ext
;
843 uint64_t num_trans
= 0; // the number of inline records
844 uint64_t num_ool_records
= 0;
845 uint64_t ool_record_padding_bytes
= 0;
846 uint64_t ool_record_metadata_bytes
= 0;
847 uint64_t ool_record_data_bytes
= 0;
848 uint64_t inline_record_metadata_bytes
= 0; // metadata exclude the delta bytes
851 struct success_read_trans_efforts_t
{
853 uint64_t num_trans
= 0;
856 struct tree_efforts_t
{
857 uint64_t num_inserts
= 0;
858 uint64_t num_erases
= 0;
860 void increment(const Transaction::tree_stats_t
& incremental
) {
861 num_inserts
+= incremental
.num_inserts
;
862 num_erases
+= incremental
.num_erases
;
866 template <typename CounterT
>
867 using counter_by_src_t
= std::array
<CounterT
, Transaction::SRC_MAX
>;
869 static constexpr std::size_t NUM_SRC_COMB
=
870 Transaction::SRC_MAX
* (Transaction::SRC_MAX
+ 1) / 2;
873 counter_by_src_t
<uint64_t> trans_created_by_src
;
874 counter_by_src_t
<commit_trans_efforts_t
> committed_efforts_by_src
;
875 counter_by_src_t
<invalid_trans_efforts_t
> invalidated_efforts_by_src
;
876 counter_by_src_t
<query_counters_t
> cache_query_by_src
;
877 success_read_trans_efforts_t success_read_efforts
;
878 uint64_t dirty_bytes
= 0;
880 uint64_t onode_tree_depth
= 0;
881 counter_by_src_t
<tree_efforts_t
> committed_onode_tree_efforts
;
882 counter_by_src_t
<tree_efforts_t
> invalidated_onode_tree_efforts
;
884 uint64_t lba_tree_depth
= 0;
885 counter_by_src_t
<tree_efforts_t
> committed_lba_tree_efforts
;
886 counter_by_src_t
<tree_efforts_t
> invalidated_lba_tree_efforts
;
888 std::array
<uint64_t, NUM_SRC_COMB
> trans_conflicts_by_srcs
;
889 counter_by_src_t
<uint64_t> trans_conflicts_by_unknown
;
892 template <typename CounterT
>
893 CounterT
& get_by_src(
894 counter_by_src_t
<CounterT
>& counters_by_src
,
895 Transaction::src_t src
) {
896 assert(static_cast<std::size_t>(src
) < counters_by_src
.size());
897 return counters_by_src
[static_cast<std::size_t>(src
)];
900 template <typename CounterT
>
901 CounterT
& get_by_ext(
902 counter_by_extent_t
<CounterT
>& counters_by_ext
,
903 extent_types_t ext
) {
904 auto index
= static_cast<uint8_t>(ext
);
905 assert(index
< EXTENT_TYPES_MAX
);
906 return counters_by_ext
[index
];
909 void account_conflict(Transaction::src_t src1
, Transaction::src_t src2
) {
910 assert(src1
< Transaction::src_t::MAX
);
911 assert(src2
< Transaction::src_t::MAX
);
913 std::swap(src1
, src2
);
915 // impossible combinations
916 // should be consistent with trans_srcs_invalidated in register_metrics()
917 assert(!(src1
== Transaction::src_t::READ
&&
918 src2
== Transaction::src_t::READ
));
919 assert(!(src1
== Transaction::src_t::CLEANER_TRIM
&&
920 src2
== Transaction::src_t::CLEANER_TRIM
));
921 assert(!(src1
== Transaction::src_t::CLEANER_RECLAIM
&&
922 src2
== Transaction::src_t::CLEANER_RECLAIM
));
923 assert(!(src1
== Transaction::src_t::CLEANER_TRIM
&&
924 src2
== Transaction::src_t::CLEANER_RECLAIM
));
926 auto src1_value
= static_cast<std::size_t>(src1
);
927 auto src2_value
= static_cast<std::size_t>(src2
);
928 auto num_srcs
= static_cast<std::size_t>(Transaction::src_t::MAX
);
929 auto conflict_index
= num_srcs
* src1_value
+ src2_value
-
930 src1_value
* (src1_value
+ 1) / 2;
931 assert(conflict_index
< NUM_SRC_COMB
);
932 ++stats
.trans_conflicts_by_srcs
[conflict_index
];
935 seastar::metrics::metric_group metrics
;
936 void register_metrics();
938 /// alloc buffer for cached extent
939 bufferptr
alloc_cache_buf(size_t size
) {
940 // TODO: memory pooling etc
941 auto bp
= ceph::bufferptr(
942 buffer::create_page_aligned(size
));
947 /// Update lru for access to ref
948 void touch_extent(CachedExtent
&ext
) {
949 assert(!ext
.is_pending());
950 if (ext
.is_clean() && !ext
.is_placeholder()) {
951 lru
.move_to_top(ext
);
955 /// Add extent to extents handling dirty and refcounting
956 void add_extent(CachedExtentRef ref
);
958 /// Mark exising extent ref dirty -- mainly for replay
959 void mark_dirty(CachedExtentRef ref
);
961 /// Add dirty extent to dirty list
962 void add_to_dirty(CachedExtentRef ref
);
964 /// Remove from dirty list
965 void remove_from_dirty(CachedExtentRef ref
);
967 /// Remove extent from extents handling dirty and refcounting
968 void remove_extent(CachedExtentRef ref
);
971 void commit_retire_extent(Transaction
& t
, CachedExtentRef ref
);
973 /// Replace prev with next
974 void commit_replace_extent(Transaction
& t
, CachedExtentRef next
, CachedExtentRef prev
);
976 /// Invalidate extent and mark affected transactions
977 void invalidate_extent(Transaction
& t
, CachedExtent
& extent
);
979 /// Mark a valid transaction as conflicted
980 void mark_transaction_conflicted(
981 Transaction
& t
, CachedExtent
& conflicting_extent
);
983 /// Introspect transaction when it is being destructed
984 void on_transaction_destruct(Transaction
& t
);
986 template <typename T
>
987 get_extent_ret
<T
> read_extent(
988 TCachedExtentRef
<T
>&& extent
990 assert(extent
->state
== CachedExtent::extent_state_t::CLEAN_PENDING
);
991 extent
->set_io_wait();
994 extent
->get_length(),
997 [extent
=std::move(extent
)]() mutable {
998 extent
->state
= CachedExtent::extent_state_t::CLEAN
;
999 /* TODO: crc should be checked against LBA manager */
1000 extent
->last_committed_crc
= extent
->get_crc32c();
1002 extent
->on_clean_read();
1003 extent
->complete_io();
1004 return get_extent_ertr::make_ready_future
<TCachedExtentRef
<T
>>(
1007 get_extent_ertr::pass_further
{},
1008 crimson::ct_error::assert_all
{
1009 "Cache::get_extent: invalid error"
1014 // Extents in cache may contain placeholders
1015 CachedExtentRef
query_cache(
1017 const src_ext_t
* p_metric_key
) {
1018 query_counters_t
* p_counters
= nullptr;
1020 p_counters
= &get_by_src(stats
.cache_query_by_src
, p_metric_key
->first
);
1021 ++p_counters
->access
;
1023 if (auto iter
= extents
.find_offset(offset
);
1024 iter
!= extents
.end()) {
1026 // retired_placeholder is not really cached yet
1027 iter
->get_type() != extent_types_t::RETIRED_PLACEHOLDER
) {
1030 return CachedExtentRef(&*iter
);
1032 return CachedExtentRef();
1037 using CacheRef
= std::unique_ptr
<Cache
>;