1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
8 #include <boost/intrusive/list.hpp>
9 #include <boost/intrusive_ptr.hpp>
10 #include <boost/smart_ptr/intrusive_ref_counter.hpp>
12 #include "seastar/core/shared_future.hh"
14 #include "include/buffer.h"
15 #include "crimson/common/errorator.h"
16 #include "crimson/common/interruptible_future.h"
17 #include "crimson/os/seastore/seastore_types.h"
19 struct btree_lba_manager_test
;
21 namespace crimson::os::seastore
{
25 using CachedExtentRef
= boost::intrusive_ptr
<CachedExtent
>;
26 class SegmentedAllocator
;
27 class TransactionManager
;
28 class ExtentPlacementManager
;
33 typename internal_node_t
,
37 bool leaf_has_children
>
39 template <typename
, typename
>
40 class BtreeNodeMapping
;
42 // #define DEBUG_CACHED_EXTENT_REF
43 #ifdef DEBUG_CACHED_EXTENT_REF
45 void intrusive_ptr_add_ref(CachedExtent
*);
46 void intrusive_ptr_release(CachedExtent
*);
51 using TCachedExtentRef
= boost::intrusive_ptr
<T
>;
57 class DummyNodeExtent
;
58 class TestReplayExtent
;
62 class read_set_item_t
{
63 using set_hook_t
= boost::intrusive::set_member_hook
<
64 boost::intrusive::link_mode
<
65 boost::intrusive::auto_unlink
>>;
66 set_hook_t trans_hook
;
67 using set_hook_options
= boost::intrusive::member_hook
<
70 &read_set_item_t::trans_hook
>;
74 using is_transparent
= paddr_t
;
75 bool operator()(const read_set_item_t
<T
> &lhs
, const read_set_item_t
&rhs
) const;
76 bool operator()(const paddr_t
&lhs
, const read_set_item_t
<T
> &rhs
) const;
77 bool operator()(const read_set_item_t
<T
> &lhs
, const paddr_t
&rhs
) const;
82 const read_set_item_t
<Transaction
> &lhs
,
83 const read_set_item_t
<Transaction
> &rhs
) const {
87 const Transaction
*lhs
,
88 const read_set_item_t
<Transaction
> &rhs
) const {
92 const read_set_item_t
<Transaction
> &lhs
,
93 const Transaction
*rhs
) const {
98 using trans_set_t
= boost::intrusive::set
<
101 boost::intrusive::constant_time_size
<false>,
102 boost::intrusive::compare
<trans_cmp_t
>>;
107 read_set_item_t(T
*t
, CachedExtentRef ref
);
108 read_set_item_t(const read_set_item_t
&) = delete;
109 read_set_item_t(read_set_item_t
&&) = default;
110 ~read_set_item_t() = default;
112 template <typename T
>
113 using read_set_t
= std::set
<
115 typename read_set_item_t
<T
>::cmp_t
>;
117 struct trans_spec_view_t
{
118 // if the extent is pending, contains the id of the owning transaction;
119 // TRANS_ID_NULL otherwise
120 transaction_id_t pending_for_transaction
= TRANS_ID_NULL
;
124 const trans_spec_view_t
&lhs
,
125 const trans_spec_view_t
&rhs
) const
127 return lhs
.pending_for_transaction
< rhs
.pending_for_transaction
;
130 const transaction_id_t
&lhs
,
131 const trans_spec_view_t
&rhs
) const
133 return lhs
< rhs
.pending_for_transaction
;
136 const trans_spec_view_t
&lhs
,
137 const transaction_id_t
&rhs
) const
139 return lhs
.pending_for_transaction
< rhs
;
143 using trans_view_hook_t
=
144 boost::intrusive::set_member_hook
<
145 boost::intrusive::link_mode
<
146 boost::intrusive::auto_unlink
>>;
147 trans_view_hook_t trans_view_hook
;
149 using trans_view_member_options
=
150 boost::intrusive::member_hook
<
153 &trans_spec_view_t::trans_view_hook
>;
154 using trans_view_set_t
= boost::intrusive::set
<
156 trans_view_member_options
,
157 boost::intrusive::constant_time_size
<false>,
158 boost::intrusive::compare
<cmp_t
>>;
163 : public boost::intrusive_ref_counter
<
164 CachedExtent
, boost::thread_unsafe_counter
>,
165 public trans_spec_view_t
{
166 enum class extent_state_t
: uint8_t {
167 INITIAL_WRITE_PENDING
, // In Transaction::write_set and fresh_block_list
168 MUTATION_PENDING
, // In Transaction::write_set and mutated_block_list
169 CLEAN_PENDING
, // CLEAN, but not yet read out
170 CLEAN
, // In Cache::extent_index, Transaction::read_set
171 // during write, contents match disk, version == 0
172 DIRTY
, // Same as CLEAN, but contents do not match disk,
174 EXIST_CLEAN
, // Similar to CLEAN, but its metadata not yet
175 // persisted to disk.
176 // In Transaction::write_set and existing_block_list.
177 // After transaction commits, state becomes CLEAN
178 // and add extent to Cache. Modifing such extents
179 // will cause state turn to EXIST_MUTATION_PENDING.
180 EXIST_MUTATION_PENDING
,// Similar to MUTATION_PENDING, but its prior_instance
182 // In Transaction::write_set, existing_block_list and
183 // mutated_block_list. State becomes DIRTY and it is
184 // added to Cache after transaction commits.
185 INVALID
// Part of no ExtentIndex set
186 } state
= extent_state_t::INVALID
;
187 friend std::ostream
&operator<<(std::ostream
&, extent_state_t
);
188 // allow a dummy extent to pretend it is at a specific state
189 friend class onode::DummyNodeExtent
;
190 friend class onode::TestReplayExtent
;
195 typename internal_node_t
,
196 typename leaf_node_t
,
199 bool leaf_has_children
>
200 friend class FixedKVBtree
;
201 uint32_t last_committed_crc
= 0;
203 // Points at current version while in state MUTATION_PENDING
204 CachedExtentRef prior_instance
;
206 // time of the last modification
207 sea_time_point modify_time
= NULL_TIME
;
210 void init(extent_state_t _state
,
212 placement_hint_t hint
,
214 transaction_id_t trans_id
) {
215 assert(gen
== NULL_GENERATION
|| is_rewrite_generation(gen
));
219 rewrite_generation
= gen
;
220 pending_for_transaction
= trans_id
;
223 void set_modify_time(sea_time_point t
) {
227 sea_time_point
get_modify_time() const {
232 * duplicate_for_write
234 * Implementation should return a fresh CachedExtentRef
235 * which represents a copy of *this until on_delta_write()
236 * is complete, at which point the user may assume *this
237 * will be in state INVALID. As such, the implementation
238 * may involve a copy of get_bptr(), or an ancillary
239 * structure which defers updating the actual buffer until
242 virtual CachedExtentRef
duplicate_for_write(Transaction
&t
) = 0;
247 * Called prior to reading buffer.
248 * Implemenation may use this callback to fully write out
249 * updates to the buffer.
251 virtual void prepare_write() {}
256 * Called prior to committing the transaction in which this extent
259 virtual void prepare_commit() {}
264 * Called after commit of extent. State will be CLEAN.
265 * Implentation may use this call to fixup the buffer
266 * with the newly available absolute get_paddr().
268 virtual void on_initial_write() {}
273 * Called after read of initially written extent.
274 * State will be CLEAN. Implentation may use this
275 * call to fixup the buffer with the newly available
276 * absolute get_paddr().
278 virtual void on_clean_read() {}
283 * Called after commit of delta. State will be DIRTY.
284 * Implentation may use this call to fixup any relative
285 * references in the the buffer with the passed
286 * record_block_offset record location.
288 virtual void on_delta_write(paddr_t record_block_offset
) {}
293 * Called after the extent has replaced a previous one. State
294 * of the extent must be MUTATION_PENDING. Implementation
295 * may use this call to synchronize states that must be synchronized
296 * with the states of Cache and can't wait till transaction
299 virtual void on_replace_prior(Transaction
&t
) {}
304 * Called after the extent is invalidated, either by Cache::invalidate_extent
305 * or Transaction::add_to_retired_set. Implementation may use this
306 * call to adjust states that must be changed immediately once
309 virtual void on_invalidated(Transaction
&t
) {}
313 * Returns concrete type.
315 virtual extent_types_t
get_type() const = 0;
317 virtual bool is_logical() const {
321 virtual bool may_conflict() const {
325 friend std::ostream
&operator<<(std::ostream
&, extent_state_t
);
326 virtual std::ostream
&print_detail(std::ostream
&out
) const { return out
; }
327 std::ostream
&print(std::ostream
&out
) const {
328 std::string prior_poffset_str
= prior_poffset
329 ? fmt::format("{}", *prior_poffset
)
331 out
<< "CachedExtent(addr=" << this
332 << ", type=" << get_type()
333 << ", version=" << version
334 << ", dirty_from_or_retired_at=" << dirty_from_or_retired_at
335 << ", modify_time=" << sea_time_point_printer_t
{modify_time
}
336 << ", paddr=" << get_paddr()
337 << ", prior_paddr=" << prior_poffset_str
338 << ", length=" << get_length()
339 << ", state=" << state
340 << ", last_committed_crc=" << last_committed_crc
341 << ", refcount=" << use_count()
342 << ", user_hint=" << user_hint
343 << ", fully_loaded=" << is_fully_loaded()
344 << ", rewrite_gen=" << rewrite_gen_printer_t
{rewrite_generation
};
345 if (state
!= extent_state_t::INVALID
&&
346 state
!= extent_state_t::CLEAN_PENDING
) {
355 * Must return a valid delta usable in apply_delta() in submit_transaction
356 * if state == MUTATION_PENDING.
358 virtual ceph::bufferlist
get_delta() = 0;
363 * bl is a delta obtained previously from get_delta. The versions will
364 * match. Implementation should mutate buffer based on bl. base matches
365 * the address passed on_delta_write.
367 * Implementation *must* use set_last_committed_crc to update the crc to
368 * what the crc of the buffer would have been at submission. For physical
369 * extents that use base to adjust internal record-relative deltas, this
370 * means that the crc should be of the buffer after applying the delta,
371 * but before that adjustment. We do it this way because the crc in the
372 * commit path does not yet know the record base address.
374 * LogicalCachedExtent overrides this method and provides a simpler
375 * apply_delta override for LogicalCachedExtent implementers.
377 virtual void apply_delta_and_adjust_crc(
378 paddr_t base
, const ceph::bufferlist
&bl
) = 0;
381 * Called on dirty CachedExtent implementation after replay.
382 * Implementation should perform any reads/in-memory-setup
383 * necessary. (for instance, the lba implementation will use this
384 * to load in lba_manager blocks)
386 using complete_load_ertr
= crimson::errorator
<
387 crimson::ct_error::input_output_error
>;
388 virtual complete_load_ertr::future
<> complete_load() {
389 return complete_load_ertr::now();
395 * Returns a TCachedExtentRef of the specified type.
396 * TODO: add dynamic check that the requested type is actually correct.
398 template <typename T
>
399 TCachedExtentRef
<T
> cast() {
400 return TCachedExtentRef
<T
>(static_cast<T
*>(this));
402 template <typename T
>
403 TCachedExtentRef
<const T
> cast() const {
404 return TCachedExtentRef
<const T
>(static_cast<const T
*>(this));
407 /// Returns true if extent can be mutated in an open transaction
408 bool is_mutable() const {
409 return state
== extent_state_t::INITIAL_WRITE_PENDING
||
410 state
== extent_state_t::MUTATION_PENDING
||
411 state
== extent_state_t::EXIST_MUTATION_PENDING
;
414 /// Returns true if extent is part of an open transaction
415 bool is_pending() const {
416 return is_mutable() || state
== extent_state_t::EXIST_CLEAN
;
419 /// Returns true if extent is stable and shared among transactions
420 bool is_stable() const {
421 return state
== extent_state_t::CLEAN_PENDING
||
422 state
== extent_state_t::CLEAN
||
423 state
== extent_state_t::DIRTY
;
426 /// Returns true if extent has a pending delta
427 bool is_mutation_pending() const {
428 return state
== extent_state_t::MUTATION_PENDING
;
431 /// Returns true if extent is a fresh extent
432 bool is_initial_pending() const {
433 return state
== extent_state_t::INITIAL_WRITE_PENDING
;
436 /// Returns true if extent is clean (does not have deltas on disk)
437 bool is_clean() const {
438 ceph_assert(is_valid());
439 return state
== extent_state_t::INITIAL_WRITE_PENDING
||
440 state
== extent_state_t::CLEAN
||
441 state
== extent_state_t::CLEAN_PENDING
||
442 state
== extent_state_t::EXIST_CLEAN
;
445 // Returs true if extent is stable and clean
446 bool is_stable_clean() const {
447 ceph_assert(is_valid());
448 return state
== extent_state_t::CLEAN
||
449 state
== extent_state_t::CLEAN_PENDING
;
452 /// Ruturns true if data is persisted while metadata isn't
453 bool is_exist_clean() const {
454 return state
== extent_state_t::EXIST_CLEAN
;
457 /// Returns true if the extent with EXTIST_CLEAN is modified
458 bool is_exist_mutation_pending() const {
459 return state
== extent_state_t::EXIST_MUTATION_PENDING
;
462 /// Returns true if extent is dirty (has deltas on disk)
463 bool is_dirty() const {
464 ceph_assert(is_valid());
468 /// Returns true if extent has not been superceded or retired
469 bool is_valid() const {
470 return state
!= extent_state_t::INVALID
;
473 /// Returns true if extent or prior_instance has been invalidated
474 bool has_been_invalidated() const {
475 return !is_valid() || (is_mutation_pending() && !prior_instance
->is_valid());
478 /// Returns true if extent is a plcaeholder
479 bool is_placeholder() const {
480 return get_type() == extent_types_t::RETIRED_PLACEHOLDER
;
483 bool is_pending_io() const {
484 return !!io_wait_promise
;
487 /// Return journal location of oldest relevant delta, only valid while DIRTY
488 auto get_dirty_from() const {
489 ceph_assert(is_dirty());
490 return dirty_from_or_retired_at
;
493 /// Return journal location of oldest relevant delta, only valid while RETIRED
494 auto get_retired_at() const {
495 ceph_assert(!is_valid());
496 return dirty_from_or_retired_at
;
499 /// Return true if extent is fully loaded or is about to be fully loaded (call
500 /// wait_io() in this case)
501 bool is_fully_loaded() const {
502 return ptr
.has_value();
508 * Returns current address of extent. If is_initial_pending(), address will
509 * be relative, otherwise address will be absolute.
511 paddr_t
get_paddr() const { return poffset
; }
513 /// Returns length of extent data in disk
514 extent_len_t
get_length() const {
518 extent_len_t
get_loaded_length() const {
519 if (ptr
.has_value()) {
520 return ptr
->length();
526 /// Returns version, get_version() == 0 iff is_clean()
527 extent_version_t
get_version() const {
531 /// Returns crc32c of buffer
532 uint32_t get_crc32c() {
535 reinterpret_cast<const unsigned char *>(get_bptr().c_str()),
539 /// Get ref to raw buffer
540 bufferptr
&get_bptr() {
541 assert(ptr
.has_value());
544 const bufferptr
&get_bptr() const {
545 assert(ptr
.has_value());
550 friend bool operator< (const CachedExtent
&a
, const CachedExtent
&b
) {
551 return a
.poffset
< b
.poffset
;
553 friend bool operator> (const CachedExtent
&a
, const CachedExtent
&b
) {
554 return a
.poffset
> b
.poffset
;
556 friend bool operator== (const CachedExtent
&a
, const CachedExtent
&b
) {
557 return a
.poffset
== b
.poffset
;
560 virtual ~CachedExtent();
562 placement_hint_t
get_user_hint() const {
566 rewrite_gen_t
get_rewrite_generation() const {
567 return rewrite_generation
;
570 void invalidate_hints() {
571 user_hint
= PLACEMENT_HINT_NULL
;
572 rewrite_generation
= NULL_GENERATION
;
575 /// assign the target rewrite generation for the followup rewrite
576 void set_target_rewrite_generation(rewrite_gen_t gen
) {
577 assert(is_target_rewrite_generation(gen
));
579 user_hint
= placement_hint_t::REWRITE
;
580 rewrite_generation
= gen
;
583 bool is_inline() const {
584 return poffset
.is_relative();
587 paddr_t
get_prior_paddr_and_reset() {
588 assert(prior_poffset
);
589 auto ret
= *prior_poffset
;
590 prior_poffset
.reset();
594 void set_invalid(Transaction
&t
);
596 // a rewrite extent has an invalid prior_instance,
597 // and a mutation_pending extent has a valid prior_instance
598 CachedExtentRef
get_prior_instance() {
599 return prior_instance
;
603 template <typename T
>
604 friend class read_set_item_t
;
606 friend struct paddr_cmp
;
607 friend struct ref_paddr_cmp
;
608 friend class ExtentIndex
;
610 /// Pointer to containing index (or null)
611 ExtentIndex
*parent_index
= nullptr;
613 /// hook for intrusive extent_index
614 boost::intrusive::set_member_hook
<> extent_index_hook
;
615 using index_member_options
= boost::intrusive::member_hook
<
617 boost::intrusive::set_member_hook
<>,
618 &CachedExtent::extent_index_hook
>;
619 using index
= boost::intrusive::set
<CachedExtent
, index_member_options
>;
620 friend class ExtentIndex
;
621 friend class Transaction
;
624 return extent_index_hook
.is_linked();
628 void set_bptr(ceph::bufferptr
&&nptr
) {
632 /// Returns true if the extent part of the open transaction
633 bool is_pending_in_trans(transaction_id_t id
) const {
634 return is_pending() && pending_for_transaction
== id
;
637 /// hook for intrusive ref list (mainly dirty or lru list)
638 boost::intrusive::list_member_hook
<> primary_ref_list_hook
;
639 using primary_ref_list_member_options
= boost::intrusive::member_hook
<
641 boost::intrusive::list_member_hook
<>,
642 &CachedExtent::primary_ref_list_hook
>;
643 using list
= boost::intrusive::list
<
645 primary_ref_list_member_options
>;
648 * dirty_from_or_retired_at
650 * Encodes ordering token for primary_ref_list -- dirty_from when
651 * dirty or retired_at if retired.
653 journal_seq_t dirty_from_or_retired_at
;
655 /// cache data contents, std::nullopt if no data in cache
656 std::optional
<ceph::bufferptr
> ptr
;
661 /// number of deltas since initial write
662 extent_version_t version
= 0;
664 /// address of original block -- record relative iff is_initial_pending()
667 /// relative address before ool write, used to update mapping
668 std::optional
<paddr_t
> prior_poffset
= std::nullopt
;
670 /// used to wait while in-progress commit completes
671 std::optional
<seastar::shared_promise
<>> io_wait_promise
;
673 ceph_assert(!io_wait_promise
);
674 io_wait_promise
= seastar::shared_promise
<>();
677 ceph_assert(io_wait_promise
);
678 io_wait_promise
->set_value();
679 io_wait_promise
= std::nullopt
;
682 seastar::future
<> wait_io() {
683 if (!io_wait_promise
) {
684 return seastar::now();
686 return io_wait_promise
->get_shared_future();
690 CachedExtent
* get_transactional_view(Transaction
&t
);
691 CachedExtent
* get_transactional_view(transaction_id_t tid
);
693 read_set_item_t
<Transaction
>::trans_set_t transactions
;
695 placement_hint_t user_hint
= PLACEMENT_HINT_NULL
;
697 // the target rewrite generation for the followup rewrite
698 // or the rewrite generation for the fresh write
699 rewrite_gen_t rewrite_generation
= NULL_GENERATION
;
702 trans_view_set_t mutation_pendings
;
704 CachedExtent(CachedExtent
&&other
) = delete;
705 CachedExtent(ceph::bufferptr
&&_ptr
) : ptr(std::move(_ptr
)) {
706 length
= ptr
->length();
710 /// construct new CachedExtent, will deep copy the buffer
711 CachedExtent(const CachedExtent
&other
)
712 : state(other
.state
),
713 dirty_from_or_retired_at(other
.dirty_from_or_retired_at
),
714 length(other
.get_length()),
715 version(other
.version
),
716 poffset(other
.poffset
) {
717 assert((length
% CEPH_PAGE_SIZE
) == 0);
718 if (other
.is_fully_loaded()) {
719 ptr
.emplace(buffer::create_page_aligned(length
));
720 other
.ptr
->copy_out(0, length
, ptr
->c_str());
722 // the extent must be fully loaded before CoW
723 assert(length
== 0); // in case of root
727 struct share_buffer_t
{};
728 /// construct new CachedExtent, will shallow copy the buffer
729 CachedExtent(const CachedExtent
&other
, share_buffer_t
)
730 : state(other
.state
),
731 dirty_from_or_retired_at(other
.dirty_from_or_retired_at
),
733 length(other
.get_length()),
734 version(other
.version
),
735 poffset(other
.poffset
) {}
737 // 0 length is only possible for the RootBlock
738 struct zero_length_t
{};
739 CachedExtent(zero_length_t
) : ptr(ceph::bufferptr(0)), length(0) {};
741 struct retired_placeholder_t
{};
742 CachedExtent(retired_placeholder_t
, extent_len_t _length
)
743 : state(extent_state_t::INVALID
),
748 /// no buffer extent, for lazy read
749 CachedExtent(extent_len_t _length
) : length(_length
) {
754 template <typename T
, typename
... Args
>
755 static TCachedExtentRef
<T
> make_cached_extent_ref(
757 return new T(std::forward
<Args
>(args
)...);
760 template <typename T
>
761 static TCachedExtentRef
<T
> make_placeholder_cached_extent_ref(
762 extent_len_t length
) {
763 return new T(length
);
766 void reset_prior_instance() {
767 prior_instance
.reset();
770 /// Sets last_committed_crc
771 void set_last_committed_crc(uint32_t crc
) {
772 last_committed_crc
= crc
;
775 void set_paddr(paddr_t offset
, bool need_update_mapping
= false) {
776 if (need_update_mapping
) {
777 assert(!prior_poffset
);
778 prior_poffset
= poffset
;
784 * maybe_generate_relative
786 * There are three kinds of addresses one might want to
787 * store within an extent:
788 * - addr for a block within the same transaction relative to the
789 * physical location of this extent in the
790 * event that we will read it in the initial read of the extent
791 * - addr relative to the physical location of the next record to a
792 * block within that record to contain a delta for this extent in
793 * the event that we'll read it from a delta and overlay it onto a
794 * dirty representation of the extent.
795 * - absolute addr to a block already written outside of the current
798 * This helper checks addr and the current state to create the correct
801 paddr_t
maybe_generate_relative(paddr_t addr
) {
802 if (is_initial_pending() && addr
.is_record_relative()) {
803 return addr
.block_relative_to(get_paddr());
805 ceph_assert(!addr
.is_record_relative() || is_mutation_pending());
810 friend class crimson::os::seastore::SegmentedAllocator
;
811 friend class crimson::os::seastore::TransactionManager
;
812 friend class crimson::os::seastore::ExtentPlacementManager
;
813 template <typename
, typename
>
814 friend class BtreeNodeMapping
;
815 friend class ::btree_lba_manager_test
;
818 std::ostream
&operator<<(std::ostream
&, CachedExtent::extent_state_t
);
819 std::ostream
&operator<<(std::ostream
&, const CachedExtent
&);
821 bool is_backref_mapped_extent_node(const CachedExtentRef
&extent
);
823 /// Compare extents by paddr
825 bool operator()(paddr_t lhs
, const CachedExtent
&rhs
) const {
826 return lhs
< rhs
.poffset
;
828 bool operator()(const CachedExtent
&lhs
, paddr_t rhs
) const {
829 return lhs
.poffset
< rhs
;
833 /// Compare extent refs by paddr
834 struct ref_paddr_cmp
{
835 using is_transparent
= paddr_t
;
836 bool operator()(const CachedExtentRef
&lhs
, const CachedExtentRef
&rhs
) const {
837 return lhs
->poffset
< rhs
->poffset
;
839 bool operator()(const paddr_t
&lhs
, const CachedExtentRef
&rhs
) const {
840 return lhs
< rhs
->poffset
;
842 bool operator()(const CachedExtentRef
&lhs
, const paddr_t
&rhs
) const {
843 return lhs
->poffset
< rhs
;
847 template <typename T
, typename C
>
848 class addr_extent_list_base_t
849 : public std::list
<std::pair
<T
, C
>> {};
851 using pextent_list_t
= addr_extent_list_base_t
<paddr_t
, CachedExtentRef
>;
853 template <typename T
, typename C
, typename Cmp
>
854 class addr_extent_set_base_t
855 : public std::set
<C
, Cmp
> {};
857 using pextent_set_t
= addr_extent_set_base_t
<
863 template <typename T
>
864 using t_pextent_list_t
= addr_extent_list_base_t
<paddr_t
, TCachedExtentRef
<T
>>;
869 * Index of CachedExtent & by poffset, does not hold a reference,
870 * user must ensure each extent is removed prior to deletion
874 CachedExtent::index extent_index
;
876 auto get_overlap(paddr_t addr
, extent_len_t len
) {
877 auto bottom
= extent_index
.upper_bound(addr
, paddr_cmp());
878 if (bottom
!= extent_index
.begin())
880 if (bottom
!= extent_index
.end() &&
881 bottom
->get_paddr().add_offset(bottom
->get_length()) <= addr
)
884 auto top
= extent_index
.lower_bound(addr
.add_offset(len
), paddr_cmp());
885 return std::make_pair(
892 struct cached_extent_disposer
{
893 void operator() (CachedExtent
* extent
) {
894 extent
->parent_index
= nullptr;
897 extent_index
.clear_and_dispose(cached_extent_disposer());
901 void insert(CachedExtent
&extent
) {
903 ceph_assert(!extent
.parent_index
);
904 auto [a
, b
] = get_overlap(
906 extent
.get_length());
909 [[maybe_unused
]] auto [iter
, inserted
] = extent_index
.insert(extent
);
911 extent
.parent_index
= this;
913 bytes
+= extent
.get_length();
916 void erase(CachedExtent
&extent
) {
917 assert(extent
.parent_index
);
918 assert(extent
.is_linked());
919 [[maybe_unused
]] auto erased
= extent_index
.erase(
920 extent_index
.s_iterator_to(extent
));
921 extent
.parent_index
= nullptr;
924 bytes
-= extent
.get_length();
927 void replace(CachedExtent
&to
, CachedExtent
&from
) {
928 assert(to
.get_length() == from
.get_length());
929 extent_index
.replace_node(extent_index
.s_iterator_to(from
), to
);
930 from
.parent_index
= nullptr;
931 to
.parent_index
= this;
935 return extent_index
.empty();
938 auto find_offset(paddr_t offset
) {
939 return extent_index
.find(offset
, paddr_cmp());
943 return extent_index
.begin();
947 return extent_index
.end();
951 return extent_index
.size();
954 auto get_bytes() const {
959 assert(extent_index
.empty());
967 class ChildableCachedExtent
;
968 class LogicalCachedExtent
;
972 child_pos_t(CachedExtentRef stable_parent
, uint16_t pos
)
973 : stable_parent(stable_parent
), pos(pos
) {}
975 template <typename parent_t
>
976 TCachedExtentRef
<parent_t
> get_parent() {
977 ceph_assert(stable_parent
);
978 return stable_parent
->template cast
<parent_t
>();
983 void link_child(ChildableCachedExtent
*c
);
985 CachedExtentRef stable_parent
;
986 uint16_t pos
= std::numeric_limits
<uint16_t>::max();
989 using get_child_ertr
= crimson::errorator
<
990 crimson::ct_error::input_output_error
>;
991 template <typename T
>
992 struct get_child_ret_t
{
993 std::variant
<child_pos_t
, get_child_ertr::future
<TCachedExtentRef
<T
>>> ret
;
994 get_child_ret_t(child_pos_t pos
)
995 : ret(std::move(pos
)) {}
996 get_child_ret_t(get_child_ertr::future
<TCachedExtentRef
<T
>> child
)
997 : ret(std::move(child
)) {}
999 bool has_child() const {
1000 return ret
.index() == 1;
1003 child_pos_t
&get_child_pos() {
1004 ceph_assert(ret
.index() == 0);
1005 return std::get
<0>(ret
);
1008 get_child_ertr::future
<TCachedExtentRef
<T
>> &get_child_fut() {
1009 ceph_assert(ret
.index() == 1);
1010 return std::get
<1>(ret
);
1014 template <typename key_t
, typename
>
1015 class PhysicalNodeMapping
;
1017 template <typename key_t
, typename val_t
>
1018 using PhysicalNodeMappingRef
= std::unique_ptr
<PhysicalNodeMapping
<key_t
, val_t
>>;
1020 template <typename key_t
, typename val_t
>
1021 class PhysicalNodeMapping
{
1023 virtual extent_len_t
get_length() const = 0;
1024 virtual extent_types_t
get_type() const = 0;
1025 virtual val_t
get_val() const = 0;
1026 virtual key_t
get_key() const = 0;
1027 virtual PhysicalNodeMappingRef
<key_t
, val_t
> duplicate() const = 0;
1028 virtual bool has_been_invalidated() const = 0;
1029 virtual CachedExtentRef
get_parent() const = 0;
1030 virtual uint16_t get_pos() const = 0;
1031 // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h
1032 virtual bool is_indirect() const { return false; }
1033 virtual key_t
get_intermediate_key() const { return min_max_t
<key_t
>::null
; }
1034 virtual key_t
get_intermediate_base() const { return min_max_t
<key_t
>::null
; }
1035 virtual extent_len_t
get_intermediate_length() const { return 0; }
1036 // The start offset of the pin, must be 0 if the pin is not indirect
1037 virtual extent_len_t
get_intermediate_offset() const {
1038 return std::numeric_limits
<extent_len_t
>::max();
1041 virtual get_child_ret_t
<LogicalCachedExtent
>
1042 get_logical_extent(Transaction
&t
) = 0;
1044 void link_child(ChildableCachedExtent
*c
) {
1045 ceph_assert(child_pos
);
1046 child_pos
->link_child(c
);
1049 virtual ~PhysicalNodeMapping() {}
1051 std::optional
<child_pos_t
> child_pos
= std::nullopt
;
1054 using LBAMapping
= PhysicalNodeMapping
<laddr_t
, paddr_t
>;
1055 using LBAMappingRef
= PhysicalNodeMappingRef
<laddr_t
, paddr_t
>;
1057 std::ostream
&operator<<(std::ostream
&out
, const LBAMapping
&rhs
);
1059 using lba_pin_list_t
= std::list
<LBAMappingRef
>;
1061 std::ostream
&operator<<(std::ostream
&out
, const lba_pin_list_t
&rhs
);
1063 using BackrefMapping
= PhysicalNodeMapping
<paddr_t
, laddr_t
>;
1064 using BackrefMappingRef
= PhysicalNodeMappingRef
<paddr_t
, laddr_t
>;
1066 using backref_pin_list_t
= std::list
<BackrefMappingRef
>;
1069 * RetiredExtentPlaceholder
1071 * Cache::retire_extent_addr(Transaction&, paddr_t, extent_len_t) can retire an
1072 * extent not currently in cache. In that case, in order to detect transaction
1073 * invalidation, we need to add a placeholder to the cache to create the
1074 * mapping back to the transaction. And whenever there is a transaction tries
1075 * to read the placeholder extent out, Cache is responsible to replace the
1076 * placeholder by the real one. Anyway, No placeholder extents should escape
1077 * the Cache interface boundary.
1079 class RetiredExtentPlaceholder
: public CachedExtent
{
1082 RetiredExtentPlaceholder(extent_len_t length
)
1083 : CachedExtent(CachedExtent::retired_placeholder_t
{}, length
) {}
1085 CachedExtentRef
duplicate_for_write(Transaction
&) final
{
1086 ceph_assert(0 == "Should never happen for a placeholder");
1087 return CachedExtentRef();
1090 ceph::bufferlist
get_delta() final
{
1091 ceph_assert(0 == "Should never happen for a placeholder");
1092 return ceph::bufferlist();
1095 static constexpr extent_types_t TYPE
= extent_types_t::RETIRED_PLACEHOLDER
;
1096 extent_types_t
get_type() const final
{
1100 void apply_delta_and_adjust_crc(
1101 paddr_t base
, const ceph::bufferlist
&bl
) final
{
1102 ceph_assert(0 == "Should never happen for a placeholder");
1105 bool is_logical() const final
{
1109 std::ostream
&print_detail(std::ostream
&out
) const final
{
1110 return out
<< ", RetiredExtentPlaceholder";
1113 void on_delta_write(paddr_t record_block_offset
) final
{
1114 ceph_assert(0 == "Should never happen for a placeholder");
1118 class parent_tracker_t
1119 : public boost::intrusive_ref_counter
<
1120 parent_tracker_t
, boost::thread_unsafe_counter
> {
1122 parent_tracker_t(CachedExtentRef parent
)
1124 parent_tracker_t(CachedExtent
* parent
)
1126 ~parent_tracker_t();
1127 template <typename T
= CachedExtent
>
1128 TCachedExtentRef
<T
> get_parent() const {
1129 ceph_assert(parent
);
1130 if constexpr (std::is_same_v
<T
, CachedExtent
>) {
1133 return parent
->template cast
<T
>();
1136 void reset_parent(CachedExtentRef p
) {
1139 bool is_valid() const {
1140 return parent
&& parent
->is_valid();
1143 CachedExtentRef parent
;
1146 std::ostream
&operator<<(std::ostream
&, const parent_tracker_t
&);
1148 using parent_tracker_ref
= boost::intrusive_ptr
<parent_tracker_t
>;
1150 class ChildableCachedExtent
: public CachedExtent
{
1152 template <typename
... T
>
1153 ChildableCachedExtent(T
&&... t
) : CachedExtent(std::forward
<T
>(t
)...) {}
1154 bool has_parent_tracker() const {
1155 return (bool)parent_tracker
;
1157 void reset_parent_tracker(parent_tracker_t
*p
= nullptr) {
1158 parent_tracker
.reset(p
);
1160 bool is_parent_valid() const {
1161 return parent_tracker
&& parent_tracker
->is_valid();
1163 template <typename T
= CachedExtent
>
1164 TCachedExtentRef
<T
> get_parent_node() const {
1165 assert(parent_tracker
);
1166 return parent_tracker
->template get_parent
<T
>();
1168 void take_prior_parent_tracker() {
1169 auto &prior
= (ChildableCachedExtent
&)(*get_prior_instance());
1170 parent_tracker
= prior
.parent_tracker
;
1172 std::ostream
&print_detail(std::ostream
&out
) const final
;
1174 parent_tracker_ref parent_tracker
;
1175 virtual std::ostream
&_print_detail(std::ostream
&out
) const {
1180 * LogicalCachedExtent
1182 * CachedExtent with associated lba mapping.
1184 * Users of TransactionManager should be using extents derived from
1185 * LogicalCachedExtent.
1187 class LogicalCachedExtent
: public ChildableCachedExtent
{
1189 template <typename
... T
>
1190 LogicalCachedExtent(T
&&... t
)
1191 : ChildableCachedExtent(std::forward
<T
>(t
)...)
1194 bool has_laddr() const {
1195 return laddr
!= L_ADDR_NULL
;
1198 laddr_t
get_laddr() const {
1199 assert(laddr
!= L_ADDR_NULL
);
1203 void set_laddr(laddr_t nladdr
) {
1207 void maybe_set_intermediate_laddr(LBAMapping
&mapping
) {
1208 laddr
= mapping
.is_indirect()
1209 ? mapping
.get_intermediate_base()
1210 : mapping
.get_key();
1213 void apply_delta_and_adjust_crc(
1214 paddr_t base
, const ceph::bufferlist
&bl
) final
{
1216 set_last_committed_crc(get_crc32c());
1219 bool is_logical() const final
{
1223 std::ostream
&_print_detail(std::ostream
&out
) const final
;
1225 void on_replace_prior(Transaction
&t
) final
;
1227 virtual ~LogicalCachedExtent();
1230 virtual void apply_delta(const ceph::bufferlist
&bl
) = 0;
1231 virtual std::ostream
&print_detail_l(std::ostream
&out
) const {
1235 virtual void logical_on_delta_write() {}
1237 void on_delta_write(paddr_t record_block_offset
) final
{
1238 assert(is_exist_mutation_pending() ||
1239 get_prior_instance());
1240 logical_on_delta_write();
1244 // the logical address of the extent, and if shared,
1245 // it is the intermediate_base, see BtreeLBAMapping comments.
1246 laddr_t laddr
= L_ADDR_NULL
;
1249 using LogicalCachedExtentRef
= TCachedExtentRef
<LogicalCachedExtent
>;
1250 struct ref_laddr_cmp
{
1251 using is_transparent
= laddr_t
;
1252 bool operator()(const LogicalCachedExtentRef
&lhs
,
1253 const LogicalCachedExtentRef
&rhs
) const {
1254 return lhs
->get_laddr() < rhs
->get_laddr();
1256 bool operator()(const laddr_t
&lhs
,
1257 const LogicalCachedExtentRef
&rhs
) const {
1258 return lhs
< rhs
->get_laddr();
1260 bool operator()(const LogicalCachedExtentRef
&lhs
,
1261 const laddr_t
&rhs
) const {
1262 return lhs
->get_laddr() < rhs
;
1266 template <typename T
>
1267 read_set_item_t
<T
>::read_set_item_t(T
*t
, CachedExtentRef ref
)
1271 template <typename T
>
1272 inline bool read_set_item_t
<T
>::cmp_t::operator()(
1273 const read_set_item_t
<T
> &lhs
, const read_set_item_t
<T
> &rhs
) const {
1274 return lhs
.ref
->poffset
< rhs
.ref
->poffset
;
1276 template <typename T
>
1277 inline bool read_set_item_t
<T
>::cmp_t::operator()(
1278 const paddr_t
&lhs
, const read_set_item_t
<T
> &rhs
) const {
1279 return lhs
< rhs
.ref
->poffset
;
1281 template <typename T
>
1282 inline bool read_set_item_t
<T
>::cmp_t::operator()(
1283 const read_set_item_t
<T
> &lhs
, const paddr_t
&rhs
) const {
1284 return lhs
.ref
->poffset
< rhs
;
1287 using lextent_set_t
= addr_extent_set_base_t
<
1289 LogicalCachedExtentRef
,
1293 template <typename T
>
1294 using lextent_list_t
= addr_extent_list_base_t
<
1295 laddr_t
, TCachedExtentRef
<T
>>;
1299 #if FMT_VERSION >= 90000
1300 template <> struct fmt::formatter
<crimson::os::seastore::lba_pin_list_t
> : fmt::ostream_formatter
{};
1301 template <> struct fmt::formatter
<crimson::os::seastore::CachedExtent
> : fmt::ostream_formatter
{};
1302 template <> struct fmt::formatter
<crimson::os::seastore::LogicalCachedExtent
> : fmt::ostream_formatter
{};
1303 template <> struct fmt::formatter
<crimson::os::seastore::LBAMapping
> : fmt::ostream_formatter
{};