]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/cached_extent.h
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / crimson / os / seastore / cached_extent.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #pragma once
5
6 #include <iostream>
7
8 #include <boost/intrusive/list.hpp>
9 #include <boost/intrusive_ptr.hpp>
10 #include <boost/smart_ptr/intrusive_ref_counter.hpp>
11
12 #include "seastar/core/shared_future.hh"
13
14 #include "include/buffer.h"
15 #include "crimson/common/errorator.h"
16 #include "crimson/common/interruptible_future.h"
17 #include "crimson/os/seastore/seastore_types.h"
18
19 struct btree_lba_manager_test;
20
21 namespace crimson::os::seastore {
22
23 class Transaction;
24 class CachedExtent;
25 using CachedExtentRef = boost::intrusive_ptr<CachedExtent>;
26 class SegmentedAllocator;
27 class TransactionManager;
28 class ExtentPlacementManager;
29
30 template <
31 typename node_key_t,
32 typename node_val_t,
33 typename internal_node_t,
34 typename leaf_node_t,
35 typename pin_t,
36 size_t node_size,
37 bool leaf_has_children>
38 class FixedKVBtree;
39 template <typename, typename>
40 class BtreeNodeMapping;
41
42 // #define DEBUG_CACHED_EXTENT_REF
43 #ifdef DEBUG_CACHED_EXTENT_REF
44
45 void intrusive_ptr_add_ref(CachedExtent *);
46 void intrusive_ptr_release(CachedExtent *);
47
48 #endif
49
50 template <typename T>
51 using TCachedExtentRef = boost::intrusive_ptr<T>;
52
53 /**
54 * CachedExtent
55 */
56 namespace onode {
57 class DummyNodeExtent;
58 class TestReplayExtent;
59 }
60
61 template <typename T>
62 class read_set_item_t {
63 using set_hook_t = boost::intrusive::set_member_hook<
64 boost::intrusive::link_mode<
65 boost::intrusive::auto_unlink>>;
66 set_hook_t trans_hook;
67 using set_hook_options = boost::intrusive::member_hook<
68 read_set_item_t,
69 set_hook_t,
70 &read_set_item_t::trans_hook>;
71
72 public:
73 struct cmp_t {
74 using is_transparent = paddr_t;
75 bool operator()(const read_set_item_t<T> &lhs, const read_set_item_t &rhs) const;
76 bool operator()(const paddr_t &lhs, const read_set_item_t<T> &rhs) const;
77 bool operator()(const read_set_item_t<T> &lhs, const paddr_t &rhs) const;
78 };
79
80 struct trans_cmp_t {
81 bool operator()(
82 const read_set_item_t<Transaction> &lhs,
83 const read_set_item_t<Transaction> &rhs) const {
84 return lhs.t < rhs.t;
85 }
86 bool operator()(
87 const Transaction *lhs,
88 const read_set_item_t<Transaction> &rhs) const {
89 return lhs < rhs.t;
90 }
91 bool operator()(
92 const read_set_item_t<Transaction> &lhs,
93 const Transaction *rhs) const {
94 return lhs.t < rhs;
95 }
96 };
97
98 using trans_set_t = boost::intrusive::set<
99 read_set_item_t,
100 set_hook_options,
101 boost::intrusive::constant_time_size<false>,
102 boost::intrusive::compare<trans_cmp_t>>;
103
104 T *t = nullptr;
105 CachedExtentRef ref;
106
107 read_set_item_t(T *t, CachedExtentRef ref);
108 read_set_item_t(const read_set_item_t &) = delete;
109 read_set_item_t(read_set_item_t &&) = default;
110 ~read_set_item_t() = default;
111 };
112 template <typename T>
113 using read_set_t = std::set<
114 read_set_item_t<T>,
115 typename read_set_item_t<T>::cmp_t>;
116
117 struct trans_spec_view_t {
118 // if the extent is pending, contains the id of the owning transaction;
119 // TRANS_ID_NULL otherwise
120 transaction_id_t pending_for_transaction = TRANS_ID_NULL;
121
122 struct cmp_t {
123 bool operator()(
124 const trans_spec_view_t &lhs,
125 const trans_spec_view_t &rhs) const
126 {
127 return lhs.pending_for_transaction < rhs.pending_for_transaction;
128 }
129 bool operator()(
130 const transaction_id_t &lhs,
131 const trans_spec_view_t &rhs) const
132 {
133 return lhs < rhs.pending_for_transaction;
134 }
135 bool operator()(
136 const trans_spec_view_t &lhs,
137 const transaction_id_t &rhs) const
138 {
139 return lhs.pending_for_transaction < rhs;
140 }
141 };
142
143 using trans_view_hook_t =
144 boost::intrusive::set_member_hook<
145 boost::intrusive::link_mode<
146 boost::intrusive::auto_unlink>>;
147 trans_view_hook_t trans_view_hook;
148
149 using trans_view_member_options =
150 boost::intrusive::member_hook<
151 trans_spec_view_t,
152 trans_view_hook_t,
153 &trans_spec_view_t::trans_view_hook>;
154 using trans_view_set_t = boost::intrusive::set<
155 trans_spec_view_t,
156 trans_view_member_options,
157 boost::intrusive::constant_time_size<false>,
158 boost::intrusive::compare<cmp_t>>;
159 };
160
161 class ExtentIndex;
162 class CachedExtent
163 : public boost::intrusive_ref_counter<
164 CachedExtent, boost::thread_unsafe_counter>,
165 public trans_spec_view_t {
166 enum class extent_state_t : uint8_t {
167 INITIAL_WRITE_PENDING, // In Transaction::write_set and fresh_block_list
168 MUTATION_PENDING, // In Transaction::write_set and mutated_block_list
169 CLEAN_PENDING, // CLEAN, but not yet read out
170 CLEAN, // In Cache::extent_index, Transaction::read_set
171 // during write, contents match disk, version == 0
172 DIRTY, // Same as CLEAN, but contents do not match disk,
173 // version > 0
174 EXIST_CLEAN, // Similar to CLEAN, but its metadata not yet
175 // persisted to disk.
176 // In Transaction::write_set and existing_block_list.
177 // After transaction commits, state becomes CLEAN
178 // and add extent to Cache. Modifing such extents
179 // will cause state turn to EXIST_MUTATION_PENDING.
180 EXIST_MUTATION_PENDING,// Similar to MUTATION_PENDING, but its prior_instance
181 // is empty.
182 // In Transaction::write_set, existing_block_list and
183 // mutated_block_list. State becomes DIRTY and it is
184 // added to Cache after transaction commits.
185 INVALID // Part of no ExtentIndex set
186 } state = extent_state_t::INVALID;
187 friend std::ostream &operator<<(std::ostream &, extent_state_t);
188 // allow a dummy extent to pretend it is at a specific state
189 friend class onode::DummyNodeExtent;
190 friend class onode::TestReplayExtent;
191
192 template <
193 typename node_key_t,
194 typename node_val_t,
195 typename internal_node_t,
196 typename leaf_node_t,
197 typename pin_t,
198 size_t node_size,
199 bool leaf_has_children>
200 friend class FixedKVBtree;
201 uint32_t last_committed_crc = 0;
202
203 // Points at current version while in state MUTATION_PENDING
204 CachedExtentRef prior_instance;
205
206 // time of the last modification
207 sea_time_point modify_time = NULL_TIME;
208
209 public:
210 void init(extent_state_t _state,
211 paddr_t paddr,
212 placement_hint_t hint,
213 rewrite_gen_t gen,
214 transaction_id_t trans_id) {
215 assert(gen == NULL_GENERATION || is_rewrite_generation(gen));
216 state = _state;
217 set_paddr(paddr);
218 user_hint = hint;
219 rewrite_generation = gen;
220 pending_for_transaction = trans_id;
221 }
222
223 void set_modify_time(sea_time_point t) {
224 modify_time = t;
225 }
226
227 sea_time_point get_modify_time() const {
228 return modify_time;
229 }
230
231 /**
232 * duplicate_for_write
233 *
234 * Implementation should return a fresh CachedExtentRef
235 * which represents a copy of *this until on_delta_write()
236 * is complete, at which point the user may assume *this
237 * will be in state INVALID. As such, the implementation
238 * may involve a copy of get_bptr(), or an ancillary
239 * structure which defers updating the actual buffer until
240 * on_delta_write().
241 */
242 virtual CachedExtentRef duplicate_for_write(Transaction &t) = 0;
243
244 /**
245 * prepare_write
246 *
247 * Called prior to reading buffer.
248 * Implemenation may use this callback to fully write out
249 * updates to the buffer.
250 */
251 virtual void prepare_write() {}
252
253 /**
254 * prepare_commit
255 *
256 * Called prior to committing the transaction in which this extent
257 * is living.
258 */
259 virtual void prepare_commit() {}
260
261 /**
262 * on_initial_write
263 *
264 * Called after commit of extent. State will be CLEAN.
265 * Implentation may use this call to fixup the buffer
266 * with the newly available absolute get_paddr().
267 */
268 virtual void on_initial_write() {}
269
270 /**
271 * on_clean_read
272 *
273 * Called after read of initially written extent.
274 * State will be CLEAN. Implentation may use this
275 * call to fixup the buffer with the newly available
276 * absolute get_paddr().
277 */
278 virtual void on_clean_read() {}
279
280 /**
281 * on_delta_write
282 *
283 * Called after commit of delta. State will be DIRTY.
284 * Implentation may use this call to fixup any relative
285 * references in the the buffer with the passed
286 * record_block_offset record location.
287 */
288 virtual void on_delta_write(paddr_t record_block_offset) {}
289
290 /**
291 * on_replace_prior
292 *
293 * Called after the extent has replaced a previous one. State
294 * of the extent must be MUTATION_PENDING. Implementation
295 * may use this call to synchronize states that must be synchronized
296 * with the states of Cache and can't wait till transaction
297 * completes.
298 */
299 virtual void on_replace_prior(Transaction &t) {}
300
301 /**
302 * on_invalidated
303 *
304 * Called after the extent is invalidated, either by Cache::invalidate_extent
305 * or Transaction::add_to_retired_set. Implementation may use this
306 * call to adjust states that must be changed immediately once
307 * invalidated.
308 */
309 virtual void on_invalidated(Transaction &t) {}
310 /**
311 * get_type
312 *
313 * Returns concrete type.
314 */
315 virtual extent_types_t get_type() const = 0;
316
317 virtual bool is_logical() const {
318 return false;
319 }
320
321 virtual bool may_conflict() const {
322 return true;
323 }
324
325 friend std::ostream &operator<<(std::ostream &, extent_state_t);
326 virtual std::ostream &print_detail(std::ostream &out) const { return out; }
327 std::ostream &print(std::ostream &out) const {
328 std::string prior_poffset_str = prior_poffset
329 ? fmt::format("{}", *prior_poffset)
330 : "nullopt";
331 out << "CachedExtent(addr=" << this
332 << ", type=" << get_type()
333 << ", version=" << version
334 << ", dirty_from_or_retired_at=" << dirty_from_or_retired_at
335 << ", modify_time=" << sea_time_point_printer_t{modify_time}
336 << ", paddr=" << get_paddr()
337 << ", prior_paddr=" << prior_poffset_str
338 << ", length=" << get_length()
339 << ", state=" << state
340 << ", last_committed_crc=" << last_committed_crc
341 << ", refcount=" << use_count()
342 << ", user_hint=" << user_hint
343 << ", fully_loaded=" << is_fully_loaded()
344 << ", rewrite_gen=" << rewrite_gen_printer_t{rewrite_generation};
345 if (state != extent_state_t::INVALID &&
346 state != extent_state_t::CLEAN_PENDING) {
347 print_detail(out);
348 }
349 return out << ")";
350 }
351
352 /**
353 * get_delta
354 *
355 * Must return a valid delta usable in apply_delta() in submit_transaction
356 * if state == MUTATION_PENDING.
357 */
358 virtual ceph::bufferlist get_delta() = 0;
359
360 /**
361 * apply_delta
362 *
363 * bl is a delta obtained previously from get_delta. The versions will
364 * match. Implementation should mutate buffer based on bl. base matches
365 * the address passed on_delta_write.
366 *
367 * Implementation *must* use set_last_committed_crc to update the crc to
368 * what the crc of the buffer would have been at submission. For physical
369 * extents that use base to adjust internal record-relative deltas, this
370 * means that the crc should be of the buffer after applying the delta,
371 * but before that adjustment. We do it this way because the crc in the
372 * commit path does not yet know the record base address.
373 *
374 * LogicalCachedExtent overrides this method and provides a simpler
375 * apply_delta override for LogicalCachedExtent implementers.
376 */
377 virtual void apply_delta_and_adjust_crc(
378 paddr_t base, const ceph::bufferlist &bl) = 0;
379
380 /**
381 * Called on dirty CachedExtent implementation after replay.
382 * Implementation should perform any reads/in-memory-setup
383 * necessary. (for instance, the lba implementation will use this
384 * to load in lba_manager blocks)
385 */
386 using complete_load_ertr = crimson::errorator<
387 crimson::ct_error::input_output_error>;
388 virtual complete_load_ertr::future<> complete_load() {
389 return complete_load_ertr::now();
390 }
391
392 /**
393 * cast
394 *
395 * Returns a TCachedExtentRef of the specified type.
396 * TODO: add dynamic check that the requested type is actually correct.
397 */
398 template <typename T>
399 TCachedExtentRef<T> cast() {
400 return TCachedExtentRef<T>(static_cast<T*>(this));
401 }
402 template <typename T>
403 TCachedExtentRef<const T> cast() const {
404 return TCachedExtentRef<const T>(static_cast<const T*>(this));
405 }
406
407 /// Returns true if extent can be mutated in an open transaction
408 bool is_mutable() const {
409 return state == extent_state_t::INITIAL_WRITE_PENDING ||
410 state == extent_state_t::MUTATION_PENDING ||
411 state == extent_state_t::EXIST_MUTATION_PENDING;
412 }
413
414 /// Returns true if extent is part of an open transaction
415 bool is_pending() const {
416 return is_mutable() || state == extent_state_t::EXIST_CLEAN;
417 }
418
419 /// Returns true if extent is stable and shared among transactions
420 bool is_stable() const {
421 return state == extent_state_t::CLEAN_PENDING ||
422 state == extent_state_t::CLEAN ||
423 state == extent_state_t::DIRTY;
424 }
425
426 /// Returns true if extent has a pending delta
427 bool is_mutation_pending() const {
428 return state == extent_state_t::MUTATION_PENDING;
429 }
430
431 /// Returns true if extent is a fresh extent
432 bool is_initial_pending() const {
433 return state == extent_state_t::INITIAL_WRITE_PENDING;
434 }
435
436 /// Returns true if extent is clean (does not have deltas on disk)
437 bool is_clean() const {
438 ceph_assert(is_valid());
439 return state == extent_state_t::INITIAL_WRITE_PENDING ||
440 state == extent_state_t::CLEAN ||
441 state == extent_state_t::CLEAN_PENDING ||
442 state == extent_state_t::EXIST_CLEAN;
443 }
444
445 // Returs true if extent is stable and clean
446 bool is_stable_clean() const {
447 ceph_assert(is_valid());
448 return state == extent_state_t::CLEAN ||
449 state == extent_state_t::CLEAN_PENDING;
450 }
451
452 /// Ruturns true if data is persisted while metadata isn't
453 bool is_exist_clean() const {
454 return state == extent_state_t::EXIST_CLEAN;
455 }
456
457 /// Returns true if the extent with EXTIST_CLEAN is modified
458 bool is_exist_mutation_pending() const {
459 return state == extent_state_t::EXIST_MUTATION_PENDING;
460 }
461
462 /// Returns true if extent is dirty (has deltas on disk)
463 bool is_dirty() const {
464 ceph_assert(is_valid());
465 return !is_clean();
466 }
467
468 /// Returns true if extent has not been superceded or retired
469 bool is_valid() const {
470 return state != extent_state_t::INVALID;
471 }
472
473 /// Returns true if extent or prior_instance has been invalidated
474 bool has_been_invalidated() const {
475 return !is_valid() || (is_mutation_pending() && !prior_instance->is_valid());
476 }
477
478 /// Returns true if extent is a plcaeholder
479 bool is_placeholder() const {
480 return get_type() == extent_types_t::RETIRED_PLACEHOLDER;
481 }
482
483 bool is_pending_io() const {
484 return !!io_wait_promise;
485 }
486
487 /// Return journal location of oldest relevant delta, only valid while DIRTY
488 auto get_dirty_from() const {
489 ceph_assert(is_dirty());
490 return dirty_from_or_retired_at;
491 }
492
493 /// Return journal location of oldest relevant delta, only valid while RETIRED
494 auto get_retired_at() const {
495 ceph_assert(!is_valid());
496 return dirty_from_or_retired_at;
497 }
498
499 /// Return true if extent is fully loaded or is about to be fully loaded (call
500 /// wait_io() in this case)
501 bool is_fully_loaded() const {
502 return ptr.has_value();
503 }
504
505 /**
506 * get_paddr
507 *
508 * Returns current address of extent. If is_initial_pending(), address will
509 * be relative, otherwise address will be absolute.
510 */
511 paddr_t get_paddr() const { return poffset; }
512
513 /// Returns length of extent data in disk
514 extent_len_t get_length() const {
515 return length;
516 }
517
518 extent_len_t get_loaded_length() const {
519 if (ptr.has_value()) {
520 return ptr->length();
521 } else {
522 return 0;
523 }
524 }
525
526 /// Returns version, get_version() == 0 iff is_clean()
527 extent_version_t get_version() const {
528 return version;
529 }
530
531 /// Returns crc32c of buffer
532 uint32_t get_crc32c() {
533 return ceph_crc32c(
534 1,
535 reinterpret_cast<const unsigned char *>(get_bptr().c_str()),
536 get_length());
537 }
538
539 /// Get ref to raw buffer
540 bufferptr &get_bptr() {
541 assert(ptr.has_value());
542 return *ptr;
543 }
544 const bufferptr &get_bptr() const {
545 assert(ptr.has_value());
546 return *ptr;
547 }
548
549 /// Compare by paddr
550 friend bool operator< (const CachedExtent &a, const CachedExtent &b) {
551 return a.poffset < b.poffset;
552 }
553 friend bool operator> (const CachedExtent &a, const CachedExtent &b) {
554 return a.poffset > b.poffset;
555 }
556 friend bool operator== (const CachedExtent &a, const CachedExtent &b) {
557 return a.poffset == b.poffset;
558 }
559
560 virtual ~CachedExtent();
561
562 placement_hint_t get_user_hint() const {
563 return user_hint;
564 }
565
566 rewrite_gen_t get_rewrite_generation() const {
567 return rewrite_generation;
568 }
569
570 void invalidate_hints() {
571 user_hint = PLACEMENT_HINT_NULL;
572 rewrite_generation = NULL_GENERATION;
573 }
574
575 /// assign the target rewrite generation for the followup rewrite
576 void set_target_rewrite_generation(rewrite_gen_t gen) {
577 assert(is_target_rewrite_generation(gen));
578
579 user_hint = placement_hint_t::REWRITE;
580 rewrite_generation = gen;
581 }
582
583 bool is_inline() const {
584 return poffset.is_relative();
585 }
586
587 paddr_t get_prior_paddr_and_reset() {
588 assert(prior_poffset);
589 auto ret = *prior_poffset;
590 prior_poffset.reset();
591 return ret;
592 }
593
594 void set_invalid(Transaction &t);
595
596 // a rewrite extent has an invalid prior_instance,
597 // and a mutation_pending extent has a valid prior_instance
598 CachedExtentRef get_prior_instance() {
599 return prior_instance;
600 }
601
602 private:
603 template <typename T>
604 friend class read_set_item_t;
605
606 friend struct paddr_cmp;
607 friend struct ref_paddr_cmp;
608 friend class ExtentIndex;
609
610 /// Pointer to containing index (or null)
611 ExtentIndex *parent_index = nullptr;
612
613 /// hook for intrusive extent_index
614 boost::intrusive::set_member_hook<> extent_index_hook;
615 using index_member_options = boost::intrusive::member_hook<
616 CachedExtent,
617 boost::intrusive::set_member_hook<>,
618 &CachedExtent::extent_index_hook>;
619 using index = boost::intrusive::set<CachedExtent, index_member_options>;
620 friend class ExtentIndex;
621 friend class Transaction;
622
623 bool is_linked() {
624 return extent_index_hook.is_linked();
625 }
626
627 /// set bufferptr
628 void set_bptr(ceph::bufferptr &&nptr) {
629 ptr = nptr;
630 }
631
632 /// Returns true if the extent part of the open transaction
633 bool is_pending_in_trans(transaction_id_t id) const {
634 return is_pending() && pending_for_transaction == id;
635 }
636
637 /// hook for intrusive ref list (mainly dirty or lru list)
638 boost::intrusive::list_member_hook<> primary_ref_list_hook;
639 using primary_ref_list_member_options = boost::intrusive::member_hook<
640 CachedExtent,
641 boost::intrusive::list_member_hook<>,
642 &CachedExtent::primary_ref_list_hook>;
643 using list = boost::intrusive::list<
644 CachedExtent,
645 primary_ref_list_member_options>;
646
647 /**
648 * dirty_from_or_retired_at
649 *
650 * Encodes ordering token for primary_ref_list -- dirty_from when
651 * dirty or retired_at if retired.
652 */
653 journal_seq_t dirty_from_or_retired_at;
654
655 /// cache data contents, std::nullopt if no data in cache
656 std::optional<ceph::bufferptr> ptr;
657
658 /// disk data length
659 extent_len_t length;
660
661 /// number of deltas since initial write
662 extent_version_t version = 0;
663
664 /// address of original block -- record relative iff is_initial_pending()
665 paddr_t poffset;
666
667 /// relative address before ool write, used to update mapping
668 std::optional<paddr_t> prior_poffset = std::nullopt;
669
670 /// used to wait while in-progress commit completes
671 std::optional<seastar::shared_promise<>> io_wait_promise;
672 void set_io_wait() {
673 ceph_assert(!io_wait_promise);
674 io_wait_promise = seastar::shared_promise<>();
675 }
676 void complete_io() {
677 ceph_assert(io_wait_promise);
678 io_wait_promise->set_value();
679 io_wait_promise = std::nullopt;
680 }
681
682 seastar::future<> wait_io() {
683 if (!io_wait_promise) {
684 return seastar::now();
685 } else {
686 return io_wait_promise->get_shared_future();
687 }
688 }
689
690 CachedExtent* get_transactional_view(Transaction &t);
691 CachedExtent* get_transactional_view(transaction_id_t tid);
692
693 read_set_item_t<Transaction>::trans_set_t transactions;
694
695 placement_hint_t user_hint = PLACEMENT_HINT_NULL;
696
697 // the target rewrite generation for the followup rewrite
698 // or the rewrite generation for the fresh write
699 rewrite_gen_t rewrite_generation = NULL_GENERATION;
700
701 protected:
702 trans_view_set_t mutation_pendings;
703
704 CachedExtent(CachedExtent &&other) = delete;
705 CachedExtent(ceph::bufferptr &&_ptr) : ptr(std::move(_ptr)) {
706 length = ptr->length();
707 assert(length > 0);
708 }
709
710 /// construct new CachedExtent, will deep copy the buffer
711 CachedExtent(const CachedExtent &other)
712 : state(other.state),
713 dirty_from_or_retired_at(other.dirty_from_or_retired_at),
714 length(other.get_length()),
715 version(other.version),
716 poffset(other.poffset) {
717 assert((length % CEPH_PAGE_SIZE) == 0);
718 if (other.is_fully_loaded()) {
719 ptr.emplace(buffer::create_page_aligned(length));
720 other.ptr->copy_out(0, length, ptr->c_str());
721 } else {
722 // the extent must be fully loaded before CoW
723 assert(length == 0); // in case of root
724 }
725 }
726
727 struct share_buffer_t {};
728 /// construct new CachedExtent, will shallow copy the buffer
729 CachedExtent(const CachedExtent &other, share_buffer_t)
730 : state(other.state),
731 dirty_from_or_retired_at(other.dirty_from_or_retired_at),
732 ptr(other.ptr),
733 length(other.get_length()),
734 version(other.version),
735 poffset(other.poffset) {}
736
737 // 0 length is only possible for the RootBlock
738 struct zero_length_t {};
739 CachedExtent(zero_length_t) : ptr(ceph::bufferptr(0)), length(0) {};
740
741 struct retired_placeholder_t{};
742 CachedExtent(retired_placeholder_t, extent_len_t _length)
743 : state(extent_state_t::INVALID),
744 length(_length) {
745 assert(length > 0);
746 }
747
748 /// no buffer extent, for lazy read
749 CachedExtent(extent_len_t _length) : length(_length) {
750 assert(length > 0);
751 }
752
753 friend class Cache;
754 template <typename T, typename... Args>
755 static TCachedExtentRef<T> make_cached_extent_ref(
756 Args&&... args) {
757 return new T(std::forward<Args>(args)...);
758 }
759
760 template <typename T>
761 static TCachedExtentRef<T> make_placeholder_cached_extent_ref(
762 extent_len_t length) {
763 return new T(length);
764 }
765
766 void reset_prior_instance() {
767 prior_instance.reset();
768 }
769
770 /// Sets last_committed_crc
771 void set_last_committed_crc(uint32_t crc) {
772 last_committed_crc = crc;
773 }
774
775 void set_paddr(paddr_t offset, bool need_update_mapping = false) {
776 if (need_update_mapping) {
777 assert(!prior_poffset);
778 prior_poffset = poffset;
779 }
780 poffset = offset;
781 }
782
783 /**
784 * maybe_generate_relative
785 *
786 * There are three kinds of addresses one might want to
787 * store within an extent:
788 * - addr for a block within the same transaction relative to the
789 * physical location of this extent in the
790 * event that we will read it in the initial read of the extent
791 * - addr relative to the physical location of the next record to a
792 * block within that record to contain a delta for this extent in
793 * the event that we'll read it from a delta and overlay it onto a
794 * dirty representation of the extent.
795 * - absolute addr to a block already written outside of the current
796 * transaction.
797 *
798 * This helper checks addr and the current state to create the correct
799 * reference.
800 */
801 paddr_t maybe_generate_relative(paddr_t addr) {
802 if (is_initial_pending() && addr.is_record_relative()) {
803 return addr.block_relative_to(get_paddr());
804 } else {
805 ceph_assert(!addr.is_record_relative() || is_mutation_pending());
806 return addr;
807 }
808 }
809
810 friend class crimson::os::seastore::SegmentedAllocator;
811 friend class crimson::os::seastore::TransactionManager;
812 friend class crimson::os::seastore::ExtentPlacementManager;
813 template <typename, typename>
814 friend class BtreeNodeMapping;
815 friend class ::btree_lba_manager_test;
816 };
817
818 std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t);
819 std::ostream &operator<<(std::ostream &, const CachedExtent&);
820
821 bool is_backref_mapped_extent_node(const CachedExtentRef &extent);
822
823 /// Compare extents by paddr
824 struct paddr_cmp {
825 bool operator()(paddr_t lhs, const CachedExtent &rhs) const {
826 return lhs < rhs.poffset;
827 }
828 bool operator()(const CachedExtent &lhs, paddr_t rhs) const {
829 return lhs.poffset < rhs;
830 }
831 };
832
833 /// Compare extent refs by paddr
834 struct ref_paddr_cmp {
835 using is_transparent = paddr_t;
836 bool operator()(const CachedExtentRef &lhs, const CachedExtentRef &rhs) const {
837 return lhs->poffset < rhs->poffset;
838 }
839 bool operator()(const paddr_t &lhs, const CachedExtentRef &rhs) const {
840 return lhs < rhs->poffset;
841 }
842 bool operator()(const CachedExtentRef &lhs, const paddr_t &rhs) const {
843 return lhs->poffset < rhs;
844 }
845 };
846
847 template <typename T, typename C>
848 class addr_extent_list_base_t
849 : public std::list<std::pair<T, C>> {};
850
851 using pextent_list_t = addr_extent_list_base_t<paddr_t, CachedExtentRef>;
852
853 template <typename T, typename C, typename Cmp>
854 class addr_extent_set_base_t
855 : public std::set<C, Cmp> {};
856
857 using pextent_set_t = addr_extent_set_base_t<
858 paddr_t,
859 CachedExtentRef,
860 ref_paddr_cmp
861 >;
862
863 template <typename T>
864 using t_pextent_list_t = addr_extent_list_base_t<paddr_t, TCachedExtentRef<T>>;
865
866 /**
867 * ExtentIndex
868 *
869 * Index of CachedExtent & by poffset, does not hold a reference,
870 * user must ensure each extent is removed prior to deletion
871 */
872 class ExtentIndex {
873 friend class Cache;
874 CachedExtent::index extent_index;
875 public:
876 auto get_overlap(paddr_t addr, extent_len_t len) {
877 auto bottom = extent_index.upper_bound(addr, paddr_cmp());
878 if (bottom != extent_index.begin())
879 --bottom;
880 if (bottom != extent_index.end() &&
881 bottom->get_paddr().add_offset(bottom->get_length()) <= addr)
882 ++bottom;
883
884 auto top = extent_index.lower_bound(addr.add_offset(len), paddr_cmp());
885 return std::make_pair(
886 bottom,
887 top
888 );
889 }
890
891 void clear() {
892 struct cached_extent_disposer {
893 void operator() (CachedExtent* extent) {
894 extent->parent_index = nullptr;
895 }
896 };
897 extent_index.clear_and_dispose(cached_extent_disposer());
898 bytes = 0;
899 }
900
901 void insert(CachedExtent &extent) {
902 // sanity check
903 ceph_assert(!extent.parent_index);
904 auto [a, b] = get_overlap(
905 extent.get_paddr(),
906 extent.get_length());
907 ceph_assert(a == b);
908
909 [[maybe_unused]] auto [iter, inserted] = extent_index.insert(extent);
910 assert(inserted);
911 extent.parent_index = this;
912
913 bytes += extent.get_length();
914 }
915
916 void erase(CachedExtent &extent) {
917 assert(extent.parent_index);
918 assert(extent.is_linked());
919 [[maybe_unused]] auto erased = extent_index.erase(
920 extent_index.s_iterator_to(extent));
921 extent.parent_index = nullptr;
922
923 assert(erased);
924 bytes -= extent.get_length();
925 }
926
927 void replace(CachedExtent &to, CachedExtent &from) {
928 assert(to.get_length() == from.get_length());
929 extent_index.replace_node(extent_index.s_iterator_to(from), to);
930 from.parent_index = nullptr;
931 to.parent_index = this;
932 }
933
934 bool empty() const {
935 return extent_index.empty();
936 }
937
938 auto find_offset(paddr_t offset) {
939 return extent_index.find(offset, paddr_cmp());
940 }
941
942 auto begin() {
943 return extent_index.begin();
944 }
945
946 auto end() {
947 return extent_index.end();
948 }
949
950 auto size() const {
951 return extent_index.size();
952 }
953
954 auto get_bytes() const {
955 return bytes;
956 }
957
958 ~ExtentIndex() {
959 assert(extent_index.empty());
960 assert(bytes == 0);
961 }
962
963 private:
964 uint64_t bytes = 0;
965 };
966
967 class ChildableCachedExtent;
968 class LogicalCachedExtent;
969
970 class child_pos_t {
971 public:
972 child_pos_t(CachedExtentRef stable_parent, uint16_t pos)
973 : stable_parent(stable_parent), pos(pos) {}
974
975 template <typename parent_t>
976 TCachedExtentRef<parent_t> get_parent() {
977 ceph_assert(stable_parent);
978 return stable_parent->template cast<parent_t>();
979 }
980 uint16_t get_pos() {
981 return pos;
982 }
983 void link_child(ChildableCachedExtent *c);
984 private:
985 CachedExtentRef stable_parent;
986 uint16_t pos = std::numeric_limits<uint16_t>::max();
987 };
988
989 using get_child_ertr = crimson::errorator<
990 crimson::ct_error::input_output_error>;
991 template <typename T>
992 struct get_child_ret_t {
993 std::variant<child_pos_t, get_child_ertr::future<TCachedExtentRef<T>>> ret;
994 get_child_ret_t(child_pos_t pos)
995 : ret(std::move(pos)) {}
996 get_child_ret_t(get_child_ertr::future<TCachedExtentRef<T>> child)
997 : ret(std::move(child)) {}
998
999 bool has_child() const {
1000 return ret.index() == 1;
1001 }
1002
1003 child_pos_t &get_child_pos() {
1004 ceph_assert(ret.index() == 0);
1005 return std::get<0>(ret);
1006 }
1007
1008 get_child_ertr::future<TCachedExtentRef<T>> &get_child_fut() {
1009 ceph_assert(ret.index() == 1);
1010 return std::get<1>(ret);
1011 }
1012 };
1013
1014 template <typename key_t, typename>
1015 class PhysicalNodeMapping;
1016
1017 template <typename key_t, typename val_t>
1018 using PhysicalNodeMappingRef = std::unique_ptr<PhysicalNodeMapping<key_t, val_t>>;
1019
1020 template <typename key_t, typename val_t>
1021 class PhysicalNodeMapping {
1022 public:
1023 virtual extent_len_t get_length() const = 0;
1024 virtual extent_types_t get_type() const = 0;
1025 virtual val_t get_val() const = 0;
1026 virtual key_t get_key() const = 0;
1027 virtual PhysicalNodeMappingRef<key_t, val_t> duplicate() const = 0;
1028 virtual bool has_been_invalidated() const = 0;
1029 virtual CachedExtentRef get_parent() const = 0;
1030 virtual uint16_t get_pos() const = 0;
1031 // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h
1032 virtual bool is_indirect() const { return false; }
1033 virtual key_t get_intermediate_key() const { return min_max_t<key_t>::null; }
1034 virtual key_t get_intermediate_base() const { return min_max_t<key_t>::null; }
1035 virtual extent_len_t get_intermediate_length() const { return 0; }
1036 // The start offset of the pin, must be 0 if the pin is not indirect
1037 virtual extent_len_t get_intermediate_offset() const {
1038 return std::numeric_limits<extent_len_t>::max();
1039 }
1040
1041 virtual get_child_ret_t<LogicalCachedExtent>
1042 get_logical_extent(Transaction &t) = 0;
1043
1044 void link_child(ChildableCachedExtent *c) {
1045 ceph_assert(child_pos);
1046 child_pos->link_child(c);
1047 }
1048
1049 virtual ~PhysicalNodeMapping() {}
1050 protected:
1051 std::optional<child_pos_t> child_pos = std::nullopt;
1052 };
1053
1054 using LBAMapping = PhysicalNodeMapping<laddr_t, paddr_t>;
1055 using LBAMappingRef = PhysicalNodeMappingRef<laddr_t, paddr_t>;
1056
1057 std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs);
1058
1059 using lba_pin_list_t = std::list<LBAMappingRef>;
1060
1061 std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs);
1062
1063 using BackrefMapping = PhysicalNodeMapping<paddr_t, laddr_t>;
1064 using BackrefMappingRef = PhysicalNodeMappingRef<paddr_t, laddr_t>;
1065
1066 using backref_pin_list_t = std::list<BackrefMappingRef>;
1067
1068 /**
1069 * RetiredExtentPlaceholder
1070 *
1071 * Cache::retire_extent_addr(Transaction&, paddr_t, extent_len_t) can retire an
1072 * extent not currently in cache. In that case, in order to detect transaction
1073 * invalidation, we need to add a placeholder to the cache to create the
1074 * mapping back to the transaction. And whenever there is a transaction tries
1075 * to read the placeholder extent out, Cache is responsible to replace the
1076 * placeholder by the real one. Anyway, No placeholder extents should escape
1077 * the Cache interface boundary.
1078 */
1079 class RetiredExtentPlaceholder : public CachedExtent {
1080
1081 public:
1082 RetiredExtentPlaceholder(extent_len_t length)
1083 : CachedExtent(CachedExtent::retired_placeholder_t{}, length) {}
1084
1085 CachedExtentRef duplicate_for_write(Transaction&) final {
1086 ceph_assert(0 == "Should never happen for a placeholder");
1087 return CachedExtentRef();
1088 }
1089
1090 ceph::bufferlist get_delta() final {
1091 ceph_assert(0 == "Should never happen for a placeholder");
1092 return ceph::bufferlist();
1093 }
1094
1095 static constexpr extent_types_t TYPE = extent_types_t::RETIRED_PLACEHOLDER;
1096 extent_types_t get_type() const final {
1097 return TYPE;
1098 }
1099
1100 void apply_delta_and_adjust_crc(
1101 paddr_t base, const ceph::bufferlist &bl) final {
1102 ceph_assert(0 == "Should never happen for a placeholder");
1103 }
1104
1105 bool is_logical() const final {
1106 return false;
1107 }
1108
1109 std::ostream &print_detail(std::ostream &out) const final {
1110 return out << ", RetiredExtentPlaceholder";
1111 }
1112
1113 void on_delta_write(paddr_t record_block_offset) final {
1114 ceph_assert(0 == "Should never happen for a placeholder");
1115 }
1116 };
1117
1118 class parent_tracker_t
1119 : public boost::intrusive_ref_counter<
1120 parent_tracker_t, boost::thread_unsafe_counter> {
1121 public:
1122 parent_tracker_t(CachedExtentRef parent)
1123 : parent(parent) {}
1124 parent_tracker_t(CachedExtent* parent)
1125 : parent(parent) {}
1126 ~parent_tracker_t();
1127 template <typename T = CachedExtent>
1128 TCachedExtentRef<T> get_parent() const {
1129 ceph_assert(parent);
1130 if constexpr (std::is_same_v<T, CachedExtent>) {
1131 return parent;
1132 } else {
1133 return parent->template cast<T>();
1134 }
1135 }
1136 void reset_parent(CachedExtentRef p) {
1137 parent = p;
1138 }
1139 bool is_valid() const {
1140 return parent && parent->is_valid();
1141 }
1142 private:
1143 CachedExtentRef parent;
1144 };
1145
1146 std::ostream &operator<<(std::ostream &, const parent_tracker_t &);
1147
1148 using parent_tracker_ref = boost::intrusive_ptr<parent_tracker_t>;
1149
1150 class ChildableCachedExtent : public CachedExtent {
1151 public:
1152 template <typename... T>
1153 ChildableCachedExtent(T&&... t) : CachedExtent(std::forward<T>(t)...) {}
1154 bool has_parent_tracker() const {
1155 return (bool)parent_tracker;
1156 }
1157 void reset_parent_tracker(parent_tracker_t *p = nullptr) {
1158 parent_tracker.reset(p);
1159 }
1160 bool is_parent_valid() const {
1161 return parent_tracker && parent_tracker->is_valid();
1162 }
1163 template <typename T = CachedExtent>
1164 TCachedExtentRef<T> get_parent_node() const {
1165 assert(parent_tracker);
1166 return parent_tracker->template get_parent<T>();
1167 }
1168 void take_prior_parent_tracker() {
1169 auto &prior = (ChildableCachedExtent&)(*get_prior_instance());
1170 parent_tracker = prior.parent_tracker;
1171 }
1172 std::ostream &print_detail(std::ostream &out) const final;
1173 private:
1174 parent_tracker_ref parent_tracker;
1175 virtual std::ostream &_print_detail(std::ostream &out) const {
1176 return out;
1177 }
1178 };
1179 /**
1180 * LogicalCachedExtent
1181 *
1182 * CachedExtent with associated lba mapping.
1183 *
1184 * Users of TransactionManager should be using extents derived from
1185 * LogicalCachedExtent.
1186 */
1187 class LogicalCachedExtent : public ChildableCachedExtent {
1188 public:
1189 template <typename... T>
1190 LogicalCachedExtent(T&&... t)
1191 : ChildableCachedExtent(std::forward<T>(t)...)
1192 {}
1193
1194 bool has_laddr() const {
1195 return laddr != L_ADDR_NULL;
1196 }
1197
1198 laddr_t get_laddr() const {
1199 assert(laddr != L_ADDR_NULL);
1200 return laddr;
1201 }
1202
1203 void set_laddr(laddr_t nladdr) {
1204 laddr = nladdr;
1205 }
1206
1207 void maybe_set_intermediate_laddr(LBAMapping &mapping) {
1208 laddr = mapping.is_indirect()
1209 ? mapping.get_intermediate_base()
1210 : mapping.get_key();
1211 }
1212
1213 void apply_delta_and_adjust_crc(
1214 paddr_t base, const ceph::bufferlist &bl) final {
1215 apply_delta(bl);
1216 set_last_committed_crc(get_crc32c());
1217 }
1218
1219 bool is_logical() const final {
1220 return true;
1221 }
1222
1223 std::ostream &_print_detail(std::ostream &out) const final;
1224
1225 void on_replace_prior(Transaction &t) final;
1226
1227 virtual ~LogicalCachedExtent();
1228 protected:
1229
1230 virtual void apply_delta(const ceph::bufferlist &bl) = 0;
1231 virtual std::ostream &print_detail_l(std::ostream &out) const {
1232 return out;
1233 }
1234
1235 virtual void logical_on_delta_write() {}
1236
1237 void on_delta_write(paddr_t record_block_offset) final {
1238 assert(is_exist_mutation_pending() ||
1239 get_prior_instance());
1240 logical_on_delta_write();
1241 }
1242
1243 private:
1244 // the logical address of the extent, and if shared,
1245 // it is the intermediate_base, see BtreeLBAMapping comments.
1246 laddr_t laddr = L_ADDR_NULL;
1247 };
1248
1249 using LogicalCachedExtentRef = TCachedExtentRef<LogicalCachedExtent>;
1250 struct ref_laddr_cmp {
1251 using is_transparent = laddr_t;
1252 bool operator()(const LogicalCachedExtentRef &lhs,
1253 const LogicalCachedExtentRef &rhs) const {
1254 return lhs->get_laddr() < rhs->get_laddr();
1255 }
1256 bool operator()(const laddr_t &lhs,
1257 const LogicalCachedExtentRef &rhs) const {
1258 return lhs < rhs->get_laddr();
1259 }
1260 bool operator()(const LogicalCachedExtentRef &lhs,
1261 const laddr_t &rhs) const {
1262 return lhs->get_laddr() < rhs;
1263 }
1264 };
1265
1266 template <typename T>
1267 read_set_item_t<T>::read_set_item_t(T *t, CachedExtentRef ref)
1268 : t(t), ref(ref)
1269 {}
1270
1271 template <typename T>
1272 inline bool read_set_item_t<T>::cmp_t::operator()(
1273 const read_set_item_t<T> &lhs, const read_set_item_t<T> &rhs) const {
1274 return lhs.ref->poffset < rhs.ref->poffset;
1275 }
1276 template <typename T>
1277 inline bool read_set_item_t<T>::cmp_t::operator()(
1278 const paddr_t &lhs, const read_set_item_t<T> &rhs) const {
1279 return lhs < rhs.ref->poffset;
1280 }
1281 template <typename T>
1282 inline bool read_set_item_t<T>::cmp_t::operator()(
1283 const read_set_item_t<T> &lhs, const paddr_t &rhs) const {
1284 return lhs.ref->poffset < rhs;
1285 }
1286
1287 using lextent_set_t = addr_extent_set_base_t<
1288 laddr_t,
1289 LogicalCachedExtentRef,
1290 ref_laddr_cmp
1291 >;
1292
1293 template <typename T>
1294 using lextent_list_t = addr_extent_list_base_t<
1295 laddr_t, TCachedExtentRef<T>>;
1296
1297 }
1298
1299 #if FMT_VERSION >= 90000
1300 template <> struct fmt::formatter<crimson::os::seastore::lba_pin_list_t> : fmt::ostream_formatter {};
1301 template <> struct fmt::formatter<crimson::os::seastore::CachedExtent> : fmt::ostream_formatter {};
1302 template <> struct fmt::formatter<crimson::os::seastore::LogicalCachedExtent> : fmt::ostream_formatter {};
1303 template <> struct fmt::formatter<crimson::os::seastore::LBAMapping> : fmt::ostream_formatter {};
1304 #endif