1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
11 #include <boost/core/ignore_unused.hpp>
13 #include <seastar/core/lowres_clock.hh>
15 #include "include/byteorder.h"
16 #include "include/denc.h"
17 #include "include/buffer.h"
18 #include "include/intarith.h"
19 #include "include/interval_set.h"
20 #include "include/uuid.h"
22 namespace crimson::os::seastore
{
24 /* using a special xattr key "omap_header" to store omap header */
25 const std::string OMAP_HEADER_XATTR_KEY
= "omap_header";
27 using transaction_id_t
= uint64_t;
28 constexpr transaction_id_t TRANS_ID_NULL
= 0;
31 * Note: NULL value is usually the default and max value.
34 using depth_t
= uint32_t;
35 using depth_le_t
= ceph_le32
;
37 inline depth_le_t
init_depth_le(uint32_t i
) {
41 using checksum_t
= uint32_t;
43 // Immutable metadata for seastore to set at mkfs time
44 struct seastore_meta_t
{
47 DENC(seastore_meta_t
, v
, p
) {
49 denc(v
.seastore_id
, p
);
54 std::ostream
& operator<<(std::ostream
& out
, const seastore_meta_t
& meta
);
56 bool is_aligned(uint64_t offset
, uint64_t alignment
);
58 // identifies a specific physical device within seastore
59 using device_id_t
= uint8_t;
61 constexpr auto DEVICE_ID_BITS
= std::numeric_limits
<device_id_t
>::digits
;
63 constexpr device_id_t DEVICE_ID_MAX
= std::numeric_limits
<device_id_t
>::max();
64 constexpr device_id_t DEVICE_ID_NULL
= DEVICE_ID_MAX
;
65 constexpr device_id_t DEVICE_ID_RECORD_RELATIVE
= DEVICE_ID_MAX
- 1;
66 constexpr device_id_t DEVICE_ID_BLOCK_RELATIVE
= DEVICE_ID_MAX
- 2;
67 constexpr device_id_t DEVICE_ID_DELAYED
= DEVICE_ID_MAX
- 3;
68 // for tests which generate fake paddrs
69 constexpr device_id_t DEVICE_ID_FAKE
= DEVICE_ID_MAX
- 4;
70 constexpr device_id_t DEVICE_ID_ZERO
= DEVICE_ID_MAX
- 5;
71 constexpr device_id_t DEVICE_ID_ROOT
= DEVICE_ID_MAX
- 6;
72 constexpr device_id_t DEVICE_ID_MAX_VALID
= DEVICE_ID_MAX
- 7;
73 constexpr device_id_t DEVICE_ID_MAX_VALID_SEGMENT
= DEVICE_ID_MAX
>> 1;
74 constexpr device_id_t DEVICE_ID_SEGMENTED_MIN
= 0;
75 constexpr device_id_t DEVICE_ID_RANDOM_BLOCK_MIN
=
76 1 << (std::numeric_limits
<device_id_t
>::digits
- 1);
78 struct device_id_printer_t
{
82 std::ostream
&operator<<(std::ostream
&out
, const device_id_printer_t
&id
);
84 // 1 bit in paddr_t to identify the absolute physical address type
85 enum class paddr_types_t
{
91 constexpr paddr_types_t
device_id_to_paddr_type(device_id_t id
) {
92 if (id
> DEVICE_ID_MAX_VALID
) {
93 return paddr_types_t::RESERVED
;
94 } else if ((id
& 0x80) == 0) {
95 return paddr_types_t::SEGMENT
;
97 return paddr_types_t::RANDOM_BLOCK
;
101 constexpr bool has_device_off(device_id_t id
) {
102 return id
== DEVICE_ID_RECORD_RELATIVE
||
103 id
== DEVICE_ID_BLOCK_RELATIVE
||
104 id
== DEVICE_ID_DELAYED
||
105 id
== DEVICE_ID_FAKE
||
106 id
== DEVICE_ID_ROOT
;
109 // internal segment id type of segment_id_t below, with the top
110 // "DEVICE_ID_BITS" bits representing the device id of the segment.
111 using internal_segment_id_t
= uint32_t;
112 constexpr auto SEGMENT_ID_BITS
= std::numeric_limits
<internal_segment_id_t
>::digits
;
114 // segment ids without a device id encapsulated
115 using device_segment_id_t
= uint32_t;
116 constexpr auto DEVICE_SEGMENT_ID_BITS
= SEGMENT_ID_BITS
- DEVICE_ID_BITS
;
117 constexpr device_segment_id_t DEVICE_SEGMENT_ID_MAX
= (1 << DEVICE_SEGMENT_ID_BITS
) - 1;
119 // Identifies segment location on disk, see SegmentManager,
120 struct segment_id_t
{
122 // segment_id_t() == MAX_SEG_ID == NULL_SEG_ID
124 : segment_id_t(DEVICE_ID_MAX_VALID_SEGMENT
, DEVICE_SEGMENT_ID_MAX
) {}
126 segment_id_t(device_id_t id
, device_segment_id_t _segment
)
127 : segment_id_t(make_internal(id
, _segment
)) {}
129 segment_id_t(internal_segment_id_t _segment
)
130 : segment(_segment
) {
131 assert(device_id_to_paddr_type(device_id()) == paddr_types_t::SEGMENT
);
134 [[gnu::always_inline
]]
135 constexpr device_id_t
device_id() const {
136 return static_cast<device_id_t
>(segment
>> DEVICE_SEGMENT_ID_BITS
);
139 [[gnu::always_inline
]]
140 constexpr device_segment_id_t
device_segment_id() const {
141 constexpr internal_segment_id_t _SEGMENT_ID_MASK
= (1u << DEVICE_SEGMENT_ID_BITS
) - 1;
142 return segment
& _SEGMENT_ID_MASK
;
145 bool operator==(const segment_id_t
& other
) const {
146 return segment
== other
.segment
;
148 bool operator!=(const segment_id_t
& other
) const {
149 return segment
!= other
.segment
;
151 bool operator<(const segment_id_t
& other
) const {
152 return segment
< other
.segment
;
154 bool operator<=(const segment_id_t
& other
) const {
155 return segment
<= other
.segment
;
157 bool operator>(const segment_id_t
& other
) const {
158 return segment
> other
.segment
;
160 bool operator>=(const segment_id_t
& other
) const {
161 return segment
>= other
.segment
;
164 DENC(segment_id_t
, v
, p
) {
168 static constexpr segment_id_t
create_const(
169 device_id_t id
, device_segment_id_t segment
) {
170 return segment_id_t(id
, segment
, const_t
{});
175 constexpr segment_id_t(device_id_t id
, device_segment_id_t _segment
, const_t
)
176 : segment(make_internal(id
, _segment
)) {}
178 constexpr static inline internal_segment_id_t
make_internal(
180 device_segment_id_t s_id
) {
181 return static_cast<internal_segment_id_t
>(s_id
) |
182 (static_cast<internal_segment_id_t
>(d_id
) << DEVICE_SEGMENT_ID_BITS
);
185 internal_segment_id_t segment
;
187 friend struct segment_id_le_t
;
188 friend struct paddr_t
;
191 std::ostream
&operator<<(std::ostream
&out
, const segment_id_t
&);
193 // ondisk type of segment_id_t
194 struct __attribute((packed
)) segment_id_le_t
{
195 ceph_le32 segment
= ceph_le32(segment_id_t().segment
);
197 segment_id_le_t(const segment_id_t id
) :
198 segment(ceph_le32(id
.segment
)) {}
200 operator segment_id_t() const {
201 return segment_id_t(segment
);
205 constexpr segment_id_t MIN_SEG_ID
= segment_id_t::create_const(0, 0);
206 // segment_id_t() == MAX_SEG_ID == NULL_SEG_ID
207 constexpr segment_id_t MAX_SEG_ID
=
208 segment_id_t::create_const(DEVICE_ID_MAX_VALID_SEGMENT
, DEVICE_SEGMENT_ID_MAX
);
209 constexpr segment_id_t NULL_SEG_ID
= MAX_SEG_ID
;
211 /* Monotonically increasing segment seq, uniquely identifies
212 * the incarnation of a segment */
213 using segment_seq_t
= uint32_t;
214 static constexpr segment_seq_t MAX_SEG_SEQ
=
215 std::numeric_limits
<segment_seq_t
>::max();
216 static constexpr segment_seq_t NULL_SEG_SEQ
= MAX_SEG_SEQ
;
218 enum class segment_type_t
: uint8_t {
224 std::ostream
& operator<<(std::ostream
& out
, segment_type_t t
);
226 struct segment_seq_printer_t
{
230 std::ostream
& operator<<(std::ostream
& out
, segment_seq_printer_t seq
);
235 * Compact templated mapping from a segment_id_t to a value type.
237 template <typename T
>
238 class segment_map_t
{
241 // initializes top vector with 0 length vectors to indicate that they
242 // are not yet present
243 device_to_segments
.resize(DEVICE_ID_MAX_VALID
);
245 void add_device(device_id_t device
, std::size_t segments
, const T
& init
) {
246 ceph_assert(device
<= DEVICE_ID_MAX_VALID
);
247 ceph_assert(device_to_segments
[device
].size() == 0);
248 ceph_assert(segments
> 0);
249 device_to_segments
[device
].resize(segments
, init
);
250 total_segments
+= segments
;
253 device_to_segments
.clear();
254 device_to_segments
.resize(DEVICE_ID_MAX_VALID
);
258 T
& operator[](segment_id_t id
) {
259 assert(id
.device_segment_id() < device_to_segments
[id
.device_id()].size());
260 return device_to_segments
[id
.device_id()][id
.device_segment_id()];
262 const T
& operator[](segment_id_t id
) const {
263 assert(id
.device_segment_id() < device_to_segments
[id
.device_id()].size());
264 return device_to_segments
[id
.device_id()][id
.device_segment_id()];
267 bool contains(segment_id_t id
) {
268 bool b
= id
.device_id() < device_to_segments
.size();
272 b
= id
.device_segment_id() < device_to_segments
[id
.device_id()].size();
277 return iterator
<false>::lower_bound(*this, 0, 0);
280 return iterator
<true>::lower_bound(*this, 0, 0);
284 return iterator
<false>::end_iterator(*this);
287 return iterator
<true>::end_iterator(*this);
290 auto device_begin(device_id_t id
) {
291 auto ret
= iterator
<false>::lower_bound(*this, id
, 0);
292 assert(ret
->first
.device_id() == id
);
295 auto device_end(device_id_t id
) {
296 return iterator
<false>::lower_bound(*this, id
+ 1, 0);
299 size_t size() const {
300 return total_segments
;
304 template <bool is_const
= false>
306 /// points at set being iterated over
309 const segment_map_t
&,
310 segment_map_t
&> parent
;
312 /// points at current device, or DEVICE_ID_MAX_VALID if is_end()
313 device_id_t device_id
;
315 /// segment at which we are pointing, 0 if is_end()
316 device_segment_id_t device_segment_id
;
318 /// holds referent for operator* and operator-> when !is_end()
322 std::conditional_t
<is_const
, const T
&, T
&>
325 bool is_end() const {
326 return device_id
== DEVICE_ID_MAX_VALID
;
331 auto &device_vec
= parent
.device_to_segments
[device_id
];
332 if (device_vec
.size() == 0 ||
333 device_segment_id
== device_vec
.size()) {
334 while (++device_id
< DEVICE_ID_MAX_VALID
&&
335 parent
.device_to_segments
[device_id
].size() == 0);
336 device_segment_id
= 0;
339 current
= std::nullopt
;
342 segment_id_t
{device_id
, device_segment_id
},
343 parent
.device_to_segments
[device_id
][device_segment_id
]
349 decltype(parent
) &parent
,
350 device_id_t device_id
,
351 device_segment_id_t device_segment_id
)
352 : parent(parent
), device_id(device_id
),
353 device_segment_id(device_segment_id
) {}
356 static iterator
lower_bound(
357 decltype(parent
) &parent
,
358 device_id_t device_id
,
359 device_segment_id_t device_segment_id
) {
360 if (device_id
== DEVICE_ID_MAX_VALID
) {
361 return end_iterator(parent
);
363 auto ret
= iterator
{parent
, device_id
, device_segment_id
};
369 static iterator
end_iterator(
370 decltype(parent
) &parent
) {
371 return iterator
{parent
, DEVICE_ID_MAX_VALID
, 0};
374 iterator
<is_const
>& operator++() {
381 bool operator==(iterator
<is_const
> rit
) {
382 return (device_id
== rit
.device_id
&&
383 device_segment_id
== rit
.device_segment_id
);
386 bool operator!=(iterator
<is_const
> rit
) {
387 return !(*this == rit
);
390 template <bool c
= is_const
, std::enable_if_t
<c
, int> = 0>
391 const std::pair
<const segment_id_t
, const T
&> *operator->() {
395 template <bool c
= is_const
, std::enable_if_t
<!c
, int> = 0>
396 std::pair
<const segment_id_t
, T
&> *operator->() {
401 using reference
= std::conditional_t
<
402 is_const
, const std::pair
<const segment_id_t
, const T
&>&,
403 std::pair
<const segment_id_t
, T
&>&>;
404 reference
operator*() {
413 * device -> segment -> T mapping. device_to_segments[d].size() > 0 iff
414 * device <d> has been added.
416 std::vector
<std::vector
<T
>> device_to_segments
;
418 /// total number of added segments
419 size_t total_segments
= 0;
425 * <segment, offset> offset on disk, see SegmentManager
427 * May be absolute, record_relative, or block_relative.
429 * Blocks get read independently of the surrounding record,
430 * so paddrs embedded directly within a block need to refer
431 * to other blocks within the same record by a block_relative
432 * addr relative to the block's own offset. By contrast,
433 * deltas to existing blocks need to use record_relative
434 * addrs relative to the first block of the record.
436 * Fresh extents during a transaction are refered to by
437 * record_relative paddrs.
440 using internal_paddr_t
= uint64_t;
441 constexpr auto PADDR_BITS
= std::numeric_limits
<internal_paddr_t
>::digits
;
446 * Offset within a device, may be negative for relative offsets.
448 using device_off_t
= int64_t;
449 using u_device_off_t
= uint64_t;
450 constexpr auto DEVICE_OFF_BITS
= PADDR_BITS
- DEVICE_ID_BITS
;
451 constexpr auto DEVICE_OFF_MAX
=
452 std::numeric_limits
<device_off_t
>::max() >> DEVICE_ID_BITS
;
453 constexpr auto DEVICE_OFF_MIN
= -(DEVICE_OFF_MAX
+ 1);
458 * Offset within a segment on disk, may be negative for relative offsets.
460 using segment_off_t
= int32_t;
461 using u_segment_off_t
= uint32_t;
462 constexpr auto SEGMENT_OFF_MAX
= std::numeric_limits
<segment_off_t
>::max();
463 constexpr auto SEGMENT_OFF_MIN
= std::numeric_limits
<segment_off_t
>::min();
464 constexpr auto SEGMENT_OFF_BITS
= std::numeric_limits
<u_segment_off_t
>::digits
;
465 static_assert(PADDR_BITS
== SEGMENT_ID_BITS
+ SEGMENT_OFF_BITS
);
467 constexpr auto DEVICE_ID_MASK
=
468 ((internal_paddr_t(1) << DEVICE_ID_BITS
) - 1) << DEVICE_OFF_BITS
;
469 constexpr auto DEVICE_OFF_MASK
=
470 std::numeric_limits
<u_device_off_t
>::max() >> DEVICE_ID_BITS
;
471 constexpr auto SEGMENT_ID_MASK
=
472 ((internal_paddr_t(1) << SEGMENT_ID_BITS
) - 1) << SEGMENT_OFF_BITS
;
473 constexpr auto SEGMENT_OFF_MASK
=
474 (internal_paddr_t(1) << SEGMENT_OFF_BITS
) - 1;
476 constexpr internal_paddr_t
encode_device_off(device_off_t off
) {
477 return static_cast<internal_paddr_t
>(off
) & DEVICE_OFF_MASK
;
480 constexpr device_off_t
decode_device_off(internal_paddr_t addr
) {
481 if (addr
& (1ull << (DEVICE_OFF_BITS
- 1))) {
482 return static_cast<device_off_t
>(addr
| DEVICE_ID_MASK
);
484 return static_cast<device_off_t
>(addr
& DEVICE_OFF_MASK
);
493 // P_ADDR_MAX == P_ADDR_NULL == paddr_t{}
494 paddr_t() : paddr_t(DEVICE_ID_MAX
, device_off_t(0)) {}
496 static paddr_t
make_seg_paddr(
498 segment_off_t offset
) {
499 return paddr_t(seg
, offset
);
502 static paddr_t
make_seg_paddr(
504 device_segment_id_t seg
,
505 segment_off_t offset
) {
506 return paddr_t(segment_id_t(device
, seg
), offset
);
509 static paddr_t
make_blk_paddr(
511 device_off_t offset
) {
512 assert(device_id_to_paddr_type(device
) == paddr_types_t::RANDOM_BLOCK
);
513 return paddr_t(device
, offset
);
516 static paddr_t
make_res_paddr(
518 device_off_t offset
) {
519 assert(device_id_to_paddr_type(device
) == paddr_types_t::RESERVED
);
520 return paddr_t(device
, offset
);
523 void swap(paddr_t
&other
) {
524 std::swap(internal_paddr
, other
.internal_paddr
);
527 device_id_t
get_device_id() const {
528 return static_cast<device_id_t
>(internal_paddr
>> DEVICE_OFF_BITS
);
531 paddr_types_t
get_addr_type() const {
532 return device_id_to_paddr_type(get_device_id());
535 paddr_t
add_offset(device_off_t o
) const;
537 paddr_t
add_relative(paddr_t o
) const;
539 paddr_t
add_block_relative(paddr_t o
) const {
540 // special version mainly for documentation purposes
541 assert(o
.is_block_relative());
542 return add_relative(o
);
545 paddr_t
add_record_relative(paddr_t o
) const {
546 // special version mainly for documentation purposes
547 assert(o
.is_record_relative());
548 return add_relative(o
);
554 * Helper for the case where an in-memory paddr_t may be
555 * either block_relative or absolute (not record_relative).
557 * base must be either absolute or record_relative.
559 paddr_t
maybe_relative_to(paddr_t base
) const {
560 assert(!base
.is_block_relative());
561 if (is_block_relative()) {
562 return base
.add_block_relative(*this);
571 * Only defined for record_relative paddr_ts. Yields a
572 * block_relative address.
574 paddr_t
block_relative_to(paddr_t rhs
) const;
576 // To be compatible with laddr_t operator+
577 paddr_t
operator+(device_off_t o
) const {
578 return add_offset(o
);
581 seg_paddr_t
& as_seg_paddr();
582 const seg_paddr_t
& as_seg_paddr() const;
583 blk_paddr_t
& as_blk_paddr();
584 const blk_paddr_t
& as_blk_paddr() const;
585 res_paddr_t
& as_res_paddr();
586 const res_paddr_t
& as_res_paddr() const;
588 bool is_delayed() const {
589 return get_device_id() == DEVICE_ID_DELAYED
;
591 bool is_block_relative() const {
592 return get_device_id() == DEVICE_ID_BLOCK_RELATIVE
;
594 bool is_record_relative() const {
595 return get_device_id() == DEVICE_ID_RECORD_RELATIVE
;
597 bool is_relative() const {
598 return is_block_relative() || is_record_relative();
600 /// Denotes special null addr
601 bool is_null() const {
602 return get_device_id() == DEVICE_ID_NULL
;
604 /// Denotes special zero addr
605 bool is_zero() const {
606 return get_device_id() == DEVICE_ID_ZERO
;
608 /// Denotes the root addr
609 bool is_root() const {
610 return get_device_id() == DEVICE_ID_ROOT
;
616 * indicates whether addr reflects a physical location, absolute, relative,
617 * or delayed. FAKE segments also count as real so as to reflect the way in
618 * which unit tests use them.
620 bool is_real() const {
621 return !is_zero() && !is_null() && !is_root();
624 bool is_absolute() const {
625 return get_addr_type() != paddr_types_t::RESERVED
;
628 bool is_fake() const {
629 return get_device_id() == DEVICE_ID_FAKE
;
632 auto operator<=>(const paddr_t
&) const = default;
634 DENC(paddr_t
, v
, p
) {
636 denc(v
.internal_paddr
, p
);
640 constexpr static paddr_t
create_const(
641 device_id_t d_id
, device_off_t offset
) {
642 return paddr_t(d_id
, offset
, const_construct_t());
646 internal_paddr_t internal_paddr
;
650 paddr_t(segment_id_t seg
, segment_off_t offset
)
651 : paddr_t((static_cast<internal_paddr_t
>(seg
.segment
) << SEGMENT_OFF_BITS
) |
652 static_cast<u_segment_off_t
>(offset
)) {}
655 paddr_t(device_id_t d_id
, device_off_t offset
)
656 : paddr_t((static_cast<internal_paddr_t
>(d_id
) << DEVICE_OFF_BITS
) |
657 encode_device_off(offset
)) {
658 assert(offset
>= DEVICE_OFF_MIN
);
659 assert(offset
<= DEVICE_OFF_MAX
);
660 assert(get_addr_type() != paddr_types_t::SEGMENT
);
663 paddr_t(internal_paddr_t val
);
665 struct const_construct_t
{};
666 constexpr paddr_t(device_id_t d_id
, device_off_t offset
, const_construct_t
)
667 : internal_paddr((static_cast<internal_paddr_t
>(d_id
) << DEVICE_OFF_BITS
) |
668 static_cast<u_device_off_t
>(offset
)) {}
670 friend struct paddr_le_t
;
673 std::ostream
&operator<<(std::ostream
&out
, const paddr_t
&rhs
);
675 struct seg_paddr_t
: public paddr_t
{
676 seg_paddr_t(const seg_paddr_t
&) = delete;
677 seg_paddr_t(seg_paddr_t
&) = delete;
678 seg_paddr_t
& operator=(const seg_paddr_t
&) = delete;
679 seg_paddr_t
& operator=(seg_paddr_t
&) = delete;
681 segment_id_t
get_segment_id() const {
682 return segment_id_t(static_cast<internal_segment_id_t
>(
683 internal_paddr
>> SEGMENT_OFF_BITS
));
686 segment_off_t
get_segment_off() const {
687 return segment_off_t(internal_paddr
& SEGMENT_OFF_MASK
);
690 void set_segment_off(segment_off_t off
) {
692 internal_paddr
= (internal_paddr
& SEGMENT_ID_MASK
);
693 internal_paddr
|= static_cast<u_segment_off_t
>(off
);
696 paddr_t
add_offset(device_off_t o
) const {
697 device_off_t off
= get_segment_off() + o
;
699 assert(off
<= SEGMENT_OFF_MAX
);
700 return paddr_t::make_seg_paddr(
701 get_segment_id(), static_cast<segment_off_t
>(off
));
705 struct blk_paddr_t
: public paddr_t
{
706 blk_paddr_t(const blk_paddr_t
&) = delete;
707 blk_paddr_t(blk_paddr_t
&) = delete;
708 blk_paddr_t
& operator=(const blk_paddr_t
&) = delete;
709 blk_paddr_t
& operator=(blk_paddr_t
&) = delete;
711 device_off_t
get_device_off() const {
712 return decode_device_off(internal_paddr
);
715 void set_device_off(device_off_t off
) {
717 assert(off
<= DEVICE_OFF_MAX
);
718 internal_paddr
= (internal_paddr
& DEVICE_ID_MASK
);
719 internal_paddr
|= encode_device_off(off
);
722 paddr_t
add_offset(device_off_t o
) const {
723 assert(o
>= DEVICE_OFF_MIN
);
724 assert(o
<= DEVICE_OFF_MAX
);
725 auto off
= get_device_off() + o
;
726 return paddr_t::make_blk_paddr(get_device_id(), off
);
730 struct res_paddr_t
: public paddr_t
{
731 res_paddr_t(const res_paddr_t
&) = delete;
732 res_paddr_t(res_paddr_t
&) = delete;
733 res_paddr_t
& operator=(const res_paddr_t
&) = delete;
734 res_paddr_t
& operator=(res_paddr_t
&) = delete;
736 device_off_t
get_device_off() const {
737 return decode_device_off(internal_paddr
);
740 void set_device_off(device_off_t off
) {
741 assert(has_device_off(get_device_id()));
742 assert(off
>= DEVICE_OFF_MIN
);
743 assert(off
<= DEVICE_OFF_MAX
);
744 internal_paddr
= (internal_paddr
& DEVICE_ID_MASK
);
745 internal_paddr
|= encode_device_off(off
);
748 paddr_t
add_offset(device_off_t o
) const {
749 assert(has_device_off(get_device_id()));
750 assert(o
>= DEVICE_OFF_MIN
);
751 assert(o
<= DEVICE_OFF_MAX
);
752 auto off
= get_device_off() + o
;
753 return paddr_t::make_res_paddr(get_device_id(), off
);
756 paddr_t
block_relative_to(const res_paddr_t
&rhs
) const {
757 assert(rhs
.is_record_relative() && is_record_relative());
758 auto off
= get_device_off() - rhs
.get_device_off();
759 return paddr_t::make_res_paddr(DEVICE_ID_BLOCK_RELATIVE
, off
);
763 constexpr paddr_t P_ADDR_MIN
= paddr_t::create_const(0, 0);
764 // P_ADDR_MAX == P_ADDR_NULL == paddr_t{}
765 constexpr paddr_t P_ADDR_MAX
= paddr_t::create_const(DEVICE_ID_MAX
, 0);
766 constexpr paddr_t P_ADDR_NULL
= P_ADDR_MAX
;
767 constexpr paddr_t P_ADDR_ZERO
= paddr_t::create_const(DEVICE_ID_ZERO
, 0);
768 constexpr paddr_t P_ADDR_ROOT
= paddr_t::create_const(DEVICE_ID_ROOT
, 0);
770 inline paddr_t
make_record_relative_paddr(device_off_t off
) {
771 return paddr_t::make_res_paddr(DEVICE_ID_RECORD_RELATIVE
, off
);
773 inline paddr_t
make_block_relative_paddr(device_off_t off
) {
774 return paddr_t::make_res_paddr(DEVICE_ID_BLOCK_RELATIVE
, off
);
776 inline paddr_t
make_fake_paddr(device_off_t off
) {
777 return paddr_t::make_res_paddr(DEVICE_ID_FAKE
, off
);
779 inline paddr_t
make_delayed_temp_paddr(device_off_t off
) {
780 return paddr_t::make_res_paddr(DEVICE_ID_DELAYED
, off
);
783 inline const seg_paddr_t
& paddr_t::as_seg_paddr() const {
784 assert(get_addr_type() == paddr_types_t::SEGMENT
);
785 return *static_cast<const seg_paddr_t
*>(this);
788 inline seg_paddr_t
& paddr_t::as_seg_paddr() {
789 assert(get_addr_type() == paddr_types_t::SEGMENT
);
790 return *static_cast<seg_paddr_t
*>(this);
793 inline const blk_paddr_t
& paddr_t::as_blk_paddr() const {
794 assert(get_addr_type() == paddr_types_t::RANDOM_BLOCK
);
795 return *static_cast<const blk_paddr_t
*>(this);
798 inline blk_paddr_t
& paddr_t::as_blk_paddr() {
799 assert(get_addr_type() == paddr_types_t::RANDOM_BLOCK
);
800 return *static_cast<blk_paddr_t
*>(this);
803 inline const res_paddr_t
& paddr_t::as_res_paddr() const {
804 assert(get_addr_type() == paddr_types_t::RESERVED
);
805 return *static_cast<const res_paddr_t
*>(this);
808 inline res_paddr_t
& paddr_t::as_res_paddr() {
809 assert(get_addr_type() == paddr_types_t::RESERVED
);
810 return *static_cast<res_paddr_t
*>(this);
813 inline paddr_t::paddr_t(internal_paddr_t val
) : internal_paddr(val
) {
815 auto type
= get_addr_type();
816 if (type
== paddr_types_t::SEGMENT
) {
817 assert(as_seg_paddr().get_segment_off() >= 0);
818 } else if (type
== paddr_types_t::RANDOM_BLOCK
) {
819 assert(as_blk_paddr().get_device_off() >= 0);
821 assert(type
== paddr_types_t::RESERVED
);
822 if (!has_device_off(get_device_id())) {
823 assert(as_res_paddr().get_device_off() == 0);
829 #define PADDR_OPERATION(a_type, base, func) \
830 if (get_addr_type() == a_type) { \
831 return static_cast<const base*>(this)->func; \
834 inline paddr_t
paddr_t::add_offset(device_off_t o
) const {
835 PADDR_OPERATION(paddr_types_t::SEGMENT
, seg_paddr_t
, add_offset(o
))
836 PADDR_OPERATION(paddr_types_t::RANDOM_BLOCK
, blk_paddr_t
, add_offset(o
))
837 PADDR_OPERATION(paddr_types_t::RESERVED
, res_paddr_t
, add_offset(o
))
838 ceph_assert(0 == "not supported type");
842 inline paddr_t
paddr_t::add_relative(paddr_t o
) const {
843 assert(o
.is_relative());
844 auto &res_o
= o
.as_res_paddr();
845 return add_offset(res_o
.get_device_off());
848 inline paddr_t
paddr_t::block_relative_to(paddr_t rhs
) const {
849 return as_res_paddr().block_relative_to(rhs
.as_res_paddr());
852 struct __attribute((packed
)) paddr_le_t
{
853 ceph_le64 internal_paddr
=
854 ceph_le64(P_ADDR_NULL
.internal_paddr
);
856 using orig_type
= paddr_t
;
858 paddr_le_t() = default;
859 paddr_le_t(const paddr_t
&addr
) : internal_paddr(ceph_le64(addr
.internal_paddr
)) {}
861 operator paddr_t() const {
862 return paddr_t
{internal_paddr
};
866 using objaddr_t
= uint32_t;
867 constexpr objaddr_t OBJ_ADDR_MAX
= std::numeric_limits
<objaddr_t
>::max();
868 constexpr objaddr_t OBJ_ADDR_NULL
= OBJ_ADDR_MAX
;
870 enum class placement_hint_t
{
871 HOT
= 0, // The default user hint that expects mutations or retirement
872 COLD
, // Expect no mutations and no retirement in the near future
873 REWRITE
, // Hint for the internal rewrites
874 NUM_HINTS
// Constant for number of hints or as NULL
877 constexpr auto PLACEMENT_HINT_NULL
= placement_hint_t::NUM_HINTS
;
879 std::ostream
& operator<<(std::ostream
& out
, placement_hint_t h
);
881 enum class device_type_t
: uint8_t {
889 RANDOM_BLOCK_EPHEMERAL
,
893 std::ostream
& operator<<(std::ostream
& out
, device_type_t t
);
895 bool can_delay_allocation(device_type_t type
);
896 device_type_t
string_to_device_type(std::string type
);
898 enum class backend_type_t
{
899 SEGMENTED
, // SegmentManager: SSD, ZNS, HDD
900 RANDOM_BLOCK
// RBMDevice: RANDOM_BLOCK_SSD
903 std::ostream
& operator<<(std::ostream
& out
, backend_type_t
);
904 using journal_type_t
= backend_type_t
;
906 constexpr backend_type_t
get_default_backend_of_device(device_type_t dtype
) {
907 assert(dtype
!= device_type_t::NONE
&&
908 dtype
!= device_type_t::NUM_TYPES
);
909 if (dtype
>= device_type_t::HDD
&&
910 dtype
<= device_type_t::EPHEMERAL_MAIN
) {
911 return backend_type_t::SEGMENTED
;
913 return backend_type_t::RANDOM_BLOCK
;
918 * Monotonically increasing identifier for the location of a
921 // JOURNAL_SEQ_NULL == JOURNAL_SEQ_MAX == journal_seq_t{}
922 struct journal_seq_t
{
923 segment_seq_t segment_seq
= NULL_SEG_SEQ
;
924 paddr_t offset
= P_ADDR_NULL
;
926 void swap(journal_seq_t
&other
) {
927 std::swap(segment_seq
, other
.segment_seq
);
928 std::swap(offset
, other
.offset
);
931 // produces a pseudo journal_seq_t relative to this by offset
932 journal_seq_t
add_offset(
935 device_off_t roll_start
,
936 device_off_t roll_size
) const;
938 device_off_t
relative_to(
940 const journal_seq_t
& r
,
941 device_off_t roll_start
,
942 device_off_t roll_size
) const;
944 DENC(journal_seq_t
, v
, p
) {
946 denc(v
.segment_seq
, p
);
951 bool operator==(const journal_seq_t
&o
) const { return cmp(o
) == 0; }
952 bool operator!=(const journal_seq_t
&o
) const { return cmp(o
) != 0; }
953 bool operator<(const journal_seq_t
&o
) const { return cmp(o
) < 0; }
954 bool operator<=(const journal_seq_t
&o
) const { return cmp(o
) <= 0; }
955 bool operator>(const journal_seq_t
&o
) const { return cmp(o
) > 0; }
956 bool operator>=(const journal_seq_t
&o
) const { return cmp(o
) >= 0; }
959 int cmp(const journal_seq_t
&other
) const {
960 if (segment_seq
> other
.segment_seq
) {
962 } else if (segment_seq
< other
.segment_seq
) {
965 using ret_t
= std::pair
<device_off_t
, segment_id_t
>;
966 auto to_pair
= [](const paddr_t
&addr
) -> ret_t
{
967 if (addr
.get_addr_type() == paddr_types_t::SEGMENT
) {
968 auto &seg_addr
= addr
.as_seg_paddr();
969 return ret_t(seg_addr
.get_segment_off(), seg_addr
.get_segment_id());
970 } else if (addr
.get_addr_type() == paddr_types_t::RANDOM_BLOCK
) {
971 auto &blk_addr
= addr
.as_blk_paddr();
972 return ret_t(blk_addr
.get_device_off(), MAX_SEG_ID
);
973 } else if (addr
.get_addr_type() == paddr_types_t::RESERVED
) {
974 auto &res_addr
= addr
.as_res_paddr();
975 return ret_t(res_addr
.get_device_off(), MAX_SEG_ID
);
977 assert(0 == "impossible");
978 return ret_t(0, MAX_SEG_ID
);
981 auto left
= to_pair(offset
);
982 auto right
= to_pair(other
.offset
);
985 } else if (left
< right
) {
993 std::ostream
&operator<<(std::ostream
&out
, const journal_seq_t
&seq
);
995 constexpr journal_seq_t JOURNAL_SEQ_MIN
{
999 constexpr journal_seq_t JOURNAL_SEQ_MAX
{
1003 // JOURNAL_SEQ_NULL == JOURNAL_SEQ_MAX == journal_seq_t{}
1004 constexpr journal_seq_t JOURNAL_SEQ_NULL
= JOURNAL_SEQ_MAX
;
1006 // logical addr, see LBAManager, TransactionManager
1007 using laddr_t
= uint64_t;
1008 constexpr laddr_t L_ADDR_MIN
= std::numeric_limits
<laddr_t
>::min();
1009 constexpr laddr_t L_ADDR_MAX
= std::numeric_limits
<laddr_t
>::max();
1010 constexpr laddr_t L_ADDR_NULL
= L_ADDR_MAX
;
1011 constexpr laddr_t L_ADDR_ROOT
= L_ADDR_MAX
- 1;
1012 constexpr laddr_t L_ADDR_LBAT
= L_ADDR_MAX
- 2;
1014 struct __attribute((packed
)) laddr_le_t
{
1015 ceph_le64 laddr
= ceph_le64(L_ADDR_NULL
);
1017 using orig_type
= laddr_t
;
1019 laddr_le_t() = default;
1020 laddr_le_t(const laddr_le_t
&) = default;
1021 explicit laddr_le_t(const laddr_t
&addr
)
1022 : laddr(ceph_le64(addr
)) {}
1024 operator laddr_t() const {
1025 return laddr_t(laddr
);
1027 laddr_le_t
& operator=(laddr_t addr
) {
1035 // logical offset, see LBAManager, TransactionManager
1036 using extent_len_t
= uint32_t;
1037 constexpr extent_len_t EXTENT_LEN_MAX
=
1038 std::numeric_limits
<extent_len_t
>::max();
1040 using extent_len_le_t
= ceph_le32
;
1041 inline extent_len_le_t
init_extent_len_le(extent_len_t len
) {
1042 return ceph_le32(len
);
1045 struct laddr_list_t
: std::list
<std::pair
<laddr_t
, extent_len_t
>> {
1046 template <typename
... T
>
1047 laddr_list_t(T
&&... args
)
1048 : std::list
<std::pair
<laddr_t
, extent_len_t
>>(std::forward
<T
>(args
)...) {}
1050 struct paddr_list_t
: std::list
<std::pair
<paddr_t
, extent_len_t
>> {
1051 template <typename
... T
>
1052 paddr_list_t(T
&&... args
)
1053 : std::list
<std::pair
<paddr_t
, extent_len_t
>>(std::forward
<T
>(args
)...) {}
1056 std::ostream
&operator<<(std::ostream
&out
, const laddr_list_t
&rhs
);
1057 std::ostream
&operator<<(std::ostream
&out
, const paddr_list_t
&rhs
);
1059 /* identifies type of extent, used for interpretting deltas, managing
1062 * Note that any new extent type needs to be added to
1063 * Cache::get_extent_by_type in cache.cc
1065 enum class extent_types_t
: uint8_t {
1069 DINK_LADDR_LEAF
= 3, // should only be used for unitttests
1072 ONODE_BLOCK_STAGED
= 6,
1074 OBJECT_DATA_BLOCK
= 8,
1075 RETIRED_PLACEHOLDER
= 9,
1076 // the following two types are not extent types,
1077 // they are just used to indicates paddr allocation deltas
1082 TEST_BLOCK_PHYSICAL
= 13,
1083 BACKREF_INTERNAL
= 14,
1085 // None and the number of valid extent_types_t
1088 using extent_types_le_t
= uint8_t;
1089 constexpr auto EXTENT_TYPES_MAX
= static_cast<uint8_t>(extent_types_t::NONE
);
1091 constexpr size_t BACKREF_NODE_SIZE
= 4096;
1093 std::ostream
&operator<<(std::ostream
&out
, extent_types_t t
);
1095 constexpr bool is_logical_type(extent_types_t type
) {
1097 case extent_types_t::ROOT
:
1098 case extent_types_t::LADDR_INTERNAL
:
1099 case extent_types_t::LADDR_LEAF
:
1100 case extent_types_t::BACKREF_INTERNAL
:
1101 case extent_types_t::BACKREF_LEAF
:
1108 constexpr bool is_retired_placeholder(extent_types_t type
)
1110 return type
== extent_types_t::RETIRED_PLACEHOLDER
;
1113 constexpr bool is_lba_node(extent_types_t type
)
1115 return type
== extent_types_t::LADDR_INTERNAL
||
1116 type
== extent_types_t::LADDR_LEAF
||
1117 type
== extent_types_t::DINK_LADDR_LEAF
;
1120 constexpr bool is_backref_node(extent_types_t type
)
1122 return type
== extent_types_t::BACKREF_INTERNAL
||
1123 type
== extent_types_t::BACKREF_LEAF
;
1126 constexpr bool is_lba_backref_node(extent_types_t type
)
1128 return is_lba_node(type
) || is_backref_node(type
);
1131 std::ostream
&operator<<(std::ostream
&out
, extent_types_t t
);
1136 * The goal is to group the similar aged extents in the same segment for better
1137 * bimodel utilization distribution, and also to the same device tier. For EPM,
1138 * it has the flexibility to make placement decisions by re-assigning the
1139 * generation. And each non-inline generation will be statically mapped to a
1142 * All the fresh and dirty extents start with INIT_GENERATION upon allocation,
1143 * and they will be assigned to INLINE/OOL generation by EPM before the initial
1144 * writes. After that, the generation can only be increased upon rewrite.
1146 * Note, although EPM can re-assign the generations according to the tiering
1147 * status, it cannot decrease the generation for the correctness of space
1148 * reservation. It may choose to assign a larger generation if the extent is
1149 * hinted cold, or if want to evict extents to the cold tier. And it may choose
1150 * to not increase the generation if want to keep the hot tier as filled as
1153 using rewrite_gen_t
= uint8_t;
1155 // INIT_GENERATION requires EPM decision to INLINE/OOL_GENERATION
1156 constexpr rewrite_gen_t INIT_GENERATION
= 0;
1157 constexpr rewrite_gen_t INLINE_GENERATION
= 1; // to the journal
1158 constexpr rewrite_gen_t OOL_GENERATION
= 2;
1160 // All the rewritten extents start with MIN_REWRITE_GENERATION
1161 constexpr rewrite_gen_t MIN_REWRITE_GENERATION
= 3;
1162 // without cold tier, the largest generation is less than MIN_COLD_GENERATION
1163 constexpr rewrite_gen_t MIN_COLD_GENERATION
= 5;
1164 constexpr rewrite_gen_t MAX_REWRITE_GENERATION
= 7;
1165 constexpr rewrite_gen_t REWRITE_GENERATIONS
= MAX_REWRITE_GENERATION
+ 1;
1166 constexpr rewrite_gen_t NULL_GENERATION
=
1167 std::numeric_limits
<rewrite_gen_t
>::max();
1169 struct rewrite_gen_printer_t
{
1173 std::ostream
&operator<<(std::ostream
&out
, rewrite_gen_printer_t gen
);
1175 constexpr std::size_t generation_to_writer(rewrite_gen_t gen
) {
1176 // caller to assert the gen is in the reasonable range
1177 return gen
- OOL_GENERATION
;
1180 // before EPM decision
1181 constexpr bool is_target_rewrite_generation(rewrite_gen_t gen
) {
1182 return gen
== INIT_GENERATION
||
1183 (gen
>= MIN_REWRITE_GENERATION
&&
1184 gen
<= REWRITE_GENERATIONS
);
1187 // after EPM decision
1188 constexpr bool is_rewrite_generation(rewrite_gen_t gen
) {
1189 return gen
>= INLINE_GENERATION
&&
1190 gen
< REWRITE_GENERATIONS
;
1193 enum class data_category_t
: uint8_t {
1199 std::ostream
&operator<<(std::ostream
&out
, data_category_t c
);
1201 constexpr data_category_t
get_extent_category(extent_types_t type
) {
1202 if (type
== extent_types_t::OBJECT_DATA_BLOCK
||
1203 type
== extent_types_t::TEST_BLOCK
) {
1204 return data_category_t::DATA
;
1206 return data_category_t::METADATA
;
1210 // type for extent modification time, milliseconds since the epoch
1211 using sea_time_point
= seastar::lowres_system_clock::time_point
;
1212 using sea_duration
= seastar::lowres_system_clock::duration
;
1213 using mod_time_point_t
= int64_t;
1215 constexpr mod_time_point_t
1216 timepoint_to_mod(const sea_time_point
&t
) {
1217 return std::chrono::duration_cast
<std::chrono::milliseconds
>(
1218 t
.time_since_epoch()).count();
1221 constexpr sea_time_point
1222 mod_to_timepoint(mod_time_point_t t
) {
1223 return sea_time_point(std::chrono::duration_cast
<sea_duration
>(
1224 std::chrono::milliseconds(t
)));
1227 constexpr auto NULL_TIME
= sea_time_point();
1228 constexpr auto NULL_MOD_TIME
= timepoint_to_mod(NULL_TIME
);
1230 struct sea_time_point_printer_t
{
1233 std::ostream
&operator<<(std::ostream
&out
, sea_time_point_printer_t tp
);
1235 struct mod_time_point_printer_t
{
1236 mod_time_point_t tp
;
1238 std::ostream
&operator<<(std::ostream
&out
, mod_time_point_printer_t tp
);
1240 constexpr sea_time_point
1241 get_average_time(const sea_time_point
& t1
, std::size_t n1
,
1242 const sea_time_point
& t2
, std::size_t n2
) {
1243 assert(t1
!= NULL_TIME
);
1244 assert(t2
!= NULL_TIME
);
1245 auto new_size
= n1
+ n2
;
1246 assert(new_size
> 0);
1247 auto c1
= t1
.time_since_epoch().count();
1248 auto c2
= t2
.time_since_epoch().count();
1249 auto c_ret
= c1
/ new_size
* n1
+ c2
/ new_size
* n2
;
1250 return sea_time_point(sea_duration(c_ret
));
1253 /* description of a new physical extent */
1255 extent_types_t type
; ///< type of extent
1256 laddr_t addr
; ///< laddr of extent (L_ADDR_NULL for non-logical)
1257 ceph::bufferlist bl
; ///< payload, bl.length() == length, aligned
1260 using extent_version_t
= uint32_t;
1262 /* description of a mutation to a physical extent */
1263 struct delta_info_t
{
1264 extent_types_t type
= extent_types_t::NONE
; ///< delta type
1265 paddr_t paddr
; ///< physical address
1266 laddr_t laddr
= L_ADDR_NULL
; ///< logical address
1267 uint32_t prev_crc
= 0;
1268 uint32_t final_crc
= 0;
1269 extent_len_t length
= 0; ///< extent length
1270 extent_version_t pversion
; ///< prior version
1271 segment_seq_t ext_seq
; ///< seq of the extent's segment
1272 segment_type_t seg_type
;
1273 ceph::bufferlist bl
; ///< payload
1275 DENC(delta_info_t
, v
, p
) {
1276 DENC_START(1, 1, p
);
1280 denc(v
.prev_crc
, p
);
1281 denc(v
.final_crc
, p
);
1283 denc(v
.pversion
, p
);
1285 denc(v
.seg_type
, p
);
1290 bool operator==(const delta_info_t
&rhs
) const {
1293 paddr
== rhs
.paddr
&&
1294 laddr
== rhs
.laddr
&&
1295 prev_crc
== rhs
.prev_crc
&&
1296 final_crc
== rhs
.final_crc
&&
1297 length
== rhs
.length
&&
1298 pversion
== rhs
.pversion
&&
1299 ext_seq
== rhs
.ext_seq
&&
1305 std::ostream
&operator<<(std::ostream
&out
, const delta_info_t
&delta
);
1307 /* contains the latest journal tail information */
1308 struct journal_tail_delta_t
{
1309 journal_seq_t alloc_tail
;
1310 journal_seq_t dirty_tail
;
1312 DENC(journal_tail_delta_t
, v
, p
) {
1313 DENC_START(1, 1, p
);
1314 denc(v
.alloc_tail
, p
);
1315 denc(v
.dirty_tail
, p
);
1320 std::ostream
&operator<<(std::ostream
&out
, const journal_tail_delta_t
&delta
);
1322 class object_data_t
{
1323 laddr_t reserved_data_base
= L_ADDR_NULL
;
1324 extent_len_t reserved_data_len
= 0;
1329 laddr_t reserved_data_base
,
1330 extent_len_t reserved_data_len
)
1331 : reserved_data_base(reserved_data_base
),
1332 reserved_data_len(reserved_data_len
) {}
1334 laddr_t
get_reserved_data_base() const {
1335 return reserved_data_base
;
1338 extent_len_t
get_reserved_data_len() const {
1339 return reserved_data_len
;
1342 bool is_null() const {
1343 return reserved_data_base
== L_ADDR_NULL
;
1346 bool must_update() const {
1350 void update_reserved(
1354 reserved_data_base
= base
;
1355 reserved_data_len
= len
;
1361 reserved_data_len
= len
;
1366 reserved_data_base
= L_ADDR_NULL
;
1367 reserved_data_len
= 0;
1371 struct __attribute__((packed
)) object_data_le_t
{
1372 laddr_le_t reserved_data_base
= laddr_le_t(L_ADDR_NULL
);
1373 extent_len_le_t reserved_data_len
= init_extent_len_le(0);
1375 void update(const object_data_t
&nroot
) {
1376 reserved_data_base
= nroot
.get_reserved_data_base();
1377 reserved_data_len
= init_extent_len_le(nroot
.get_reserved_data_len());
1380 object_data_t
get() const {
1381 return object_data_t(
1387 struct omap_root_t
{
1388 laddr_t addr
= L_ADDR_NULL
;
1390 laddr_t hint
= L_ADDR_MIN
;
1391 bool mutated
= false;
1393 omap_root_t() = default;
1394 omap_root_t(laddr_t addr
, depth_t depth
, laddr_t addr_min
)
1399 omap_root_t(const omap_root_t
&o
) = default;
1400 omap_root_t(omap_root_t
&&o
) = default;
1401 omap_root_t
&operator=(const omap_root_t
&o
) = default;
1402 omap_root_t
&operator=(omap_root_t
&&o
) = default;
1404 bool is_null() const {
1405 return addr
== L_ADDR_NULL
;
1408 bool must_update() const {
1412 void update(laddr_t _addr
, depth_t _depth
, laddr_t _hint
) {
1419 laddr_t
get_location() const {
1423 depth_t
get_depth() const {
1427 laddr_t
get_hint() const {
1431 std::ostream
&operator<<(std::ostream
&out
, const omap_root_t
&root
);
1433 class __attribute__((packed
)) omap_root_le_t
{
1434 laddr_le_t addr
= laddr_le_t(L_ADDR_NULL
);
1435 depth_le_t depth
= init_depth_le(0);
1438 omap_root_le_t() = default;
1440 omap_root_le_t(laddr_t addr
, depth_t depth
)
1441 : addr(addr
), depth(init_depth_le(depth
)) {}
1443 omap_root_le_t(const omap_root_le_t
&o
) = default;
1444 omap_root_le_t(omap_root_le_t
&&o
) = default;
1445 omap_root_le_t
&operator=(const omap_root_le_t
&o
) = default;
1446 omap_root_le_t
&operator=(omap_root_le_t
&&o
) = default;
1448 void update(const omap_root_t
&nroot
) {
1449 addr
= nroot
.get_location();
1450 depth
= init_depth_le(nroot
.get_depth());
1453 omap_root_t
get(laddr_t hint
) const {
1454 return omap_root_t(addr
, depth
, hint
);
1461 class __attribute__((packed
)) phy_tree_root_t
{
1462 paddr_le_t root_addr
;
1463 depth_le_t depth
= init_extent_len_le(0);
1466 phy_tree_root_t() = default;
1468 phy_tree_root_t(paddr_t addr
, depth_t depth
)
1469 : root_addr(addr
), depth(init_depth_le(depth
)) {}
1471 phy_tree_root_t(const phy_tree_root_t
&o
) = default;
1472 phy_tree_root_t(phy_tree_root_t
&&o
) = default;
1473 phy_tree_root_t
&operator=(const phy_tree_root_t
&o
) = default;
1474 phy_tree_root_t
&operator=(phy_tree_root_t
&&o
) = default;
1476 paddr_t
get_location() const {
1480 void set_location(paddr_t location
) {
1481 root_addr
= location
;
1484 depth_t
get_depth() const {
1488 void set_depth(depth_t ndepth
) {
1492 void adjust_addrs_from_base(paddr_t base
) {
1493 paddr_t _root_addr
= root_addr
;
1494 if (_root_addr
.is_relative()) {
1495 root_addr
= base
.add_record_relative(_root_addr
);
1501 laddr_t addr
= L_ADDR_NULL
;
1502 extent_len_t size
= 0;
1504 bool mutated
= false;
1507 coll_root_t() = default;
1508 coll_root_t(laddr_t addr
, extent_len_t size
) : addr(addr
), size(size
) {}
1510 coll_root_t(const coll_root_t
&o
) = default;
1511 coll_root_t(coll_root_t
&&o
) = default;
1512 coll_root_t
&operator=(const coll_root_t
&o
) = default;
1513 coll_root_t
&operator=(coll_root_t
&&o
) = default;
1515 bool must_update() const {
1519 void update(laddr_t _addr
, extent_len_t _s
) {
1525 laddr_t
get_location() const {
1529 extent_len_t
get_size() const {
1537 * Information for locating CollectionManager information, to be embedded
1540 class __attribute__((packed
)) coll_root_le_t
{
1542 extent_len_le_t size
= init_extent_len_le(0);
1545 coll_root_le_t() = default;
1547 coll_root_le_t(laddr_t laddr
, extent_len_t size
)
1548 : addr(laddr
), size(init_extent_len_le(size
)) {}
1551 coll_root_le_t(const coll_root_le_t
&o
) = default;
1552 coll_root_le_t(coll_root_le_t
&&o
) = default;
1553 coll_root_le_t
&operator=(const coll_root_le_t
&o
) = default;
1554 coll_root_le_t
&operator=(coll_root_le_t
&&o
) = default;
1556 void update(const coll_root_t
&nroot
) {
1557 addr
= nroot
.get_location();
1558 size
= init_extent_len_le(nroot
.get_size());
1561 coll_root_t
get() const {
1562 return coll_root_t(addr
, size
);
1566 using lba_root_t
= phy_tree_root_t
;
1567 using backref_root_t
= phy_tree_root_t
;
1572 * Contains information required to find metadata roots.
1573 * TODO: generalize this to permit more than one lba_manager implementation
1575 struct __attribute__((packed
)) root_t
{
1576 using meta_t
= std::map
<std::string
, std::string
>;
1578 static constexpr int MAX_META_LENGTH
= 1024;
1580 backref_root_t backref_root
;
1581 lba_root_t lba_root
;
1582 laddr_le_t onode_root
;
1583 coll_root_le_t collection_root
;
1585 char meta
[MAX_META_LENGTH
];
1591 void adjust_addrs_from_base(paddr_t base
) {
1592 lba_root
.adjust_addrs_from_base(base
);
1593 backref_root
.adjust_addrs_from_base(base
);
1598 bl
.append(ceph::buffer::create_static(MAX_META_LENGTH
, meta
));
1600 auto iter
= bl
.cbegin();
1605 void set_meta(const meta_t
&m
) {
1606 ceph::bufferlist bl
;
1608 ceph_assert(bl
.length() < MAX_META_LENGTH
);
1610 auto &bptr
= bl
.front();
1611 ::memset(meta
, 0, MAX_META_LENGTH
);
1612 ::memcpy(meta
, bptr
.c_str(), bl
.length());
1616 struct alloc_blk_t
{
1621 extent_types_t type
)
1622 : paddr(paddr
), laddr(laddr
), len(len
), type(type
)
1625 explicit alloc_blk_t() = default;
1627 paddr_t paddr
= P_ADDR_NULL
;
1628 laddr_t laddr
= L_ADDR_NULL
;
1629 extent_len_t len
= 0;
1630 extent_types_t type
= extent_types_t::ROOT
;
1631 DENC(alloc_blk_t
, v
, p
) {
1632 DENC_START(1, 1, p
);
1641 // use absolute address
1642 struct alloc_delta_t
{
1643 enum class op_types_t
: uint8_t {
1648 std::vector
<alloc_blk_t
> alloc_blk_ranges
;
1649 op_types_t op
= op_types_t::NONE
;
1651 alloc_delta_t() = default;
1653 DENC(alloc_delta_t
, v
, p
) {
1654 DENC_START(1, 1, p
);
1655 denc(v
.alloc_blk_ranges
, p
);
1661 struct extent_info_t
{
1662 extent_types_t type
= extent_types_t::NONE
;
1663 laddr_t addr
= L_ADDR_NULL
;
1664 extent_len_t len
= 0;
1666 extent_info_t() = default;
1667 extent_info_t(const extent_t
&et
)
1668 : type(et
.type
), addr(et
.addr
),
1672 DENC(extent_info_t
, v
, p
) {
1673 DENC_START(1, 1, p
);
1680 std::ostream
&operator<<(std::ostream
&out
, const extent_info_t
&header
);
1682 using segment_nonce_t
= uint32_t;
1687 * Every segment contains and encode segment_header_t in the first block.
1688 * Our strategy for finding the journal replay point is:
1689 * 1) Find the segment with the highest journal_segment_seq
1690 * 2) Get dirty_tail and alloc_tail from the segment header
1691 * 3) Scan forward to update tails from journal_tail_delta_t
1692 * 4) Replay from the latest tails
1694 struct segment_header_t
{
1695 segment_seq_t segment_seq
;
1696 segment_id_t physical_segment_id
; // debugging
1698 journal_seq_t dirty_tail
;
1699 journal_seq_t alloc_tail
;
1700 segment_nonce_t segment_nonce
;
1702 segment_type_t type
;
1704 data_category_t category
;
1705 rewrite_gen_t generation
;
1707 segment_type_t
get_type() const {
1711 DENC(segment_header_t
, v
, p
) {
1712 DENC_START(1, 1, p
);
1713 denc(v
.segment_seq
, p
);
1714 denc(v
.physical_segment_id
, p
);
1715 denc(v
.dirty_tail
, p
);
1716 denc(v
.alloc_tail
, p
);
1717 denc(v
.segment_nonce
, p
);
1719 denc(v
.category
, p
);
1720 denc(v
.generation
, p
);
1724 std::ostream
&operator<<(std::ostream
&out
, const segment_header_t
&header
);
1726 struct segment_tail_t
{
1727 segment_seq_t segment_seq
;
1728 segment_id_t physical_segment_id
; // debugging
1730 segment_nonce_t segment_nonce
;
1732 segment_type_t type
;
1734 mod_time_point_t modify_time
;
1735 std::size_t num_extents
;
1737 segment_type_t
get_type() const {
1741 DENC(segment_tail_t
, v
, p
) {
1742 DENC_START(1, 1, p
);
1743 denc(v
.segment_seq
, p
);
1744 denc(v
.physical_segment_id
, p
);
1745 denc(v
.segment_nonce
, p
);
1747 denc(v
.modify_time
, p
);
1748 denc(v
.num_extents
, p
);
1752 std::ostream
&operator<<(std::ostream
&out
, const segment_tail_t
&tail
);
1754 enum class transaction_type_t
: uint8_t {
1756 READ
, // including weak and non-weak read transactions
1764 static constexpr auto TRANSACTION_TYPE_NULL
= transaction_type_t::MAX
;
1766 static constexpr auto TRANSACTION_TYPE_MAX
= static_cast<std::size_t>(
1767 transaction_type_t::MAX
);
1769 std::ostream
&operator<<(std::ostream
&os
, transaction_type_t type
);
1771 constexpr bool is_valid_transaction(transaction_type_t type
) {
1772 return type
< transaction_type_t::MAX
;
1775 constexpr bool is_background_transaction(transaction_type_t type
) {
1776 return (type
>= transaction_type_t::TRIM_DIRTY
&&
1777 type
< transaction_type_t::MAX
);
1780 constexpr bool is_trim_transaction(transaction_type_t type
) {
1781 return (type
== transaction_type_t::TRIM_DIRTY
||
1782 type
== transaction_type_t::TRIM_ALLOC
);
1785 struct record_size_t
{
1786 extent_len_t plain_mdlength
= 0; // mdlength without the record header
1787 extent_len_t dlength
= 0;
1789 extent_len_t
get_raw_mdlength() const;
1791 bool is_empty() const {
1792 return plain_mdlength
== 0 &&
1796 void account_extent(extent_len_t extent_len
);
1798 void account(const extent_t
& extent
) {
1799 account_extent(extent
.bl
.length());
1802 void account(const delta_info_t
& delta
);
1804 bool operator==(const record_size_t
&) const = default;
1806 std::ostream
&operator<<(std::ostream
&, const record_size_t
&);
1809 transaction_type_t type
= TRANSACTION_TYPE_NULL
;
1810 std::vector
<extent_t
> extents
;
1811 std::vector
<delta_info_t
> deltas
;
1813 sea_time_point modify_time
= NULL_TIME
;
1815 record_t(transaction_type_t type
) : type
{type
} { }
1819 type
= transaction_type_t::MUTATE
;
1823 record_t(std::vector
<extent_t
>&& _extents
,
1824 std::vector
<delta_info_t
>&& _deltas
) {
1825 auto modify_time
= seastar::lowres_system_clock::now();
1826 for (auto& e
: _extents
) {
1827 push_back(std::move(e
), modify_time
);
1829 for (auto& d
: _deltas
) {
1830 push_back(std::move(d
));
1832 type
= transaction_type_t::MUTATE
;
1835 bool is_empty() const {
1836 return extents
.size() == 0 &&
1840 std::size_t get_delta_size() const {
1841 auto delta_size
= std::accumulate(
1842 deltas
.begin(), deltas
.end(), 0,
1843 [](uint64_t sum
, auto& delta
) {
1844 return sum
+ delta
.bl
.length();
1850 void push_back(extent_t
&& extent
, sea_time_point
&t
) {
1851 ceph_assert(t
!= NULL_TIME
);
1852 if (extents
.size() == 0) {
1853 assert(modify_time
== NULL_TIME
);
1856 modify_time
= get_average_time(modify_time
, extents
.size(), t
, 1);
1858 size
.account(extent
);
1859 extents
.push_back(std::move(extent
));
1862 void push_back(delta_info_t
&& delta
) {
1863 size
.account(delta
);
1864 deltas
.push_back(std::move(delta
));
1867 std::ostream
&operator<<(std::ostream
&, const record_t
&);
1869 struct record_header_t
{
1870 transaction_type_t type
;
1871 uint32_t deltas
; // number of deltas
1872 uint32_t extents
; // number of extents
1873 mod_time_point_t modify_time
;
1875 DENC(record_header_t
, v
, p
) {
1876 DENC_START(1, 1, p
);
1880 denc(v
.modify_time
, p
);
1884 std::ostream
&operator<<(std::ostream
&, const record_header_t
&);
1886 struct record_group_header_t
{
1888 extent_len_t mdlength
; // block aligned, length of metadata
1889 extent_len_t dlength
; // block aligned, length of data
1890 segment_nonce_t segment_nonce
;// nonce of containing segment
1891 journal_seq_t committed_to
; // records prior to committed_to have been
1892 // fully written, maybe in another segment.
1893 checksum_t data_crc
; // crc of data payload
1896 DENC(record_group_header_t
, v
, p
) {
1897 DENC_START(1, 1, p
);
1899 denc(v
.mdlength
, p
);
1901 denc(v
.segment_nonce
, p
);
1902 denc(v
.committed_to
, p
);
1903 denc(v
.data_crc
, p
);
1907 std::ostream
& operator<<(std::ostream
&, const record_group_header_t
&);
1909 struct record_group_size_t
{
1910 extent_len_t plain_mdlength
= 0; // mdlength without the group header
1911 extent_len_t dlength
= 0;
1912 extent_len_t block_size
= 0;
1914 record_group_size_t() = default;
1915 record_group_size_t(
1916 const record_size_t
& rsize
,
1917 extent_len_t block_size
) {
1918 account(rsize
, block_size
);
1921 extent_len_t
get_raw_mdlength() const;
1923 extent_len_t
get_mdlength() const {
1924 assert(block_size
> 0);
1925 return p2roundup(get_raw_mdlength(), block_size
);
1928 extent_len_t
get_encoded_length() const {
1929 assert(block_size
> 0);
1930 assert(dlength
% block_size
== 0);
1931 return get_mdlength() + dlength
;
1934 record_group_size_t
get_encoded_length_after(
1935 const record_size_t
& rsize
,
1936 extent_len_t block_size
) const {
1937 record_group_size_t tmp
= *this;
1938 tmp
.account(rsize
, block_size
);
1942 double get_fullness() const {
1943 assert(block_size
> 0);
1944 return ((double)(get_raw_mdlength() + dlength
) /
1945 get_encoded_length());
1948 void account(const record_size_t
& rsize
,
1949 extent_len_t block_size
);
1951 bool operator==(const record_group_size_t
&) const = default;
1953 std::ostream
& operator<<(std::ostream
&, const record_group_size_t
&);
1955 struct record_group_t
{
1956 std::vector
<record_t
> records
;
1957 record_group_size_t size
;
1959 record_group_t() = default;
1962 extent_len_t block_size
) {
1963 push_back(std::move(record
), block_size
);
1966 std::size_t get_size() const {
1967 return records
.size();
1972 extent_len_t block_size
) {
1973 size
.account(record
.size
, block_size
);
1974 records
.push_back(std::move(record
));
1975 assert(size
.get_encoded_length() < SEGMENT_OFF_MAX
);
1978 void reserve(std::size_t limit
) {
1979 records
.reserve(limit
);
1987 std::ostream
& operator<<(std::ostream
&, const record_group_t
&);
1989 ceph::bufferlist
encode_record(
1991 extent_len_t block_size
,
1992 const journal_seq_t
& committed_to
,
1993 segment_nonce_t current_segment_nonce
);
1995 ceph::bufferlist
encode_records(
1996 record_group_t
& record_group
,
1997 const journal_seq_t
& committed_to
,
1998 segment_nonce_t current_segment_nonce
);
2000 std::optional
<record_group_header_t
>
2001 try_decode_records_header(
2002 const ceph::bufferlist
& header_bl
,
2003 segment_nonce_t expected_nonce
);
2005 bool validate_records_metadata(
2006 const ceph::bufferlist
& md_bl
);
2008 bool validate_records_data(
2009 const record_group_header_t
& header
,
2010 const ceph::bufferlist
& data_bl
);
2012 struct record_extent_infos_t
{
2013 record_header_t header
;
2014 std::vector
<extent_info_t
> extent_infos
;
2016 std::optional
<std::vector
<record_extent_infos_t
> >
2017 try_decode_extent_infos(
2018 const record_group_header_t
& header
,
2019 const ceph::bufferlist
& md_bl
);
2020 std::optional
<std::vector
<record_header_t
>>
2021 try_decode_record_headers(
2022 const record_group_header_t
& header
,
2023 const ceph::bufferlist
& md_bl
);
2025 struct record_deltas_t
{
2026 paddr_t record_block_base
;
2027 std::vector
<std::pair
<sea_time_point
, delta_info_t
>> deltas
;
2029 std::optional
<std::vector
<record_deltas_t
> >
2031 const record_group_header_t
& header
,
2032 const ceph::bufferlist
& md_bl
,
2033 paddr_t record_block_base
);
2035 struct write_result_t
{
2036 journal_seq_t start_seq
;
2037 extent_len_t length
;
2039 journal_seq_t
get_end_seq() const {
2040 return journal_seq_t
{
2041 start_seq
.segment_seq
,
2042 start_seq
.offset
.add_offset(length
)};
2045 std::ostream
& operator<<(std::ostream
&, const write_result_t
&);
2047 struct record_locator_t
{
2048 paddr_t record_block_base
;
2049 write_result_t write_result
;
2051 std::ostream
& operator<<(std::ostream
&, const record_locator_t
&);
2053 /// scan segment for end incrementally
2054 struct scan_valid_records_cursor
{
2055 bool last_valid_header_found
= false;
2057 journal_seq_t last_committed
;
2058 std::size_t num_consumed_records
= 0;
2060 struct found_record_group_t
{
2062 record_group_header_t header
;
2063 bufferlist mdbuffer
;
2065 found_record_group_t(
2067 const record_group_header_t
&header
,
2068 const bufferlist
&mdbuffer
)
2069 : offset(offset
), header(header
), mdbuffer(mdbuffer
) {}
2071 std::deque
<found_record_group_t
> pending_record_groups
;
2073 bool is_complete() const {
2074 return last_valid_header_found
&& pending_record_groups
.empty();
2077 segment_id_t
get_segment_id() const {
2078 return seq
.offset
.as_seg_paddr().get_segment_id();
2081 segment_off_t
get_segment_offset() const {
2082 return seq
.offset
.as_seg_paddr().get_segment_off();
2085 void increment_seq(segment_off_t off
) {
2086 auto& seg_addr
= seq
.offset
.as_seg_paddr();
2087 seg_addr
.set_segment_off(
2088 seg_addr
.get_segment_off() + off
);
2091 void emplace_record_group(const record_group_header_t
&, ceph::bufferlist
&&);
2093 void pop_record_group() {
2094 assert(!pending_record_groups
.empty());
2095 ++num_consumed_records
;
2096 pending_record_groups
.pop_front();
2099 scan_valid_records_cursor(
2103 std::ostream
& operator<<(std::ostream
&, const scan_valid_records_cursor
&);
2107 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::seastore_meta_t
)
2108 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_id_t
)
2109 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t
)
2110 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t
)
2111 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t
)
2112 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_tail_delta_t
)
2113 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t
)
2114 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_group_header_t
)
2115 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t
)
2116 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_header_t
)
2117 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_blk_t
)
2118 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_delta_t
)
2119 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_tail_t
)
2121 #if FMT_VERSION >= 90000
2122 template <> struct fmt::formatter
<crimson::os::seastore::data_category_t
> : fmt::ostream_formatter
{};
2123 template <> struct fmt::formatter
<crimson::os::seastore::delta_info_t
> : fmt::ostream_formatter
{};
2124 template <> struct fmt::formatter
<crimson::os::seastore::device_id_printer_t
> : fmt::ostream_formatter
{};
2125 template <> struct fmt::formatter
<crimson::os::seastore::extent_types_t
> : fmt::ostream_formatter
{};
2126 template <> struct fmt::formatter
<crimson::os::seastore::journal_seq_t
> : fmt::ostream_formatter
{};
2127 template <> struct fmt::formatter
<crimson::os::seastore::journal_tail_delta_t
> : fmt::ostream_formatter
{};
2128 template <> struct fmt::formatter
<crimson::os::seastore::laddr_list_t
> : fmt::ostream_formatter
{};
2129 template <> struct fmt::formatter
<crimson::os::seastore::omap_root_t
> : fmt::ostream_formatter
{};
2130 template <> struct fmt::formatter
<crimson::os::seastore::paddr_list_t
> : fmt::ostream_formatter
{};
2131 template <> struct fmt::formatter
<crimson::os::seastore::paddr_t
> : fmt::ostream_formatter
{};
2132 template <> struct fmt::formatter
<crimson::os::seastore::placement_hint_t
> : fmt::ostream_formatter
{};
2133 template <> struct fmt::formatter
<crimson::os::seastore::device_type_t
> : fmt::ostream_formatter
{};
2134 template <> struct fmt::formatter
<crimson::os::seastore::record_group_header_t
> : fmt::ostream_formatter
{};
2135 template <> struct fmt::formatter
<crimson::os::seastore::record_group_size_t
> : fmt::ostream_formatter
{};
2136 template <> struct fmt::formatter
<crimson::os::seastore::record_header_t
> : fmt::ostream_formatter
{};
2137 template <> struct fmt::formatter
<crimson::os::seastore::record_locator_t
> : fmt::ostream_formatter
{};
2138 template <> struct fmt::formatter
<crimson::os::seastore::record_t
> : fmt::ostream_formatter
{};
2139 template <> struct fmt::formatter
<crimson::os::seastore::rewrite_gen_printer_t
> : fmt::ostream_formatter
{};
2140 template <> struct fmt::formatter
<crimson::os::seastore::scan_valid_records_cursor
> : fmt::ostream_formatter
{};
2141 template <> struct fmt::formatter
<crimson::os::seastore::sea_time_point_printer_t
> : fmt::ostream_formatter
{};
2142 template <> struct fmt::formatter
<crimson::os::seastore::segment_header_t
> : fmt::ostream_formatter
{};
2143 template <> struct fmt::formatter
<crimson::os::seastore::segment_id_t
> : fmt::ostream_formatter
{};
2144 template <> struct fmt::formatter
<crimson::os::seastore::segment_seq_printer_t
> : fmt::ostream_formatter
{};
2145 template <> struct fmt::formatter
<crimson::os::seastore::segment_tail_t
> : fmt::ostream_formatter
{};
2146 template <> struct fmt::formatter
<crimson::os::seastore::segment_type_t
> : fmt::ostream_formatter
{};
2147 template <> struct fmt::formatter
<crimson::os::seastore::transaction_type_t
> : fmt::ostream_formatter
{};
2148 template <> struct fmt::formatter
<crimson::os::seastore::write_result_t
> : fmt::ostream_formatter
{};
2149 template <> struct fmt::formatter
<ceph::buffer::list
> : fmt::ostream_formatter
{};