]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/seastore_types.h
add stop-gap to fix compat with CPUs not supporting SSE 4.1
[ceph.git] / ceph / src / crimson / os / seastore / seastore_types.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #pragma once
5
6 #include <limits>
7 #include <numeric>
8 #include <optional>
9 #include <iostream>
10 #include <vector>
11 #include <boost/core/ignore_unused.hpp>
12
13 #include <seastar/core/lowres_clock.hh>
14
15 #include "include/byteorder.h"
16 #include "include/denc.h"
17 #include "include/buffer.h"
18 #include "include/intarith.h"
19 #include "include/interval_set.h"
20 #include "include/uuid.h"
21
22 namespace crimson::os::seastore {
23
24 /* using a special xattr key "omap_header" to store omap header */
25 const std::string OMAP_HEADER_XATTR_KEY = "omap_header";
26
27 using transaction_id_t = uint64_t;
28 constexpr transaction_id_t TRANS_ID_NULL = 0;
29
30 /*
31 * Note: NULL value is usually the default and max value.
32 */
33
34 using depth_t = uint32_t;
35 using depth_le_t = ceph_le32;
36
37 inline depth_le_t init_depth_le(uint32_t i) {
38 return ceph_le32(i);
39 }
40
41 using checksum_t = uint32_t;
42
43 // Immutable metadata for seastore to set at mkfs time
44 struct seastore_meta_t {
45 uuid_d seastore_id;
46
47 DENC(seastore_meta_t, v, p) {
48 DENC_START(1, 1, p);
49 denc(v.seastore_id, p);
50 DENC_FINISH(p);
51 }
52 };
53
54 std::ostream& operator<<(std::ostream& out, const seastore_meta_t& meta);
55
56 bool is_aligned(uint64_t offset, uint64_t alignment);
57
58 // identifies a specific physical device within seastore
59 using device_id_t = uint8_t;
60
61 constexpr auto DEVICE_ID_BITS = std::numeric_limits<device_id_t>::digits;
62
63 constexpr device_id_t DEVICE_ID_MAX = std::numeric_limits<device_id_t>::max();
64 constexpr device_id_t DEVICE_ID_NULL = DEVICE_ID_MAX;
65 constexpr device_id_t DEVICE_ID_RECORD_RELATIVE = DEVICE_ID_MAX - 1;
66 constexpr device_id_t DEVICE_ID_BLOCK_RELATIVE = DEVICE_ID_MAX - 2;
67 constexpr device_id_t DEVICE_ID_DELAYED = DEVICE_ID_MAX - 3;
68 // for tests which generate fake paddrs
69 constexpr device_id_t DEVICE_ID_FAKE = DEVICE_ID_MAX - 4;
70 constexpr device_id_t DEVICE_ID_ZERO = DEVICE_ID_MAX - 5;
71 constexpr device_id_t DEVICE_ID_ROOT = DEVICE_ID_MAX - 6;
72 constexpr device_id_t DEVICE_ID_MAX_VALID = DEVICE_ID_MAX - 7;
73 constexpr device_id_t DEVICE_ID_MAX_VALID_SEGMENT = DEVICE_ID_MAX >> 1;
74 constexpr device_id_t DEVICE_ID_SEGMENTED_MIN = 0;
75 constexpr device_id_t DEVICE_ID_RANDOM_BLOCK_MIN =
76 1 << (std::numeric_limits<device_id_t>::digits - 1);
77
78 struct device_id_printer_t {
79 device_id_t id;
80 };
81
82 std::ostream &operator<<(std::ostream &out, const device_id_printer_t &id);
83
84 // 1 bit in paddr_t to identify the absolute physical address type
85 enum class paddr_types_t {
86 SEGMENT = 0,
87 RANDOM_BLOCK = 1,
88 RESERVED = 2
89 };
90
91 constexpr paddr_types_t device_id_to_paddr_type(device_id_t id) {
92 if (id > DEVICE_ID_MAX_VALID) {
93 return paddr_types_t::RESERVED;
94 } else if ((id & 0x80) == 0) {
95 return paddr_types_t::SEGMENT;
96 } else {
97 return paddr_types_t::RANDOM_BLOCK;
98 }
99 }
100
101 constexpr bool has_device_off(device_id_t id) {
102 return id == DEVICE_ID_RECORD_RELATIVE ||
103 id == DEVICE_ID_BLOCK_RELATIVE ||
104 id == DEVICE_ID_DELAYED ||
105 id == DEVICE_ID_FAKE ||
106 id == DEVICE_ID_ROOT;
107 }
108
109 // internal segment id type of segment_id_t below, with the top
110 // "DEVICE_ID_BITS" bits representing the device id of the segment.
111 using internal_segment_id_t = uint32_t;
112 constexpr auto SEGMENT_ID_BITS = std::numeric_limits<internal_segment_id_t>::digits;
113
114 // segment ids without a device id encapsulated
115 using device_segment_id_t = uint32_t;
116 constexpr auto DEVICE_SEGMENT_ID_BITS = SEGMENT_ID_BITS - DEVICE_ID_BITS;
117 constexpr device_segment_id_t DEVICE_SEGMENT_ID_MAX = (1 << DEVICE_SEGMENT_ID_BITS) - 1;
118
119 // Identifies segment location on disk, see SegmentManager,
120 struct segment_id_t {
121 public:
122 // segment_id_t() == MAX_SEG_ID == NULL_SEG_ID
123 segment_id_t()
124 : segment_id_t(DEVICE_ID_MAX_VALID_SEGMENT, DEVICE_SEGMENT_ID_MAX) {}
125
126 segment_id_t(device_id_t id, device_segment_id_t _segment)
127 : segment_id_t(make_internal(id, _segment)) {}
128
129 segment_id_t(internal_segment_id_t _segment)
130 : segment(_segment) {
131 assert(device_id_to_paddr_type(device_id()) == paddr_types_t::SEGMENT);
132 }
133
134 [[gnu::always_inline]]
135 constexpr device_id_t device_id() const {
136 return static_cast<device_id_t>(segment >> DEVICE_SEGMENT_ID_BITS);
137 }
138
139 [[gnu::always_inline]]
140 constexpr device_segment_id_t device_segment_id() const {
141 constexpr internal_segment_id_t _SEGMENT_ID_MASK = (1u << DEVICE_SEGMENT_ID_BITS) - 1;
142 return segment & _SEGMENT_ID_MASK;
143 }
144
145 bool operator==(const segment_id_t& other) const {
146 return segment == other.segment;
147 }
148 bool operator!=(const segment_id_t& other) const {
149 return segment != other.segment;
150 }
151 bool operator<(const segment_id_t& other) const {
152 return segment < other.segment;
153 }
154 bool operator<=(const segment_id_t& other) const {
155 return segment <= other.segment;
156 }
157 bool operator>(const segment_id_t& other) const {
158 return segment > other.segment;
159 }
160 bool operator>=(const segment_id_t& other) const {
161 return segment >= other.segment;
162 }
163
164 DENC(segment_id_t, v, p) {
165 denc(v.segment, p);
166 }
167
168 static constexpr segment_id_t create_const(
169 device_id_t id, device_segment_id_t segment) {
170 return segment_id_t(id, segment, const_t{});
171 }
172
173 private:
174 struct const_t {};
175 constexpr segment_id_t(device_id_t id, device_segment_id_t _segment, const_t)
176 : segment(make_internal(id, _segment)) {}
177
178 constexpr static inline internal_segment_id_t make_internal(
179 device_id_t d_id,
180 device_segment_id_t s_id) {
181 return static_cast<internal_segment_id_t>(s_id) |
182 (static_cast<internal_segment_id_t>(d_id) << DEVICE_SEGMENT_ID_BITS);
183 }
184
185 internal_segment_id_t segment;
186
187 friend struct segment_id_le_t;
188 friend struct paddr_t;
189 };
190
191 std::ostream &operator<<(std::ostream &out, const segment_id_t&);
192
193 // ondisk type of segment_id_t
194 struct __attribute((packed)) segment_id_le_t {
195 ceph_le32 segment = ceph_le32(segment_id_t().segment);
196
197 segment_id_le_t(const segment_id_t id) :
198 segment(ceph_le32(id.segment)) {}
199
200 operator segment_id_t() const {
201 return segment_id_t(segment);
202 }
203 };
204
205 constexpr segment_id_t MIN_SEG_ID = segment_id_t::create_const(0, 0);
206 // segment_id_t() == MAX_SEG_ID == NULL_SEG_ID
207 constexpr segment_id_t MAX_SEG_ID =
208 segment_id_t::create_const(DEVICE_ID_MAX_VALID_SEGMENT, DEVICE_SEGMENT_ID_MAX);
209 constexpr segment_id_t NULL_SEG_ID = MAX_SEG_ID;
210
211 /* Monotonically increasing segment seq, uniquely identifies
212 * the incarnation of a segment */
213 using segment_seq_t = uint32_t;
214 static constexpr segment_seq_t MAX_SEG_SEQ =
215 std::numeric_limits<segment_seq_t>::max();
216 static constexpr segment_seq_t NULL_SEG_SEQ = MAX_SEG_SEQ;
217
218 enum class segment_type_t : uint8_t {
219 JOURNAL = 0,
220 OOL,
221 NULL_SEG,
222 };
223
224 std::ostream& operator<<(std::ostream& out, segment_type_t t);
225
226 struct segment_seq_printer_t {
227 segment_seq_t seq;
228 };
229
230 std::ostream& operator<<(std::ostream& out, segment_seq_printer_t seq);
231
232 /**
233 * segment_map_t
234 *
235 * Compact templated mapping from a segment_id_t to a value type.
236 */
237 template <typename T>
238 class segment_map_t {
239 public:
240 segment_map_t() {
241 // initializes top vector with 0 length vectors to indicate that they
242 // are not yet present
243 device_to_segments.resize(DEVICE_ID_MAX_VALID);
244 }
245 void add_device(device_id_t device, std::size_t segments, const T& init) {
246 ceph_assert(device <= DEVICE_ID_MAX_VALID);
247 ceph_assert(device_to_segments[device].size() == 0);
248 ceph_assert(segments > 0);
249 device_to_segments[device].resize(segments, init);
250 total_segments += segments;
251 }
252 void clear() {
253 device_to_segments.clear();
254 device_to_segments.resize(DEVICE_ID_MAX_VALID);
255 total_segments = 0;
256 }
257
258 T& operator[](segment_id_t id) {
259 assert(id.device_segment_id() < device_to_segments[id.device_id()].size());
260 return device_to_segments[id.device_id()][id.device_segment_id()];
261 }
262 const T& operator[](segment_id_t id) const {
263 assert(id.device_segment_id() < device_to_segments[id.device_id()].size());
264 return device_to_segments[id.device_id()][id.device_segment_id()];
265 }
266
267 bool contains(segment_id_t id) {
268 bool b = id.device_id() < device_to_segments.size();
269 if (!b) {
270 return b;
271 }
272 b = id.device_segment_id() < device_to_segments[id.device_id()].size();
273 return b;
274 }
275
276 auto begin() {
277 return iterator<false>::lower_bound(*this, 0, 0);
278 }
279 auto begin() const {
280 return iterator<true>::lower_bound(*this, 0, 0);
281 }
282
283 auto end() {
284 return iterator<false>::end_iterator(*this);
285 }
286 auto end() const {
287 return iterator<true>::end_iterator(*this);
288 }
289
290 auto device_begin(device_id_t id) {
291 auto ret = iterator<false>::lower_bound(*this, id, 0);
292 assert(ret->first.device_id() == id);
293 return ret;
294 }
295 auto device_end(device_id_t id) {
296 return iterator<false>::lower_bound(*this, id + 1, 0);
297 }
298
299 size_t size() const {
300 return total_segments;
301 }
302
303 private:
304 template <bool is_const = false>
305 class iterator {
306 /// points at set being iterated over
307 std::conditional_t<
308 is_const,
309 const segment_map_t &,
310 segment_map_t &> parent;
311
312 /// points at current device, or DEVICE_ID_MAX_VALID if is_end()
313 device_id_t device_id;
314
315 /// segment at which we are pointing, 0 if is_end()
316 device_segment_id_t device_segment_id;
317
318 /// holds referent for operator* and operator-> when !is_end()
319 std::optional<
320 std::pair<
321 const segment_id_t,
322 std::conditional_t<is_const, const T&, T&>
323 >> current;
324
325 bool is_end() const {
326 return device_id == DEVICE_ID_MAX_VALID;
327 }
328
329 void find_valid() {
330 assert(!is_end());
331 auto &device_vec = parent.device_to_segments[device_id];
332 if (device_vec.size() == 0 ||
333 device_segment_id == device_vec.size()) {
334 while (++device_id < DEVICE_ID_MAX_VALID &&
335 parent.device_to_segments[device_id].size() == 0);
336 device_segment_id = 0;
337 }
338 if (is_end()) {
339 current = std::nullopt;
340 } else {
341 current.emplace(
342 segment_id_t{device_id, device_segment_id},
343 parent.device_to_segments[device_id][device_segment_id]
344 );
345 }
346 }
347
348 iterator(
349 decltype(parent) &parent,
350 device_id_t device_id,
351 device_segment_id_t device_segment_id)
352 : parent(parent), device_id(device_id),
353 device_segment_id(device_segment_id) {}
354
355 public:
356 static iterator lower_bound(
357 decltype(parent) &parent,
358 device_id_t device_id,
359 device_segment_id_t device_segment_id) {
360 if (device_id == DEVICE_ID_MAX_VALID) {
361 return end_iterator(parent);
362 } else {
363 auto ret = iterator{parent, device_id, device_segment_id};
364 ret.find_valid();
365 return ret;
366 }
367 }
368
369 static iterator end_iterator(
370 decltype(parent) &parent) {
371 return iterator{parent, DEVICE_ID_MAX_VALID, 0};
372 }
373
374 iterator<is_const>& operator++() {
375 assert(!is_end());
376 ++device_segment_id;
377 find_valid();
378 return *this;
379 }
380
381 bool operator==(iterator<is_const> rit) {
382 return (device_id == rit.device_id &&
383 device_segment_id == rit.device_segment_id);
384 }
385
386 bool operator!=(iterator<is_const> rit) {
387 return !(*this == rit);
388 }
389
390 template <bool c = is_const, std::enable_if_t<c, int> = 0>
391 const std::pair<const segment_id_t, const T&> *operator->() {
392 assert(!is_end());
393 return &*current;
394 }
395 template <bool c = is_const, std::enable_if_t<!c, int> = 0>
396 std::pair<const segment_id_t, T&> *operator->() {
397 assert(!is_end());
398 return &*current;
399 }
400
401 using reference = std::conditional_t<
402 is_const, const std::pair<const segment_id_t, const T&>&,
403 std::pair<const segment_id_t, T&>&>;
404 reference operator*() {
405 assert(!is_end());
406 return *current;
407 }
408 };
409
410 /**
411 * device_to_segments
412 *
413 * device -> segment -> T mapping. device_to_segments[d].size() > 0 iff
414 * device <d> has been added.
415 */
416 std::vector<std::vector<T>> device_to_segments;
417
418 /// total number of added segments
419 size_t total_segments = 0;
420 };
421
422 /**
423 * paddr_t
424 *
425 * <segment, offset> offset on disk, see SegmentManager
426 *
427 * May be absolute, record_relative, or block_relative.
428 *
429 * Blocks get read independently of the surrounding record,
430 * so paddrs embedded directly within a block need to refer
431 * to other blocks within the same record by a block_relative
432 * addr relative to the block's own offset. By contrast,
433 * deltas to existing blocks need to use record_relative
434 * addrs relative to the first block of the record.
435 *
436 * Fresh extents during a transaction are refered to by
437 * record_relative paddrs.
438 */
439
440 using internal_paddr_t = uint64_t;
441 constexpr auto PADDR_BITS = std::numeric_limits<internal_paddr_t>::digits;
442
443 /**
444 * device_off_t
445 *
446 * Offset within a device, may be negative for relative offsets.
447 */
448 using device_off_t = int64_t;
449 using u_device_off_t = uint64_t;
450 constexpr auto DEVICE_OFF_BITS = PADDR_BITS - DEVICE_ID_BITS;
451 constexpr auto DEVICE_OFF_MAX =
452 std::numeric_limits<device_off_t>::max() >> DEVICE_ID_BITS;
453 constexpr auto DEVICE_OFF_MIN = -(DEVICE_OFF_MAX + 1);
454
455 /**
456 * segment_off_t
457 *
458 * Offset within a segment on disk, may be negative for relative offsets.
459 */
460 using segment_off_t = int32_t;
461 using u_segment_off_t = uint32_t;
462 constexpr auto SEGMENT_OFF_MAX = std::numeric_limits<segment_off_t>::max();
463 constexpr auto SEGMENT_OFF_MIN = std::numeric_limits<segment_off_t>::min();
464 constexpr auto SEGMENT_OFF_BITS = std::numeric_limits<u_segment_off_t>::digits;
465 static_assert(PADDR_BITS == SEGMENT_ID_BITS + SEGMENT_OFF_BITS);
466
467 constexpr auto DEVICE_ID_MASK =
468 ((internal_paddr_t(1) << DEVICE_ID_BITS) - 1) << DEVICE_OFF_BITS;
469 constexpr auto DEVICE_OFF_MASK =
470 std::numeric_limits<u_device_off_t>::max() >> DEVICE_ID_BITS;
471 constexpr auto SEGMENT_ID_MASK =
472 ((internal_paddr_t(1) << SEGMENT_ID_BITS) - 1) << SEGMENT_OFF_BITS;
473 constexpr auto SEGMENT_OFF_MASK =
474 (internal_paddr_t(1) << SEGMENT_OFF_BITS) - 1;
475
476 constexpr internal_paddr_t encode_device_off(device_off_t off) {
477 return static_cast<internal_paddr_t>(off) & DEVICE_OFF_MASK;
478 }
479
480 constexpr device_off_t decode_device_off(internal_paddr_t addr) {
481 if (addr & (1ull << (DEVICE_OFF_BITS - 1))) {
482 return static_cast<device_off_t>(addr | DEVICE_ID_MASK);
483 } else {
484 return static_cast<device_off_t>(addr & DEVICE_OFF_MASK);
485 }
486 }
487
488 struct seg_paddr_t;
489 struct blk_paddr_t;
490 struct res_paddr_t;
491 struct paddr_t {
492 public:
493 // P_ADDR_MAX == P_ADDR_NULL == paddr_t{}
494 paddr_t() : paddr_t(DEVICE_ID_MAX, device_off_t(0)) {}
495
496 static paddr_t make_seg_paddr(
497 segment_id_t seg,
498 segment_off_t offset) {
499 return paddr_t(seg, offset);
500 }
501
502 static paddr_t make_seg_paddr(
503 device_id_t device,
504 device_segment_id_t seg,
505 segment_off_t offset) {
506 return paddr_t(segment_id_t(device, seg), offset);
507 }
508
509 static paddr_t make_blk_paddr(
510 device_id_t device,
511 device_off_t offset) {
512 assert(device_id_to_paddr_type(device) == paddr_types_t::RANDOM_BLOCK);
513 return paddr_t(device, offset);
514 }
515
516 static paddr_t make_res_paddr(
517 device_id_t device,
518 device_off_t offset) {
519 assert(device_id_to_paddr_type(device) == paddr_types_t::RESERVED);
520 return paddr_t(device, offset);
521 }
522
523 void swap(paddr_t &other) {
524 std::swap(internal_paddr, other.internal_paddr);
525 }
526
527 device_id_t get_device_id() const {
528 return static_cast<device_id_t>(internal_paddr >> DEVICE_OFF_BITS);
529 }
530
531 paddr_types_t get_addr_type() const {
532 return device_id_to_paddr_type(get_device_id());
533 }
534
535 paddr_t add_offset(device_off_t o) const;
536
537 paddr_t add_relative(paddr_t o) const;
538
539 paddr_t add_block_relative(paddr_t o) const {
540 // special version mainly for documentation purposes
541 assert(o.is_block_relative());
542 return add_relative(o);
543 }
544
545 paddr_t add_record_relative(paddr_t o) const {
546 // special version mainly for documentation purposes
547 assert(o.is_record_relative());
548 return add_relative(o);
549 }
550
551 /**
552 * maybe_relative_to
553 *
554 * Helper for the case where an in-memory paddr_t may be
555 * either block_relative or absolute (not record_relative).
556 *
557 * base must be either absolute or record_relative.
558 */
559 paddr_t maybe_relative_to(paddr_t base) const {
560 assert(!base.is_block_relative());
561 if (is_block_relative()) {
562 return base.add_block_relative(*this);
563 } else {
564 return *this;
565 }
566 }
567
568 /**
569 * block_relative_to
570 *
571 * Only defined for record_relative paddr_ts. Yields a
572 * block_relative address.
573 */
574 paddr_t block_relative_to(paddr_t rhs) const;
575
576 // To be compatible with laddr_t operator+
577 paddr_t operator+(device_off_t o) const {
578 return add_offset(o);
579 }
580
581 seg_paddr_t& as_seg_paddr();
582 const seg_paddr_t& as_seg_paddr() const;
583 blk_paddr_t& as_blk_paddr();
584 const blk_paddr_t& as_blk_paddr() const;
585 res_paddr_t& as_res_paddr();
586 const res_paddr_t& as_res_paddr() const;
587
588 bool is_delayed() const {
589 return get_device_id() == DEVICE_ID_DELAYED;
590 }
591 bool is_block_relative() const {
592 return get_device_id() == DEVICE_ID_BLOCK_RELATIVE;
593 }
594 bool is_record_relative() const {
595 return get_device_id() == DEVICE_ID_RECORD_RELATIVE;
596 }
597 bool is_relative() const {
598 return is_block_relative() || is_record_relative();
599 }
600 /// Denotes special null addr
601 bool is_null() const {
602 return get_device_id() == DEVICE_ID_NULL;
603 }
604 /// Denotes special zero addr
605 bool is_zero() const {
606 return get_device_id() == DEVICE_ID_ZERO;
607 }
608 /// Denotes the root addr
609 bool is_root() const {
610 return get_device_id() == DEVICE_ID_ROOT;
611 }
612
613 /**
614 * is_real
615 *
616 * indicates whether addr reflects a physical location, absolute, relative,
617 * or delayed. FAKE segments also count as real so as to reflect the way in
618 * which unit tests use them.
619 */
620 bool is_real() const {
621 return !is_zero() && !is_null() && !is_root();
622 }
623
624 bool is_absolute() const {
625 return get_addr_type() != paddr_types_t::RESERVED;
626 }
627
628 bool is_fake() const {
629 return get_device_id() == DEVICE_ID_FAKE;
630 }
631
632 auto operator<=>(const paddr_t &) const = default;
633
634 DENC(paddr_t, v, p) {
635 DENC_START(1, 1, p);
636 denc(v.internal_paddr, p);
637 DENC_FINISH(p);
638 }
639
640 constexpr static paddr_t create_const(
641 device_id_t d_id, device_off_t offset) {
642 return paddr_t(d_id, offset, const_construct_t());
643 }
644
645 protected:
646 internal_paddr_t internal_paddr;
647
648 private:
649 // as seg
650 paddr_t(segment_id_t seg, segment_off_t offset)
651 : paddr_t((static_cast<internal_paddr_t>(seg.segment) << SEGMENT_OFF_BITS) |
652 static_cast<u_segment_off_t>(offset)) {}
653
654 // as blk or res
655 paddr_t(device_id_t d_id, device_off_t offset)
656 : paddr_t((static_cast<internal_paddr_t>(d_id) << DEVICE_OFF_BITS) |
657 encode_device_off(offset)) {
658 assert(offset >= DEVICE_OFF_MIN);
659 assert(offset <= DEVICE_OFF_MAX);
660 assert(get_addr_type() != paddr_types_t::SEGMENT);
661 }
662
663 paddr_t(internal_paddr_t val);
664
665 struct const_construct_t {};
666 constexpr paddr_t(device_id_t d_id, device_off_t offset, const_construct_t)
667 : internal_paddr((static_cast<internal_paddr_t>(d_id) << DEVICE_OFF_BITS) |
668 static_cast<u_device_off_t>(offset)) {}
669
670 friend struct paddr_le_t;
671 };
672
673 std::ostream &operator<<(std::ostream &out, const paddr_t &rhs);
674
675 struct seg_paddr_t : public paddr_t {
676 seg_paddr_t(const seg_paddr_t&) = delete;
677 seg_paddr_t(seg_paddr_t&) = delete;
678 seg_paddr_t& operator=(const seg_paddr_t&) = delete;
679 seg_paddr_t& operator=(seg_paddr_t&) = delete;
680
681 segment_id_t get_segment_id() const {
682 return segment_id_t(static_cast<internal_segment_id_t>(
683 internal_paddr >> SEGMENT_OFF_BITS));
684 }
685
686 segment_off_t get_segment_off() const {
687 return segment_off_t(internal_paddr & SEGMENT_OFF_MASK);
688 }
689
690 void set_segment_off(segment_off_t off) {
691 assert(off >= 0);
692 internal_paddr = (internal_paddr & SEGMENT_ID_MASK);
693 internal_paddr |= static_cast<u_segment_off_t>(off);
694 }
695
696 paddr_t add_offset(device_off_t o) const {
697 device_off_t off = get_segment_off() + o;
698 assert(off >= 0);
699 assert(off <= SEGMENT_OFF_MAX);
700 return paddr_t::make_seg_paddr(
701 get_segment_id(), static_cast<segment_off_t>(off));
702 }
703 };
704
705 struct blk_paddr_t : public paddr_t {
706 blk_paddr_t(const blk_paddr_t&) = delete;
707 blk_paddr_t(blk_paddr_t&) = delete;
708 blk_paddr_t& operator=(const blk_paddr_t&) = delete;
709 blk_paddr_t& operator=(blk_paddr_t&) = delete;
710
711 device_off_t get_device_off() const {
712 return decode_device_off(internal_paddr);
713 }
714
715 void set_device_off(device_off_t off) {
716 assert(off >= 0);
717 assert(off <= DEVICE_OFF_MAX);
718 internal_paddr = (internal_paddr & DEVICE_ID_MASK);
719 internal_paddr |= encode_device_off(off);
720 }
721
722 paddr_t add_offset(device_off_t o) const {
723 assert(o >= DEVICE_OFF_MIN);
724 assert(o <= DEVICE_OFF_MAX);
725 auto off = get_device_off() + o;
726 return paddr_t::make_blk_paddr(get_device_id(), off);
727 }
728 };
729
730 struct res_paddr_t : public paddr_t {
731 res_paddr_t(const res_paddr_t&) = delete;
732 res_paddr_t(res_paddr_t&) = delete;
733 res_paddr_t& operator=(const res_paddr_t&) = delete;
734 res_paddr_t& operator=(res_paddr_t&) = delete;
735
736 device_off_t get_device_off() const {
737 return decode_device_off(internal_paddr);
738 }
739
740 void set_device_off(device_off_t off) {
741 assert(has_device_off(get_device_id()));
742 assert(off >= DEVICE_OFF_MIN);
743 assert(off <= DEVICE_OFF_MAX);
744 internal_paddr = (internal_paddr & DEVICE_ID_MASK);
745 internal_paddr |= encode_device_off(off);
746 }
747
748 paddr_t add_offset(device_off_t o) const {
749 assert(has_device_off(get_device_id()));
750 assert(o >= DEVICE_OFF_MIN);
751 assert(o <= DEVICE_OFF_MAX);
752 auto off = get_device_off() + o;
753 return paddr_t::make_res_paddr(get_device_id(), off);
754 }
755
756 paddr_t block_relative_to(const res_paddr_t &rhs) const {
757 assert(rhs.is_record_relative() && is_record_relative());
758 auto off = get_device_off() - rhs.get_device_off();
759 return paddr_t::make_res_paddr(DEVICE_ID_BLOCK_RELATIVE, off);
760 }
761 };
762
763 constexpr paddr_t P_ADDR_MIN = paddr_t::create_const(0, 0);
764 // P_ADDR_MAX == P_ADDR_NULL == paddr_t{}
765 constexpr paddr_t P_ADDR_MAX = paddr_t::create_const(DEVICE_ID_MAX, 0);
766 constexpr paddr_t P_ADDR_NULL = P_ADDR_MAX;
767 constexpr paddr_t P_ADDR_ZERO = paddr_t::create_const(DEVICE_ID_ZERO, 0);
768 constexpr paddr_t P_ADDR_ROOT = paddr_t::create_const(DEVICE_ID_ROOT, 0);
769
770 inline paddr_t make_record_relative_paddr(device_off_t off) {
771 return paddr_t::make_res_paddr(DEVICE_ID_RECORD_RELATIVE, off);
772 }
773 inline paddr_t make_block_relative_paddr(device_off_t off) {
774 return paddr_t::make_res_paddr(DEVICE_ID_BLOCK_RELATIVE, off);
775 }
776 inline paddr_t make_fake_paddr(device_off_t off) {
777 return paddr_t::make_res_paddr(DEVICE_ID_FAKE, off);
778 }
779 inline paddr_t make_delayed_temp_paddr(device_off_t off) {
780 return paddr_t::make_res_paddr(DEVICE_ID_DELAYED, off);
781 }
782
783 inline const seg_paddr_t& paddr_t::as_seg_paddr() const {
784 assert(get_addr_type() == paddr_types_t::SEGMENT);
785 return *static_cast<const seg_paddr_t*>(this);
786 }
787
788 inline seg_paddr_t& paddr_t::as_seg_paddr() {
789 assert(get_addr_type() == paddr_types_t::SEGMENT);
790 return *static_cast<seg_paddr_t*>(this);
791 }
792
793 inline const blk_paddr_t& paddr_t::as_blk_paddr() const {
794 assert(get_addr_type() == paddr_types_t::RANDOM_BLOCK);
795 return *static_cast<const blk_paddr_t*>(this);
796 }
797
798 inline blk_paddr_t& paddr_t::as_blk_paddr() {
799 assert(get_addr_type() == paddr_types_t::RANDOM_BLOCK);
800 return *static_cast<blk_paddr_t*>(this);
801 }
802
803 inline const res_paddr_t& paddr_t::as_res_paddr() const {
804 assert(get_addr_type() == paddr_types_t::RESERVED);
805 return *static_cast<const res_paddr_t*>(this);
806 }
807
808 inline res_paddr_t& paddr_t::as_res_paddr() {
809 assert(get_addr_type() == paddr_types_t::RESERVED);
810 return *static_cast<res_paddr_t*>(this);
811 }
812
813 inline paddr_t::paddr_t(internal_paddr_t val) : internal_paddr(val) {
814 #ifndef NDEBUG
815 auto type = get_addr_type();
816 if (type == paddr_types_t::SEGMENT) {
817 assert(as_seg_paddr().get_segment_off() >= 0);
818 } else if (type == paddr_types_t::RANDOM_BLOCK) {
819 assert(as_blk_paddr().get_device_off() >= 0);
820 } else {
821 assert(type == paddr_types_t::RESERVED);
822 if (!has_device_off(get_device_id())) {
823 assert(as_res_paddr().get_device_off() == 0);
824 }
825 }
826 #endif
827 }
828
829 #define PADDR_OPERATION(a_type, base, func) \
830 if (get_addr_type() == a_type) { \
831 return static_cast<const base*>(this)->func; \
832 }
833
834 inline paddr_t paddr_t::add_offset(device_off_t o) const {
835 PADDR_OPERATION(paddr_types_t::SEGMENT, seg_paddr_t, add_offset(o))
836 PADDR_OPERATION(paddr_types_t::RANDOM_BLOCK, blk_paddr_t, add_offset(o))
837 PADDR_OPERATION(paddr_types_t::RESERVED, res_paddr_t, add_offset(o))
838 ceph_assert(0 == "not supported type");
839 return P_ADDR_NULL;
840 }
841
842 inline paddr_t paddr_t::add_relative(paddr_t o) const {
843 assert(o.is_relative());
844 auto &res_o = o.as_res_paddr();
845 return add_offset(res_o.get_device_off());
846 }
847
848 inline paddr_t paddr_t::block_relative_to(paddr_t rhs) const {
849 return as_res_paddr().block_relative_to(rhs.as_res_paddr());
850 }
851
852 struct __attribute((packed)) paddr_le_t {
853 ceph_le64 internal_paddr =
854 ceph_le64(P_ADDR_NULL.internal_paddr);
855
856 using orig_type = paddr_t;
857
858 paddr_le_t() = default;
859 paddr_le_t(const paddr_t &addr) : internal_paddr(ceph_le64(addr.internal_paddr)) {}
860
861 operator paddr_t() const {
862 return paddr_t{internal_paddr};
863 }
864 };
865
866 using objaddr_t = uint32_t;
867 constexpr objaddr_t OBJ_ADDR_MAX = std::numeric_limits<objaddr_t>::max();
868 constexpr objaddr_t OBJ_ADDR_NULL = OBJ_ADDR_MAX;
869
870 enum class placement_hint_t {
871 HOT = 0, // The default user hint that expects mutations or retirement
872 COLD, // Expect no mutations and no retirement in the near future
873 REWRITE, // Hint for the internal rewrites
874 NUM_HINTS // Constant for number of hints or as NULL
875 };
876
877 constexpr auto PLACEMENT_HINT_NULL = placement_hint_t::NUM_HINTS;
878
879 std::ostream& operator<<(std::ostream& out, placement_hint_t h);
880
881 enum class device_type_t : uint8_t {
882 NONE = 0,
883 HDD,
884 SSD,
885 ZNS,
886 EPHEMERAL_COLD,
887 EPHEMERAL_MAIN,
888 RANDOM_BLOCK_SSD,
889 RANDOM_BLOCK_EPHEMERAL,
890 NUM_TYPES
891 };
892
893 std::ostream& operator<<(std::ostream& out, device_type_t t);
894
895 bool can_delay_allocation(device_type_t type);
896 device_type_t string_to_device_type(std::string type);
897
898 enum class backend_type_t {
899 SEGMENTED, // SegmentManager: SSD, ZNS, HDD
900 RANDOM_BLOCK // RBMDevice: RANDOM_BLOCK_SSD
901 };
902
903 std::ostream& operator<<(std::ostream& out, backend_type_t);
904 using journal_type_t = backend_type_t;
905
906 constexpr backend_type_t get_default_backend_of_device(device_type_t dtype) {
907 assert(dtype != device_type_t::NONE &&
908 dtype != device_type_t::NUM_TYPES);
909 if (dtype >= device_type_t::HDD &&
910 dtype <= device_type_t::EPHEMERAL_MAIN) {
911 return backend_type_t::SEGMENTED;
912 } else {
913 return backend_type_t::RANDOM_BLOCK;
914 }
915 }
916
917 /**
918 * Monotonically increasing identifier for the location of a
919 * journal_record.
920 */
921 // JOURNAL_SEQ_NULL == JOURNAL_SEQ_MAX == journal_seq_t{}
922 struct journal_seq_t {
923 segment_seq_t segment_seq = NULL_SEG_SEQ;
924 paddr_t offset = P_ADDR_NULL;
925
926 void swap(journal_seq_t &other) {
927 std::swap(segment_seq, other.segment_seq);
928 std::swap(offset, other.offset);
929 }
930
931 // produces a pseudo journal_seq_t relative to this by offset
932 journal_seq_t add_offset(
933 journal_type_t type,
934 device_off_t off,
935 device_off_t roll_start,
936 device_off_t roll_size) const;
937
938 device_off_t relative_to(
939 journal_type_t type,
940 const journal_seq_t& r,
941 device_off_t roll_start,
942 device_off_t roll_size) const;
943
944 DENC(journal_seq_t, v, p) {
945 DENC_START(1, 1, p);
946 denc(v.segment_seq, p);
947 denc(v.offset, p);
948 DENC_FINISH(p);
949 }
950
951 bool operator==(const journal_seq_t &o) const { return cmp(o) == 0; }
952 bool operator!=(const journal_seq_t &o) const { return cmp(o) != 0; }
953 bool operator<(const journal_seq_t &o) const { return cmp(o) < 0; }
954 bool operator<=(const journal_seq_t &o) const { return cmp(o) <= 0; }
955 bool operator>(const journal_seq_t &o) const { return cmp(o) > 0; }
956 bool operator>=(const journal_seq_t &o) const { return cmp(o) >= 0; }
957
958 private:
959 int cmp(const journal_seq_t &other) const {
960 if (segment_seq > other.segment_seq) {
961 return 1;
962 } else if (segment_seq < other.segment_seq) {
963 return -1;
964 }
965 using ret_t = std::pair<device_off_t, segment_id_t>;
966 auto to_pair = [](const paddr_t &addr) -> ret_t {
967 if (addr.get_addr_type() == paddr_types_t::SEGMENT) {
968 auto &seg_addr = addr.as_seg_paddr();
969 return ret_t(seg_addr.get_segment_off(), seg_addr.get_segment_id());
970 } else if (addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK) {
971 auto &blk_addr = addr.as_blk_paddr();
972 return ret_t(blk_addr.get_device_off(), MAX_SEG_ID);
973 } else if (addr.get_addr_type() == paddr_types_t::RESERVED) {
974 auto &res_addr = addr.as_res_paddr();
975 return ret_t(res_addr.get_device_off(), MAX_SEG_ID);
976 } else {
977 assert(0 == "impossible");
978 return ret_t(0, MAX_SEG_ID);
979 }
980 };
981 auto left = to_pair(offset);
982 auto right = to_pair(other.offset);
983 if (left > right) {
984 return 1;
985 } else if (left < right) {
986 return -1;
987 } else {
988 return 0;
989 }
990 }
991 };
992
993 std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq);
994
995 constexpr journal_seq_t JOURNAL_SEQ_MIN{
996 0,
997 P_ADDR_MIN
998 };
999 constexpr journal_seq_t JOURNAL_SEQ_MAX{
1000 MAX_SEG_SEQ,
1001 P_ADDR_MAX
1002 };
1003 // JOURNAL_SEQ_NULL == JOURNAL_SEQ_MAX == journal_seq_t{}
1004 constexpr journal_seq_t JOURNAL_SEQ_NULL = JOURNAL_SEQ_MAX;
1005
1006 // logical addr, see LBAManager, TransactionManager
1007 using laddr_t = uint64_t;
1008 constexpr laddr_t L_ADDR_MIN = std::numeric_limits<laddr_t>::min();
1009 constexpr laddr_t L_ADDR_MAX = std::numeric_limits<laddr_t>::max();
1010 constexpr laddr_t L_ADDR_NULL = L_ADDR_MAX;
1011 constexpr laddr_t L_ADDR_ROOT = L_ADDR_MAX - 1;
1012 constexpr laddr_t L_ADDR_LBAT = L_ADDR_MAX - 2;
1013
1014 struct __attribute((packed)) laddr_le_t {
1015 ceph_le64 laddr = ceph_le64(L_ADDR_NULL);
1016
1017 using orig_type = laddr_t;
1018
1019 laddr_le_t() = default;
1020 laddr_le_t(const laddr_le_t &) = default;
1021 explicit laddr_le_t(const laddr_t &addr)
1022 : laddr(ceph_le64(addr)) {}
1023
1024 operator laddr_t() const {
1025 return laddr_t(laddr);
1026 }
1027 laddr_le_t& operator=(laddr_t addr) {
1028 ceph_le64 val;
1029 val = addr;
1030 laddr = val;
1031 return *this;
1032 }
1033 };
1034
1035 // logical offset, see LBAManager, TransactionManager
1036 using extent_len_t = uint32_t;
1037 constexpr extent_len_t EXTENT_LEN_MAX =
1038 std::numeric_limits<extent_len_t>::max();
1039
1040 using extent_len_le_t = ceph_le32;
1041 inline extent_len_le_t init_extent_len_le(extent_len_t len) {
1042 return ceph_le32(len);
1043 }
1044
1045 struct laddr_list_t : std::list<std::pair<laddr_t, extent_len_t>> {
1046 template <typename... T>
1047 laddr_list_t(T&&... args)
1048 : std::list<std::pair<laddr_t, extent_len_t>>(std::forward<T>(args)...) {}
1049 };
1050 struct paddr_list_t : std::list<std::pair<paddr_t, extent_len_t>> {
1051 template <typename... T>
1052 paddr_list_t(T&&... args)
1053 : std::list<std::pair<paddr_t, extent_len_t>>(std::forward<T>(args)...) {}
1054 };
1055
1056 std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs);
1057 std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs);
1058
1059 /* identifies type of extent, used for interpretting deltas, managing
1060 * writeback.
1061 *
1062 * Note that any new extent type needs to be added to
1063 * Cache::get_extent_by_type in cache.cc
1064 */
1065 enum class extent_types_t : uint8_t {
1066 ROOT = 0,
1067 LADDR_INTERNAL = 1,
1068 LADDR_LEAF = 2,
1069 DINK_LADDR_LEAF = 3, // should only be used for unitttests
1070 OMAP_INNER = 4,
1071 OMAP_LEAF = 5,
1072 ONODE_BLOCK_STAGED = 6,
1073 COLL_BLOCK = 7,
1074 OBJECT_DATA_BLOCK = 8,
1075 RETIRED_PLACEHOLDER = 9,
1076 // the following two types are not extent types,
1077 // they are just used to indicates paddr allocation deltas
1078 ALLOC_INFO = 10,
1079 JOURNAL_TAIL = 11,
1080 // Test Block Types
1081 TEST_BLOCK = 12,
1082 TEST_BLOCK_PHYSICAL = 13,
1083 BACKREF_INTERNAL = 14,
1084 BACKREF_LEAF = 15,
1085 // None and the number of valid extent_types_t
1086 NONE = 16,
1087 };
1088 using extent_types_le_t = uint8_t;
1089 constexpr auto EXTENT_TYPES_MAX = static_cast<uint8_t>(extent_types_t::NONE);
1090
1091 constexpr size_t BACKREF_NODE_SIZE = 4096;
1092
1093 std::ostream &operator<<(std::ostream &out, extent_types_t t);
1094
1095 constexpr bool is_logical_type(extent_types_t type) {
1096 switch (type) {
1097 case extent_types_t::ROOT:
1098 case extent_types_t::LADDR_INTERNAL:
1099 case extent_types_t::LADDR_LEAF:
1100 case extent_types_t::BACKREF_INTERNAL:
1101 case extent_types_t::BACKREF_LEAF:
1102 return false;
1103 default:
1104 return true;
1105 }
1106 }
1107
1108 constexpr bool is_retired_placeholder(extent_types_t type)
1109 {
1110 return type == extent_types_t::RETIRED_PLACEHOLDER;
1111 }
1112
1113 constexpr bool is_lba_node(extent_types_t type)
1114 {
1115 return type == extent_types_t::LADDR_INTERNAL ||
1116 type == extent_types_t::LADDR_LEAF ||
1117 type == extent_types_t::DINK_LADDR_LEAF;
1118 }
1119
1120 constexpr bool is_backref_node(extent_types_t type)
1121 {
1122 return type == extent_types_t::BACKREF_INTERNAL ||
1123 type == extent_types_t::BACKREF_LEAF;
1124 }
1125
1126 constexpr bool is_lba_backref_node(extent_types_t type)
1127 {
1128 return is_lba_node(type) || is_backref_node(type);
1129 }
1130
1131 std::ostream &operator<<(std::ostream &out, extent_types_t t);
1132
1133 /**
1134 * rewrite_gen_t
1135 *
1136 * The goal is to group the similar aged extents in the same segment for better
1137 * bimodel utilization distribution, and also to the same device tier. For EPM,
1138 * it has the flexibility to make placement decisions by re-assigning the
1139 * generation. And each non-inline generation will be statically mapped to a
1140 * writer in EPM.
1141 *
1142 * All the fresh and dirty extents start with INIT_GENERATION upon allocation,
1143 * and they will be assigned to INLINE/OOL generation by EPM before the initial
1144 * writes. After that, the generation can only be increased upon rewrite.
1145 *
1146 * Note, although EPM can re-assign the generations according to the tiering
1147 * status, it cannot decrease the generation for the correctness of space
1148 * reservation. It may choose to assign a larger generation if the extent is
1149 * hinted cold, or if want to evict extents to the cold tier. And it may choose
1150 * to not increase the generation if want to keep the hot tier as filled as
1151 * possible.
1152 */
1153 using rewrite_gen_t = uint8_t;
1154
1155 // INIT_GENERATION requires EPM decision to INLINE/OOL_GENERATION
1156 constexpr rewrite_gen_t INIT_GENERATION = 0;
1157 constexpr rewrite_gen_t INLINE_GENERATION = 1; // to the journal
1158 constexpr rewrite_gen_t OOL_GENERATION = 2;
1159
1160 // All the rewritten extents start with MIN_REWRITE_GENERATION
1161 constexpr rewrite_gen_t MIN_REWRITE_GENERATION = 3;
1162 // without cold tier, the largest generation is less than MIN_COLD_GENERATION
1163 constexpr rewrite_gen_t MIN_COLD_GENERATION = 5;
1164 constexpr rewrite_gen_t MAX_REWRITE_GENERATION = 7;
1165 constexpr rewrite_gen_t REWRITE_GENERATIONS = MAX_REWRITE_GENERATION + 1;
1166 constexpr rewrite_gen_t NULL_GENERATION =
1167 std::numeric_limits<rewrite_gen_t>::max();
1168
1169 struct rewrite_gen_printer_t {
1170 rewrite_gen_t gen;
1171 };
1172
1173 std::ostream &operator<<(std::ostream &out, rewrite_gen_printer_t gen);
1174
1175 constexpr std::size_t generation_to_writer(rewrite_gen_t gen) {
1176 // caller to assert the gen is in the reasonable range
1177 return gen - OOL_GENERATION;
1178 }
1179
1180 // before EPM decision
1181 constexpr bool is_target_rewrite_generation(rewrite_gen_t gen) {
1182 return gen == INIT_GENERATION ||
1183 (gen >= MIN_REWRITE_GENERATION &&
1184 gen <= REWRITE_GENERATIONS);
1185 }
1186
1187 // after EPM decision
1188 constexpr bool is_rewrite_generation(rewrite_gen_t gen) {
1189 return gen >= INLINE_GENERATION &&
1190 gen < REWRITE_GENERATIONS;
1191 }
1192
1193 enum class data_category_t : uint8_t {
1194 METADATA = 0,
1195 DATA,
1196 NUM
1197 };
1198
1199 std::ostream &operator<<(std::ostream &out, data_category_t c);
1200
1201 constexpr data_category_t get_extent_category(extent_types_t type) {
1202 if (type == extent_types_t::OBJECT_DATA_BLOCK ||
1203 type == extent_types_t::TEST_BLOCK) {
1204 return data_category_t::DATA;
1205 } else {
1206 return data_category_t::METADATA;
1207 }
1208 }
1209
1210 // type for extent modification time, milliseconds since the epoch
1211 using sea_time_point = seastar::lowres_system_clock::time_point;
1212 using sea_duration = seastar::lowres_system_clock::duration;
1213 using mod_time_point_t = int64_t;
1214
1215 constexpr mod_time_point_t
1216 timepoint_to_mod(const sea_time_point &t) {
1217 return std::chrono::duration_cast<std::chrono::milliseconds>(
1218 t.time_since_epoch()).count();
1219 }
1220
1221 constexpr sea_time_point
1222 mod_to_timepoint(mod_time_point_t t) {
1223 return sea_time_point(std::chrono::duration_cast<sea_duration>(
1224 std::chrono::milliseconds(t)));
1225 }
1226
1227 constexpr auto NULL_TIME = sea_time_point();
1228 constexpr auto NULL_MOD_TIME = timepoint_to_mod(NULL_TIME);
1229
1230 struct sea_time_point_printer_t {
1231 sea_time_point tp;
1232 };
1233 std::ostream &operator<<(std::ostream &out, sea_time_point_printer_t tp);
1234
1235 struct mod_time_point_printer_t {
1236 mod_time_point_t tp;
1237 };
1238 std::ostream &operator<<(std::ostream &out, mod_time_point_printer_t tp);
1239
1240 constexpr sea_time_point
1241 get_average_time(const sea_time_point& t1, std::size_t n1,
1242 const sea_time_point& t2, std::size_t n2) {
1243 assert(t1 != NULL_TIME);
1244 assert(t2 != NULL_TIME);
1245 auto new_size = n1 + n2;
1246 assert(new_size > 0);
1247 auto c1 = t1.time_since_epoch().count();
1248 auto c2 = t2.time_since_epoch().count();
1249 auto c_ret = c1 / new_size * n1 + c2 / new_size * n2;
1250 return sea_time_point(sea_duration(c_ret));
1251 }
1252
1253 /* description of a new physical extent */
1254 struct extent_t {
1255 extent_types_t type; ///< type of extent
1256 laddr_t addr; ///< laddr of extent (L_ADDR_NULL for non-logical)
1257 ceph::bufferlist bl; ///< payload, bl.length() == length, aligned
1258 };
1259
1260 using extent_version_t = uint32_t;
1261
1262 /* description of a mutation to a physical extent */
1263 struct delta_info_t {
1264 extent_types_t type = extent_types_t::NONE; ///< delta type
1265 paddr_t paddr; ///< physical address
1266 laddr_t laddr = L_ADDR_NULL; ///< logical address
1267 uint32_t prev_crc = 0;
1268 uint32_t final_crc = 0;
1269 extent_len_t length = 0; ///< extent length
1270 extent_version_t pversion; ///< prior version
1271 segment_seq_t ext_seq; ///< seq of the extent's segment
1272 segment_type_t seg_type;
1273 ceph::bufferlist bl; ///< payload
1274
1275 DENC(delta_info_t, v, p) {
1276 DENC_START(1, 1, p);
1277 denc(v.type, p);
1278 denc(v.paddr, p);
1279 denc(v.laddr, p);
1280 denc(v.prev_crc, p);
1281 denc(v.final_crc, p);
1282 denc(v.length, p);
1283 denc(v.pversion, p);
1284 denc(v.ext_seq, p);
1285 denc(v.seg_type, p);
1286 denc(v.bl, p);
1287 DENC_FINISH(p);
1288 }
1289
1290 bool operator==(const delta_info_t &rhs) const {
1291 return (
1292 type == rhs.type &&
1293 paddr == rhs.paddr &&
1294 laddr == rhs.laddr &&
1295 prev_crc == rhs.prev_crc &&
1296 final_crc == rhs.final_crc &&
1297 length == rhs.length &&
1298 pversion == rhs.pversion &&
1299 ext_seq == rhs.ext_seq &&
1300 bl == rhs.bl
1301 );
1302 }
1303 };
1304
1305 std::ostream &operator<<(std::ostream &out, const delta_info_t &delta);
1306
1307 /* contains the latest journal tail information */
1308 struct journal_tail_delta_t {
1309 journal_seq_t alloc_tail;
1310 journal_seq_t dirty_tail;
1311
1312 DENC(journal_tail_delta_t, v, p) {
1313 DENC_START(1, 1, p);
1314 denc(v.alloc_tail, p);
1315 denc(v.dirty_tail, p);
1316 DENC_FINISH(p);
1317 }
1318 };
1319
1320 std::ostream &operator<<(std::ostream &out, const journal_tail_delta_t &delta);
1321
1322 class object_data_t {
1323 laddr_t reserved_data_base = L_ADDR_NULL;
1324 extent_len_t reserved_data_len = 0;
1325
1326 bool dirty = false;
1327 public:
1328 object_data_t(
1329 laddr_t reserved_data_base,
1330 extent_len_t reserved_data_len)
1331 : reserved_data_base(reserved_data_base),
1332 reserved_data_len(reserved_data_len) {}
1333
1334 laddr_t get_reserved_data_base() const {
1335 return reserved_data_base;
1336 }
1337
1338 extent_len_t get_reserved_data_len() const {
1339 return reserved_data_len;
1340 }
1341
1342 bool is_null() const {
1343 return reserved_data_base == L_ADDR_NULL;
1344 }
1345
1346 bool must_update() const {
1347 return dirty;
1348 }
1349
1350 void update_reserved(
1351 laddr_t base,
1352 extent_len_t len) {
1353 dirty = true;
1354 reserved_data_base = base;
1355 reserved_data_len = len;
1356 }
1357
1358 void update_len(
1359 extent_len_t len) {
1360 dirty = true;
1361 reserved_data_len = len;
1362 }
1363
1364 void clear() {
1365 dirty = true;
1366 reserved_data_base = L_ADDR_NULL;
1367 reserved_data_len = 0;
1368 }
1369 };
1370
1371 struct __attribute__((packed)) object_data_le_t {
1372 laddr_le_t reserved_data_base = laddr_le_t(L_ADDR_NULL);
1373 extent_len_le_t reserved_data_len = init_extent_len_le(0);
1374
1375 void update(const object_data_t &nroot) {
1376 reserved_data_base = nroot.get_reserved_data_base();
1377 reserved_data_len = init_extent_len_le(nroot.get_reserved_data_len());
1378 }
1379
1380 object_data_t get() const {
1381 return object_data_t(
1382 reserved_data_base,
1383 reserved_data_len);
1384 }
1385 };
1386
1387 struct omap_root_t {
1388 laddr_t addr = L_ADDR_NULL;
1389 depth_t depth = 0;
1390 laddr_t hint = L_ADDR_MIN;
1391 bool mutated = false;
1392
1393 omap_root_t() = default;
1394 omap_root_t(laddr_t addr, depth_t depth, laddr_t addr_min)
1395 : addr(addr),
1396 depth(depth),
1397 hint(addr_min) {}
1398
1399 omap_root_t(const omap_root_t &o) = default;
1400 omap_root_t(omap_root_t &&o) = default;
1401 omap_root_t &operator=(const omap_root_t &o) = default;
1402 omap_root_t &operator=(omap_root_t &&o) = default;
1403
1404 bool is_null() const {
1405 return addr == L_ADDR_NULL;
1406 }
1407
1408 bool must_update() const {
1409 return mutated;
1410 }
1411
1412 void update(laddr_t _addr, depth_t _depth, laddr_t _hint) {
1413 mutated = true;
1414 addr = _addr;
1415 depth = _depth;
1416 hint = _hint;
1417 }
1418
1419 laddr_t get_location() const {
1420 return addr;
1421 }
1422
1423 depth_t get_depth() const {
1424 return depth;
1425 }
1426
1427 laddr_t get_hint() const {
1428 return hint;
1429 }
1430 };
1431 std::ostream &operator<<(std::ostream &out, const omap_root_t &root);
1432
1433 class __attribute__((packed)) omap_root_le_t {
1434 laddr_le_t addr = laddr_le_t(L_ADDR_NULL);
1435 depth_le_t depth = init_depth_le(0);
1436
1437 public:
1438 omap_root_le_t() = default;
1439
1440 omap_root_le_t(laddr_t addr, depth_t depth)
1441 : addr(addr), depth(init_depth_le(depth)) {}
1442
1443 omap_root_le_t(const omap_root_le_t &o) = default;
1444 omap_root_le_t(omap_root_le_t &&o) = default;
1445 omap_root_le_t &operator=(const omap_root_le_t &o) = default;
1446 omap_root_le_t &operator=(omap_root_le_t &&o) = default;
1447
1448 void update(const omap_root_t &nroot) {
1449 addr = nroot.get_location();
1450 depth = init_depth_le(nroot.get_depth());
1451 }
1452
1453 omap_root_t get(laddr_t hint) const {
1454 return omap_root_t(addr, depth, hint);
1455 }
1456 };
1457
1458 /**
1459 * phy_tree_root_t
1460 */
1461 class __attribute__((packed)) phy_tree_root_t {
1462 paddr_le_t root_addr;
1463 depth_le_t depth = init_extent_len_le(0);
1464
1465 public:
1466 phy_tree_root_t() = default;
1467
1468 phy_tree_root_t(paddr_t addr, depth_t depth)
1469 : root_addr(addr), depth(init_depth_le(depth)) {}
1470
1471 phy_tree_root_t(const phy_tree_root_t &o) = default;
1472 phy_tree_root_t(phy_tree_root_t &&o) = default;
1473 phy_tree_root_t &operator=(const phy_tree_root_t &o) = default;
1474 phy_tree_root_t &operator=(phy_tree_root_t &&o) = default;
1475
1476 paddr_t get_location() const {
1477 return root_addr;
1478 }
1479
1480 void set_location(paddr_t location) {
1481 root_addr = location;
1482 }
1483
1484 depth_t get_depth() const {
1485 return depth;
1486 }
1487
1488 void set_depth(depth_t ndepth) {
1489 depth = ndepth;
1490 }
1491
1492 void adjust_addrs_from_base(paddr_t base) {
1493 paddr_t _root_addr = root_addr;
1494 if (_root_addr.is_relative()) {
1495 root_addr = base.add_record_relative(_root_addr);
1496 }
1497 }
1498 };
1499
1500 class coll_root_t {
1501 laddr_t addr = L_ADDR_NULL;
1502 extent_len_t size = 0;
1503
1504 bool mutated = false;
1505
1506 public:
1507 coll_root_t() = default;
1508 coll_root_t(laddr_t addr, extent_len_t size) : addr(addr), size(size) {}
1509
1510 coll_root_t(const coll_root_t &o) = default;
1511 coll_root_t(coll_root_t &&o) = default;
1512 coll_root_t &operator=(const coll_root_t &o) = default;
1513 coll_root_t &operator=(coll_root_t &&o) = default;
1514
1515 bool must_update() const {
1516 return mutated;
1517 }
1518
1519 void update(laddr_t _addr, extent_len_t _s) {
1520 mutated = true;
1521 addr = _addr;
1522 size = _s;
1523 }
1524
1525 laddr_t get_location() const {
1526 return addr;
1527 }
1528
1529 extent_len_t get_size() const {
1530 return size;
1531 }
1532 };
1533
1534 /**
1535 * coll_root_le_t
1536 *
1537 * Information for locating CollectionManager information, to be embedded
1538 * in root block.
1539 */
1540 class __attribute__((packed)) coll_root_le_t {
1541 laddr_le_t addr;
1542 extent_len_le_t size = init_extent_len_le(0);
1543
1544 public:
1545 coll_root_le_t() = default;
1546
1547 coll_root_le_t(laddr_t laddr, extent_len_t size)
1548 : addr(laddr), size(init_extent_len_le(size)) {}
1549
1550
1551 coll_root_le_t(const coll_root_le_t &o) = default;
1552 coll_root_le_t(coll_root_le_t &&o) = default;
1553 coll_root_le_t &operator=(const coll_root_le_t &o) = default;
1554 coll_root_le_t &operator=(coll_root_le_t &&o) = default;
1555
1556 void update(const coll_root_t &nroot) {
1557 addr = nroot.get_location();
1558 size = init_extent_len_le(nroot.get_size());
1559 }
1560
1561 coll_root_t get() const {
1562 return coll_root_t(addr, size);
1563 }
1564 };
1565
1566 using lba_root_t = phy_tree_root_t;
1567 using backref_root_t = phy_tree_root_t;
1568
1569 /**
1570 * root_t
1571 *
1572 * Contains information required to find metadata roots.
1573 * TODO: generalize this to permit more than one lba_manager implementation
1574 */
1575 struct __attribute__((packed)) root_t {
1576 using meta_t = std::map<std::string, std::string>;
1577
1578 static constexpr int MAX_META_LENGTH = 1024;
1579
1580 backref_root_t backref_root;
1581 lba_root_t lba_root;
1582 laddr_le_t onode_root;
1583 coll_root_le_t collection_root;
1584
1585 char meta[MAX_META_LENGTH];
1586
1587 root_t() {
1588 set_meta(meta_t{});
1589 }
1590
1591 void adjust_addrs_from_base(paddr_t base) {
1592 lba_root.adjust_addrs_from_base(base);
1593 backref_root.adjust_addrs_from_base(base);
1594 }
1595
1596 meta_t get_meta() {
1597 bufferlist bl;
1598 bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta));
1599 meta_t ret;
1600 auto iter = bl.cbegin();
1601 decode(ret, iter);
1602 return ret;
1603 }
1604
1605 void set_meta(const meta_t &m) {
1606 ceph::bufferlist bl;
1607 encode(m, bl);
1608 ceph_assert(bl.length() < MAX_META_LENGTH);
1609 bl.rebuild();
1610 auto &bptr = bl.front();
1611 ::memset(meta, 0, MAX_META_LENGTH);
1612 ::memcpy(meta, bptr.c_str(), bl.length());
1613 }
1614 };
1615
1616 struct alloc_blk_t {
1617 alloc_blk_t(
1618 paddr_t paddr,
1619 laddr_t laddr,
1620 extent_len_t len,
1621 extent_types_t type)
1622 : paddr(paddr), laddr(laddr), len(len), type(type)
1623 {}
1624
1625 explicit alloc_blk_t() = default;
1626
1627 paddr_t paddr = P_ADDR_NULL;
1628 laddr_t laddr = L_ADDR_NULL;
1629 extent_len_t len = 0;
1630 extent_types_t type = extent_types_t::ROOT;
1631 DENC(alloc_blk_t, v, p) {
1632 DENC_START(1, 1, p);
1633 denc(v.paddr, p);
1634 denc(v.laddr, p);
1635 denc(v.len, p);
1636 denc(v.type, p);
1637 DENC_FINISH(p);
1638 }
1639 };
1640
1641 // use absolute address
1642 struct alloc_delta_t {
1643 enum class op_types_t : uint8_t {
1644 NONE = 0,
1645 SET = 1,
1646 CLEAR = 2
1647 };
1648 std::vector<alloc_blk_t> alloc_blk_ranges;
1649 op_types_t op = op_types_t::NONE;
1650
1651 alloc_delta_t() = default;
1652
1653 DENC(alloc_delta_t, v, p) {
1654 DENC_START(1, 1, p);
1655 denc(v.alloc_blk_ranges, p);
1656 denc(v.op, p);
1657 DENC_FINISH(p);
1658 }
1659 };
1660
1661 struct extent_info_t {
1662 extent_types_t type = extent_types_t::NONE;
1663 laddr_t addr = L_ADDR_NULL;
1664 extent_len_t len = 0;
1665
1666 extent_info_t() = default;
1667 extent_info_t(const extent_t &et)
1668 : type(et.type), addr(et.addr),
1669 len(et.bl.length())
1670 {}
1671
1672 DENC(extent_info_t, v, p) {
1673 DENC_START(1, 1, p);
1674 denc(v.type, p);
1675 denc(v.addr, p);
1676 denc(v.len, p);
1677 DENC_FINISH(p);
1678 }
1679 };
1680 std::ostream &operator<<(std::ostream &out, const extent_info_t &header);
1681
1682 using segment_nonce_t = uint32_t;
1683
1684 /**
1685 * Segment header
1686 *
1687 * Every segment contains and encode segment_header_t in the first block.
1688 * Our strategy for finding the journal replay point is:
1689 * 1) Find the segment with the highest journal_segment_seq
1690 * 2) Get dirty_tail and alloc_tail from the segment header
1691 * 3) Scan forward to update tails from journal_tail_delta_t
1692 * 4) Replay from the latest tails
1693 */
1694 struct segment_header_t {
1695 segment_seq_t segment_seq;
1696 segment_id_t physical_segment_id; // debugging
1697
1698 journal_seq_t dirty_tail;
1699 journal_seq_t alloc_tail;
1700 segment_nonce_t segment_nonce;
1701
1702 segment_type_t type;
1703
1704 data_category_t category;
1705 rewrite_gen_t generation;
1706
1707 segment_type_t get_type() const {
1708 return type;
1709 }
1710
1711 DENC(segment_header_t, v, p) {
1712 DENC_START(1, 1, p);
1713 denc(v.segment_seq, p);
1714 denc(v.physical_segment_id, p);
1715 denc(v.dirty_tail, p);
1716 denc(v.alloc_tail, p);
1717 denc(v.segment_nonce, p);
1718 denc(v.type, p);
1719 denc(v.category, p);
1720 denc(v.generation, p);
1721 DENC_FINISH(p);
1722 }
1723 };
1724 std::ostream &operator<<(std::ostream &out, const segment_header_t &header);
1725
1726 struct segment_tail_t {
1727 segment_seq_t segment_seq;
1728 segment_id_t physical_segment_id; // debugging
1729
1730 segment_nonce_t segment_nonce;
1731
1732 segment_type_t type;
1733
1734 mod_time_point_t modify_time;
1735 std::size_t num_extents;
1736
1737 segment_type_t get_type() const {
1738 return type;
1739 }
1740
1741 DENC(segment_tail_t, v, p) {
1742 DENC_START(1, 1, p);
1743 denc(v.segment_seq, p);
1744 denc(v.physical_segment_id, p);
1745 denc(v.segment_nonce, p);
1746 denc(v.type, p);
1747 denc(v.modify_time, p);
1748 denc(v.num_extents, p);
1749 DENC_FINISH(p);
1750 }
1751 };
1752 std::ostream &operator<<(std::ostream &out, const segment_tail_t &tail);
1753
1754 enum class transaction_type_t : uint8_t {
1755 MUTATE = 0,
1756 READ, // including weak and non-weak read transactions
1757 TRIM_DIRTY,
1758 TRIM_ALLOC,
1759 CLEANER_MAIN,
1760 CLEANER_COLD,
1761 MAX
1762 };
1763
1764 static constexpr auto TRANSACTION_TYPE_NULL = transaction_type_t::MAX;
1765
1766 static constexpr auto TRANSACTION_TYPE_MAX = static_cast<std::size_t>(
1767 transaction_type_t::MAX);
1768
1769 std::ostream &operator<<(std::ostream &os, transaction_type_t type);
1770
1771 constexpr bool is_valid_transaction(transaction_type_t type) {
1772 return type < transaction_type_t::MAX;
1773 }
1774
1775 constexpr bool is_background_transaction(transaction_type_t type) {
1776 return (type >= transaction_type_t::TRIM_DIRTY &&
1777 type < transaction_type_t::MAX);
1778 }
1779
1780 constexpr bool is_trim_transaction(transaction_type_t type) {
1781 return (type == transaction_type_t::TRIM_DIRTY ||
1782 type == transaction_type_t::TRIM_ALLOC);
1783 }
1784
1785 struct record_size_t {
1786 extent_len_t plain_mdlength = 0; // mdlength without the record header
1787 extent_len_t dlength = 0;
1788
1789 extent_len_t get_raw_mdlength() const;
1790
1791 bool is_empty() const {
1792 return plain_mdlength == 0 &&
1793 dlength == 0;
1794 }
1795
1796 void account_extent(extent_len_t extent_len);
1797
1798 void account(const extent_t& extent) {
1799 account_extent(extent.bl.length());
1800 }
1801
1802 void account(const delta_info_t& delta);
1803
1804 bool operator==(const record_size_t &) const = default;
1805 };
1806 std::ostream &operator<<(std::ostream&, const record_size_t&);
1807
1808 struct record_t {
1809 transaction_type_t type = TRANSACTION_TYPE_NULL;
1810 std::vector<extent_t> extents;
1811 std::vector<delta_info_t> deltas;
1812 record_size_t size;
1813 sea_time_point modify_time = NULL_TIME;
1814
1815 record_t(transaction_type_t type) : type{type} { }
1816
1817 // unit test only
1818 record_t() {
1819 type = transaction_type_t::MUTATE;
1820 }
1821
1822 // unit test only
1823 record_t(std::vector<extent_t>&& _extents,
1824 std::vector<delta_info_t>&& _deltas) {
1825 auto modify_time = seastar::lowres_system_clock::now();
1826 for (auto& e: _extents) {
1827 push_back(std::move(e), modify_time);
1828 }
1829 for (auto& d: _deltas) {
1830 push_back(std::move(d));
1831 }
1832 type = transaction_type_t::MUTATE;
1833 }
1834
1835 bool is_empty() const {
1836 return extents.size() == 0 &&
1837 deltas.size() == 0;
1838 }
1839
1840 std::size_t get_delta_size() const {
1841 auto delta_size = std::accumulate(
1842 deltas.begin(), deltas.end(), 0,
1843 [](uint64_t sum, auto& delta) {
1844 return sum + delta.bl.length();
1845 }
1846 );
1847 return delta_size;
1848 }
1849
1850 void push_back(extent_t&& extent, sea_time_point &t) {
1851 ceph_assert(t != NULL_TIME);
1852 if (extents.size() == 0) {
1853 assert(modify_time == NULL_TIME);
1854 modify_time = t;
1855 } else {
1856 modify_time = get_average_time(modify_time, extents.size(), t, 1);
1857 }
1858 size.account(extent);
1859 extents.push_back(std::move(extent));
1860 }
1861
1862 void push_back(delta_info_t&& delta) {
1863 size.account(delta);
1864 deltas.push_back(std::move(delta));
1865 }
1866 };
1867 std::ostream &operator<<(std::ostream&, const record_t&);
1868
1869 struct record_header_t {
1870 transaction_type_t type;
1871 uint32_t deltas; // number of deltas
1872 uint32_t extents; // number of extents
1873 mod_time_point_t modify_time;
1874
1875 DENC(record_header_t, v, p) {
1876 DENC_START(1, 1, p);
1877 denc(v.type, p);
1878 denc(v.deltas, p);
1879 denc(v.extents, p);
1880 denc(v.modify_time, p);
1881 DENC_FINISH(p);
1882 }
1883 };
1884 std::ostream &operator<<(std::ostream&, const record_header_t&);
1885
1886 struct record_group_header_t {
1887 uint32_t records;
1888 extent_len_t mdlength; // block aligned, length of metadata
1889 extent_len_t dlength; // block aligned, length of data
1890 segment_nonce_t segment_nonce;// nonce of containing segment
1891 journal_seq_t committed_to; // records prior to committed_to have been
1892 // fully written, maybe in another segment.
1893 checksum_t data_crc; // crc of data payload
1894
1895
1896 DENC(record_group_header_t, v, p) {
1897 DENC_START(1, 1, p);
1898 denc(v.records, p);
1899 denc(v.mdlength, p);
1900 denc(v.dlength, p);
1901 denc(v.segment_nonce, p);
1902 denc(v.committed_to, p);
1903 denc(v.data_crc, p);
1904 DENC_FINISH(p);
1905 }
1906 };
1907 std::ostream& operator<<(std::ostream&, const record_group_header_t&);
1908
1909 struct record_group_size_t {
1910 extent_len_t plain_mdlength = 0; // mdlength without the group header
1911 extent_len_t dlength = 0;
1912 extent_len_t block_size = 0;
1913
1914 record_group_size_t() = default;
1915 record_group_size_t(
1916 const record_size_t& rsize,
1917 extent_len_t block_size) {
1918 account(rsize, block_size);
1919 }
1920
1921 extent_len_t get_raw_mdlength() const;
1922
1923 extent_len_t get_mdlength() const {
1924 assert(block_size > 0);
1925 return p2roundup(get_raw_mdlength(), block_size);
1926 }
1927
1928 extent_len_t get_encoded_length() const {
1929 assert(block_size > 0);
1930 assert(dlength % block_size == 0);
1931 return get_mdlength() + dlength;
1932 }
1933
1934 record_group_size_t get_encoded_length_after(
1935 const record_size_t& rsize,
1936 extent_len_t block_size) const {
1937 record_group_size_t tmp = *this;
1938 tmp.account(rsize, block_size);
1939 return tmp;
1940 }
1941
1942 double get_fullness() const {
1943 assert(block_size > 0);
1944 return ((double)(get_raw_mdlength() + dlength) /
1945 get_encoded_length());
1946 }
1947
1948 void account(const record_size_t& rsize,
1949 extent_len_t block_size);
1950
1951 bool operator==(const record_group_size_t &) const = default;
1952 };
1953 std::ostream& operator<<(std::ostream&, const record_group_size_t&);
1954
1955 struct record_group_t {
1956 std::vector<record_t> records;
1957 record_group_size_t size;
1958
1959 record_group_t() = default;
1960 record_group_t(
1961 record_t&& record,
1962 extent_len_t block_size) {
1963 push_back(std::move(record), block_size);
1964 }
1965
1966 std::size_t get_size() const {
1967 return records.size();
1968 }
1969
1970 void push_back(
1971 record_t&& record,
1972 extent_len_t block_size) {
1973 size.account(record.size, block_size);
1974 records.push_back(std::move(record));
1975 assert(size.get_encoded_length() < SEGMENT_OFF_MAX);
1976 }
1977
1978 void reserve(std::size_t limit) {
1979 records.reserve(limit);
1980 }
1981
1982 void clear() {
1983 records.clear();
1984 size = {};
1985 }
1986 };
1987 std::ostream& operator<<(std::ostream&, const record_group_t&);
1988
1989 ceph::bufferlist encode_record(
1990 record_t&& record,
1991 extent_len_t block_size,
1992 const journal_seq_t& committed_to,
1993 segment_nonce_t current_segment_nonce);
1994
1995 ceph::bufferlist encode_records(
1996 record_group_t& record_group,
1997 const journal_seq_t& committed_to,
1998 segment_nonce_t current_segment_nonce);
1999
2000 std::optional<record_group_header_t>
2001 try_decode_records_header(
2002 const ceph::bufferlist& header_bl,
2003 segment_nonce_t expected_nonce);
2004
2005 bool validate_records_metadata(
2006 const ceph::bufferlist& md_bl);
2007
2008 bool validate_records_data(
2009 const record_group_header_t& header,
2010 const ceph::bufferlist& data_bl);
2011
2012 struct record_extent_infos_t {
2013 record_header_t header;
2014 std::vector<extent_info_t> extent_infos;
2015 };
2016 std::optional<std::vector<record_extent_infos_t> >
2017 try_decode_extent_infos(
2018 const record_group_header_t& header,
2019 const ceph::bufferlist& md_bl);
2020 std::optional<std::vector<record_header_t>>
2021 try_decode_record_headers(
2022 const record_group_header_t& header,
2023 const ceph::bufferlist& md_bl);
2024
2025 struct record_deltas_t {
2026 paddr_t record_block_base;
2027 std::vector<std::pair<sea_time_point, delta_info_t>> deltas;
2028 };
2029 std::optional<std::vector<record_deltas_t> >
2030 try_decode_deltas(
2031 const record_group_header_t& header,
2032 const ceph::bufferlist& md_bl,
2033 paddr_t record_block_base);
2034
2035 struct write_result_t {
2036 journal_seq_t start_seq;
2037 extent_len_t length;
2038
2039 journal_seq_t get_end_seq() const {
2040 return journal_seq_t{
2041 start_seq.segment_seq,
2042 start_seq.offset.add_offset(length)};
2043 }
2044 };
2045 std::ostream& operator<<(std::ostream&, const write_result_t&);
2046
2047 struct record_locator_t {
2048 paddr_t record_block_base;
2049 write_result_t write_result;
2050 };
2051 std::ostream& operator<<(std::ostream&, const record_locator_t&);
2052
2053 /// scan segment for end incrementally
2054 struct scan_valid_records_cursor {
2055 bool last_valid_header_found = false;
2056 journal_seq_t seq;
2057 journal_seq_t last_committed;
2058 std::size_t num_consumed_records = 0;
2059
2060 struct found_record_group_t {
2061 paddr_t offset;
2062 record_group_header_t header;
2063 bufferlist mdbuffer;
2064
2065 found_record_group_t(
2066 paddr_t offset,
2067 const record_group_header_t &header,
2068 const bufferlist &mdbuffer)
2069 : offset(offset), header(header), mdbuffer(mdbuffer) {}
2070 };
2071 std::deque<found_record_group_t> pending_record_groups;
2072
2073 bool is_complete() const {
2074 return last_valid_header_found && pending_record_groups.empty();
2075 }
2076
2077 segment_id_t get_segment_id() const {
2078 return seq.offset.as_seg_paddr().get_segment_id();
2079 }
2080
2081 segment_off_t get_segment_offset() const {
2082 return seq.offset.as_seg_paddr().get_segment_off();
2083 }
2084
2085 void increment_seq(segment_off_t off) {
2086 auto& seg_addr = seq.offset.as_seg_paddr();
2087 seg_addr.set_segment_off(
2088 seg_addr.get_segment_off() + off);
2089 }
2090
2091 void emplace_record_group(const record_group_header_t&, ceph::bufferlist&&);
2092
2093 void pop_record_group() {
2094 assert(!pending_record_groups.empty());
2095 ++num_consumed_records;
2096 pending_record_groups.pop_front();
2097 }
2098
2099 scan_valid_records_cursor(
2100 journal_seq_t seq)
2101 : seq(seq) {}
2102 };
2103 std::ostream& operator<<(std::ostream&, const scan_valid_records_cursor&);
2104
2105 }
2106
2107 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::seastore_meta_t)
2108 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_id_t)
2109 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t)
2110 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t)
2111 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t)
2112 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_tail_delta_t)
2113 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t)
2114 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_group_header_t)
2115 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t)
2116 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_header_t)
2117 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_blk_t)
2118 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_delta_t)
2119 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_tail_t)
2120
2121 #if FMT_VERSION >= 90000
2122 template <> struct fmt::formatter<crimson::os::seastore::data_category_t> : fmt::ostream_formatter {};
2123 template <> struct fmt::formatter<crimson::os::seastore::delta_info_t> : fmt::ostream_formatter {};
2124 template <> struct fmt::formatter<crimson::os::seastore::device_id_printer_t> : fmt::ostream_formatter {};
2125 template <> struct fmt::formatter<crimson::os::seastore::extent_types_t> : fmt::ostream_formatter {};
2126 template <> struct fmt::formatter<crimson::os::seastore::journal_seq_t> : fmt::ostream_formatter {};
2127 template <> struct fmt::formatter<crimson::os::seastore::journal_tail_delta_t> : fmt::ostream_formatter {};
2128 template <> struct fmt::formatter<crimson::os::seastore::laddr_list_t> : fmt::ostream_formatter {};
2129 template <> struct fmt::formatter<crimson::os::seastore::omap_root_t> : fmt::ostream_formatter {};
2130 template <> struct fmt::formatter<crimson::os::seastore::paddr_list_t> : fmt::ostream_formatter {};
2131 template <> struct fmt::formatter<crimson::os::seastore::paddr_t> : fmt::ostream_formatter {};
2132 template <> struct fmt::formatter<crimson::os::seastore::placement_hint_t> : fmt::ostream_formatter {};
2133 template <> struct fmt::formatter<crimson::os::seastore::device_type_t> : fmt::ostream_formatter {};
2134 template <> struct fmt::formatter<crimson::os::seastore::record_group_header_t> : fmt::ostream_formatter {};
2135 template <> struct fmt::formatter<crimson::os::seastore::record_group_size_t> : fmt::ostream_formatter {};
2136 template <> struct fmt::formatter<crimson::os::seastore::record_header_t> : fmt::ostream_formatter {};
2137 template <> struct fmt::formatter<crimson::os::seastore::record_locator_t> : fmt::ostream_formatter {};
2138 template <> struct fmt::formatter<crimson::os::seastore::record_t> : fmt::ostream_formatter {};
2139 template <> struct fmt::formatter<crimson::os::seastore::rewrite_gen_printer_t> : fmt::ostream_formatter {};
2140 template <> struct fmt::formatter<crimson::os::seastore::scan_valid_records_cursor> : fmt::ostream_formatter {};
2141 template <> struct fmt::formatter<crimson::os::seastore::sea_time_point_printer_t> : fmt::ostream_formatter {};
2142 template <> struct fmt::formatter<crimson::os::seastore::segment_header_t> : fmt::ostream_formatter {};
2143 template <> struct fmt::formatter<crimson::os::seastore::segment_id_t> : fmt::ostream_formatter {};
2144 template <> struct fmt::formatter<crimson::os::seastore::segment_seq_printer_t> : fmt::ostream_formatter {};
2145 template <> struct fmt::formatter<crimson::os::seastore::segment_tail_t> : fmt::ostream_formatter {};
2146 template <> struct fmt::formatter<crimson::os::seastore::segment_type_t> : fmt::ostream_formatter {};
2147 template <> struct fmt::formatter<crimson::os::seastore::transaction_type_t> : fmt::ostream_formatter {};
2148 template <> struct fmt::formatter<crimson::os::seastore::write_result_t> : fmt::ostream_formatter {};
2149 template <> struct fmt::formatter<ceph::buffer::list> : fmt::ostream_formatter {};
2150 #endif