]> git.proxmox.com Git - ceph.git/blob - ceph/src/crimson/os/seastore/seastore_types.h
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / crimson / os / seastore / seastore_types.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #pragma once
5
6 #include <limits>
7 #include <numeric>
8 #include <optional>
9 #include <iostream>
10 #include <vector>
11 #include <boost/core/ignore_unused.hpp>
12
13 #include <seastar/core/lowres_clock.hh>
14
15 #include "include/byteorder.h"
16 #include "include/denc.h"
17 #include "include/buffer.h"
18 #include "include/intarith.h"
19 #include "include/interval_set.h"
20 #include "include/uuid.h"
21
22 namespace crimson::os::seastore {
23
24 /* using a special xattr key "omap_header" to store omap header */
25 const std::string OMAP_HEADER_XATTR_KEY = "omap_header";
26
27 using transaction_id_t = uint64_t;
28 constexpr transaction_id_t TRANS_ID_NULL = 0;
29
30 /*
31 * Note: NULL value is usually the default and max value.
32 */
33
34 using depth_t = uint32_t;
35 using depth_le_t = ceph_le32;
36
37 inline depth_le_t init_depth_le(uint32_t i) {
38 return ceph_le32(i);
39 }
40
41 using checksum_t = uint32_t;
42
43 // Immutable metadata for seastore to set at mkfs time
44 struct seastore_meta_t {
45 uuid_d seastore_id;
46
47 DENC(seastore_meta_t, v, p) {
48 DENC_START(1, 1, p);
49 denc(v.seastore_id, p);
50 DENC_FINISH(p);
51 }
52 };
53
54 std::ostream& operator<<(std::ostream& out, const seastore_meta_t& meta);
55
56 bool is_aligned(uint64_t offset, uint64_t alignment);
57
58 // identifies a specific physical device within seastore
59 using device_id_t = uint8_t;
60
61 constexpr auto DEVICE_ID_BITS = std::numeric_limits<device_id_t>::digits;
62
63 constexpr device_id_t DEVICE_ID_MAX = std::numeric_limits<device_id_t>::max();
64 constexpr device_id_t DEVICE_ID_NULL = DEVICE_ID_MAX;
65 constexpr device_id_t DEVICE_ID_RECORD_RELATIVE = DEVICE_ID_MAX - 1;
66 constexpr device_id_t DEVICE_ID_BLOCK_RELATIVE = DEVICE_ID_MAX - 2;
67 constexpr device_id_t DEVICE_ID_DELAYED = DEVICE_ID_MAX - 3;
68 // for tests which generate fake paddrs
69 constexpr device_id_t DEVICE_ID_FAKE = DEVICE_ID_MAX - 4;
70 constexpr device_id_t DEVICE_ID_ZERO = DEVICE_ID_MAX - 5;
71 constexpr device_id_t DEVICE_ID_ROOT = DEVICE_ID_MAX - 6;
72 constexpr device_id_t DEVICE_ID_MAX_VALID = DEVICE_ID_MAX - 7;
73 constexpr device_id_t DEVICE_ID_MAX_VALID_SEGMENT = DEVICE_ID_MAX >> 1;
74 constexpr device_id_t DEVICE_ID_SEGMENTED_MIN = 0;
75 constexpr device_id_t DEVICE_ID_RANDOM_BLOCK_MIN =
76 1 << (std::numeric_limits<device_id_t>::digits - 1);
77
78 struct device_id_printer_t {
79 device_id_t id;
80 };
81
82 std::ostream &operator<<(std::ostream &out, const device_id_printer_t &id);
83
84 // 1 bit in paddr_t to identify the absolute physical address type
85 enum class paddr_types_t {
86 SEGMENT = 0,
87 RANDOM_BLOCK = 1,
88 RESERVED = 2
89 };
90
91 constexpr paddr_types_t device_id_to_paddr_type(device_id_t id) {
92 if (id > DEVICE_ID_MAX_VALID) {
93 return paddr_types_t::RESERVED;
94 } else if ((id & 0x80) == 0) {
95 return paddr_types_t::SEGMENT;
96 } else {
97 return paddr_types_t::RANDOM_BLOCK;
98 }
99 }
100
101 constexpr bool has_device_off(device_id_t id) {
102 return id == DEVICE_ID_RECORD_RELATIVE ||
103 id == DEVICE_ID_BLOCK_RELATIVE ||
104 id == DEVICE_ID_DELAYED ||
105 id == DEVICE_ID_FAKE ||
106 id == DEVICE_ID_ROOT;
107 }
108
109 // internal segment id type of segment_id_t below, with the top
110 // "DEVICE_ID_BITS" bits representing the device id of the segment.
111 using internal_segment_id_t = uint32_t;
112 constexpr auto SEGMENT_ID_BITS = std::numeric_limits<internal_segment_id_t>::digits;
113
114 // segment ids without a device id encapsulated
115 using device_segment_id_t = uint32_t;
116 constexpr auto DEVICE_SEGMENT_ID_BITS = SEGMENT_ID_BITS - DEVICE_ID_BITS;
117 constexpr device_segment_id_t DEVICE_SEGMENT_ID_MAX = (1 << DEVICE_SEGMENT_ID_BITS) - 1;
118
119 // Identifies segment location on disk, see SegmentManager,
120 struct segment_id_t {
121 public:
122 // segment_id_t() == MAX_SEG_ID == NULL_SEG_ID
123 segment_id_t()
124 : segment_id_t(DEVICE_ID_MAX_VALID_SEGMENT, DEVICE_SEGMENT_ID_MAX) {}
125
126 segment_id_t(device_id_t id, device_segment_id_t _segment)
127 : segment_id_t(make_internal(id, _segment)) {}
128
129 segment_id_t(internal_segment_id_t _segment)
130 : segment(_segment) {
131 assert(device_id_to_paddr_type(device_id()) == paddr_types_t::SEGMENT);
132 }
133
134 [[gnu::always_inline]]
135 constexpr device_id_t device_id() const {
136 return static_cast<device_id_t>(segment >> DEVICE_SEGMENT_ID_BITS);
137 }
138
139 [[gnu::always_inline]]
140 constexpr device_segment_id_t device_segment_id() const {
141 constexpr internal_segment_id_t _SEGMENT_ID_MASK = (1u << DEVICE_SEGMENT_ID_BITS) - 1;
142 return segment & _SEGMENT_ID_MASK;
143 }
144
145 bool operator==(const segment_id_t& other) const {
146 return segment == other.segment;
147 }
148 bool operator!=(const segment_id_t& other) const {
149 return segment != other.segment;
150 }
151 bool operator<(const segment_id_t& other) const {
152 return segment < other.segment;
153 }
154 bool operator<=(const segment_id_t& other) const {
155 return segment <= other.segment;
156 }
157 bool operator>(const segment_id_t& other) const {
158 return segment > other.segment;
159 }
160 bool operator>=(const segment_id_t& other) const {
161 return segment >= other.segment;
162 }
163
164 DENC(segment_id_t, v, p) {
165 denc(v.segment, p);
166 }
167
168 static constexpr segment_id_t create_const(
169 device_id_t id, device_segment_id_t segment) {
170 return segment_id_t(id, segment, const_t{});
171 }
172
173 private:
174 struct const_t {};
175 constexpr segment_id_t(device_id_t id, device_segment_id_t _segment, const_t)
176 : segment(make_internal(id, _segment)) {}
177
178 constexpr static inline internal_segment_id_t make_internal(
179 device_id_t d_id,
180 device_segment_id_t s_id) {
181 return static_cast<internal_segment_id_t>(s_id) |
182 (static_cast<internal_segment_id_t>(d_id) << DEVICE_SEGMENT_ID_BITS);
183 }
184
185 internal_segment_id_t segment;
186
187 friend struct segment_id_le_t;
188 friend struct paddr_t;
189 };
190
191 std::ostream &operator<<(std::ostream &out, const segment_id_t&);
192
193 // ondisk type of segment_id_t
194 struct __attribute((packed)) segment_id_le_t {
195 ceph_le32 segment = ceph_le32(segment_id_t().segment);
196
197 segment_id_le_t(const segment_id_t id) :
198 segment(ceph_le32(id.segment)) {}
199
200 operator segment_id_t() const {
201 return segment_id_t(segment);
202 }
203 };
204
205 constexpr segment_id_t MIN_SEG_ID = segment_id_t::create_const(0, 0);
206 // segment_id_t() == MAX_SEG_ID == NULL_SEG_ID
207 constexpr segment_id_t MAX_SEG_ID =
208 segment_id_t::create_const(DEVICE_ID_MAX_VALID_SEGMENT, DEVICE_SEGMENT_ID_MAX);
209 constexpr segment_id_t NULL_SEG_ID = MAX_SEG_ID;
210
211 /* Monotonically increasing segment seq, uniquely identifies
212 * the incarnation of a segment */
213 using segment_seq_t = uint64_t;
214 static constexpr segment_seq_t MAX_SEG_SEQ =
215 std::numeric_limits<segment_seq_t>::max();
216 static constexpr segment_seq_t NULL_SEG_SEQ = MAX_SEG_SEQ;
217
218 enum class segment_type_t : uint8_t {
219 JOURNAL = 0,
220 OOL,
221 NULL_SEG,
222 };
223
224 std::ostream& operator<<(std::ostream& out, segment_type_t t);
225
226 struct segment_seq_printer_t {
227 segment_seq_t seq;
228 };
229
230 std::ostream& operator<<(std::ostream& out, segment_seq_printer_t seq);
231
232 /**
233 * segment_map_t
234 *
235 * Compact templated mapping from a segment_id_t to a value type.
236 */
237 template <typename T>
238 class segment_map_t {
239 public:
240 segment_map_t() {
241 // initializes top vector with 0 length vectors to indicate that they
242 // are not yet present
243 device_to_segments.resize(DEVICE_ID_MAX_VALID);
244 }
245 void add_device(device_id_t device, std::size_t segments, const T& init) {
246 ceph_assert(device <= DEVICE_ID_MAX_VALID);
247 ceph_assert(device_to_segments[device].size() == 0);
248 ceph_assert(segments > 0);
249 device_to_segments[device].resize(segments, init);
250 total_segments += segments;
251 }
252 void clear() {
253 device_to_segments.clear();
254 device_to_segments.resize(DEVICE_ID_MAX_VALID);
255 total_segments = 0;
256 }
257
258 T& operator[](segment_id_t id) {
259 assert(id.device_segment_id() < device_to_segments[id.device_id()].size());
260 return device_to_segments[id.device_id()][id.device_segment_id()];
261 }
262 const T& operator[](segment_id_t id) const {
263 assert(id.device_segment_id() < device_to_segments[id.device_id()].size());
264 return device_to_segments[id.device_id()][id.device_segment_id()];
265 }
266
267 bool contains(segment_id_t id) {
268 bool b = id.device_id() < device_to_segments.size();
269 if (!b) {
270 return b;
271 }
272 b = id.device_segment_id() < device_to_segments[id.device_id()].size();
273 return b;
274 }
275
276 auto begin() {
277 return iterator<false>::lower_bound(*this, 0, 0);
278 }
279 auto begin() const {
280 return iterator<true>::lower_bound(*this, 0, 0);
281 }
282
283 auto end() {
284 return iterator<false>::end_iterator(*this);
285 }
286 auto end() const {
287 return iterator<true>::end_iterator(*this);
288 }
289
290 auto device_begin(device_id_t id) {
291 auto ret = iterator<false>::lower_bound(*this, id, 0);
292 assert(ret->first.device_id() == id);
293 return ret;
294 }
295 auto device_end(device_id_t id) {
296 return iterator<false>::lower_bound(*this, id + 1, 0);
297 }
298
299 size_t size() const {
300 return total_segments;
301 }
302
303 private:
304 template <bool is_const = false>
305 class iterator {
306 /// points at set being iterated over
307 std::conditional_t<
308 is_const,
309 const segment_map_t &,
310 segment_map_t &> parent;
311
312 /// points at current device, or DEVICE_ID_MAX_VALID if is_end()
313 device_id_t device_id;
314
315 /// segment at which we are pointing, 0 if is_end()
316 device_segment_id_t device_segment_id;
317
318 /// holds referent for operator* and operator-> when !is_end()
319 std::optional<
320 std::pair<
321 const segment_id_t,
322 std::conditional_t<is_const, const T&, T&>
323 >> current;
324
325 bool is_end() const {
326 return device_id == DEVICE_ID_MAX_VALID;
327 }
328
329 void find_valid() {
330 assert(!is_end());
331 auto &device_vec = parent.device_to_segments[device_id];
332 if (device_vec.size() == 0 ||
333 device_segment_id == device_vec.size()) {
334 while (++device_id < DEVICE_ID_MAX_VALID &&
335 parent.device_to_segments[device_id].size() == 0);
336 device_segment_id = 0;
337 }
338 if (is_end()) {
339 current = std::nullopt;
340 } else {
341 current.emplace(
342 segment_id_t{device_id, device_segment_id},
343 parent.device_to_segments[device_id][device_segment_id]
344 );
345 }
346 }
347
348 iterator(
349 decltype(parent) &parent,
350 device_id_t device_id,
351 device_segment_id_t device_segment_id)
352 : parent(parent), device_id(device_id),
353 device_segment_id(device_segment_id) {}
354
355 public:
356 static iterator lower_bound(
357 decltype(parent) &parent,
358 device_id_t device_id,
359 device_segment_id_t device_segment_id) {
360 if (device_id == DEVICE_ID_MAX_VALID) {
361 return end_iterator(parent);
362 } else {
363 auto ret = iterator{parent, device_id, device_segment_id};
364 ret.find_valid();
365 return ret;
366 }
367 }
368
369 static iterator end_iterator(
370 decltype(parent) &parent) {
371 return iterator{parent, DEVICE_ID_MAX_VALID, 0};
372 }
373
374 iterator<is_const>& operator++() {
375 assert(!is_end());
376 ++device_segment_id;
377 find_valid();
378 return *this;
379 }
380
381 bool operator==(iterator<is_const> rit) {
382 return (device_id == rit.device_id &&
383 device_segment_id == rit.device_segment_id);
384 }
385
386 bool operator!=(iterator<is_const> rit) {
387 return !(*this == rit);
388 }
389
390 template <bool c = is_const, std::enable_if_t<c, int> = 0>
391 const std::pair<const segment_id_t, const T&> *operator->() {
392 assert(!is_end());
393 return &*current;
394 }
395 template <bool c = is_const, std::enable_if_t<!c, int> = 0>
396 std::pair<const segment_id_t, T&> *operator->() {
397 assert(!is_end());
398 return &*current;
399 }
400
401 using reference = std::conditional_t<
402 is_const, const std::pair<const segment_id_t, const T&>&,
403 std::pair<const segment_id_t, T&>&>;
404 reference operator*() {
405 assert(!is_end());
406 return *current;
407 }
408 };
409
410 /**
411 * device_to_segments
412 *
413 * device -> segment -> T mapping. device_to_segments[d].size() > 0 iff
414 * device <d> has been added.
415 */
416 std::vector<std::vector<T>> device_to_segments;
417
418 /// total number of added segments
419 size_t total_segments = 0;
420 };
421
422 /**
423 * paddr_t
424 *
425 * <segment, offset> offset on disk, see SegmentManager
426 *
427 * May be absolute, record_relative, or block_relative.
428 *
429 * Blocks get read independently of the surrounding record,
430 * so paddrs embedded directly within a block need to refer
431 * to other blocks within the same record by a block_relative
432 * addr relative to the block's own offset. By contrast,
433 * deltas to existing blocks need to use record_relative
434 * addrs relative to the first block of the record.
435 *
436 * Fresh extents during a transaction are refered to by
437 * record_relative paddrs.
438 */
439
440 using internal_paddr_t = uint64_t;
441 constexpr auto PADDR_BITS = std::numeric_limits<internal_paddr_t>::digits;
442
443 /**
444 * device_off_t
445 *
446 * Offset within a device, may be negative for relative offsets.
447 */
448 using device_off_t = int64_t;
449 using u_device_off_t = uint64_t;
450 constexpr auto DEVICE_OFF_BITS = PADDR_BITS - DEVICE_ID_BITS;
451 constexpr auto DEVICE_OFF_MAX =
452 std::numeric_limits<device_off_t>::max() >> DEVICE_ID_BITS;
453 constexpr auto DEVICE_OFF_MIN = -(DEVICE_OFF_MAX + 1);
454
455 /**
456 * segment_off_t
457 *
458 * Offset within a segment on disk, may be negative for relative offsets.
459 */
460 using segment_off_t = int32_t;
461 using u_segment_off_t = uint32_t;
462 constexpr auto SEGMENT_OFF_MAX = std::numeric_limits<segment_off_t>::max();
463 constexpr auto SEGMENT_OFF_MIN = std::numeric_limits<segment_off_t>::min();
464 constexpr auto SEGMENT_OFF_BITS = std::numeric_limits<u_segment_off_t>::digits;
465 static_assert(PADDR_BITS == SEGMENT_ID_BITS + SEGMENT_OFF_BITS);
466
467 constexpr auto DEVICE_ID_MASK =
468 ((internal_paddr_t(1) << DEVICE_ID_BITS) - 1) << DEVICE_OFF_BITS;
469 constexpr auto DEVICE_OFF_MASK =
470 std::numeric_limits<u_device_off_t>::max() >> DEVICE_ID_BITS;
471 constexpr auto SEGMENT_ID_MASK =
472 ((internal_paddr_t(1) << SEGMENT_ID_BITS) - 1) << SEGMENT_OFF_BITS;
473 constexpr auto SEGMENT_OFF_MASK =
474 (internal_paddr_t(1) << SEGMENT_OFF_BITS) - 1;
475
476 constexpr internal_paddr_t encode_device_off(device_off_t off) {
477 return static_cast<internal_paddr_t>(off) & DEVICE_OFF_MASK;
478 }
479
480 constexpr device_off_t decode_device_off(internal_paddr_t addr) {
481 if (addr & (1ull << (DEVICE_OFF_BITS - 1))) {
482 return static_cast<device_off_t>(addr | DEVICE_ID_MASK);
483 } else {
484 return static_cast<device_off_t>(addr & DEVICE_OFF_MASK);
485 }
486 }
487
488 struct seg_paddr_t;
489 struct blk_paddr_t;
490 struct res_paddr_t;
491 struct pladdr_t;
492 struct paddr_t {
493 public:
494 // P_ADDR_MAX == P_ADDR_NULL == paddr_t{}
495 paddr_t() : paddr_t(DEVICE_ID_MAX, device_off_t(0)) {}
496
497 static paddr_t make_seg_paddr(
498 segment_id_t seg,
499 segment_off_t offset) {
500 return paddr_t(seg, offset);
501 }
502
503 static paddr_t make_seg_paddr(
504 device_id_t device,
505 device_segment_id_t seg,
506 segment_off_t offset) {
507 return paddr_t(segment_id_t(device, seg), offset);
508 }
509
510 static paddr_t make_blk_paddr(
511 device_id_t device,
512 device_off_t offset) {
513 assert(device_id_to_paddr_type(device) == paddr_types_t::RANDOM_BLOCK);
514 return paddr_t(device, offset);
515 }
516
517 static paddr_t make_res_paddr(
518 device_id_t device,
519 device_off_t offset) {
520 assert(device_id_to_paddr_type(device) == paddr_types_t::RESERVED);
521 return paddr_t(device, offset);
522 }
523
524 void swap(paddr_t &other) {
525 std::swap(internal_paddr, other.internal_paddr);
526 }
527
528 device_id_t get_device_id() const {
529 return static_cast<device_id_t>(internal_paddr >> DEVICE_OFF_BITS);
530 }
531
532 paddr_types_t get_addr_type() const {
533 return device_id_to_paddr_type(get_device_id());
534 }
535
536 paddr_t add_offset(device_off_t o) const;
537
538 paddr_t add_relative(paddr_t o) const;
539
540 paddr_t add_block_relative(paddr_t o) const {
541 // special version mainly for documentation purposes
542 assert(o.is_block_relative());
543 return add_relative(o);
544 }
545
546 paddr_t add_record_relative(paddr_t o) const {
547 // special version mainly for documentation purposes
548 assert(o.is_record_relative());
549 return add_relative(o);
550 }
551
552 /**
553 * maybe_relative_to
554 *
555 * Helper for the case where an in-memory paddr_t may be
556 * either block_relative or absolute (not record_relative).
557 *
558 * base must be either absolute or record_relative.
559 */
560 paddr_t maybe_relative_to(paddr_t base) const {
561 assert(!base.is_block_relative());
562 if (is_block_relative()) {
563 return base.add_block_relative(*this);
564 } else {
565 return *this;
566 }
567 }
568
569 /**
570 * block_relative_to
571 *
572 * Only defined for record_relative paddr_ts. Yields a
573 * block_relative address.
574 */
575 paddr_t block_relative_to(paddr_t rhs) const;
576
577 // To be compatible with laddr_t operator+
578 paddr_t operator+(device_off_t o) const {
579 return add_offset(o);
580 }
581
582 seg_paddr_t& as_seg_paddr();
583 const seg_paddr_t& as_seg_paddr() const;
584 blk_paddr_t& as_blk_paddr();
585 const blk_paddr_t& as_blk_paddr() const;
586 res_paddr_t& as_res_paddr();
587 const res_paddr_t& as_res_paddr() const;
588
589 bool is_delayed() const {
590 return get_device_id() == DEVICE_ID_DELAYED;
591 }
592 bool is_block_relative() const {
593 return get_device_id() == DEVICE_ID_BLOCK_RELATIVE;
594 }
595 bool is_record_relative() const {
596 return get_device_id() == DEVICE_ID_RECORD_RELATIVE;
597 }
598 bool is_relative() const {
599 return is_block_relative() || is_record_relative();
600 }
601 /// Denotes special null addr
602 bool is_null() const {
603 return get_device_id() == DEVICE_ID_NULL;
604 }
605 /// Denotes special zero addr
606 bool is_zero() const {
607 return get_device_id() == DEVICE_ID_ZERO;
608 }
609 /// Denotes the root addr
610 bool is_root() const {
611 return get_device_id() == DEVICE_ID_ROOT;
612 }
613
614 /**
615 * is_real
616 *
617 * indicates whether addr reflects a physical location, absolute, relative,
618 * or delayed. FAKE segments also count as real so as to reflect the way in
619 * which unit tests use them.
620 */
621 bool is_real() const {
622 return !is_zero() && !is_null() && !is_root();
623 }
624
625 bool is_absolute() const {
626 return get_addr_type() != paddr_types_t::RESERVED;
627 }
628
629 bool is_fake() const {
630 return get_device_id() == DEVICE_ID_FAKE;
631 }
632
633 auto operator<=>(const paddr_t &) const = default;
634
635 DENC(paddr_t, v, p) {
636 DENC_START(1, 1, p);
637 denc(v.internal_paddr, p);
638 DENC_FINISH(p);
639 }
640
641 constexpr static paddr_t create_const(
642 device_id_t d_id, device_off_t offset) {
643 return paddr_t(d_id, offset, const_construct_t());
644 }
645
646 protected:
647 internal_paddr_t internal_paddr;
648
649 private:
650 // as seg
651 paddr_t(segment_id_t seg, segment_off_t offset)
652 : paddr_t((static_cast<internal_paddr_t>(seg.segment) << SEGMENT_OFF_BITS) |
653 static_cast<u_segment_off_t>(offset)) {}
654
655 // as blk or res
656 paddr_t(device_id_t d_id, device_off_t offset)
657 : paddr_t((static_cast<internal_paddr_t>(d_id) << DEVICE_OFF_BITS) |
658 encode_device_off(offset)) {
659 assert(offset >= DEVICE_OFF_MIN);
660 assert(offset <= DEVICE_OFF_MAX);
661 assert(get_addr_type() != paddr_types_t::SEGMENT);
662 }
663
664 paddr_t(internal_paddr_t val);
665
666 struct const_construct_t {};
667 constexpr paddr_t(device_id_t d_id, device_off_t offset, const_construct_t)
668 : internal_paddr((static_cast<internal_paddr_t>(d_id) << DEVICE_OFF_BITS) |
669 static_cast<u_device_off_t>(offset)) {}
670
671 friend struct paddr_le_t;
672 friend struct pladdr_le_t;
673
674 };
675
676 std::ostream &operator<<(std::ostream &out, const paddr_t &rhs);
677
678 struct seg_paddr_t : public paddr_t {
679 seg_paddr_t(const seg_paddr_t&) = delete;
680 seg_paddr_t(seg_paddr_t&) = delete;
681 seg_paddr_t& operator=(const seg_paddr_t&) = delete;
682 seg_paddr_t& operator=(seg_paddr_t&) = delete;
683
684 segment_id_t get_segment_id() const {
685 return segment_id_t(static_cast<internal_segment_id_t>(
686 internal_paddr >> SEGMENT_OFF_BITS));
687 }
688
689 segment_off_t get_segment_off() const {
690 return segment_off_t(internal_paddr & SEGMENT_OFF_MASK);
691 }
692
693 void set_segment_off(segment_off_t off) {
694 assert(off >= 0);
695 internal_paddr = (internal_paddr & SEGMENT_ID_MASK);
696 internal_paddr |= static_cast<u_segment_off_t>(off);
697 }
698
699 paddr_t add_offset(device_off_t o) const {
700 device_off_t off = get_segment_off() + o;
701 assert(off >= 0);
702 assert(off <= SEGMENT_OFF_MAX);
703 return paddr_t::make_seg_paddr(
704 get_segment_id(), static_cast<segment_off_t>(off));
705 }
706 };
707
708 struct blk_paddr_t : public paddr_t {
709 blk_paddr_t(const blk_paddr_t&) = delete;
710 blk_paddr_t(blk_paddr_t&) = delete;
711 blk_paddr_t& operator=(const blk_paddr_t&) = delete;
712 blk_paddr_t& operator=(blk_paddr_t&) = delete;
713
714 device_off_t get_device_off() const {
715 return decode_device_off(internal_paddr);
716 }
717
718 void set_device_off(device_off_t off) {
719 assert(off >= 0);
720 assert(off <= DEVICE_OFF_MAX);
721 internal_paddr = (internal_paddr & DEVICE_ID_MASK);
722 internal_paddr |= encode_device_off(off);
723 }
724
725 paddr_t add_offset(device_off_t o) const {
726 assert(o >= DEVICE_OFF_MIN);
727 assert(o <= DEVICE_OFF_MAX);
728 auto off = get_device_off() + o;
729 return paddr_t::make_blk_paddr(get_device_id(), off);
730 }
731 };
732
733 struct res_paddr_t : public paddr_t {
734 res_paddr_t(const res_paddr_t&) = delete;
735 res_paddr_t(res_paddr_t&) = delete;
736 res_paddr_t& operator=(const res_paddr_t&) = delete;
737 res_paddr_t& operator=(res_paddr_t&) = delete;
738
739 device_off_t get_device_off() const {
740 return decode_device_off(internal_paddr);
741 }
742
743 void set_device_off(device_off_t off) {
744 assert(has_device_off(get_device_id()));
745 assert(off >= DEVICE_OFF_MIN);
746 assert(off <= DEVICE_OFF_MAX);
747 internal_paddr = (internal_paddr & DEVICE_ID_MASK);
748 internal_paddr |= encode_device_off(off);
749 }
750
751 paddr_t add_offset(device_off_t o) const {
752 assert(has_device_off(get_device_id()));
753 assert(o >= DEVICE_OFF_MIN);
754 assert(o <= DEVICE_OFF_MAX);
755 auto off = get_device_off() + o;
756 return paddr_t::make_res_paddr(get_device_id(), off);
757 }
758
759 paddr_t block_relative_to(const res_paddr_t &rhs) const {
760 assert(rhs.is_record_relative() && is_record_relative());
761 auto off = get_device_off() - rhs.get_device_off();
762 return paddr_t::make_res_paddr(DEVICE_ID_BLOCK_RELATIVE, off);
763 }
764 };
765
766 constexpr paddr_t P_ADDR_MIN = paddr_t::create_const(0, 0);
767 // P_ADDR_MAX == P_ADDR_NULL == paddr_t{}
768 constexpr paddr_t P_ADDR_MAX = paddr_t::create_const(DEVICE_ID_MAX, 0);
769 constexpr paddr_t P_ADDR_NULL = P_ADDR_MAX;
770 constexpr paddr_t P_ADDR_ZERO = paddr_t::create_const(DEVICE_ID_ZERO, 0);
771 constexpr paddr_t P_ADDR_ROOT = paddr_t::create_const(DEVICE_ID_ROOT, 0);
772
773 inline paddr_t make_record_relative_paddr(device_off_t off) {
774 return paddr_t::make_res_paddr(DEVICE_ID_RECORD_RELATIVE, off);
775 }
776 inline paddr_t make_block_relative_paddr(device_off_t off) {
777 return paddr_t::make_res_paddr(DEVICE_ID_BLOCK_RELATIVE, off);
778 }
779 inline paddr_t make_fake_paddr(device_off_t off) {
780 return paddr_t::make_res_paddr(DEVICE_ID_FAKE, off);
781 }
782 inline paddr_t make_delayed_temp_paddr(device_off_t off) {
783 return paddr_t::make_res_paddr(DEVICE_ID_DELAYED, off);
784 }
785
786 inline const seg_paddr_t& paddr_t::as_seg_paddr() const {
787 assert(get_addr_type() == paddr_types_t::SEGMENT);
788 return *static_cast<const seg_paddr_t*>(this);
789 }
790
791 inline seg_paddr_t& paddr_t::as_seg_paddr() {
792 assert(get_addr_type() == paddr_types_t::SEGMENT);
793 return *static_cast<seg_paddr_t*>(this);
794 }
795
796 inline const blk_paddr_t& paddr_t::as_blk_paddr() const {
797 assert(get_addr_type() == paddr_types_t::RANDOM_BLOCK);
798 return *static_cast<const blk_paddr_t*>(this);
799 }
800
801 inline blk_paddr_t& paddr_t::as_blk_paddr() {
802 assert(get_addr_type() == paddr_types_t::RANDOM_BLOCK);
803 return *static_cast<blk_paddr_t*>(this);
804 }
805
806 inline const res_paddr_t& paddr_t::as_res_paddr() const {
807 assert(get_addr_type() == paddr_types_t::RESERVED);
808 return *static_cast<const res_paddr_t*>(this);
809 }
810
811 inline res_paddr_t& paddr_t::as_res_paddr() {
812 assert(get_addr_type() == paddr_types_t::RESERVED);
813 return *static_cast<res_paddr_t*>(this);
814 }
815
816 inline paddr_t::paddr_t(internal_paddr_t val) : internal_paddr(val) {
817 #ifndef NDEBUG
818 auto type = get_addr_type();
819 if (type == paddr_types_t::SEGMENT) {
820 assert(as_seg_paddr().get_segment_off() >= 0);
821 } else if (type == paddr_types_t::RANDOM_BLOCK) {
822 assert(as_blk_paddr().get_device_off() >= 0);
823 } else {
824 assert(type == paddr_types_t::RESERVED);
825 if (!has_device_off(get_device_id())) {
826 assert(as_res_paddr().get_device_off() == 0);
827 }
828 }
829 #endif
830 }
831
832 #define PADDR_OPERATION(a_type, base, func) \
833 if (get_addr_type() == a_type) { \
834 return static_cast<const base*>(this)->func; \
835 }
836
837 inline paddr_t paddr_t::add_offset(device_off_t o) const {
838 PADDR_OPERATION(paddr_types_t::SEGMENT, seg_paddr_t, add_offset(o))
839 PADDR_OPERATION(paddr_types_t::RANDOM_BLOCK, blk_paddr_t, add_offset(o))
840 PADDR_OPERATION(paddr_types_t::RESERVED, res_paddr_t, add_offset(o))
841 ceph_assert(0 == "not supported type");
842 return P_ADDR_NULL;
843 }
844
845 inline paddr_t paddr_t::add_relative(paddr_t o) const {
846 assert(o.is_relative());
847 auto &res_o = o.as_res_paddr();
848 return add_offset(res_o.get_device_off());
849 }
850
851 inline paddr_t paddr_t::block_relative_to(paddr_t rhs) const {
852 return as_res_paddr().block_relative_to(rhs.as_res_paddr());
853 }
854
855 struct __attribute((packed)) paddr_le_t {
856 ceph_le64 internal_paddr =
857 ceph_le64(P_ADDR_NULL.internal_paddr);
858
859 using orig_type = paddr_t;
860
861 paddr_le_t() = default;
862 paddr_le_t(const paddr_t &addr) : internal_paddr(ceph_le64(addr.internal_paddr)) {}
863
864 operator paddr_t() const {
865 return paddr_t{internal_paddr};
866 }
867 };
868
869 using objaddr_t = uint32_t;
870 constexpr objaddr_t OBJ_ADDR_MAX = std::numeric_limits<objaddr_t>::max();
871 constexpr objaddr_t OBJ_ADDR_NULL = OBJ_ADDR_MAX;
872
873 enum class placement_hint_t {
874 HOT = 0, // The default user hint that expects mutations or retirement
875 COLD, // Expect no mutations and no retirement in the near future
876 REWRITE, // Hint for the internal rewrites
877 NUM_HINTS // Constant for number of hints or as NULL
878 };
879
880 constexpr auto PLACEMENT_HINT_NULL = placement_hint_t::NUM_HINTS;
881
882 std::ostream& operator<<(std::ostream& out, placement_hint_t h);
883
884 enum class device_type_t : uint8_t {
885 NONE = 0,
886 HDD,
887 SSD,
888 ZBD, // ZNS SSD or SMR HDD
889 EPHEMERAL_COLD,
890 EPHEMERAL_MAIN,
891 RANDOM_BLOCK_SSD,
892 RANDOM_BLOCK_EPHEMERAL,
893 NUM_TYPES
894 };
895
896 std::ostream& operator<<(std::ostream& out, device_type_t t);
897
898 bool can_delay_allocation(device_type_t type);
899 device_type_t string_to_device_type(std::string type);
900
901 enum class backend_type_t {
902 SEGMENTED, // SegmentManager: SSD, ZBD, HDD
903 RANDOM_BLOCK // RBMDevice: RANDOM_BLOCK_SSD
904 };
905
906 std::ostream& operator<<(std::ostream& out, backend_type_t);
907 using journal_type_t = backend_type_t;
908
909 constexpr backend_type_t get_default_backend_of_device(device_type_t dtype) {
910 assert(dtype != device_type_t::NONE &&
911 dtype != device_type_t::NUM_TYPES);
912 if (dtype >= device_type_t::HDD &&
913 dtype <= device_type_t::EPHEMERAL_MAIN) {
914 return backend_type_t::SEGMENTED;
915 } else {
916 return backend_type_t::RANDOM_BLOCK;
917 }
918 }
919
920 /**
921 * Monotonically increasing identifier for the location of a
922 * journal_record.
923 */
924 // JOURNAL_SEQ_NULL == JOURNAL_SEQ_MAX == journal_seq_t{}
925 struct journal_seq_t {
926 segment_seq_t segment_seq = NULL_SEG_SEQ;
927 paddr_t offset = P_ADDR_NULL;
928
929 void swap(journal_seq_t &other) {
930 std::swap(segment_seq, other.segment_seq);
931 std::swap(offset, other.offset);
932 }
933
934 // produces a pseudo journal_seq_t relative to this by offset
935 journal_seq_t add_offset(
936 journal_type_t type,
937 device_off_t off,
938 device_off_t roll_start,
939 device_off_t roll_size) const;
940
941 device_off_t relative_to(
942 journal_type_t type,
943 const journal_seq_t& r,
944 device_off_t roll_start,
945 device_off_t roll_size) const;
946
947 DENC(journal_seq_t, v, p) {
948 DENC_START(1, 1, p);
949 denc(v.segment_seq, p);
950 denc(v.offset, p);
951 DENC_FINISH(p);
952 }
953
954 bool operator==(const journal_seq_t &o) const { return cmp(o) == 0; }
955 bool operator!=(const journal_seq_t &o) const { return cmp(o) != 0; }
956 bool operator<(const journal_seq_t &o) const { return cmp(o) < 0; }
957 bool operator<=(const journal_seq_t &o) const { return cmp(o) <= 0; }
958 bool operator>(const journal_seq_t &o) const { return cmp(o) > 0; }
959 bool operator>=(const journal_seq_t &o) const { return cmp(o) >= 0; }
960
961 private:
962 int cmp(const journal_seq_t &other) const {
963 if (segment_seq > other.segment_seq) {
964 return 1;
965 } else if (segment_seq < other.segment_seq) {
966 return -1;
967 }
968 using ret_t = std::pair<device_off_t, segment_id_t>;
969 auto to_pair = [](const paddr_t &addr) -> ret_t {
970 if (addr.get_addr_type() == paddr_types_t::SEGMENT) {
971 auto &seg_addr = addr.as_seg_paddr();
972 return ret_t(seg_addr.get_segment_off(), seg_addr.get_segment_id());
973 } else if (addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK) {
974 auto &blk_addr = addr.as_blk_paddr();
975 return ret_t(blk_addr.get_device_off(), MAX_SEG_ID);
976 } else if (addr.get_addr_type() == paddr_types_t::RESERVED) {
977 auto &res_addr = addr.as_res_paddr();
978 return ret_t(res_addr.get_device_off(), MAX_SEG_ID);
979 } else {
980 assert(0 == "impossible");
981 return ret_t(0, MAX_SEG_ID);
982 }
983 };
984 auto left = to_pair(offset);
985 auto right = to_pair(other.offset);
986 if (left > right) {
987 return 1;
988 } else if (left < right) {
989 return -1;
990 } else {
991 return 0;
992 }
993 }
994 };
995
996 std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq);
997
998 constexpr journal_seq_t JOURNAL_SEQ_MIN{
999 0,
1000 P_ADDR_MIN
1001 };
1002 constexpr journal_seq_t JOURNAL_SEQ_MAX{
1003 MAX_SEG_SEQ,
1004 P_ADDR_MAX
1005 };
1006 // JOURNAL_SEQ_NULL == JOURNAL_SEQ_MAX == journal_seq_t{}
1007 constexpr journal_seq_t JOURNAL_SEQ_NULL = JOURNAL_SEQ_MAX;
1008
1009 // logical addr, see LBAManager, TransactionManager
1010 using laddr_t = uint64_t;
1011 constexpr laddr_t L_ADDR_MIN = std::numeric_limits<laddr_t>::min();
1012 constexpr laddr_t L_ADDR_MAX = std::numeric_limits<laddr_t>::max();
1013 constexpr laddr_t L_ADDR_NULL = L_ADDR_MAX;
1014 constexpr laddr_t L_ADDR_ROOT = L_ADDR_MAX - 1;
1015 constexpr laddr_t L_ADDR_LBAT = L_ADDR_MAX - 2;
1016
1017 struct __attribute((packed)) laddr_le_t {
1018 ceph_le64 laddr = ceph_le64(L_ADDR_NULL);
1019
1020 using orig_type = laddr_t;
1021
1022 laddr_le_t() = default;
1023 laddr_le_t(const laddr_le_t &) = default;
1024 explicit laddr_le_t(const laddr_t &addr)
1025 : laddr(ceph_le64(addr)) {}
1026
1027 operator laddr_t() const {
1028 return laddr_t(laddr);
1029 }
1030 laddr_le_t& operator=(laddr_t addr) {
1031 ceph_le64 val;
1032 val = addr;
1033 laddr = val;
1034 return *this;
1035 }
1036 };
1037
1038 constexpr uint64_t PL_ADDR_NULL = std::numeric_limits<uint64_t>::max();
1039
1040 struct pladdr_t {
1041 std::variant<laddr_t, paddr_t> pladdr;
1042
1043 pladdr_t() = default;
1044 pladdr_t(const pladdr_t &) = default;
1045 pladdr_t(laddr_t laddr)
1046 : pladdr(laddr) {}
1047 pladdr_t(paddr_t paddr)
1048 : pladdr(paddr) {}
1049
1050 bool is_laddr() const {
1051 return pladdr.index() == 0;
1052 }
1053
1054 bool is_paddr() const {
1055 return pladdr.index() == 1;
1056 }
1057
1058 pladdr_t& operator=(paddr_t paddr) {
1059 pladdr = paddr;
1060 return *this;
1061 }
1062
1063 pladdr_t& operator=(laddr_t laddr) {
1064 pladdr = laddr;
1065 return *this;
1066 }
1067
1068 bool operator==(const pladdr_t &) const = default;
1069
1070 paddr_t get_paddr() const {
1071 assert(pladdr.index() == 1);
1072 return paddr_t(std::get<1>(pladdr));
1073 }
1074
1075 laddr_t get_laddr() const {
1076 assert(pladdr.index() == 0);
1077 return laddr_t(std::get<0>(pladdr));
1078 }
1079
1080 };
1081
1082 std::ostream &operator<<(std::ostream &out, const pladdr_t &pladdr);
1083
1084 enum class addr_type_t : uint8_t {
1085 PADDR=0,
1086 LADDR=1,
1087 MAX=2 // or NONE
1088 };
1089
1090 struct __attribute((packed)) pladdr_le_t {
1091 ceph_le64 pladdr = ceph_le64(PL_ADDR_NULL);
1092 addr_type_t addr_type = addr_type_t::MAX;
1093
1094 pladdr_le_t() = default;
1095 pladdr_le_t(const pladdr_le_t &) = default;
1096 explicit pladdr_le_t(const pladdr_t &addr)
1097 : pladdr(
1098 ceph_le64(
1099 addr.is_laddr() ?
1100 std::get<0>(addr.pladdr) :
1101 std::get<1>(addr.pladdr).internal_paddr)),
1102 addr_type(
1103 addr.is_laddr() ?
1104 addr_type_t::LADDR :
1105 addr_type_t::PADDR)
1106 {}
1107
1108 operator pladdr_t() const {
1109 if (addr_type == addr_type_t::LADDR) {
1110 return pladdr_t(laddr_t(pladdr));
1111 } else {
1112 assert(addr_type == addr_type_t::PADDR);
1113 return pladdr_t(paddr_t(pladdr));
1114 }
1115 }
1116 };
1117
1118 template <typename T>
1119 struct min_max_t {};
1120
1121 template <>
1122 struct min_max_t<laddr_t> {
1123 static constexpr laddr_t max = L_ADDR_MAX;
1124 static constexpr laddr_t min = L_ADDR_MIN;
1125 static constexpr laddr_t null = L_ADDR_NULL;
1126 };
1127
1128 template <>
1129 struct min_max_t<paddr_t> {
1130 static constexpr paddr_t max = P_ADDR_MAX;
1131 static constexpr paddr_t min = P_ADDR_MIN;
1132 static constexpr paddr_t null = P_ADDR_NULL;
1133 };
1134
1135 // logical offset, see LBAManager, TransactionManager
1136 using extent_len_t = uint32_t;
1137 constexpr extent_len_t EXTENT_LEN_MAX =
1138 std::numeric_limits<extent_len_t>::max();
1139
1140 using extent_len_le_t = ceph_le32;
1141 inline extent_len_le_t init_extent_len_le(extent_len_t len) {
1142 return ceph_le32(len);
1143 }
1144
1145 struct laddr_list_t : std::list<std::pair<laddr_t, extent_len_t>> {
1146 template <typename... T>
1147 laddr_list_t(T&&... args)
1148 : std::list<std::pair<laddr_t, extent_len_t>>(std::forward<T>(args)...) {}
1149 };
1150 struct paddr_list_t : std::list<std::pair<paddr_t, extent_len_t>> {
1151 template <typename... T>
1152 paddr_list_t(T&&... args)
1153 : std::list<std::pair<paddr_t, extent_len_t>>(std::forward<T>(args)...) {}
1154 };
1155
1156 std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs);
1157 std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs);
1158
1159 /* identifies type of extent, used for interpretting deltas, managing
1160 * writeback.
1161 *
1162 * Note that any new extent type needs to be added to
1163 * Cache::get_extent_by_type in cache.cc
1164 */
1165 enum class extent_types_t : uint8_t {
1166 ROOT = 0,
1167 LADDR_INTERNAL = 1,
1168 LADDR_LEAF = 2,
1169 DINK_LADDR_LEAF = 3, // should only be used for unitttests
1170 OMAP_INNER = 4,
1171 OMAP_LEAF = 5,
1172 ONODE_BLOCK_STAGED = 6,
1173 COLL_BLOCK = 7,
1174 OBJECT_DATA_BLOCK = 8,
1175 RETIRED_PLACEHOLDER = 9,
1176 // the following two types are not extent types,
1177 // they are just used to indicates paddr allocation deltas
1178 ALLOC_INFO = 10,
1179 JOURNAL_TAIL = 11,
1180 // Test Block Types
1181 TEST_BLOCK = 12,
1182 TEST_BLOCK_PHYSICAL = 13,
1183 BACKREF_INTERNAL = 14,
1184 BACKREF_LEAF = 15,
1185 // None and the number of valid extent_types_t
1186 NONE = 16,
1187 };
1188 using extent_types_le_t = uint8_t;
1189 constexpr auto EXTENT_TYPES_MAX = static_cast<uint8_t>(extent_types_t::NONE);
1190
1191 constexpr size_t BACKREF_NODE_SIZE = 4096;
1192
1193 std::ostream &operator<<(std::ostream &out, extent_types_t t);
1194
1195 constexpr bool is_logical_type(extent_types_t type) {
1196 switch (type) {
1197 case extent_types_t::ROOT:
1198 case extent_types_t::LADDR_INTERNAL:
1199 case extent_types_t::LADDR_LEAF:
1200 case extent_types_t::BACKREF_INTERNAL:
1201 case extent_types_t::BACKREF_LEAF:
1202 return false;
1203 default:
1204 return true;
1205 }
1206 }
1207
1208 constexpr bool is_retired_placeholder(extent_types_t type)
1209 {
1210 return type == extent_types_t::RETIRED_PLACEHOLDER;
1211 }
1212
1213 constexpr bool is_lba_node(extent_types_t type)
1214 {
1215 return type == extent_types_t::LADDR_INTERNAL ||
1216 type == extent_types_t::LADDR_LEAF ||
1217 type == extent_types_t::DINK_LADDR_LEAF;
1218 }
1219
1220 constexpr bool is_backref_node(extent_types_t type)
1221 {
1222 return type == extent_types_t::BACKREF_INTERNAL ||
1223 type == extent_types_t::BACKREF_LEAF;
1224 }
1225
1226 constexpr bool is_lba_backref_node(extent_types_t type)
1227 {
1228 return is_lba_node(type) || is_backref_node(type);
1229 }
1230
1231 std::ostream &operator<<(std::ostream &out, extent_types_t t);
1232
1233 /**
1234 * rewrite_gen_t
1235 *
1236 * The goal is to group the similar aged extents in the same segment for better
1237 * bimodel utilization distribution, and also to the same device tier. For EPM,
1238 * it has the flexibility to make placement decisions by re-assigning the
1239 * generation. And each non-inline generation will be statically mapped to a
1240 * writer in EPM.
1241 *
1242 * All the fresh and dirty extents start with INIT_GENERATION upon allocation,
1243 * and they will be assigned to INLINE/OOL generation by EPM before the initial
1244 * writes. After that, the generation can only be increased upon rewrite.
1245 *
1246 * Note, although EPM can re-assign the generations according to the tiering
1247 * status, it cannot decrease the generation for the correctness of space
1248 * reservation. It may choose to assign a larger generation if the extent is
1249 * hinted cold, or if want to evict extents to the cold tier. And it may choose
1250 * to not increase the generation if want to keep the hot tier as filled as
1251 * possible.
1252 */
1253 using rewrite_gen_t = uint8_t;
1254
1255 // INIT_GENERATION requires EPM decision to INLINE/OOL_GENERATION
1256 constexpr rewrite_gen_t INIT_GENERATION = 0;
1257 constexpr rewrite_gen_t INLINE_GENERATION = 1; // to the journal
1258 constexpr rewrite_gen_t OOL_GENERATION = 2;
1259
1260 // All the rewritten extents start with MIN_REWRITE_GENERATION
1261 constexpr rewrite_gen_t MIN_REWRITE_GENERATION = 3;
1262 // without cold tier, the largest generation is less than MIN_COLD_GENERATION
1263 constexpr rewrite_gen_t MIN_COLD_GENERATION = 5;
1264 constexpr rewrite_gen_t MAX_REWRITE_GENERATION = 7;
1265 constexpr rewrite_gen_t REWRITE_GENERATIONS = MAX_REWRITE_GENERATION + 1;
1266 constexpr rewrite_gen_t NULL_GENERATION =
1267 std::numeric_limits<rewrite_gen_t>::max();
1268
1269 struct rewrite_gen_printer_t {
1270 rewrite_gen_t gen;
1271 };
1272
1273 std::ostream &operator<<(std::ostream &out, rewrite_gen_printer_t gen);
1274
1275 constexpr std::size_t generation_to_writer(rewrite_gen_t gen) {
1276 // caller to assert the gen is in the reasonable range
1277 return gen - OOL_GENERATION;
1278 }
1279
1280 // before EPM decision
1281 constexpr bool is_target_rewrite_generation(rewrite_gen_t gen) {
1282 return gen == INIT_GENERATION ||
1283 (gen >= MIN_REWRITE_GENERATION &&
1284 gen <= REWRITE_GENERATIONS);
1285 }
1286
1287 // after EPM decision
1288 constexpr bool is_rewrite_generation(rewrite_gen_t gen) {
1289 return gen >= INLINE_GENERATION &&
1290 gen < REWRITE_GENERATIONS;
1291 }
1292
1293 enum class data_category_t : uint8_t {
1294 METADATA = 0,
1295 DATA,
1296 NUM
1297 };
1298
1299 std::ostream &operator<<(std::ostream &out, data_category_t c);
1300
1301 constexpr data_category_t get_extent_category(extent_types_t type) {
1302 if (type == extent_types_t::OBJECT_DATA_BLOCK ||
1303 type == extent_types_t::TEST_BLOCK) {
1304 return data_category_t::DATA;
1305 } else {
1306 return data_category_t::METADATA;
1307 }
1308 }
1309
1310 // type for extent modification time, milliseconds since the epoch
1311 using sea_time_point = seastar::lowres_system_clock::time_point;
1312 using sea_duration = seastar::lowres_system_clock::duration;
1313 using mod_time_point_t = int64_t;
1314
1315 constexpr mod_time_point_t
1316 timepoint_to_mod(const sea_time_point &t) {
1317 return std::chrono::duration_cast<std::chrono::milliseconds>(
1318 t.time_since_epoch()).count();
1319 }
1320
1321 constexpr sea_time_point
1322 mod_to_timepoint(mod_time_point_t t) {
1323 return sea_time_point(std::chrono::duration_cast<sea_duration>(
1324 std::chrono::milliseconds(t)));
1325 }
1326
1327 constexpr auto NULL_TIME = sea_time_point();
1328 constexpr auto NULL_MOD_TIME = timepoint_to_mod(NULL_TIME);
1329
1330 struct sea_time_point_printer_t {
1331 sea_time_point tp;
1332 };
1333 std::ostream &operator<<(std::ostream &out, sea_time_point_printer_t tp);
1334
1335 struct mod_time_point_printer_t {
1336 mod_time_point_t tp;
1337 };
1338 std::ostream &operator<<(std::ostream &out, mod_time_point_printer_t tp);
1339
1340 constexpr sea_time_point
1341 get_average_time(const sea_time_point& t1, std::size_t n1,
1342 const sea_time_point& t2, std::size_t n2) {
1343 assert(t1 != NULL_TIME);
1344 assert(t2 != NULL_TIME);
1345 auto new_size = n1 + n2;
1346 assert(new_size > 0);
1347 auto c1 = t1.time_since_epoch().count();
1348 auto c2 = t2.time_since_epoch().count();
1349 auto c_ret = c1 / new_size * n1 + c2 / new_size * n2;
1350 return sea_time_point(sea_duration(c_ret));
1351 }
1352
1353 /* description of a new physical extent */
1354 struct extent_t {
1355 extent_types_t type; ///< type of extent
1356 laddr_t addr; ///< laddr of extent (L_ADDR_NULL for non-logical)
1357 ceph::bufferlist bl; ///< payload, bl.length() == length, aligned
1358 };
1359
1360 using extent_version_t = uint32_t;
1361
1362 /* description of a mutation to a physical extent */
1363 struct delta_info_t {
1364 extent_types_t type = extent_types_t::NONE; ///< delta type
1365 paddr_t paddr; ///< physical address
1366 laddr_t laddr = L_ADDR_NULL; ///< logical address
1367 uint32_t prev_crc = 0;
1368 uint32_t final_crc = 0;
1369 extent_len_t length = 0; ///< extent length
1370 extent_version_t pversion; ///< prior version
1371 segment_seq_t ext_seq; ///< seq of the extent's segment
1372 segment_type_t seg_type;
1373 ceph::bufferlist bl; ///< payload
1374
1375 DENC(delta_info_t, v, p) {
1376 DENC_START(1, 1, p);
1377 denc(v.type, p);
1378 denc(v.paddr, p);
1379 denc(v.laddr, p);
1380 denc(v.prev_crc, p);
1381 denc(v.final_crc, p);
1382 denc(v.length, p);
1383 denc(v.pversion, p);
1384 denc(v.ext_seq, p);
1385 denc(v.seg_type, p);
1386 denc(v.bl, p);
1387 DENC_FINISH(p);
1388 }
1389
1390 bool operator==(const delta_info_t &rhs) const {
1391 return (
1392 type == rhs.type &&
1393 paddr == rhs.paddr &&
1394 laddr == rhs.laddr &&
1395 prev_crc == rhs.prev_crc &&
1396 final_crc == rhs.final_crc &&
1397 length == rhs.length &&
1398 pversion == rhs.pversion &&
1399 ext_seq == rhs.ext_seq &&
1400 bl == rhs.bl
1401 );
1402 }
1403 };
1404
1405 std::ostream &operator<<(std::ostream &out, const delta_info_t &delta);
1406
1407 /* contains the latest journal tail information */
1408 struct journal_tail_delta_t {
1409 journal_seq_t alloc_tail;
1410 journal_seq_t dirty_tail;
1411
1412 DENC(journal_tail_delta_t, v, p) {
1413 DENC_START(1, 1, p);
1414 denc(v.alloc_tail, p);
1415 denc(v.dirty_tail, p);
1416 DENC_FINISH(p);
1417 }
1418 };
1419
1420 std::ostream &operator<<(std::ostream &out, const journal_tail_delta_t &delta);
1421
1422 class object_data_t {
1423 laddr_t reserved_data_base = L_ADDR_NULL;
1424 extent_len_t reserved_data_len = 0;
1425
1426 bool dirty = false;
1427 public:
1428 object_data_t(
1429 laddr_t reserved_data_base,
1430 extent_len_t reserved_data_len)
1431 : reserved_data_base(reserved_data_base),
1432 reserved_data_len(reserved_data_len) {}
1433
1434 laddr_t get_reserved_data_base() const {
1435 return reserved_data_base;
1436 }
1437
1438 extent_len_t get_reserved_data_len() const {
1439 return reserved_data_len;
1440 }
1441
1442 bool is_null() const {
1443 return reserved_data_base == L_ADDR_NULL;
1444 }
1445
1446 bool must_update() const {
1447 return dirty;
1448 }
1449
1450 void update_reserved(
1451 laddr_t base,
1452 extent_len_t len) {
1453 dirty = true;
1454 reserved_data_base = base;
1455 reserved_data_len = len;
1456 }
1457
1458 void update_len(
1459 extent_len_t len) {
1460 dirty = true;
1461 reserved_data_len = len;
1462 }
1463
1464 void clear() {
1465 dirty = true;
1466 reserved_data_base = L_ADDR_NULL;
1467 reserved_data_len = 0;
1468 }
1469 };
1470
1471 struct __attribute__((packed)) object_data_le_t {
1472 laddr_le_t reserved_data_base = laddr_le_t(L_ADDR_NULL);
1473 extent_len_le_t reserved_data_len = init_extent_len_le(0);
1474
1475 void update(const object_data_t &nroot) {
1476 reserved_data_base = nroot.get_reserved_data_base();
1477 reserved_data_len = init_extent_len_le(nroot.get_reserved_data_len());
1478 }
1479
1480 object_data_t get() const {
1481 return object_data_t(
1482 reserved_data_base,
1483 reserved_data_len);
1484 }
1485 };
1486
1487 struct omap_root_t {
1488 laddr_t addr = L_ADDR_NULL;
1489 depth_t depth = 0;
1490 laddr_t hint = L_ADDR_MIN;
1491 bool mutated = false;
1492
1493 omap_root_t() = default;
1494 omap_root_t(laddr_t addr, depth_t depth, laddr_t addr_min)
1495 : addr(addr),
1496 depth(depth),
1497 hint(addr_min) {}
1498
1499 omap_root_t(const omap_root_t &o) = default;
1500 omap_root_t(omap_root_t &&o) = default;
1501 omap_root_t &operator=(const omap_root_t &o) = default;
1502 omap_root_t &operator=(omap_root_t &&o) = default;
1503
1504 bool is_null() const {
1505 return addr == L_ADDR_NULL;
1506 }
1507
1508 bool must_update() const {
1509 return mutated;
1510 }
1511
1512 void update(laddr_t _addr, depth_t _depth, laddr_t _hint) {
1513 mutated = true;
1514 addr = _addr;
1515 depth = _depth;
1516 hint = _hint;
1517 }
1518
1519 laddr_t get_location() const {
1520 return addr;
1521 }
1522
1523 depth_t get_depth() const {
1524 return depth;
1525 }
1526
1527 laddr_t get_hint() const {
1528 return hint;
1529 }
1530 };
1531 std::ostream &operator<<(std::ostream &out, const omap_root_t &root);
1532
1533 class __attribute__((packed)) omap_root_le_t {
1534 laddr_le_t addr = laddr_le_t(L_ADDR_NULL);
1535 depth_le_t depth = init_depth_le(0);
1536
1537 public:
1538 omap_root_le_t() = default;
1539
1540 omap_root_le_t(laddr_t addr, depth_t depth)
1541 : addr(addr), depth(init_depth_le(depth)) {}
1542
1543 omap_root_le_t(const omap_root_le_t &o) = default;
1544 omap_root_le_t(omap_root_le_t &&o) = default;
1545 omap_root_le_t &operator=(const omap_root_le_t &o) = default;
1546 omap_root_le_t &operator=(omap_root_le_t &&o) = default;
1547
1548 void update(const omap_root_t &nroot) {
1549 addr = nroot.get_location();
1550 depth = init_depth_le(nroot.get_depth());
1551 }
1552
1553 omap_root_t get(laddr_t hint) const {
1554 return omap_root_t(addr, depth, hint);
1555 }
1556 };
1557
1558 /**
1559 * phy_tree_root_t
1560 */
1561 class __attribute__((packed)) phy_tree_root_t {
1562 paddr_le_t root_addr;
1563 depth_le_t depth = init_extent_len_le(0);
1564
1565 public:
1566 phy_tree_root_t() = default;
1567
1568 phy_tree_root_t(paddr_t addr, depth_t depth)
1569 : root_addr(addr), depth(init_depth_le(depth)) {}
1570
1571 phy_tree_root_t(const phy_tree_root_t &o) = default;
1572 phy_tree_root_t(phy_tree_root_t &&o) = default;
1573 phy_tree_root_t &operator=(const phy_tree_root_t &o) = default;
1574 phy_tree_root_t &operator=(phy_tree_root_t &&o) = default;
1575
1576 paddr_t get_location() const {
1577 return root_addr;
1578 }
1579
1580 void set_location(paddr_t location) {
1581 root_addr = location;
1582 }
1583
1584 depth_t get_depth() const {
1585 return depth;
1586 }
1587
1588 void set_depth(depth_t ndepth) {
1589 depth = ndepth;
1590 }
1591
1592 void adjust_addrs_from_base(paddr_t base) {
1593 paddr_t _root_addr = root_addr;
1594 if (_root_addr.is_relative()) {
1595 root_addr = base.add_record_relative(_root_addr);
1596 }
1597 }
1598 };
1599
1600 class coll_root_t {
1601 laddr_t addr = L_ADDR_NULL;
1602 extent_len_t size = 0;
1603
1604 bool mutated = false;
1605
1606 public:
1607 coll_root_t() = default;
1608 coll_root_t(laddr_t addr, extent_len_t size) : addr(addr), size(size) {}
1609
1610 coll_root_t(const coll_root_t &o) = default;
1611 coll_root_t(coll_root_t &&o) = default;
1612 coll_root_t &operator=(const coll_root_t &o) = default;
1613 coll_root_t &operator=(coll_root_t &&o) = default;
1614
1615 bool must_update() const {
1616 return mutated;
1617 }
1618
1619 void update(laddr_t _addr, extent_len_t _s) {
1620 mutated = true;
1621 addr = _addr;
1622 size = _s;
1623 }
1624
1625 laddr_t get_location() const {
1626 return addr;
1627 }
1628
1629 extent_len_t get_size() const {
1630 return size;
1631 }
1632 };
1633
1634 /**
1635 * coll_root_le_t
1636 *
1637 * Information for locating CollectionManager information, to be embedded
1638 * in root block.
1639 */
1640 class __attribute__((packed)) coll_root_le_t {
1641 laddr_le_t addr;
1642 extent_len_le_t size = init_extent_len_le(0);
1643
1644 public:
1645 coll_root_le_t() = default;
1646
1647 coll_root_le_t(laddr_t laddr, extent_len_t size)
1648 : addr(laddr), size(init_extent_len_le(size)) {}
1649
1650
1651 coll_root_le_t(const coll_root_le_t &o) = default;
1652 coll_root_le_t(coll_root_le_t &&o) = default;
1653 coll_root_le_t &operator=(const coll_root_le_t &o) = default;
1654 coll_root_le_t &operator=(coll_root_le_t &&o) = default;
1655
1656 void update(const coll_root_t &nroot) {
1657 addr = nroot.get_location();
1658 size = init_extent_len_le(nroot.get_size());
1659 }
1660
1661 coll_root_t get() const {
1662 return coll_root_t(addr, size);
1663 }
1664 };
1665
1666 using lba_root_t = phy_tree_root_t;
1667 using backref_root_t = phy_tree_root_t;
1668
1669 /**
1670 * root_t
1671 *
1672 * Contains information required to find metadata roots.
1673 * TODO: generalize this to permit more than one lba_manager implementation
1674 */
1675 struct __attribute__((packed)) root_t {
1676 using meta_t = std::map<std::string, std::string>;
1677
1678 static constexpr int MAX_META_LENGTH = 1024;
1679
1680 backref_root_t backref_root;
1681 lba_root_t lba_root;
1682 laddr_le_t onode_root;
1683 coll_root_le_t collection_root;
1684
1685 char meta[MAX_META_LENGTH];
1686
1687 root_t() {
1688 set_meta(meta_t{});
1689 }
1690
1691 void adjust_addrs_from_base(paddr_t base) {
1692 lba_root.adjust_addrs_from_base(base);
1693 backref_root.adjust_addrs_from_base(base);
1694 }
1695
1696 meta_t get_meta() {
1697 bufferlist bl;
1698 bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta));
1699 meta_t ret;
1700 auto iter = bl.cbegin();
1701 decode(ret, iter);
1702 return ret;
1703 }
1704
1705 void set_meta(const meta_t &m) {
1706 ceph::bufferlist bl;
1707 encode(m, bl);
1708 ceph_assert(bl.length() < MAX_META_LENGTH);
1709 bl.rebuild();
1710 auto &bptr = bl.front();
1711 ::memset(meta, 0, MAX_META_LENGTH);
1712 ::memcpy(meta, bptr.c_str(), bl.length());
1713 }
1714 };
1715
1716 struct alloc_blk_t {
1717 alloc_blk_t(
1718 paddr_t paddr,
1719 laddr_t laddr,
1720 extent_len_t len,
1721 extent_types_t type)
1722 : paddr(paddr), laddr(laddr), len(len), type(type)
1723 {}
1724
1725 explicit alloc_blk_t() = default;
1726
1727 paddr_t paddr = P_ADDR_NULL;
1728 laddr_t laddr = L_ADDR_NULL;
1729 extent_len_t len = 0;
1730 extent_types_t type = extent_types_t::ROOT;
1731 DENC(alloc_blk_t, v, p) {
1732 DENC_START(1, 1, p);
1733 denc(v.paddr, p);
1734 denc(v.laddr, p);
1735 denc(v.len, p);
1736 denc(v.type, p);
1737 DENC_FINISH(p);
1738 }
1739 };
1740
1741 // use absolute address
1742 struct alloc_delta_t {
1743 enum class op_types_t : uint8_t {
1744 NONE = 0,
1745 SET = 1,
1746 CLEAR = 2
1747 };
1748 std::vector<alloc_blk_t> alloc_blk_ranges;
1749 op_types_t op = op_types_t::NONE;
1750
1751 alloc_delta_t() = default;
1752
1753 DENC(alloc_delta_t, v, p) {
1754 DENC_START(1, 1, p);
1755 denc(v.alloc_blk_ranges, p);
1756 denc(v.op, p);
1757 DENC_FINISH(p);
1758 }
1759 };
1760
1761 struct extent_info_t {
1762 extent_types_t type = extent_types_t::NONE;
1763 laddr_t addr = L_ADDR_NULL;
1764 extent_len_t len = 0;
1765
1766 extent_info_t() = default;
1767 extent_info_t(const extent_t &et)
1768 : type(et.type), addr(et.addr),
1769 len(et.bl.length())
1770 {}
1771
1772 DENC(extent_info_t, v, p) {
1773 DENC_START(1, 1, p);
1774 denc(v.type, p);
1775 denc(v.addr, p);
1776 denc(v.len, p);
1777 DENC_FINISH(p);
1778 }
1779 };
1780 std::ostream &operator<<(std::ostream &out, const extent_info_t &header);
1781
1782 using segment_nonce_t = uint32_t;
1783
1784 /**
1785 * Segment header
1786 *
1787 * Every segment contains and encode segment_header_t in the first block.
1788 * Our strategy for finding the journal replay point is:
1789 * 1) Find the segment with the highest journal_segment_seq
1790 * 2) Get dirty_tail and alloc_tail from the segment header
1791 * 3) Scan forward to update tails from journal_tail_delta_t
1792 * 4) Replay from the latest tails
1793 */
1794 struct segment_header_t {
1795 segment_seq_t segment_seq;
1796 segment_id_t physical_segment_id; // debugging
1797
1798 journal_seq_t dirty_tail;
1799 journal_seq_t alloc_tail;
1800 segment_nonce_t segment_nonce;
1801
1802 segment_type_t type;
1803
1804 data_category_t category;
1805 rewrite_gen_t generation;
1806
1807 segment_type_t get_type() const {
1808 return type;
1809 }
1810
1811 DENC(segment_header_t, v, p) {
1812 DENC_START(1, 1, p);
1813 denc(v.segment_seq, p);
1814 denc(v.physical_segment_id, p);
1815 denc(v.dirty_tail, p);
1816 denc(v.alloc_tail, p);
1817 denc(v.segment_nonce, p);
1818 denc(v.type, p);
1819 denc(v.category, p);
1820 denc(v.generation, p);
1821 DENC_FINISH(p);
1822 }
1823 };
1824 std::ostream &operator<<(std::ostream &out, const segment_header_t &header);
1825
1826 struct segment_tail_t {
1827 segment_seq_t segment_seq;
1828 segment_id_t physical_segment_id; // debugging
1829
1830 segment_nonce_t segment_nonce;
1831
1832 segment_type_t type;
1833
1834 mod_time_point_t modify_time;
1835 std::size_t num_extents;
1836
1837 segment_type_t get_type() const {
1838 return type;
1839 }
1840
1841 DENC(segment_tail_t, v, p) {
1842 DENC_START(1, 1, p);
1843 denc(v.segment_seq, p);
1844 denc(v.physical_segment_id, p);
1845 denc(v.segment_nonce, p);
1846 denc(v.type, p);
1847 denc(v.modify_time, p);
1848 denc(v.num_extents, p);
1849 DENC_FINISH(p);
1850 }
1851 };
1852 std::ostream &operator<<(std::ostream &out, const segment_tail_t &tail);
1853
1854 enum class transaction_type_t : uint8_t {
1855 MUTATE = 0,
1856 READ, // including weak and non-weak read transactions
1857 TRIM_DIRTY,
1858 TRIM_ALLOC,
1859 CLEANER_MAIN,
1860 CLEANER_COLD,
1861 MAX
1862 };
1863
1864 static constexpr auto TRANSACTION_TYPE_NULL = transaction_type_t::MAX;
1865
1866 static constexpr auto TRANSACTION_TYPE_MAX = static_cast<std::size_t>(
1867 transaction_type_t::MAX);
1868
1869 std::ostream &operator<<(std::ostream &os, transaction_type_t type);
1870
1871 constexpr bool is_valid_transaction(transaction_type_t type) {
1872 return type < transaction_type_t::MAX;
1873 }
1874
1875 constexpr bool is_background_transaction(transaction_type_t type) {
1876 return (type >= transaction_type_t::TRIM_DIRTY &&
1877 type < transaction_type_t::MAX);
1878 }
1879
1880 constexpr bool is_trim_transaction(transaction_type_t type) {
1881 return (type == transaction_type_t::TRIM_DIRTY ||
1882 type == transaction_type_t::TRIM_ALLOC);
1883 }
1884
1885 struct record_size_t {
1886 extent_len_t plain_mdlength = 0; // mdlength without the record header
1887 extent_len_t dlength = 0;
1888
1889 extent_len_t get_raw_mdlength() const;
1890
1891 bool is_empty() const {
1892 return plain_mdlength == 0 &&
1893 dlength == 0;
1894 }
1895
1896 void account_extent(extent_len_t extent_len);
1897
1898 void account(const extent_t& extent) {
1899 account_extent(extent.bl.length());
1900 }
1901
1902 void account(const delta_info_t& delta);
1903
1904 bool operator==(const record_size_t &) const = default;
1905 };
1906 std::ostream &operator<<(std::ostream&, const record_size_t&);
1907
1908 struct record_t {
1909 transaction_type_t type = TRANSACTION_TYPE_NULL;
1910 std::vector<extent_t> extents;
1911 std::vector<delta_info_t> deltas;
1912 record_size_t size;
1913 sea_time_point modify_time = NULL_TIME;
1914
1915 record_t(transaction_type_t type) : type{type} { }
1916
1917 // unit test only
1918 record_t() {
1919 type = transaction_type_t::MUTATE;
1920 }
1921
1922 // unit test only
1923 record_t(std::vector<extent_t>&& _extents,
1924 std::vector<delta_info_t>&& _deltas) {
1925 auto modify_time = seastar::lowres_system_clock::now();
1926 for (auto& e: _extents) {
1927 push_back(std::move(e), modify_time);
1928 }
1929 for (auto& d: _deltas) {
1930 push_back(std::move(d));
1931 }
1932 type = transaction_type_t::MUTATE;
1933 }
1934
1935 bool is_empty() const {
1936 return extents.size() == 0 &&
1937 deltas.size() == 0;
1938 }
1939
1940 std::size_t get_delta_size() const {
1941 auto delta_size = std::accumulate(
1942 deltas.begin(), deltas.end(), 0,
1943 [](uint64_t sum, auto& delta) {
1944 return sum + delta.bl.length();
1945 }
1946 );
1947 return delta_size;
1948 }
1949
1950 void push_back(extent_t&& extent, sea_time_point &t) {
1951 ceph_assert(t != NULL_TIME);
1952 if (extents.size() == 0) {
1953 assert(modify_time == NULL_TIME);
1954 modify_time = t;
1955 } else {
1956 modify_time = get_average_time(modify_time, extents.size(), t, 1);
1957 }
1958 size.account(extent);
1959 extents.push_back(std::move(extent));
1960 }
1961
1962 void push_back(delta_info_t&& delta) {
1963 size.account(delta);
1964 deltas.push_back(std::move(delta));
1965 }
1966 };
1967 std::ostream &operator<<(std::ostream&, const record_t&);
1968
1969 struct record_header_t {
1970 transaction_type_t type;
1971 uint32_t deltas; // number of deltas
1972 uint32_t extents; // number of extents
1973 mod_time_point_t modify_time;
1974
1975 DENC(record_header_t, v, p) {
1976 DENC_START(1, 1, p);
1977 denc(v.type, p);
1978 denc(v.deltas, p);
1979 denc(v.extents, p);
1980 denc(v.modify_time, p);
1981 DENC_FINISH(p);
1982 }
1983 };
1984 std::ostream &operator<<(std::ostream&, const record_header_t&);
1985
1986 struct record_group_header_t {
1987 uint32_t records;
1988 extent_len_t mdlength; // block aligned, length of metadata
1989 extent_len_t dlength; // block aligned, length of data
1990 segment_nonce_t segment_nonce;// nonce of containing segment
1991 journal_seq_t committed_to; // records prior to committed_to have been
1992 // fully written, maybe in another segment.
1993 checksum_t data_crc; // crc of data payload
1994
1995
1996 DENC(record_group_header_t, v, p) {
1997 DENC_START(1, 1, p);
1998 denc(v.records, p);
1999 denc(v.mdlength, p);
2000 denc(v.dlength, p);
2001 denc(v.segment_nonce, p);
2002 denc(v.committed_to, p);
2003 denc(v.data_crc, p);
2004 DENC_FINISH(p);
2005 }
2006 };
2007 std::ostream& operator<<(std::ostream&, const record_group_header_t&);
2008
2009 struct record_group_size_t {
2010 extent_len_t plain_mdlength = 0; // mdlength without the group header
2011 extent_len_t dlength = 0;
2012 extent_len_t block_size = 0;
2013
2014 record_group_size_t() = default;
2015 record_group_size_t(
2016 const record_size_t& rsize,
2017 extent_len_t block_size) {
2018 account(rsize, block_size);
2019 }
2020
2021 extent_len_t get_raw_mdlength() const;
2022
2023 extent_len_t get_mdlength() const {
2024 assert(block_size > 0);
2025 return p2roundup(get_raw_mdlength(), block_size);
2026 }
2027
2028 extent_len_t get_encoded_length() const {
2029 assert(block_size > 0);
2030 assert(dlength % block_size == 0);
2031 return get_mdlength() + dlength;
2032 }
2033
2034 record_group_size_t get_encoded_length_after(
2035 const record_size_t& rsize,
2036 extent_len_t block_size) const {
2037 record_group_size_t tmp = *this;
2038 tmp.account(rsize, block_size);
2039 return tmp;
2040 }
2041
2042 double get_fullness() const {
2043 assert(block_size > 0);
2044 return ((double)(get_raw_mdlength() + dlength) /
2045 get_encoded_length());
2046 }
2047
2048 void account(const record_size_t& rsize,
2049 extent_len_t block_size);
2050
2051 bool operator==(const record_group_size_t &) const = default;
2052 };
2053 std::ostream& operator<<(std::ostream&, const record_group_size_t&);
2054
2055 struct record_group_t {
2056 std::vector<record_t> records;
2057 record_group_size_t size;
2058
2059 record_group_t() = default;
2060 record_group_t(
2061 record_t&& record,
2062 extent_len_t block_size) {
2063 push_back(std::move(record), block_size);
2064 }
2065
2066 std::size_t get_size() const {
2067 return records.size();
2068 }
2069
2070 void push_back(
2071 record_t&& record,
2072 extent_len_t block_size) {
2073 size.account(record.size, block_size);
2074 records.push_back(std::move(record));
2075 assert(size.get_encoded_length() < SEGMENT_OFF_MAX);
2076 }
2077
2078 void reserve(std::size_t limit) {
2079 records.reserve(limit);
2080 }
2081
2082 void clear() {
2083 records.clear();
2084 size = {};
2085 }
2086 };
2087 std::ostream& operator<<(std::ostream&, const record_group_t&);
2088
2089 ceph::bufferlist encode_record(
2090 record_t&& record,
2091 extent_len_t block_size,
2092 const journal_seq_t& committed_to,
2093 segment_nonce_t current_segment_nonce);
2094
2095 ceph::bufferlist encode_records(
2096 record_group_t& record_group,
2097 const journal_seq_t& committed_to,
2098 segment_nonce_t current_segment_nonce);
2099
2100 std::optional<record_group_header_t>
2101 try_decode_records_header(
2102 const ceph::bufferlist& header_bl,
2103 segment_nonce_t expected_nonce);
2104
2105 bool validate_records_metadata(
2106 const ceph::bufferlist& md_bl);
2107
2108 bool validate_records_data(
2109 const record_group_header_t& header,
2110 const ceph::bufferlist& data_bl);
2111
2112 struct record_extent_infos_t {
2113 record_header_t header;
2114 std::vector<extent_info_t> extent_infos;
2115 };
2116 std::optional<std::vector<record_extent_infos_t> >
2117 try_decode_extent_infos(
2118 const record_group_header_t& header,
2119 const ceph::bufferlist& md_bl);
2120 std::optional<std::vector<record_header_t>>
2121 try_decode_record_headers(
2122 const record_group_header_t& header,
2123 const ceph::bufferlist& md_bl);
2124
2125 struct record_deltas_t {
2126 paddr_t record_block_base;
2127 std::vector<std::pair<sea_time_point, delta_info_t>> deltas;
2128 };
2129 std::optional<std::vector<record_deltas_t> >
2130 try_decode_deltas(
2131 const record_group_header_t& header,
2132 const ceph::bufferlist& md_bl,
2133 paddr_t record_block_base);
2134
2135 struct write_result_t {
2136 journal_seq_t start_seq;
2137 extent_len_t length;
2138
2139 journal_seq_t get_end_seq() const {
2140 return journal_seq_t{
2141 start_seq.segment_seq,
2142 start_seq.offset.add_offset(length)};
2143 }
2144 };
2145 std::ostream& operator<<(std::ostream&, const write_result_t&);
2146
2147 struct record_locator_t {
2148 paddr_t record_block_base;
2149 write_result_t write_result;
2150 };
2151 std::ostream& operator<<(std::ostream&, const record_locator_t&);
2152
2153 /// scan segment for end incrementally
2154 struct scan_valid_records_cursor {
2155 bool last_valid_header_found = false;
2156 journal_seq_t seq;
2157 journal_seq_t last_committed;
2158 std::size_t num_consumed_records = 0;
2159 extent_len_t block_size = 0;
2160
2161 struct found_record_group_t {
2162 paddr_t offset;
2163 record_group_header_t header;
2164 bufferlist mdbuffer;
2165
2166 found_record_group_t(
2167 paddr_t offset,
2168 const record_group_header_t &header,
2169 const bufferlist &mdbuffer)
2170 : offset(offset), header(header), mdbuffer(mdbuffer) {}
2171 };
2172 std::deque<found_record_group_t> pending_record_groups;
2173
2174 bool is_complete() const {
2175 return last_valid_header_found && pending_record_groups.empty();
2176 }
2177
2178 segment_id_t get_segment_id() const {
2179 return seq.offset.as_seg_paddr().get_segment_id();
2180 }
2181
2182 segment_off_t get_segment_offset() const {
2183 return seq.offset.as_seg_paddr().get_segment_off();
2184 }
2185
2186 extent_len_t get_block_size() const {
2187 return block_size;
2188 }
2189
2190 void increment_seq(segment_off_t off) {
2191 seq.offset = seq.offset.add_offset(off);
2192 }
2193
2194 void emplace_record_group(const record_group_header_t&, ceph::bufferlist&&);
2195
2196 void pop_record_group() {
2197 assert(!pending_record_groups.empty());
2198 ++num_consumed_records;
2199 pending_record_groups.pop_front();
2200 }
2201
2202 scan_valid_records_cursor(
2203 journal_seq_t seq)
2204 : seq(seq) {}
2205 };
2206 std::ostream& operator<<(std::ostream&, const scan_valid_records_cursor&);
2207
2208 }
2209
2210 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::seastore_meta_t)
2211 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_id_t)
2212 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t)
2213 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t)
2214 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t)
2215 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_tail_delta_t)
2216 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t)
2217 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_group_header_t)
2218 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t)
2219 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_header_t)
2220 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_blk_t)
2221 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_delta_t)
2222 WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_tail_t)
2223
2224 #if FMT_VERSION >= 90000
2225 template <> struct fmt::formatter<crimson::os::seastore::data_category_t> : fmt::ostream_formatter {};
2226 template <> struct fmt::formatter<crimson::os::seastore::delta_info_t> : fmt::ostream_formatter {};
2227 template <> struct fmt::formatter<crimson::os::seastore::device_id_printer_t> : fmt::ostream_formatter {};
2228 template <> struct fmt::formatter<crimson::os::seastore::extent_types_t> : fmt::ostream_formatter {};
2229 template <> struct fmt::formatter<crimson::os::seastore::journal_seq_t> : fmt::ostream_formatter {};
2230 template <> struct fmt::formatter<crimson::os::seastore::journal_tail_delta_t> : fmt::ostream_formatter {};
2231 template <> struct fmt::formatter<crimson::os::seastore::laddr_list_t> : fmt::ostream_formatter {};
2232 template <> struct fmt::formatter<crimson::os::seastore::omap_root_t> : fmt::ostream_formatter {};
2233 template <> struct fmt::formatter<crimson::os::seastore::paddr_list_t> : fmt::ostream_formatter {};
2234 template <> struct fmt::formatter<crimson::os::seastore::paddr_t> : fmt::ostream_formatter {};
2235 template <> struct fmt::formatter<crimson::os::seastore::pladdr_t> : fmt::ostream_formatter {};
2236 template <> struct fmt::formatter<crimson::os::seastore::placement_hint_t> : fmt::ostream_formatter {};
2237 template <> struct fmt::formatter<crimson::os::seastore::device_type_t> : fmt::ostream_formatter {};
2238 template <> struct fmt::formatter<crimson::os::seastore::record_group_header_t> : fmt::ostream_formatter {};
2239 template <> struct fmt::formatter<crimson::os::seastore::record_group_size_t> : fmt::ostream_formatter {};
2240 template <> struct fmt::formatter<crimson::os::seastore::record_header_t> : fmt::ostream_formatter {};
2241 template <> struct fmt::formatter<crimson::os::seastore::record_locator_t> : fmt::ostream_formatter {};
2242 template <> struct fmt::formatter<crimson::os::seastore::record_t> : fmt::ostream_formatter {};
2243 template <> struct fmt::formatter<crimson::os::seastore::rewrite_gen_printer_t> : fmt::ostream_formatter {};
2244 template <> struct fmt::formatter<crimson::os::seastore::scan_valid_records_cursor> : fmt::ostream_formatter {};
2245 template <> struct fmt::formatter<crimson::os::seastore::sea_time_point_printer_t> : fmt::ostream_formatter {};
2246 template <> struct fmt::formatter<crimson::os::seastore::segment_header_t> : fmt::ostream_formatter {};
2247 template <> struct fmt::formatter<crimson::os::seastore::segment_id_t> : fmt::ostream_formatter {};
2248 template <> struct fmt::formatter<crimson::os::seastore::segment_seq_printer_t> : fmt::ostream_formatter {};
2249 template <> struct fmt::formatter<crimson::os::seastore::segment_tail_t> : fmt::ostream_formatter {};
2250 template <> struct fmt::formatter<crimson::os::seastore::segment_type_t> : fmt::ostream_formatter {};
2251 template <> struct fmt::formatter<crimson::os::seastore::transaction_type_t> : fmt::ostream_formatter {};
2252 template <> struct fmt::formatter<crimson::os::seastore::write_result_t> : fmt::ostream_formatter {};
2253 template <> struct fmt::formatter<ceph::buffer::list> : fmt::ostream_formatter {};
2254 #endif