]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #pragma once | |
5 | ||
6 | #include <limits> | |
20effc67 TL |
7 | #include <numeric> |
8 | #include <optional> | |
f67539c2 | 9 | #include <iostream> |
20effc67 | 10 | #include <vector> |
f67539c2 TL |
11 | |
12 | #include "include/byteorder.h" | |
13 | #include "include/denc.h" | |
14 | #include "include/buffer.h" | |
15 | #include "include/cmp.h" | |
16 | #include "include/uuid.h" | |
20effc67 | 17 | #include "include/interval_set.h" |
f67539c2 TL |
18 | |
19 | namespace crimson::os::seastore { | |
20 | ||
20effc67 TL |
21 | using depth_t = uint32_t; |
22 | using depth_le_t = ceph_le32; | |
23 | ||
24 | inline depth_le_t init_depth_le(uint32_t i) { | |
25 | return ceph_le32(i); | |
26 | } | |
f67539c2 TL |
27 | |
28 | using checksum_t = uint32_t; | |
29 | ||
30 | // Immutable metadata for seastore to set at mkfs time | |
31 | struct seastore_meta_t { | |
32 | uuid_d seastore_id; | |
33 | ||
34 | DENC(seastore_meta_t, v, p) { | |
35 | DENC_START(1, 1, p); | |
36 | denc(v.seastore_id, p); | |
37 | DENC_FINISH(p); | |
38 | } | |
39 | }; | |
40 | ||
20effc67 TL |
41 | std::ostream& operator<<(std::ostream& out, const seastore_meta_t& meta); |
42 | ||
43 | // identifies a specific physical device within seastore | |
44 | using device_id_t = uint8_t; | |
45 | ||
46 | constexpr uint16_t SEGMENT_ID_LEN_BITS = 24; | |
47 | ||
48 | // order of device_id_t | |
49 | constexpr uint16_t DEVICE_ID_LEN_BITS = 8; | |
50 | ||
51 | // 1 bit to identify address type | |
52 | ||
53 | // segment ids without a device id encapsulated | |
54 | using device_segment_id_t = uint32_t; | |
55 | ||
56 | constexpr device_id_t DEVICE_ID_MAX = | |
57 | (std::numeric_limits<device_id_t>::max() >> | |
58 | (std::numeric_limits<device_id_t>::digits - DEVICE_ID_LEN_BITS + 1)); | |
59 | constexpr device_id_t DEVICE_ID_RECORD_RELATIVE = DEVICE_ID_MAX - 1; | |
60 | constexpr device_id_t DEVICE_ID_BLOCK_RELATIVE = DEVICE_ID_MAX - 2; | |
61 | constexpr device_id_t DEVICE_ID_DELAYED = DEVICE_ID_MAX - 3; | |
62 | constexpr device_id_t DEVICE_ID_NULL = DEVICE_ID_MAX - 4; | |
63 | constexpr device_id_t DEVICE_ID_FAKE = DEVICE_ID_MAX - 5; | |
64 | constexpr device_id_t DEVICE_ID_ZERO = DEVICE_ID_MAX - 6; | |
65 | constexpr device_id_t DEVICE_ID_MAX_VALID = DEVICE_ID_MAX - 7; | |
66 | ||
67 | constexpr device_segment_id_t DEVICE_SEGMENT_ID_MAX = | |
68 | (1 << SEGMENT_ID_LEN_BITS) - 1; | |
69 | ||
f67539c2 | 70 | // Identifies segment location on disk, see SegmentManager, |
20effc67 TL |
71 | struct segment_id_t { |
72 | private: | |
73 | // internal segment id type of segment_id_t, basically | |
74 | // this is a unsigned int with the top "DEVICE_ID_LEN_BITS" | |
75 | // bits representing the id of the device on which the | |
76 | // segment resides | |
77 | using internal_segment_id_t = uint32_t; | |
78 | ||
79 | // mask for segment manager id | |
80 | static constexpr internal_segment_id_t SM_ID_MASK = | |
81 | 0xF << (std::numeric_limits<internal_segment_id_t>::digits - DEVICE_ID_LEN_BITS); | |
82 | // default internal segment id | |
83 | static constexpr internal_segment_id_t DEFAULT_INTERNAL_SEG_ID = | |
84 | (std::numeric_limits<internal_segment_id_t>::max() >> 1) - 1; | |
85 | ||
86 | internal_segment_id_t segment = DEFAULT_INTERNAL_SEG_ID; | |
87 | ||
88 | constexpr segment_id_t(uint32_t encoded) : segment(encoded) {} | |
89 | ||
90 | public: | |
91 | segment_id_t() = default; | |
92 | constexpr segment_id_t(device_id_t id, device_segment_id_t segment) | |
93 | : segment(make_internal(segment, id)) {} | |
94 | ||
95 | [[gnu::always_inline]] | |
96 | device_id_t device_id() const { | |
97 | return internal_to_device(segment); | |
98 | } | |
99 | ||
100 | [[gnu::always_inline]] | |
101 | constexpr device_segment_id_t device_segment_id() const { | |
102 | return internal_to_segment(segment); | |
103 | } | |
104 | ||
105 | bool operator==(const segment_id_t& other) const { | |
106 | return segment == other.segment; | |
107 | } | |
108 | bool operator!=(const segment_id_t& other) const { | |
109 | return segment != other.segment; | |
110 | } | |
111 | bool operator<(const segment_id_t& other) const { | |
112 | return segment < other.segment; | |
113 | } | |
114 | bool operator<=(const segment_id_t& other) const { | |
115 | return segment <= other.segment; | |
116 | } | |
117 | bool operator>(const segment_id_t& other) const { | |
118 | return segment > other.segment; | |
119 | } | |
120 | bool operator>=(const segment_id_t& other) const { | |
121 | return segment >= other.segment; | |
122 | } | |
123 | ||
124 | DENC(segment_id_t, v, p) { | |
125 | denc(v.segment, p); | |
126 | } | |
127 | private: | |
128 | static constexpr unsigned segment_bits = ( | |
129 | std::numeric_limits<internal_segment_id_t>::digits - DEVICE_ID_LEN_BITS | |
130 | ); | |
131 | ||
132 | static inline device_id_t internal_to_device(internal_segment_id_t id) { | |
133 | return (static_cast<device_id_t>(id) & SM_ID_MASK) >> segment_bits; | |
134 | } | |
135 | ||
136 | constexpr static inline device_segment_id_t internal_to_segment( | |
137 | internal_segment_id_t id) { | |
138 | return id & (~SM_ID_MASK); | |
139 | } | |
140 | ||
141 | constexpr static inline internal_segment_id_t make_internal( | |
142 | device_segment_id_t id, | |
143 | device_id_t sm_id) { | |
144 | return static_cast<internal_segment_id_t>(id) | | |
145 | (static_cast<internal_segment_id_t>(sm_id) << segment_bits); | |
146 | } | |
147 | ||
148 | friend struct segment_id_le_t; | |
149 | friend struct seg_paddr_t; | |
150 | friend struct paddr_t; | |
151 | friend struct paddr_le_t; | |
152 | }; | |
f67539c2 | 153 | |
20effc67 TL |
154 | // ondisk type of segment_id_t |
155 | struct __attribute((packed)) segment_id_le_t { | |
156 | ceph_le32 segment = ceph_le32(segment_id_t::DEFAULT_INTERNAL_SEG_ID); | |
157 | ||
158 | segment_id_le_t(const segment_id_t id) : | |
159 | segment(ceph_le32(id.segment)) {} | |
160 | ||
161 | operator segment_id_t() const { | |
162 | return segment_id_t(segment); | |
163 | } | |
164 | }; | |
165 | ||
166 | constexpr segment_id_t MAX_SEG_ID = segment_id_t( | |
167 | DEVICE_ID_MAX, | |
168 | DEVICE_SEGMENT_ID_MAX | |
169 | ); | |
f67539c2 | 170 | // for tests which generate fake paddrs |
20effc67 TL |
171 | constexpr segment_id_t NULL_SEG_ID = segment_id_t(DEVICE_ID_NULL, 0); |
172 | constexpr segment_id_t FAKE_SEG_ID = segment_id_t(DEVICE_ID_FAKE, 0); | |
173 | ||
174 | std::ostream &operator<<(std::ostream &out, const segment_id_t&); | |
175 | ||
f67539c2 TL |
176 | |
177 | std::ostream &segment_to_stream(std::ostream &, const segment_id_t &t); | |
178 | ||
179 | // Offset within a segment on disk, see SegmentManager | |
180 | // may be negative for relative offsets | |
181 | using segment_off_t = int32_t; | |
182 | constexpr segment_off_t NULL_SEG_OFF = | |
20effc67 TL |
183 | std::numeric_limits<segment_off_t>::max(); |
184 | constexpr segment_off_t MAX_SEG_OFF = | |
185 | std::numeric_limits<segment_off_t>::max(); | |
f67539c2 TL |
186 | |
187 | std::ostream &offset_to_stream(std::ostream &, const segment_off_t &t); | |
188 | ||
189 | /* Monotonically increasing segment seq, uniquely identifies | |
190 | * the incarnation of a segment */ | |
191 | using segment_seq_t = uint32_t; | |
192 | static constexpr segment_seq_t NULL_SEG_SEQ = | |
193 | std::numeric_limits<segment_seq_t>::max(); | |
20effc67 TL |
194 | static constexpr segment_seq_t MAX_SEG_SEQ = |
195 | std::numeric_limits<segment_seq_t>::max(); | |
f67539c2 TL |
196 | |
197 | // Offset of delta within a record | |
198 | using record_delta_idx_t = uint32_t; | |
199 | constexpr record_delta_idx_t NULL_DELTA_IDX = | |
200 | std::numeric_limits<record_delta_idx_t>::max(); | |
201 | ||
20effc67 TL |
202 | /** |
203 | * segment_map_t | |
204 | * | |
205 | * Compact templated mapping from a segment_id_t to a value type. | |
206 | */ | |
207 | template <typename T> | |
208 | class segment_map_t { | |
209 | public: | |
210 | segment_map_t() { | |
211 | // initializes top vector with 0 length vectors to indicate that they | |
212 | // are not yet present | |
213 | device_to_segments.resize(DEVICE_ID_MAX_VALID); | |
214 | } | |
215 | void add_device(device_id_t device, size_t segments, const T& init) { | |
216 | assert(device <= DEVICE_ID_MAX_VALID); | |
217 | assert(device_to_segments[device].size() == 0); | |
218 | device_to_segments[device].resize(segments, init); | |
219 | total_segments += segments; | |
220 | } | |
221 | void clear() { | |
222 | device_to_segments.clear(); | |
223 | device_to_segments.resize(DEVICE_ID_MAX_VALID); | |
224 | total_segments = 0; | |
225 | } | |
226 | ||
227 | T& operator[](segment_id_t id) { | |
228 | assert(id.device_segment_id() < device_to_segments[id.device_id()].size()); | |
229 | return device_to_segments[id.device_id()][id.device_segment_id()]; | |
230 | } | |
231 | const T& operator[](segment_id_t id) const { | |
232 | assert(id.device_segment_id() < device_to_segments[id.device_id()].size()); | |
233 | return device_to_segments[id.device_id()][id.device_segment_id()]; | |
234 | } | |
235 | ||
236 | bool contains(segment_id_t id) { | |
237 | bool b = id.device_id() < device_to_segments.size(); | |
238 | if (!b) { | |
239 | return b; | |
240 | } | |
241 | b = id.device_segment_id() < device_to_segments[id.device_id()].size(); | |
242 | return b; | |
243 | } | |
244 | ||
245 | auto begin() { | |
246 | return iterator<false>::lower_bound(*this, 0, 0); | |
247 | } | |
248 | auto begin() const { | |
249 | return iterator<true>::lower_bound(*this, 0, 0); | |
250 | } | |
251 | ||
252 | auto end() { | |
253 | return iterator<false>::end_iterator(*this); | |
254 | } | |
255 | auto end() const { | |
256 | return iterator<true>::end_iterator(*this); | |
257 | } | |
258 | ||
259 | auto device_begin(device_id_t id) { | |
260 | auto ret = iterator<false>::lower_bound(*this, id, 0); | |
261 | assert(ret->first.device_id() == id); | |
262 | return ret; | |
263 | } | |
264 | auto device_end(device_id_t id) { | |
265 | return iterator<false>::lower_bound(*this, id + 1, 0); | |
266 | } | |
267 | ||
268 | size_t size() const { | |
269 | return total_segments; | |
270 | } | |
271 | ||
272 | private: | |
273 | template <bool is_const = false> | |
274 | class iterator { | |
275 | /// points at set being iterated over | |
276 | std::conditional_t< | |
277 | is_const, | |
278 | const segment_map_t &, | |
279 | segment_map_t &> parent; | |
280 | ||
281 | /// points at current device, or DEVICE_ID_MAX_VALID if is_end() | |
282 | device_id_t device_id; | |
283 | ||
284 | /// segment at which we are pointing, 0 if is_end() | |
285 | device_segment_id_t device_segment_id; | |
286 | ||
287 | /// holds referent for operator* and operator-> when !is_end() | |
288 | std::optional< | |
289 | std::pair< | |
290 | const segment_id_t, | |
291 | std::conditional_t<is_const, const T&, T&> | |
292 | >> current; | |
293 | ||
294 | bool is_end() const { | |
295 | return device_id == DEVICE_ID_MAX_VALID; | |
296 | } | |
297 | ||
298 | void find_valid() { | |
299 | assert(!is_end()); | |
300 | auto &device_vec = parent.device_to_segments[device_id]; | |
301 | if (device_vec.size() == 0 || | |
302 | device_segment_id == device_vec.size()) { | |
303 | while (++device_id < DEVICE_ID_MAX_VALID && | |
304 | parent.device_to_segments[device_id].size() == 0); | |
305 | device_segment_id = 0; | |
306 | } | |
307 | if (is_end()) { | |
308 | current = std::nullopt; | |
309 | } else { | |
310 | current.emplace( | |
311 | segment_id_t{device_id, device_segment_id}, | |
312 | parent.device_to_segments[device_id][device_segment_id] | |
313 | ); | |
314 | } | |
315 | } | |
316 | ||
317 | iterator( | |
318 | decltype(parent) &parent, | |
319 | device_id_t device_id, | |
320 | device_segment_id_t device_segment_id) | |
321 | : parent(parent), device_id(device_id), | |
322 | device_segment_id(device_segment_id) {} | |
323 | ||
324 | public: | |
325 | static iterator lower_bound( | |
326 | decltype(parent) &parent, | |
327 | device_id_t device_id, | |
328 | device_segment_id_t device_segment_id) { | |
329 | if (device_id == DEVICE_ID_MAX_VALID) { | |
330 | return end_iterator(parent); | |
331 | } else { | |
332 | auto ret = iterator{parent, device_id, device_segment_id}; | |
333 | ret.find_valid(); | |
334 | return ret; | |
335 | } | |
336 | } | |
337 | ||
338 | static iterator end_iterator( | |
339 | decltype(parent) &parent) { | |
340 | return iterator{parent, DEVICE_ID_MAX_VALID, 0}; | |
341 | } | |
342 | ||
343 | iterator<is_const>& operator++() { | |
344 | assert(!is_end()); | |
345 | ++device_segment_id; | |
346 | find_valid(); | |
347 | return *this; | |
348 | } | |
349 | ||
350 | bool operator==(iterator<is_const> rit) { | |
351 | return (device_id == rit.device_id && | |
352 | device_segment_id == rit.device_segment_id); | |
353 | } | |
354 | ||
355 | bool operator!=(iterator<is_const> rit) { | |
356 | return !(*this == rit); | |
357 | } | |
358 | ||
359 | template <bool c = is_const, std::enable_if_t<c, int> = 0> | |
360 | const std::pair<const segment_id_t, const T&> *operator->() { | |
361 | assert(!is_end()); | |
362 | return &*current; | |
363 | } | |
364 | template <bool c = is_const, std::enable_if_t<!c, int> = 0> | |
365 | std::pair<const segment_id_t, T&> *operator->() { | |
366 | assert(!is_end()); | |
367 | return &*current; | |
368 | } | |
369 | template <bool c = is_const, std::enable_if_t<c, int> = 0> | |
370 | const std::pair<const segment_id_t, const T&> &operator*() { | |
371 | assert(!is_end()); | |
372 | return *current; | |
373 | } | |
374 | template <bool c = is_const, std::enable_if_t<!c, int> = 0> | |
375 | std::pair<const segment_id_t, T&> &operator*() { | |
376 | assert(!is_end()); | |
377 | return *current; | |
378 | } | |
379 | }; | |
380 | ||
381 | /** | |
382 | * device_to_segments | |
383 | * | |
384 | * device -> segment -> T mapping. device_to_segments[d].size() > 0 iff | |
385 | * device <d> has been added. | |
386 | */ | |
387 | std::vector<std::vector<T>> device_to_segments; | |
388 | ||
389 | /// total number of added segments | |
390 | size_t total_segments = 0; | |
391 | }; | |
392 | ||
f67539c2 TL |
393 | /** |
394 | * paddr_t | |
395 | * | |
396 | * <segment, offset> offset on disk, see SegmentManager | |
397 | * | |
398 | * May be absolute, record_relative, or block_relative. | |
399 | * | |
400 | * Blocks get read independently of the surrounding record, | |
401 | * so paddrs embedded directly within a block need to refer | |
402 | * to other blocks within the same record by a block_relative | |
403 | * addr relative to the block's own offset. By contrast, | |
404 | * deltas to existing blocks need to use record_relative | |
405 | * addrs relative to the first block of the record. | |
406 | * | |
407 | * Fresh extents during a transaction are refered to by | |
408 | * record_relative paddrs. | |
409 | */ | |
20effc67 TL |
410 | constexpr uint16_t DEV_ADDR_LEN_BITS = 64 - DEVICE_ID_LEN_BITS; |
411 | static constexpr uint16_t SEG_OFF_LEN_BITS = 32; | |
412 | enum class addr_types_t : uint8_t { | |
413 | SEGMENT = 0, | |
414 | RANDOM_BLOCK = 1 | |
415 | }; | |
416 | struct seg_paddr_t; | |
f67539c2 | 417 | struct paddr_t { |
20effc67 TL |
418 | protected: |
419 | using common_addr_t = uint64_t; | |
420 | common_addr_t dev_addr; | |
421 | private: | |
422 | constexpr paddr_t(segment_id_t seg, segment_off_t offset) | |
423 | : dev_addr((static_cast<common_addr_t>(seg.segment) | |
424 | << SEG_OFF_LEN_BITS) | static_cast<uint32_t>(offset)) {} | |
425 | constexpr paddr_t(common_addr_t val) : dev_addr(val) {} | |
426 | public: | |
427 | static constexpr paddr_t make_seg_paddr( | |
428 | segment_id_t seg, segment_off_t offset) { | |
429 | return paddr_t(seg, offset); | |
430 | } | |
431 | static constexpr paddr_t make_seg_paddr( | |
432 | device_id_t device, | |
433 | device_segment_id_t seg, | |
434 | segment_off_t offset) { | |
435 | return paddr_t(segment_id_t(device, seg), offset); | |
436 | } | |
437 | constexpr paddr_t() : paddr_t(NULL_SEG_ID, 0) {} | |
f67539c2 | 438 | |
20effc67 TL |
439 | // use 1bit in device_id_t for address type |
440 | void set_device_id(device_id_t id, addr_types_t type = addr_types_t::SEGMENT) { | |
441 | dev_addr &= static_cast<common_addr_t>( | |
442 | std::numeric_limits<device_segment_id_t>::max()); | |
443 | dev_addr |= static_cast<common_addr_t>(id & 0x8) << DEV_ADDR_LEN_BITS; | |
444 | dev_addr |= static_cast<common_addr_t>(type) | |
445 | << (std::numeric_limits<common_addr_t>::digits - 1); | |
f67539c2 TL |
446 | } |
447 | ||
20effc67 TL |
448 | device_id_t get_device_id() const { |
449 | return static_cast<device_id_t>(dev_addr >> DEV_ADDR_LEN_BITS); | |
450 | } | |
451 | addr_types_t get_addr_type() const { | |
452 | return (addr_types_t)((dev_addr | |
453 | >> (std::numeric_limits<common_addr_t>::digits - 1)) & 1); | |
f67539c2 TL |
454 | } |
455 | ||
20effc67 TL |
456 | paddr_t add_offset(int32_t o) const; |
457 | paddr_t add_relative(paddr_t o) const; | |
458 | paddr_t add_block_relative(paddr_t o) const; | |
459 | paddr_t add_record_relative(paddr_t o) const; | |
460 | paddr_t maybe_relative_to(paddr_t base) const; | |
461 | ||
462 | seg_paddr_t& as_seg_paddr(); | |
463 | const seg_paddr_t& as_seg_paddr() const; | |
464 | ||
465 | paddr_t operator-(paddr_t rhs) const; | |
466 | ||
f67539c2 | 467 | bool is_block_relative() const { |
20effc67 TL |
468 | return get_device_id() == DEVICE_ID_BLOCK_RELATIVE; |
469 | } | |
470 | bool is_record_relative() const { | |
471 | return get_device_id() == DEVICE_ID_RECORD_RELATIVE; | |
472 | } | |
473 | bool is_relative() const { | |
474 | return is_block_relative() || is_record_relative(); | |
475 | } | |
476 | /// Denotes special null addr | |
477 | bool is_null() const { | |
478 | return get_device_id() == DEVICE_ID_NULL; | |
479 | } | |
480 | /// Denotes special zero addr | |
481 | bool is_zero() const { | |
482 | return get_device_id() == DEVICE_ID_ZERO; | |
483 | } | |
484 | ||
485 | /** | |
486 | * is_real | |
487 | * | |
488 | * indicates whether addr reflects a physical location, absolute | |
489 | * or relative. FAKE segments also count as real so as to reflect | |
490 | * the way in which unit tests use them. | |
491 | */ | |
492 | bool is_real() const { | |
493 | return !is_zero() && !is_null(); | |
494 | } | |
495 | ||
496 | DENC(paddr_t, v, p) { | |
497 | DENC_START(1, 1, p); | |
498 | denc(v.dev_addr, p); | |
499 | DENC_FINISH(p); | |
500 | } | |
501 | friend struct paddr_le_t; | |
502 | friend struct seg_paddr_t; | |
503 | ||
504 | friend bool operator==(const paddr_t &, const paddr_t&); | |
505 | friend bool operator!=(const paddr_t &, const paddr_t&); | |
506 | friend bool operator<=(const paddr_t &, const paddr_t&); | |
507 | friend bool operator<(const paddr_t &, const paddr_t&); | |
508 | friend bool operator>=(const paddr_t &, const paddr_t&); | |
509 | friend bool operator>(const paddr_t &, const paddr_t&); | |
510 | }; | |
511 | WRITE_EQ_OPERATORS_1(paddr_t, dev_addr); | |
512 | WRITE_CMP_OPERATORS_1(paddr_t, dev_addr); | |
513 | ||
514 | struct seg_paddr_t : public paddr_t { | |
515 | static constexpr uint64_t SEG_OFF_MASK = std::numeric_limits<uint32_t>::max(); | |
516 | // mask for segment manager id | |
517 | static constexpr uint64_t SEG_ID_MASK = | |
518 | static_cast<common_addr_t>(0xFFFFFFFF) << SEG_OFF_LEN_BITS; | |
519 | ||
520 | seg_paddr_t(const seg_paddr_t&) = delete; | |
521 | seg_paddr_t(seg_paddr_t&) = delete; | |
522 | seg_paddr_t& operator=(const seg_paddr_t&) = delete; | |
523 | seg_paddr_t& operator=(seg_paddr_t&) = delete; | |
524 | segment_id_t get_segment_id() const { | |
525 | return segment_id_t((dev_addr & SEG_ID_MASK) >> SEG_OFF_LEN_BITS); | |
526 | } | |
527 | segment_off_t get_segment_off() const { | |
528 | return segment_off_t(dev_addr & SEG_OFF_MASK); | |
529 | } | |
530 | void set_segment_id(const segment_id_t id) { | |
531 | dev_addr &= static_cast<common_addr_t>( | |
532 | std::numeric_limits<device_segment_id_t>::max()); | |
533 | dev_addr |= static_cast<common_addr_t>(id.segment) << SEG_OFF_LEN_BITS; | |
534 | } | |
535 | void set_segment_off(const segment_off_t off) { | |
536 | dev_addr &= static_cast<common_addr_t>( | |
537 | std::numeric_limits<device_segment_id_t>::max()) << SEG_OFF_LEN_BITS; | |
538 | dev_addr |= (uint32_t)off; | |
f67539c2 TL |
539 | } |
540 | ||
541 | paddr_t add_offset(segment_off_t o) const { | |
20effc67 | 542 | return paddr_t::make_seg_paddr(get_segment_id(), get_segment_off() + o); |
f67539c2 TL |
543 | } |
544 | ||
545 | paddr_t add_relative(paddr_t o) const { | |
546 | assert(o.is_relative()); | |
20effc67 TL |
547 | seg_paddr_t& s = o.as_seg_paddr(); |
548 | return paddr_t::make_seg_paddr(get_segment_id(), | |
549 | get_segment_off() + s.get_segment_off()); | |
f67539c2 TL |
550 | } |
551 | ||
552 | paddr_t add_block_relative(paddr_t o) const { | |
553 | // special version mainly for documentation purposes | |
554 | assert(o.is_block_relative()); | |
555 | return add_relative(o); | |
556 | } | |
557 | ||
558 | paddr_t add_record_relative(paddr_t o) const { | |
559 | // special version mainly for documentation purposes | |
560 | assert(o.is_record_relative()); | |
561 | return add_relative(o); | |
562 | } | |
563 | ||
564 | /** | |
565 | * paddr_t::operator- | |
566 | * | |
567 | * Only defined for record_relative paddr_ts. Yields a | |
568 | * block_relative address. | |
569 | */ | |
570 | paddr_t operator-(paddr_t rhs) const { | |
20effc67 | 571 | seg_paddr_t& r = rhs.as_seg_paddr(); |
f67539c2 | 572 | assert(rhs.is_relative() && is_relative()); |
20effc67 TL |
573 | assert(r.get_segment_id() == get_segment_id()); |
574 | return paddr_t::make_seg_paddr( | |
575 | segment_id_t{DEVICE_ID_BLOCK_RELATIVE, 0}, | |
576 | get_segment_off() - r.get_segment_off() | |
577 | ); | |
f67539c2 TL |
578 | } |
579 | ||
580 | /** | |
581 | * maybe_relative_to | |
582 | * | |
583 | * Helper for the case where an in-memory paddr_t may be | |
584 | * either block_relative or absolute (not record_relative). | |
585 | * | |
586 | * base must be either absolute or record_relative. | |
587 | */ | |
588 | paddr_t maybe_relative_to(paddr_t base) const { | |
589 | assert(!base.is_block_relative()); | |
20effc67 | 590 | seg_paddr_t& s = base.as_seg_paddr(); |
f67539c2 | 591 | if (is_block_relative()) |
20effc67 | 592 | return s.add_block_relative(*this); |
f67539c2 TL |
593 | else |
594 | return *this; | |
595 | } | |
f67539c2 | 596 | }; |
f67539c2 | 597 | constexpr paddr_t P_ADDR_NULL = paddr_t{}; |
20effc67 TL |
598 | constexpr paddr_t P_ADDR_MIN = paddr_t::make_seg_paddr(segment_id_t(0, 0), 0); |
599 | constexpr paddr_t P_ADDR_MAX = paddr_t::make_seg_paddr( | |
600 | segment_id_t(DEVICE_ID_MAX, DEVICE_SEGMENT_ID_MAX), | |
601 | std::numeric_limits<segment_off_t>::max()); | |
602 | constexpr paddr_t P_ADDR_ZERO = paddr_t::make_seg_paddr( | |
603 | DEVICE_ID_ZERO, 0, 0); | |
604 | ||
f67539c2 | 605 | constexpr paddr_t make_record_relative_paddr(segment_off_t off) { |
20effc67 TL |
606 | return paddr_t::make_seg_paddr( |
607 | segment_id_t{DEVICE_ID_RECORD_RELATIVE, 0}, | |
608 | off); | |
f67539c2 TL |
609 | } |
610 | constexpr paddr_t make_block_relative_paddr(segment_off_t off) { | |
20effc67 TL |
611 | return paddr_t::make_seg_paddr( |
612 | segment_id_t{DEVICE_ID_BLOCK_RELATIVE, 0}, | |
613 | off); | |
f67539c2 TL |
614 | } |
615 | constexpr paddr_t make_fake_paddr(segment_off_t off) { | |
20effc67 TL |
616 | return paddr_t::make_seg_paddr(FAKE_SEG_ID, off); |
617 | } | |
618 | constexpr paddr_t delayed_temp_paddr(segment_off_t off) { | |
619 | return paddr_t::make_seg_paddr( | |
620 | segment_id_t{DEVICE_ID_DELAYED, 0}, | |
621 | off); | |
f67539c2 TL |
622 | } |
623 | ||
20effc67 TL |
624 | struct __attribute((packed)) paddr_le_t { |
625 | ceph_le64 dev_addr = | |
626 | ceph_le64(P_ADDR_NULL.dev_addr); | |
f67539c2 TL |
627 | |
628 | paddr_le_t() = default; | |
20effc67 | 629 | paddr_le_t(const paddr_t &addr) : dev_addr(ceph_le64(addr.dev_addr)) {} |
f67539c2 TL |
630 | |
631 | operator paddr_t() const { | |
20effc67 | 632 | return paddr_t{dev_addr}; |
f67539c2 TL |
633 | } |
634 | }; | |
635 | ||
636 | std::ostream &operator<<(std::ostream &out, const paddr_t &rhs); | |
637 | ||
638 | using objaddr_t = uint32_t; | |
20effc67 TL |
639 | constexpr objaddr_t OBJ_ADDR_MAX = std::numeric_limits<objaddr_t>::max(); |
640 | constexpr objaddr_t OBJ_ADDR_NULL = OBJ_ADDR_MAX - 1; | |
641 | ||
642 | enum class placement_hint_t { | |
643 | HOT = 0, // Most of the metadata | |
644 | COLD, // Object data | |
645 | REWRITE, // Cold metadata and data (probably need further splits) | |
646 | NUM_HINTS // Constant for number of hints | |
647 | }; | |
648 | ||
649 | enum class device_type_t { | |
650 | NONE = 0, | |
651 | SEGMENTED, // i.e. Hard_Disk, SATA_SSD, NAND_NVME | |
652 | RANDOM_BLOCK, // i.e. RANDOM_BD | |
653 | PMEM, // i.e. NVDIMM, PMEM | |
654 | NUM_TYPES | |
655 | }; | |
656 | ||
657 | std::ostream& operator<<(std::ostream& out, device_type_t t); | |
658 | ||
659 | bool can_delay_allocation(device_type_t type); | |
660 | device_type_t string_to_device_type(std::string type); | |
f67539c2 TL |
661 | |
662 | /* Monotonically increasing identifier for the location of a | |
663 | * journal_record. | |
664 | */ | |
665 | struct journal_seq_t { | |
666 | segment_seq_t segment_seq = 0; | |
667 | paddr_t offset; | |
668 | ||
20effc67 TL |
669 | journal_seq_t add_offset(segment_off_t o) const { |
670 | return {segment_seq, offset.add_offset(o)}; | |
671 | } | |
672 | ||
f67539c2 TL |
673 | DENC(journal_seq_t, v, p) { |
674 | DENC_START(1, 1, p); | |
675 | denc(v.segment_seq, p); | |
676 | denc(v.offset, p); | |
677 | DENC_FINISH(p); | |
678 | } | |
679 | }; | |
680 | WRITE_CMP_OPERATORS_2(journal_seq_t, segment_seq, offset) | |
681 | WRITE_EQ_OPERATORS_2(journal_seq_t, segment_seq, offset) | |
20effc67 TL |
682 | constexpr journal_seq_t JOURNAL_SEQ_MIN{ |
683 | 0, | |
684 | paddr_t::make_seg_paddr(NULL_SEG_ID, 0) | |
685 | }; | |
686 | constexpr journal_seq_t JOURNAL_SEQ_MAX{ | |
687 | MAX_SEG_SEQ, | |
688 | P_ADDR_MAX | |
689 | }; | |
f67539c2 TL |
690 | |
691 | std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq); | |
692 | ||
693 | static constexpr journal_seq_t NO_DELTAS = journal_seq_t{ | |
694 | NULL_SEG_SEQ, | |
695 | P_ADDR_NULL | |
696 | }; | |
697 | ||
698 | // logical addr, see LBAManager, TransactionManager | |
699 | using laddr_t = uint64_t; | |
700 | constexpr laddr_t L_ADDR_MIN = std::numeric_limits<laddr_t>::min(); | |
701 | constexpr laddr_t L_ADDR_MAX = std::numeric_limits<laddr_t>::max(); | |
702 | constexpr laddr_t L_ADDR_NULL = std::numeric_limits<laddr_t>::max(); | |
703 | constexpr laddr_t L_ADDR_ROOT = std::numeric_limits<laddr_t>::max() - 1; | |
704 | constexpr laddr_t L_ADDR_LBAT = std::numeric_limits<laddr_t>::max() - 2; | |
705 | ||
20effc67 TL |
706 | struct __attribute((packed)) laddr_le_t { |
707 | ceph_le64 laddr = ceph_le64(L_ADDR_NULL); | |
f67539c2 TL |
708 | |
709 | laddr_le_t() = default; | |
710 | laddr_le_t(const laddr_le_t &) = default; | |
711 | explicit laddr_le_t(const laddr_t &addr) | |
20effc67 | 712 | : laddr(ceph_le64(addr)) {} |
f67539c2 TL |
713 | |
714 | operator laddr_t() const { | |
715 | return laddr_t(laddr); | |
716 | } | |
717 | laddr_le_t& operator=(laddr_t addr) { | |
718 | ceph_le64 val; | |
719 | val = addr; | |
720 | laddr = val; | |
721 | return *this; | |
722 | } | |
723 | }; | |
724 | ||
725 | // logical offset, see LBAManager, TransactionManager | |
726 | using extent_len_t = uint32_t; | |
727 | constexpr extent_len_t EXTENT_LEN_MAX = | |
728 | std::numeric_limits<extent_len_t>::max(); | |
729 | ||
730 | using extent_len_le_t = ceph_le32; | |
20effc67 TL |
731 | inline extent_len_le_t init_extent_len_le(extent_len_t len) { |
732 | return ceph_le32(len); | |
f67539c2 TL |
733 | } |
734 | ||
735 | struct laddr_list_t : std::list<std::pair<laddr_t, extent_len_t>> { | |
736 | template <typename... T> | |
737 | laddr_list_t(T&&... args) | |
738 | : std::list<std::pair<laddr_t, extent_len_t>>(std::forward<T>(args)...) {} | |
739 | }; | |
740 | struct paddr_list_t : std::list<std::pair<paddr_t, extent_len_t>> { | |
741 | template <typename... T> | |
742 | paddr_list_t(T&&... args) | |
743 | : std::list<std::pair<paddr_t, extent_len_t>>(std::forward<T>(args)...) {} | |
744 | }; | |
745 | ||
746 | std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs); | |
747 | std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs); | |
748 | ||
749 | /* identifies type of extent, used for interpretting deltas, managing | |
750 | * writeback. | |
751 | * | |
752 | * Note that any new extent type needs to be added to | |
753 | * Cache::get_extent_by_type in cache.cc | |
754 | */ | |
755 | enum class extent_types_t : uint8_t { | |
756 | ROOT = 0, | |
757 | LADDR_INTERNAL = 1, | |
758 | LADDR_LEAF = 2, | |
20effc67 TL |
759 | OMAP_INNER = 3, |
760 | OMAP_LEAF = 4, | |
761 | ONODE_BLOCK_STAGED = 5, | |
762 | COLL_BLOCK = 6, | |
763 | OBJECT_DATA_BLOCK = 7, | |
764 | RETIRED_PLACEHOLDER = 8, | |
765 | RBM_ALLOC_INFO = 9, | |
f67539c2 | 766 | // Test Block Types |
20effc67 TL |
767 | TEST_BLOCK = 10, |
768 | TEST_BLOCK_PHYSICAL = 11, | |
769 | // None and the number of valid extent_types_t | |
770 | NONE = 12, | |
f67539c2 | 771 | }; |
20effc67 | 772 | constexpr auto EXTENT_TYPES_MAX = static_cast<uint8_t>(extent_types_t::NONE); |
f67539c2 | 773 | |
20effc67 | 774 | constexpr bool is_logical_type(extent_types_t type) { |
f67539c2 TL |
775 | switch (type) { |
776 | case extent_types_t::ROOT: | |
777 | case extent_types_t::LADDR_INTERNAL: | |
778 | case extent_types_t::LADDR_LEAF: | |
779 | return false; | |
780 | default: | |
781 | return true; | |
782 | } | |
783 | } | |
784 | ||
20effc67 TL |
785 | constexpr bool is_lba_node(extent_types_t type) |
786 | { | |
787 | return type == extent_types_t::LADDR_INTERNAL || | |
788 | type == extent_types_t::LADDR_LEAF; | |
789 | } | |
790 | ||
f67539c2 TL |
791 | std::ostream &operator<<(std::ostream &out, extent_types_t t); |
792 | ||
793 | /* description of a new physical extent */ | |
794 | struct extent_t { | |
795 | extent_types_t type; ///< type of extent | |
796 | laddr_t addr; ///< laddr of extent (L_ADDR_NULL for non-logical) | |
797 | ceph::bufferlist bl; ///< payload, bl.length() == length, aligned | |
798 | }; | |
799 | ||
800 | using extent_version_t = uint32_t; | |
801 | constexpr extent_version_t EXTENT_VERSION_NULL = 0; | |
802 | ||
803 | /* description of a mutation to a physical extent */ | |
804 | struct delta_info_t { | |
805 | extent_types_t type = extent_types_t::NONE; ///< delta type | |
806 | paddr_t paddr; ///< physical address | |
807 | laddr_t laddr = L_ADDR_NULL; ///< logical address | |
808 | uint32_t prev_crc = 0; | |
809 | uint32_t final_crc = 0; | |
810 | segment_off_t length = NULL_SEG_OFF; ///< extent length | |
811 | extent_version_t pversion; ///< prior version | |
812 | ceph::bufferlist bl; ///< payload | |
813 | ||
814 | DENC(delta_info_t, v, p) { | |
815 | DENC_START(1, 1, p); | |
816 | denc(v.type, p); | |
817 | denc(v.paddr, p); | |
818 | denc(v.laddr, p); | |
819 | denc(v.prev_crc, p); | |
820 | denc(v.final_crc, p); | |
821 | denc(v.length, p); | |
822 | denc(v.pversion, p); | |
823 | denc(v.bl, p); | |
824 | DENC_FINISH(p); | |
825 | } | |
826 | ||
827 | bool operator==(const delta_info_t &rhs) const { | |
828 | return ( | |
829 | type == rhs.type && | |
830 | paddr == rhs.paddr && | |
831 | laddr == rhs.laddr && | |
832 | prev_crc == rhs.prev_crc && | |
833 | final_crc == rhs.final_crc && | |
834 | length == rhs.length && | |
835 | pversion == rhs.pversion && | |
836 | bl == rhs.bl | |
837 | ); | |
838 | } | |
839 | ||
840 | friend std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs); | |
841 | }; | |
842 | ||
843 | std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs); | |
844 | ||
20effc67 TL |
845 | class object_data_t { |
846 | laddr_t reserved_data_base = L_ADDR_NULL; | |
847 | extent_len_t reserved_data_len = 0; | |
848 | ||
849 | bool dirty = false; | |
850 | public: | |
851 | object_data_t( | |
852 | laddr_t reserved_data_base, | |
853 | extent_len_t reserved_data_len) | |
854 | : reserved_data_base(reserved_data_base), | |
855 | reserved_data_len(reserved_data_len) {} | |
856 | ||
857 | laddr_t get_reserved_data_base() const { | |
858 | return reserved_data_base; | |
859 | } | |
860 | ||
861 | extent_len_t get_reserved_data_len() const { | |
862 | return reserved_data_len; | |
863 | } | |
864 | ||
865 | bool is_null() const { | |
866 | return reserved_data_base == L_ADDR_NULL; | |
867 | } | |
868 | ||
869 | bool must_update() const { | |
870 | return dirty; | |
871 | } | |
872 | ||
873 | void update_reserved( | |
874 | laddr_t base, | |
875 | extent_len_t len) { | |
876 | dirty = true; | |
877 | reserved_data_base = base; | |
878 | reserved_data_len = len; | |
879 | } | |
880 | ||
881 | void update_len( | |
882 | extent_len_t len) { | |
883 | dirty = true; | |
884 | reserved_data_len = len; | |
885 | } | |
886 | ||
887 | void clear() { | |
888 | dirty = true; | |
889 | reserved_data_base = L_ADDR_NULL; | |
890 | reserved_data_len = 0; | |
891 | } | |
892 | }; | |
893 | ||
894 | struct __attribute__((packed)) object_data_le_t { | |
895 | laddr_le_t reserved_data_base = laddr_le_t(L_ADDR_NULL); | |
896 | extent_len_le_t reserved_data_len = init_extent_len_le(0); | |
897 | ||
898 | void update(const object_data_t &nroot) { | |
899 | reserved_data_base = nroot.get_reserved_data_base(); | |
900 | reserved_data_len = init_extent_len_le(nroot.get_reserved_data_len()); | |
901 | } | |
902 | ||
903 | object_data_t get() const { | |
904 | return object_data_t( | |
905 | reserved_data_base, | |
906 | reserved_data_len); | |
907 | } | |
908 | }; | |
909 | ||
910 | struct omap_root_t { | |
911 | laddr_t addr = L_ADDR_NULL; | |
912 | depth_t depth = 0; | |
913 | laddr_t hint = L_ADDR_MIN; | |
914 | bool mutated = false; | |
915 | ||
916 | omap_root_t() = default; | |
917 | omap_root_t(laddr_t addr, depth_t depth, laddr_t addr_min) | |
918 | : addr(addr), | |
919 | depth(depth), | |
920 | hint(addr_min) {} | |
921 | ||
922 | omap_root_t(const omap_root_t &o) = default; | |
923 | omap_root_t(omap_root_t &&o) = default; | |
924 | omap_root_t &operator=(const omap_root_t &o) = default; | |
925 | omap_root_t &operator=(omap_root_t &&o) = default; | |
926 | ||
927 | bool is_null() const { | |
928 | return addr == L_ADDR_NULL; | |
929 | } | |
930 | ||
931 | bool must_update() const { | |
932 | return mutated; | |
933 | } | |
934 | ||
935 | void update(laddr_t _addr, depth_t _depth, laddr_t _hint) { | |
936 | mutated = true; | |
937 | addr = _addr; | |
938 | depth = _depth; | |
939 | hint = _hint; | |
940 | } | |
941 | ||
942 | laddr_t get_location() const { | |
943 | return addr; | |
944 | } | |
945 | ||
946 | depth_t get_depth() const { | |
947 | return depth; | |
948 | } | |
949 | ||
950 | laddr_t get_hint() const { | |
951 | return hint; | |
952 | } | |
953 | }; | |
954 | ||
955 | class __attribute__((packed)) omap_root_le_t { | |
956 | laddr_le_t addr = laddr_le_t(L_ADDR_NULL); | |
957 | depth_le_t depth = init_depth_le(0); | |
958 | ||
959 | public: | |
960 | omap_root_le_t() = default; | |
961 | ||
962 | omap_root_le_t(laddr_t addr, depth_t depth) | |
963 | : addr(addr), depth(init_depth_le(depth)) {} | |
964 | ||
965 | omap_root_le_t(const omap_root_le_t &o) = default; | |
966 | omap_root_le_t(omap_root_le_t &&o) = default; | |
967 | omap_root_le_t &operator=(const omap_root_le_t &o) = default; | |
968 | omap_root_le_t &operator=(omap_root_le_t &&o) = default; | |
969 | ||
970 | void update(const omap_root_t &nroot) { | |
971 | addr = nroot.get_location(); | |
972 | depth = init_depth_le(nroot.get_depth()); | |
973 | } | |
974 | ||
975 | omap_root_t get(laddr_t hint) const { | |
976 | return omap_root_t(addr, depth, hint); | |
977 | } | |
978 | }; | |
979 | ||
980 | /** | |
981 | * lba_root_t | |
982 | */ | |
983 | class __attribute__((packed)) lba_root_t { | |
984 | paddr_le_t root_addr; | |
985 | depth_le_t depth = init_extent_len_le(0); | |
986 | ||
987 | public: | |
988 | lba_root_t() = default; | |
989 | ||
990 | lba_root_t(paddr_t addr, depth_t depth) | |
991 | : root_addr(addr), depth(init_depth_le(depth)) {} | |
992 | ||
993 | lba_root_t(const lba_root_t &o) = default; | |
994 | lba_root_t(lba_root_t &&o) = default; | |
995 | lba_root_t &operator=(const lba_root_t &o) = default; | |
996 | lba_root_t &operator=(lba_root_t &&o) = default; | |
997 | ||
998 | paddr_t get_location() const { | |
999 | return root_addr; | |
1000 | } | |
1001 | ||
1002 | void set_location(paddr_t location) { | |
1003 | root_addr = location; | |
1004 | } | |
1005 | ||
1006 | depth_t get_depth() const { | |
1007 | return depth; | |
1008 | } | |
1009 | ||
1010 | void set_depth(depth_t ndepth) { | |
1011 | depth = ndepth; | |
1012 | } | |
1013 | ||
1014 | void adjust_addrs_from_base(paddr_t base) { | |
1015 | paddr_t _root_addr = root_addr; | |
1016 | if (_root_addr.is_relative()) { | |
1017 | root_addr = base.add_record_relative(_root_addr); | |
1018 | } | |
1019 | } | |
1020 | }; | |
1021 | ||
1022 | class coll_root_t { | |
1023 | laddr_t addr = L_ADDR_NULL; | |
1024 | extent_len_t size = 0; | |
1025 | ||
1026 | bool mutated = false; | |
1027 | ||
1028 | public: | |
1029 | coll_root_t() = default; | |
1030 | coll_root_t(laddr_t addr, extent_len_t size) : addr(addr), size(size) {} | |
1031 | ||
1032 | coll_root_t(const coll_root_t &o) = default; | |
1033 | coll_root_t(coll_root_t &&o) = default; | |
1034 | coll_root_t &operator=(const coll_root_t &o) = default; | |
1035 | coll_root_t &operator=(coll_root_t &&o) = default; | |
1036 | ||
1037 | bool must_update() const { | |
1038 | return mutated; | |
1039 | } | |
1040 | ||
1041 | void update(laddr_t _addr, extent_len_t _s) { | |
1042 | mutated = true; | |
1043 | addr = _addr; | |
1044 | size = _s; | |
1045 | } | |
1046 | ||
1047 | laddr_t get_location() const { | |
1048 | return addr; | |
1049 | } | |
1050 | ||
1051 | extent_len_t get_size() const { | |
1052 | return size; | |
1053 | } | |
1054 | }; | |
1055 | ||
1056 | /** | |
1057 | * coll_root_le_t | |
1058 | * | |
1059 | * Information for locating CollectionManager information, to be embedded | |
1060 | * in root block. | |
1061 | */ | |
1062 | class __attribute__((packed)) coll_root_le_t { | |
1063 | laddr_le_t addr; | |
1064 | extent_len_le_t size = init_extent_len_le(0); | |
1065 | ||
1066 | public: | |
1067 | coll_root_le_t() = default; | |
1068 | ||
1069 | coll_root_le_t(laddr_t laddr, segment_off_t size) | |
1070 | : addr(laddr), size(init_extent_len_le(size)) {} | |
1071 | ||
1072 | ||
1073 | coll_root_le_t(const coll_root_le_t &o) = default; | |
1074 | coll_root_le_t(coll_root_le_t &&o) = default; | |
1075 | coll_root_le_t &operator=(const coll_root_le_t &o) = default; | |
1076 | coll_root_le_t &operator=(coll_root_le_t &&o) = default; | |
1077 | ||
1078 | void update(const coll_root_t &nroot) { | |
1079 | addr = nroot.get_location(); | |
1080 | size = init_extent_len_le(nroot.get_size()); | |
1081 | } | |
1082 | ||
1083 | coll_root_t get() const { | |
1084 | return coll_root_t(addr, size); | |
1085 | } | |
1086 | }; | |
1087 | ||
1088 | ||
1089 | /** | |
1090 | * root_t | |
1091 | * | |
1092 | * Contains information required to find metadata roots. | |
1093 | * TODO: generalize this to permit more than one lba_manager implementation | |
1094 | */ | |
1095 | struct __attribute__((packed)) root_t { | |
1096 | using meta_t = std::map<std::string, std::string>; | |
1097 | ||
1098 | static constexpr int MAX_META_LENGTH = 1024; | |
1099 | ||
1100 | lba_root_t lba_root; | |
1101 | laddr_le_t onode_root; | |
1102 | coll_root_le_t collection_root; | |
1103 | ||
1104 | char meta[MAX_META_LENGTH]; | |
1105 | ||
1106 | root_t() { | |
1107 | set_meta(meta_t{}); | |
1108 | } | |
1109 | ||
1110 | void adjust_addrs_from_base(paddr_t base) { | |
1111 | lba_root.adjust_addrs_from_base(base); | |
1112 | } | |
1113 | ||
1114 | meta_t get_meta() { | |
1115 | bufferlist bl; | |
1116 | bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta)); | |
1117 | meta_t ret; | |
1118 | auto iter = bl.cbegin(); | |
1119 | decode(ret, iter); | |
1120 | return ret; | |
1121 | } | |
1122 | ||
1123 | void set_meta(const meta_t &m) { | |
1124 | ceph::bufferlist bl; | |
1125 | encode(m, bl); | |
1126 | ceph_assert(bl.length() < MAX_META_LENGTH); | |
1127 | bl.rebuild(); | |
1128 | auto &bptr = bl.front(); | |
1129 | ::memset(meta, 0, MAX_META_LENGTH); | |
1130 | ::memcpy(meta, bptr.c_str(), bl.length()); | |
1131 | } | |
1132 | }; | |
1133 | ||
1134 | using blk_id_t = uint64_t; | |
1135 | constexpr blk_id_t NULL_BLK_ID = | |
1136 | std::numeric_limits<blk_id_t>::max(); | |
1137 | ||
1138 | // use absolute address | |
1139 | using blk_paddr_t = uint64_t; | |
1140 | struct rbm_alloc_delta_t { | |
1141 | enum class op_types_t : uint8_t { | |
1142 | NONE = 0, | |
1143 | SET = 1, | |
1144 | CLEAR = 2 | |
1145 | }; | |
1146 | std::vector<std::pair<paddr_t, size_t>> alloc_blk_ranges; | |
1147 | op_types_t op = op_types_t::NONE; | |
1148 | ||
1149 | rbm_alloc_delta_t() = default; | |
1150 | ||
1151 | DENC(rbm_alloc_delta_t, v, p) { | |
1152 | DENC_START(1, 1, p); | |
1153 | denc(v.alloc_blk_ranges, p); | |
1154 | denc(v.op, p); | |
1155 | DENC_FINISH(p); | |
1156 | } | |
1157 | }; | |
1158 | ||
1159 | paddr_t convert_blk_paddr_to_paddr(blk_paddr_t addr, size_t block_size, | |
1160 | uint32_t blocks_per_segment, device_id_t d_id); | |
1161 | blk_paddr_t convert_paddr_to_blk_paddr(paddr_t addr, size_t block_size, | |
1162 | uint32_t blocks_per_segment); | |
1163 | ||
1164 | struct extent_info_t { | |
1165 | extent_types_t type = extent_types_t::NONE; | |
1166 | laddr_t addr = L_ADDR_NULL; | |
1167 | extent_len_t len = 0; | |
1168 | ||
1169 | extent_info_t() = default; | |
1170 | extent_info_t(const extent_t &et) | |
1171 | : type(et.type), addr(et.addr), len(et.bl.length()) {} | |
1172 | ||
1173 | DENC(extent_info_t, v, p) { | |
1174 | DENC_START(1, 1, p); | |
1175 | denc(v.type, p); | |
1176 | denc(v.addr, p); | |
1177 | denc(v.len, p); | |
1178 | DENC_FINISH(p); | |
1179 | } | |
1180 | }; | |
1181 | std::ostream &operator<<(std::ostream &out, const extent_info_t &header); | |
1182 | ||
1183 | using segment_nonce_t = uint32_t; | |
1184 | ||
1185 | /** | |
1186 | * Segment header | |
1187 | * | |
1188 | * Every segment contains and encode segment_header_t in the first block. | |
1189 | * Our strategy for finding the journal replay point is: | |
1190 | * 1) Find the segment with the highest journal_segment_seq | |
1191 | * 2) Replay starting at record located at that segment's journal_tail | |
1192 | */ | |
1193 | struct segment_header_t { | |
1194 | segment_seq_t journal_segment_seq; | |
1195 | segment_id_t physical_segment_id; // debugging | |
1196 | ||
1197 | journal_seq_t journal_tail; | |
1198 | segment_nonce_t segment_nonce; | |
1199 | bool out_of_line; | |
1200 | ||
1201 | DENC(segment_header_t, v, p) { | |
1202 | DENC_START(1, 1, p); | |
1203 | denc(v.journal_segment_seq, p); | |
1204 | denc(v.physical_segment_id, p); | |
1205 | denc(v.journal_tail, p); | |
1206 | denc(v.segment_nonce, p); | |
1207 | denc(v.out_of_line, p); | |
1208 | DENC_FINISH(p); | |
1209 | } | |
1210 | }; | |
1211 | std::ostream &operator<<(std::ostream &out, const segment_header_t &header); | |
1212 | ||
1213 | struct record_size_t { | |
1214 | extent_len_t plain_mdlength = 0; // mdlength without the record header | |
1215 | extent_len_t dlength = 0; | |
1216 | ||
1217 | extent_len_t get_raw_mdlength() const; | |
1218 | ||
1219 | bool is_empty() const { | |
1220 | return plain_mdlength == 0 && | |
1221 | dlength == 0; | |
1222 | } | |
1223 | ||
1224 | void account_extent(extent_len_t extent_len); | |
1225 | ||
1226 | void account(const extent_t& extent) { | |
1227 | account_extent(extent.bl.length()); | |
1228 | } | |
1229 | ||
1230 | void account(const delta_info_t& delta); | |
1231 | }; | |
1232 | WRITE_EQ_OPERATORS_2(record_size_t, plain_mdlength, dlength); | |
1233 | ||
f67539c2 TL |
1234 | struct record_t { |
1235 | std::vector<extent_t> extents; | |
1236 | std::vector<delta_info_t> deltas; | |
20effc67 TL |
1237 | record_size_t size; |
1238 | ||
1239 | record_t() = default; | |
1240 | record_t(std::vector<extent_t>&& _extents, | |
1241 | std::vector<delta_info_t>&& _deltas) { | |
1242 | for (auto& e: _extents) { | |
1243 | push_back(std::move(e)); | |
1244 | } | |
1245 | for (auto& d: _deltas) { | |
1246 | push_back(std::move(d)); | |
1247 | } | |
1248 | } | |
1249 | ||
1250 | bool is_empty() const { | |
1251 | return extents.size() == 0 && | |
1252 | deltas.size() == 0; | |
1253 | } | |
1254 | ||
1255 | std::size_t get_delta_size() const { | |
1256 | auto delta_size = std::accumulate( | |
1257 | deltas.begin(), deltas.end(), 0, | |
1258 | [](uint64_t sum, auto& delta) { | |
1259 | return sum + delta.bl.length(); | |
1260 | } | |
1261 | ); | |
1262 | return delta_size; | |
1263 | } | |
1264 | ||
1265 | void push_back(extent_t&& extent) { | |
1266 | size.account(extent); | |
1267 | extents.push_back(std::move(extent)); | |
1268 | } | |
1269 | ||
1270 | void push_back(delta_info_t&& delta) { | |
1271 | size.account(delta); | |
1272 | deltas.push_back(std::move(delta)); | |
1273 | } | |
f67539c2 TL |
1274 | }; |
1275 | ||
20effc67 TL |
1276 | struct record_header_t { |
1277 | uint32_t deltas; // number of deltas | |
1278 | uint32_t extents; // number of extents | |
1279 | ||
1280 | ||
1281 | DENC(record_header_t, v, p) { | |
1282 | DENC_START(1, 1, p); | |
1283 | denc(v.deltas, p); | |
1284 | denc(v.extents, p); | |
1285 | DENC_FINISH(p); | |
1286 | } | |
1287 | }; | |
1288 | ||
1289 | struct record_group_header_t { | |
1290 | uint32_t records; | |
1291 | extent_len_t mdlength; // block aligned, length of metadata | |
1292 | extent_len_t dlength; // block aligned, length of data | |
1293 | segment_nonce_t segment_nonce;// nonce of containing segment | |
1294 | journal_seq_t committed_to; // records prior to committed_to have been | |
1295 | // fully written, maybe in another segment. | |
1296 | checksum_t data_crc; // crc of data payload | |
1297 | ||
1298 | ||
1299 | DENC(record_group_header_t, v, p) { | |
1300 | DENC_START(1, 1, p); | |
1301 | denc(v.records, p); | |
1302 | denc(v.mdlength, p); | |
1303 | denc(v.dlength, p); | |
1304 | denc(v.segment_nonce, p); | |
1305 | denc(v.committed_to, p); | |
1306 | denc(v.data_crc, p); | |
1307 | DENC_FINISH(p); | |
1308 | } | |
1309 | }; | |
1310 | ||
1311 | struct record_group_size_t { | |
1312 | extent_len_t plain_mdlength = 0; // mdlength without the group header | |
1313 | extent_len_t dlength = 0; | |
1314 | extent_len_t block_size = 0; | |
1315 | ||
1316 | record_group_size_t() = default; | |
1317 | record_group_size_t( | |
1318 | const record_size_t& rsize, | |
1319 | extent_len_t block_size) { | |
1320 | account(rsize, block_size); | |
1321 | } | |
1322 | ||
1323 | extent_len_t get_raw_mdlength() const; | |
1324 | ||
1325 | extent_len_t get_mdlength() const { | |
1326 | assert(block_size > 0); | |
1327 | return p2roundup(get_raw_mdlength(), block_size); | |
1328 | } | |
1329 | ||
1330 | extent_len_t get_encoded_length() const { | |
1331 | assert(block_size > 0); | |
1332 | assert(dlength % block_size == 0); | |
1333 | return get_mdlength() + dlength; | |
1334 | } | |
1335 | ||
1336 | record_group_size_t get_encoded_length_after( | |
1337 | const record_size_t& rsize, | |
1338 | extent_len_t block_size) const { | |
1339 | record_group_size_t tmp = *this; | |
1340 | tmp.account(rsize, block_size); | |
1341 | return tmp; | |
1342 | } | |
1343 | ||
1344 | double get_fullness() const { | |
1345 | assert(block_size > 0); | |
1346 | return ((double)(get_raw_mdlength() + dlength) / | |
1347 | get_encoded_length()); | |
1348 | } | |
1349 | ||
1350 | void account(const record_size_t& rsize, | |
1351 | extent_len_t block_size); | |
1352 | }; | |
1353 | WRITE_EQ_OPERATORS_3(record_group_size_t, plain_mdlength, dlength, block_size); | |
1354 | ||
1355 | struct record_group_t { | |
1356 | std::vector<record_t> records; | |
1357 | record_group_size_t size; | |
1358 | extent_len_t current_dlength = 0; | |
1359 | ||
1360 | record_group_t() = default; | |
1361 | record_group_t( | |
1362 | record_t&& record, | |
1363 | extent_len_t block_size) { | |
1364 | push_back(std::move(record), block_size); | |
1365 | } | |
1366 | ||
1367 | std::size_t get_size() const { | |
1368 | return records.size(); | |
1369 | } | |
1370 | ||
1371 | void push_back( | |
1372 | record_t&& record, | |
1373 | extent_len_t block_size) { | |
1374 | size.account(record.size, block_size); | |
1375 | current_dlength += record.size.dlength; | |
1376 | records.push_back(std::move(record)); | |
1377 | assert(size.get_encoded_length() < MAX_SEG_OFF); | |
1378 | } | |
1379 | ||
1380 | void reserve(std::size_t limit) { | |
1381 | records.reserve(limit); | |
1382 | } | |
1383 | ||
1384 | void clear() { | |
1385 | records.clear(); | |
1386 | size = {}; | |
1387 | current_dlength = 0; | |
1388 | } | |
1389 | }; | |
1390 | ||
1391 | ceph::bufferlist encode_record( | |
1392 | record_t&& record, | |
1393 | extent_len_t block_size, | |
1394 | const journal_seq_t& committed_to, | |
1395 | segment_nonce_t current_segment_nonce); | |
1396 | ||
1397 | ceph::bufferlist encode_records( | |
1398 | record_group_t& record_group, | |
1399 | const journal_seq_t& committed_to, | |
1400 | segment_nonce_t current_segment_nonce); | |
1401 | ||
1402 | std::optional<record_group_header_t> | |
1403 | try_decode_records_header( | |
1404 | const ceph::bufferlist& header_bl, | |
1405 | segment_nonce_t expected_nonce); | |
1406 | ||
1407 | bool validate_records_metadata( | |
1408 | const ceph::bufferlist& md_bl); | |
1409 | ||
1410 | bool validate_records_data( | |
1411 | const record_group_header_t& header, | |
1412 | const ceph::bufferlist& data_bl); | |
1413 | ||
1414 | struct record_extent_infos_t { | |
1415 | record_header_t header; | |
1416 | std::vector<extent_info_t> extent_infos; | |
1417 | }; | |
1418 | std::optional<std::vector<record_extent_infos_t> > | |
1419 | try_decode_extent_infos( | |
1420 | const record_group_header_t& header, | |
1421 | const ceph::bufferlist& md_bl); | |
1422 | ||
1423 | struct record_deltas_t { | |
1424 | paddr_t record_block_base; | |
1425 | std::vector<delta_info_t> deltas; | |
1426 | }; | |
1427 | std::optional<std::vector<record_deltas_t> > | |
1428 | try_decode_deltas( | |
1429 | const record_group_header_t& header, | |
1430 | const ceph::bufferlist& md_bl, | |
1431 | paddr_t record_block_base); | |
1432 | ||
1433 | struct write_result_t { | |
1434 | journal_seq_t start_seq; | |
1435 | segment_off_t length; | |
1436 | ||
1437 | journal_seq_t get_end_seq() const { | |
1438 | return start_seq.add_offset(length); | |
1439 | } | |
1440 | }; | |
1441 | ||
1442 | struct record_locator_t { | |
1443 | paddr_t record_block_base; | |
1444 | write_result_t write_result; | |
1445 | }; | |
1446 | ||
1447 | /// scan segment for end incrementally | |
1448 | struct scan_valid_records_cursor { | |
1449 | bool last_valid_header_found = false; | |
1450 | journal_seq_t seq; | |
1451 | journal_seq_t last_committed; | |
1452 | ||
1453 | struct found_record_group_t { | |
1454 | paddr_t offset; | |
1455 | record_group_header_t header; | |
1456 | bufferlist mdbuffer; | |
1457 | ||
1458 | found_record_group_t( | |
1459 | paddr_t offset, | |
1460 | const record_group_header_t &header, | |
1461 | const bufferlist &mdbuffer) | |
1462 | : offset(offset), header(header), mdbuffer(mdbuffer) {} | |
1463 | }; | |
1464 | std::deque<found_record_group_t> pending_record_groups; | |
1465 | ||
1466 | bool is_complete() const { | |
1467 | return last_valid_header_found && pending_record_groups.empty(); | |
1468 | } | |
1469 | ||
1470 | segment_id_t get_segment_id() const { | |
1471 | return seq.offset.as_seg_paddr().get_segment_id(); | |
1472 | } | |
1473 | ||
1474 | segment_off_t get_segment_offset() const { | |
1475 | return seq.offset.as_seg_paddr().get_segment_off(); | |
1476 | } | |
1477 | ||
1478 | void increment(segment_off_t off) { | |
1479 | auto& seg_addr = seq.offset.as_seg_paddr(); | |
1480 | seg_addr.set_segment_off( | |
1481 | seg_addr.get_segment_off() + off); | |
1482 | } | |
1483 | ||
1484 | scan_valid_records_cursor( | |
1485 | journal_seq_t seq) | |
1486 | : seq(seq) {} | |
1487 | }; | |
1488 | ||
1489 | inline const seg_paddr_t& paddr_t::as_seg_paddr() const { | |
1490 | assert(get_addr_type() == addr_types_t::SEGMENT); | |
1491 | return *static_cast<const seg_paddr_t*>(this); | |
1492 | } | |
1493 | ||
1494 | inline seg_paddr_t& paddr_t::as_seg_paddr() { | |
1495 | assert(get_addr_type() == addr_types_t::SEGMENT); | |
1496 | return *static_cast<seg_paddr_t*>(this); | |
1497 | } | |
1498 | ||
1499 | inline paddr_t paddr_t::operator-(paddr_t rhs) const { | |
1500 | if (get_addr_type() == addr_types_t::SEGMENT) { | |
1501 | auto& seg_addr = as_seg_paddr(); | |
1502 | return seg_addr - rhs; | |
1503 | } | |
1504 | ceph_assert(0 == "not supported type"); | |
1505 | return paddr_t{}; | |
1506 | } | |
1507 | ||
1508 | #define PADDR_OPERATION(a_type, base, func) \ | |
1509 | if (get_addr_type() == a_type) { \ | |
1510 | return static_cast<const base*>(this)->func; \ | |
1511 | } | |
1512 | ||
1513 | inline paddr_t paddr_t::add_offset(int32_t o) const { | |
1514 | PADDR_OPERATION(addr_types_t::SEGMENT, seg_paddr_t, add_offset(o)) | |
1515 | ceph_assert(0 == "not supported type"); | |
1516 | return paddr_t{}; | |
1517 | } | |
1518 | ||
1519 | inline paddr_t paddr_t::add_relative(paddr_t o) const { | |
1520 | PADDR_OPERATION(addr_types_t::SEGMENT, seg_paddr_t, add_relative(o)) | |
1521 | ceph_assert(0 == "not supported type"); | |
1522 | return paddr_t{}; | |
1523 | } | |
1524 | ||
1525 | inline paddr_t paddr_t::add_block_relative(paddr_t o) const { | |
1526 | PADDR_OPERATION(addr_types_t::SEGMENT, seg_paddr_t, add_block_relative(o)) | |
1527 | ceph_assert(0 == "not supported type"); | |
1528 | return paddr_t{}; | |
1529 | } | |
1530 | ||
1531 | inline paddr_t paddr_t::add_record_relative(paddr_t o) const { | |
1532 | PADDR_OPERATION(addr_types_t::SEGMENT, seg_paddr_t, add_record_relative(o)) | |
1533 | ceph_assert(0 == "not supported type"); | |
1534 | return paddr_t{}; | |
1535 | } | |
1536 | ||
1537 | inline paddr_t paddr_t::maybe_relative_to(paddr_t o) const { | |
1538 | PADDR_OPERATION(addr_types_t::SEGMENT, seg_paddr_t, maybe_relative_to(o)) | |
1539 | ceph_assert(0 == "not supported type"); | |
1540 | return paddr_t{}; | |
1541 | } | |
1542 | ||
f67539c2 TL |
1543 | } |
1544 | ||
1545 | WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::seastore_meta_t) | |
20effc67 | 1546 | WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_id_t) |
f67539c2 TL |
1547 | WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t) |
1548 | WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t) | |
1549 | WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t) | |
20effc67 TL |
1550 | WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t) |
1551 | WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_group_header_t) | |
1552 | WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t) | |
1553 | WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_header_t) | |
1554 | WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::rbm_alloc_delta_t) | |
1555 | ||
1556 | template<> | |
1557 | struct denc_traits<crimson::os::seastore::device_type_t> { | |
1558 | static constexpr bool supported = true; | |
1559 | static constexpr bool featured = false; | |
1560 | static constexpr bool bounded = true; | |
1561 | static constexpr bool need_contiguous = false; | |
1562 | ||
1563 | static void bound_encode( | |
1564 | const crimson::os::seastore::device_type_t &o, | |
1565 | size_t& p, | |
1566 | uint64_t f=0) { | |
1567 | p += sizeof(crimson::os::seastore::device_type_t); | |
1568 | } | |
1569 | template<class It> | |
1570 | static std::enable_if_t<!is_const_iterator_v<It>> | |
1571 | encode( | |
1572 | const crimson::os::seastore::device_type_t &o, | |
1573 | It& p, | |
1574 | uint64_t f=0) { | |
1575 | get_pos_add<crimson::os::seastore::device_type_t>(p) = o; | |
1576 | } | |
1577 | template<class It> | |
1578 | static std::enable_if_t<is_const_iterator_v<It>> | |
1579 | decode( | |
1580 | crimson::os::seastore::device_type_t& o, | |
1581 | It& p, | |
1582 | uint64_t f=0) { | |
1583 | o = get_pos_add<crimson::os::seastore::device_type_t>(p); | |
1584 | } | |
1585 | static void decode( | |
1586 | crimson::os::seastore::device_type_t& o, | |
1587 | ceph::buffer::list::const_iterator &p) { | |
1588 | p.copy(sizeof(crimson::os::seastore::device_type_t), | |
1589 | reinterpret_cast<char*>(&o)); | |
1590 | } | |
1591 | }; |