1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "crimson/os/seastore/seastore_types.h"
5 #include "crimson/common/log.h"
9 seastar::logger
& logger() {
10 return crimson::get_logger(ceph_subsys_seastore_tm
);
15 namespace crimson::os::seastore
{
17 std::ostream
& operator<<(std::ostream
& out
, const seastore_meta_t
& meta
)
19 return out
<< meta
.seastore_id
;
22 std::ostream
&segment_to_stream(std::ostream
&out
, const segment_id_t
&t
)
25 return out
<< "NULL_SEG";
26 else if (t
== FAKE_SEG_ID
)
27 return out
<< "FAKE_SEG";
32 std::ostream
&offset_to_stream(std::ostream
&out
, const segment_off_t
&t
)
34 if (t
== NULL_SEG_OFF
)
35 return out
<< "NULL_OFF";
40 std::ostream
&operator<<(std::ostream
&out
, const segment_id_t
& segment
)
42 return out
<< "[" << (uint64_t)segment
.device_id() << ","
43 << segment
.device_segment_id() << "]";
46 std::ostream
&operator<<(std::ostream
&out
, const paddr_t
&rhs
)
49 if (rhs
== P_ADDR_NULL
) {
51 } else if (rhs
== P_ADDR_MIN
) {
53 } else if (rhs
.is_block_relative()) {
55 } else if (rhs
.is_record_relative()) {
57 } else if (rhs
.get_device_id() == DEVICE_ID_DELAYED
) {
58 out
<< "DELAYED_TEMP";
59 } else if (rhs
.get_addr_type() == addr_types_t::SEGMENT
) {
60 const seg_paddr_t
& s
= rhs
.as_seg_paddr();
61 segment_to_stream(out
, s
.get_segment_id());
63 offset_to_stream(out
, s
.get_segment_off());
70 std::ostream
&operator<<(std::ostream
&out
, const journal_seq_t
&seq
)
72 return out
<< "journal_seq_t(segment_seq="
73 << seq
.segment_seq
<< ", offset="
78 std::ostream
&operator<<(std::ostream
&out
, extent_types_t t
)
81 case extent_types_t::ROOT
:
83 case extent_types_t::LADDR_INTERNAL
:
84 return out
<< "LADDR_INTERNAL";
85 case extent_types_t::LADDR_LEAF
:
86 return out
<< "LADDR_LEAF";
87 case extent_types_t::ONODE_BLOCK_STAGED
:
88 return out
<< "ONODE_BLOCK_STAGED";
89 case extent_types_t::OMAP_INNER
:
90 return out
<< "OMAP_INNER";
91 case extent_types_t::OMAP_LEAF
:
92 return out
<< "OMAP_LEAF";
93 case extent_types_t::COLL_BLOCK
:
94 return out
<< "COLL_BLOCK";
95 case extent_types_t::OBJECT_DATA_BLOCK
:
96 return out
<< "OBJECT_DATA_BLOCK";
97 case extent_types_t::RETIRED_PLACEHOLDER
:
98 return out
<< "RETIRED_PLACEHOLDER";
99 case extent_types_t::TEST_BLOCK
:
100 return out
<< "TEST_BLOCK";
101 case extent_types_t::TEST_BLOCK_PHYSICAL
:
102 return out
<< "TEST_BLOCK_PHYSICAL";
103 case extent_types_t::NONE
:
104 return out
<< "NONE";
106 return out
<< "UNKNOWN";
110 std::ostream
&operator<<(std::ostream
&out
, const laddr_list_t
&rhs
)
114 out
<< (first
? '[' : ',') << '(' << i
.first
<< ',' << i
.second
<< ')';
119 std::ostream
&operator<<(std::ostream
&out
, const paddr_list_t
&rhs
)
123 out
<< (first
? '[' : ',') << '(' << i
.first
<< ',' << i
.second
<< ')';
129 std::ostream
&operator<<(std::ostream
&lhs
, const delta_info_t
&rhs
)
131 return lhs
<< "delta_info_t("
132 << "type: " << rhs
.type
133 << ", paddr: " << rhs
.paddr
134 << ", laddr: " << rhs
.laddr
135 << ", prev_crc: " << rhs
.prev_crc
136 << ", final_crc: " << rhs
.final_crc
137 << ", length: " << rhs
.length
138 << ", pversion: " << rhs
.pversion
142 std::ostream
&operator<<(std::ostream
&out
, const extent_info_t
&info
)
144 return out
<< "extent_info_t("
145 << "type: " << info
.type
146 << ", addr: " << info
.addr
147 << ", len: " << info
.len
151 std::ostream
&operator<<(std::ostream
&out
, const segment_header_t
&header
)
153 return out
<< "segment_header_t("
154 << "segment_seq=" << header
.journal_segment_seq
155 << ", physical_segment_id=" << header
.physical_segment_id
156 << ", journal_tail=" << header
.journal_tail
157 << ", segment_nonce=" << header
.segment_nonce
158 << ", out-of-line=" << header
.out_of_line
162 extent_len_t
record_size_t::get_raw_mdlength() const
164 // empty record is allowed to submit
165 return plain_mdlength
+
166 ceph::encoded_sizeof_bounded
<record_header_t
>();
169 void record_size_t::account_extent(extent_len_t extent_len
)
172 plain_mdlength
+= ceph::encoded_sizeof_bounded
<extent_info_t
>();
173 dlength
+= extent_len
;
176 void record_size_t::account(const delta_info_t
& delta
)
178 assert(delta
.bl
.length());
179 plain_mdlength
+= ceph::encoded_sizeof(delta
);
182 extent_len_t
record_group_size_t::get_raw_mdlength() const
184 return plain_mdlength
+
186 ceph::encoded_sizeof_bounded
<record_group_header_t
>();
189 void record_group_size_t::account(
190 const record_size_t
& rsize
,
191 extent_len_t _block_size
)
193 // empty record is allowed to submit
194 assert(_block_size
> 0);
195 assert(rsize
.dlength
% _block_size
== 0);
196 assert(block_size
== 0 || block_size
== _block_size
);
197 plain_mdlength
+= rsize
.get_raw_mdlength();
198 dlength
+= rsize
.dlength
;
199 block_size
= _block_size
;
202 ceph::bufferlist
encode_record(
204 extent_len_t block_size
,
205 const journal_seq_t
& committed_to
,
206 segment_nonce_t current_segment_nonce
)
208 record_group_t
record_group(std::move(record
), block_size
);
209 return encode_records(
212 current_segment_nonce
);
215 ceph::bufferlist
encode_records(
216 record_group_t
& record_group
,
217 const journal_seq_t
& committed_to
,
218 segment_nonce_t current_segment_nonce
)
220 assert(record_group
.size
.block_size
> 0);
221 assert(record_group
.records
.size() > 0);
224 for (auto& r
: record_group
.records
) {
225 for (auto& i
: r
.extents
) {
226 assert(i
.bl
.length());
227 data_bl
.append(i
.bl
);
232 record_group_header_t header
{
233 static_cast<extent_len_t
>(record_group
.records
.size()),
234 record_group
.size
.get_mdlength(),
235 record_group
.size
.dlength
,
236 current_segment_nonce
,
242 auto metadata_crc_filler
= bl
.append_hole(sizeof(checksum_t
));
244 for (auto& r
: record_group
.records
) {
245 record_header_t rheader
{
246 (extent_len_t
)r
.deltas
.size(),
247 (extent_len_t
)r
.extents
.size(),
251 for (auto& r
: record_group
.records
) {
252 for (const auto& i
: r
.extents
) {
253 encode(extent_info_t(i
), bl
);
256 for (auto& r
: record_group
.records
) {
257 for (const auto& i
: r
.deltas
) {
261 ceph_assert(bl
.length() == record_group
.size
.get_raw_mdlength());
263 auto aligned_mdlength
= record_group
.size
.get_mdlength();
264 if (bl
.length() != aligned_mdlength
) {
265 assert(bl
.length() < aligned_mdlength
);
266 bl
.append_zero(aligned_mdlength
- bl
.length());
269 auto bliter
= bl
.cbegin();
270 auto metadata_crc
= bliter
.crc32c(
271 ceph::encoded_sizeof_bounded
<record_group_header_t
>(),
273 bliter
+= sizeof(checksum_t
); /* metadata crc hole */
274 metadata_crc
= bliter
.crc32c(
275 bliter
.get_remaining(),
277 ceph_le32 metadata_crc_le
;
278 metadata_crc_le
= metadata_crc
;
279 metadata_crc_filler
.copy_in(
281 reinterpret_cast<const char *>(&metadata_crc_le
));
283 bl
.claim_append(data_bl
);
284 ceph_assert(bl
.length() == record_group
.size
.get_encoded_length());
286 record_group
.clear();
290 std::optional
<record_group_header_t
>
291 try_decode_records_header(
292 const ceph::bufferlist
& header_bl
,
293 segment_nonce_t expected_nonce
)
295 auto bp
= header_bl
.cbegin();
296 record_group_header_t header
;
299 } catch (ceph::buffer::error
&e
) {
301 "try_decode_records_header: failed, "
302 "cannot decode record_group_header_t, got {}.",
306 if (header
.segment_nonce
!= expected_nonce
) {
308 "try_decode_records_header: failed, record_group_header nonce mismatch, "
309 "read {}, expected {}!",
310 header
.segment_nonce
,
317 bool validate_records_metadata(
318 const ceph::bufferlist
& md_bl
)
320 auto bliter
= md_bl
.cbegin();
321 auto test_crc
= bliter
.crc32c(
322 ceph::encoded_sizeof_bounded
<record_group_header_t
>(),
324 ceph_le32 recorded_crc_le
;
325 decode(recorded_crc_le
, bliter
);
326 uint32_t recorded_crc
= recorded_crc_le
;
327 test_crc
= bliter
.crc32c(
328 bliter
.get_remaining(),
330 bool success
= (test_crc
== recorded_crc
);
332 logger().debug("validate_records_metadata: failed, metadata crc mismatch.");
337 bool validate_records_data(
338 const record_group_header_t
& header
,
339 const ceph::bufferlist
& data_bl
)
341 bool success
= (data_bl
.crc32c(-1) == header
.data_crc
);
343 logger().debug("validate_records_data: failed, data crc mismatch!");
350 std::optional
<std::vector
<record_header_t
>>
351 try_decode_record_headers(
352 const record_group_header_t
& header
,
353 const ceph::bufferlist
& md_bl
)
355 auto bliter
= md_bl
.cbegin();
356 bliter
+= ceph::encoded_sizeof_bounded
<record_group_header_t
>();
357 bliter
+= sizeof(checksum_t
); /* metadata crc hole */
358 std::vector
<record_header_t
> record_headers(header
.records
);
359 for (auto &&i
: record_headers
) {
362 } catch (ceph::buffer::error
&e
) {
364 "try_decode_record_headers: failed, "
365 "cannot decode record_header_t, got {}.",
370 return record_headers
;
375 std::optional
<std::vector
<record_extent_infos_t
> >
376 try_decode_extent_infos(
377 const record_group_header_t
& header
,
378 const ceph::bufferlist
& md_bl
)
380 auto maybe_headers
= try_decode_record_headers(header
, md_bl
);
381 if (!maybe_headers
) {
383 "try_decode_extent_infos: failed, cannot decode record headers.");
387 auto bliter
= md_bl
.cbegin();
388 bliter
+= ceph::encoded_sizeof_bounded
<record_group_header_t
>();
389 bliter
+= sizeof(checksum_t
); /* metadata crc hole */
390 bliter
+= (ceph::encoded_sizeof_bounded
<record_header_t
>() *
391 maybe_headers
->size());
393 std::vector
<record_extent_infos_t
> record_extent_infos(
394 maybe_headers
->size());
395 auto result_iter
= record_extent_infos
.begin();
396 for (auto& h
: *maybe_headers
) {
397 result_iter
->header
= h
;
398 result_iter
->extent_infos
.resize(h
.extents
);
399 for (auto& i
: result_iter
->extent_infos
) {
402 } catch (ceph::buffer::error
&e
) {
404 "try_decode_extent_infos: failed, "
405 "cannot decode extent_info_t, got {}.",
412 return record_extent_infos
;
415 std::optional
<std::vector
<record_deltas_t
> >
417 const record_group_header_t
& header
,
418 const ceph::bufferlist
& md_bl
,
419 paddr_t record_block_base
)
421 auto maybe_record_extent_infos
= try_decode_extent_infos(header
, md_bl
);
422 if (!maybe_record_extent_infos
) {
424 "try_decode_deltas: failed, cannot decode extent_infos.");
428 auto bliter
= md_bl
.cbegin();
429 bliter
+= ceph::encoded_sizeof_bounded
<record_group_header_t
>();
430 bliter
+= sizeof(checksum_t
); /* metadata crc hole */
431 bliter
+= (ceph::encoded_sizeof_bounded
<record_header_t
>() *
432 maybe_record_extent_infos
->size());
433 for (auto& r
: *maybe_record_extent_infos
) {
434 bliter
+= (ceph::encoded_sizeof_bounded
<extent_info_t
>() *
435 r
.extent_infos
.size());
438 std::vector
<record_deltas_t
> record_deltas(
439 maybe_record_extent_infos
->size());
440 auto result_iter
= record_deltas
.begin();
441 for (auto& r
: *maybe_record_extent_infos
) {
442 result_iter
->record_block_base
= record_block_base
;
443 result_iter
->deltas
.resize(r
.header
.deltas
);
444 for (auto& i
: result_iter
->deltas
) {
447 } catch (ceph::buffer::error
&e
) {
449 "try_decode_deltas: failed, "
450 "cannot decode delta_info_t, got {}.",
455 for (auto& i
: r
.extent_infos
) {
456 auto& seg_addr
= record_block_base
.as_seg_paddr();
457 seg_addr
.set_segment_off(seg_addr
.get_segment_off() + i
.len
);
461 return record_deltas
;
464 bool can_delay_allocation(device_type_t type
) {
465 // Some types of device may not support delayed allocation, for example PMEM.
466 return type
<= device_type_t::RANDOM_BLOCK
;
469 device_type_t
string_to_device_type(std::string type
) {
470 if (type
== "segmented") {
471 return device_type_t::SEGMENTED
;
473 if (type
== "random_block") {
474 return device_type_t::RANDOM_BLOCK
;
476 if (type
== "pmem") {
477 return device_type_t::PMEM
;
479 return device_type_t::NONE
;
482 std::ostream
& operator<<(std::ostream
& out
, device_type_t t
)
485 case device_type_t::NONE
:
486 return out
<< "NONE";
487 case device_type_t::SEGMENTED
:
488 return out
<< "SEGMENTED";
489 case device_type_t::RANDOM_BLOCK
:
490 return out
<< "RANDOM_BLOCK";
491 case device_type_t::PMEM
:
492 return out
<< "PMEM";
494 return out
<< "INVALID_DEVICE_TYPE!";
498 paddr_t
convert_blk_paddr_to_paddr(blk_paddr_t addr
, size_t block_size
,
499 uint32_t blocks_per_segment
, device_id_t d_id
)
501 segment_id_t id
= segment_id_t
{
503 (device_segment_id_t
)(addr
/ (block_size
* blocks_per_segment
))
505 segment_off_t off
= addr
% (block_size
* blocks_per_segment
);
506 return paddr_t::make_seg_paddr(id
, off
);
509 blk_paddr_t
convert_paddr_to_blk_paddr(paddr_t addr
, size_t block_size
,
510 uint32_t blocks_per_segment
)
512 seg_paddr_t
& s
= addr
.as_seg_paddr();
513 return (blk_paddr_t
)(s
.get_segment_id().device_segment_id() *
514 (block_size
* blocks_per_segment
) + s
.get_segment_off());