1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
16 #define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
20 #include <type_traits>
23 #include "include/mempool.h"
24 #include "include/types.h"
25 #include "include/interval_set.h"
26 #include "include/utime.h"
27 #include "common/hobject.h"
28 #include "compressor/Compressor.h"
29 #include "common/Checksummer.h"
30 #include "include/ceph_hash.h"
36 /// label for block device
37 struct bluestore_bdev_label_t
{
38 uuid_d osd_uuid
; ///< osd uuid
39 uint64_t size
= 0; ///< device size
40 utime_t btime
; ///< birth time
41 std::string description
; ///< device description
43 std::map
<std::string
,std::string
> meta
; ///< {read,write}_meta() content from ObjectStore
45 void encode(ceph::buffer::list
& bl
) const;
46 void decode(ceph::buffer::list::const_iterator
& p
);
47 void dump(ceph::Formatter
*f
) const;
48 static void generate_test_instances(std::list
<bluestore_bdev_label_t
*>& o
);
50 WRITE_CLASS_ENCODER(bluestore_bdev_label_t
)
52 std::ostream
& operator<<(std::ostream
& out
, const bluestore_bdev_label_t
& l
);
54 /// collection metadata
55 struct bluestore_cnode_t
{
56 uint32_t bits
; ///< how many bits of coll pgid are significant
58 explicit bluestore_cnode_t(int b
=0) : bits(b
) {}
60 DENC(bluestore_cnode_t
, v
, p
) {
65 void dump(ceph::Formatter
*f
) const;
66 static void generate_test_instances(std::list
<bluestore_cnode_t
*>& o
);
68 WRITE_CLASS_DENC(bluestore_cnode_t
)
70 std::ostream
& operator<<(std::ostream
& out
, const bluestore_cnode_t
& l
);
72 template <typename OFFS_TYPE
, typename LEN_TYPE
>
73 struct bluestore_interval_t
75 static const uint64_t INVALID_OFFSET
= ~0ull;
80 bluestore_interval_t(){}
81 bluestore_interval_t(uint64_t o
, uint64_t l
) : offset(o
), length(l
) {}
83 bool is_valid() const {
84 return offset
!= INVALID_OFFSET
;
86 uint64_t end() const {
87 return offset
!= INVALID_OFFSET
? offset
+ length
: INVALID_OFFSET
;
90 bool operator==(const bluestore_interval_t
& other
) const {
91 return offset
== other
.offset
&& length
== other
.length
;
96 /// pextent: physical extent
97 struct bluestore_pextent_t
: public bluestore_interval_t
<uint64_t, uint32_t>
99 bluestore_pextent_t() {}
100 bluestore_pextent_t(uint64_t o
, uint64_t l
) : bluestore_interval_t(o
, l
) {}
101 bluestore_pextent_t(const bluestore_interval_t
&ext
) :
102 bluestore_interval_t(ext
.offset
, ext
.length
) {}
104 DENC(bluestore_pextent_t
, v
, p
) {
105 denc_lba(v
.offset
, p
);
106 denc_varint_lowz(v
.length
, p
);
109 void dump(ceph::Formatter
*f
) const;
110 static void generate_test_instances(std::list
<bluestore_pextent_t
*>& ls
);
112 WRITE_CLASS_DENC(bluestore_pextent_t
)
114 std::ostream
& operator<<(std::ostream
& out
, const bluestore_pextent_t
& o
);
116 typedef mempool::bluestore_cache_other::vector
<bluestore_pextent_t
> PExtentVector
;
119 struct denc_traits
<PExtentVector
> {
120 static constexpr bool supported
= true;
121 static constexpr bool bounded
= false;
122 static constexpr bool featured
= false;
123 static constexpr bool need_contiguous
= true;
124 static void bound_encode(const PExtentVector
& v
, size_t& p
) {
125 p
+= sizeof(uint32_t);
126 const auto size
= v
.size();
129 denc(v
.front(), per
);
133 static void encode(const PExtentVector
& v
,
134 ceph::buffer::list::contiguous_appender
& p
) {
135 denc_varint(v
.size(), p
);
140 static void decode(PExtentVector
& v
, ceph::buffer::ptr::const_iterator
& p
) {
145 for (unsigned i
=0; i
<num
; ++i
) {
151 /// extent_map: a std::map of reference counted extents
152 struct bluestore_extent_ref_map_t
{
156 record_t(uint32_t l
=0, uint32_t r
=0) : length(l
), refs(r
) {}
157 DENC(bluestore_extent_ref_map_t::record_t
, v
, p
) {
158 denc_varint_lowz(v
.length
, p
);
159 denc_varint(v
.refs
, p
);
163 typedef mempool::bluestore_cache_other::map
<uint64_t,record_t
> map_t
;
167 void _maybe_merge_left(map_t::iterator
& p
);
173 return ref_map
.empty();
176 void get(uint64_t offset
, uint32_t len
);
177 void put(uint64_t offset
, uint32_t len
, PExtentVector
*release
,
178 bool *maybe_unshared
);
180 bool contains(uint64_t offset
, uint32_t len
) const;
181 bool intersects(uint64_t offset
, uint32_t len
) const;
183 void bound_encode(size_t& p
) const {
184 denc_varint((uint32_t)0, p
);
185 if (!ref_map
.empty()) {
186 size_t elem_size
= 0;
187 denc_varint_lowz((uint64_t)0, elem_size
);
188 ref_map
.begin()->second
.bound_encode(elem_size
);
189 p
+= elem_size
* ref_map
.size();
192 void encode(ceph::buffer::list::contiguous_appender
& p
) const {
193 const uint32_t n
= ref_map
.size();
196 auto i
= ref_map
.begin();
197 denc_varint_lowz(i
->first
, p
);
199 int64_t pos
= i
->first
;
200 while (++i
!= ref_map
.end()) {
201 denc_varint_lowz((int64_t)i
->first
- pos
, p
);
207 void decode(ceph::buffer::ptr::const_iterator
& p
) {
212 denc_varint_lowz(pos
, p
);
213 ref_map
[pos
].decode(p
);
216 denc_varint_lowz(delta
, p
);
218 ref_map
[pos
].decode(p
);
223 void dump(ceph::Formatter
*f
) const;
224 static void generate_test_instances(std::list
<bluestore_extent_ref_map_t
*>& o
);
226 WRITE_CLASS_DENC(bluestore_extent_ref_map_t
)
229 std::ostream
& operator<<(std::ostream
& out
, const bluestore_extent_ref_map_t
& rm
);
230 static inline bool operator==(const bluestore_extent_ref_map_t::record_t
& l
,
231 const bluestore_extent_ref_map_t::record_t
& r
) {
232 return l
.length
== r
.length
&& l
.refs
== r
.refs
;
234 static inline bool operator==(const bluestore_extent_ref_map_t
& l
,
235 const bluestore_extent_ref_map_t
& r
) {
236 return l
.ref_map
== r
.ref_map
;
238 static inline bool operator!=(const bluestore_extent_ref_map_t
& l
,
239 const bluestore_extent_ref_map_t
& r
) {
243 /// blob_use_tracker: a set of per-alloc unit ref buckets to track blob usage
244 struct bluestore_blob_use_tracker_t
{
245 // N.B.: There is no need to minimize au_size/num_au
246 // as much as possible (e.g. have just a single byte for au_size) since:
247 // 1) Struct isn't packed hence it's padded. And even if it's packed see 2)
248 // 2) Mem manager has its own granularity, most probably >= 8 bytes
250 uint32_t au_size
; // Allocation (=tracking) unit size,
251 // == 0 if uninitialized
252 uint32_t num_au
; // Amount of allocation units tracked
253 // == 0 if single unit or the whole blob is tracked
254 uint32_t alloc_au
; // Amount of allocation units allocated
257 uint32_t* bytes_per_au
;
258 uint32_t total_bytes
;
261 bluestore_blob_use_tracker_t()
262 : au_size(0), num_au(0), alloc_au(0), bytes_per_au(nullptr) {
264 bluestore_blob_use_tracker_t(const bluestore_blob_use_tracker_t
& tracker
);
265 bluestore_blob_use_tracker_t
& operator=(const bluestore_blob_use_tracker_t
& rhs
);
266 ~bluestore_blob_use_tracker_t() {
271 release(alloc_au
, bytes_per_au
);
278 uint32_t get_referenced_bytes() const {
283 for (size_t i
= 0; i
< num_au
; ++i
) {
284 total
+= bytes_per_au
[i
];
289 bool is_not_empty() const {
291 return total_bytes
!= 0;
293 for (size_t i
= 0; i
< num_au
; ++i
) {
294 if (bytes_per_au
[i
]) {
301 bool is_empty() const {
302 return !is_not_empty();
304 void prune_tail(uint32_t new_len
) {
306 new_len
= round_up_to(new_len
, au_size
);
307 uint32_t _num_au
= new_len
/ au_size
;
308 ceph_assert(_num_au
<= num_au
);
310 num_au
= _num_au
; // bytes_per_au array is left unmodified
316 void add_tail(uint32_t new_len
, uint32_t _au_size
) {
317 auto full_size
= au_size
* (num_au
? num_au
: 1);
318 ceph_assert(new_len
>= full_size
);
319 if (new_len
== full_size
) {
323 uint32_t old_total
= total_bytes
;
325 init(new_len
, _au_size
);
327 bytes_per_au
[0] = old_total
;
329 ceph_assert(_au_size
== au_size
);
330 new_len
= round_up_to(new_len
, au_size
);
331 uint32_t _num_au
= new_len
/ au_size
;
332 ceph_assert(_num_au
>= num_au
);
333 if (_num_au
> num_au
) {
334 auto old_bytes
= bytes_per_au
;
335 auto old_num_au
= num_au
;
336 auto old_alloc_au
= alloc_au
;
337 alloc_au
= num_au
= 0; // to bypass an assertion in allocate()
338 bytes_per_au
= nullptr;
340 for (size_t i
= 0; i
< old_num_au
; i
++) {
341 bytes_per_au
[i
] = old_bytes
[i
];
343 for (size_t i
= old_num_au
; i
< num_au
; i
++) {
346 release(old_alloc_au
, old_bytes
);
352 uint32_t full_length
,
359 /// put: return true if the blob has no references any more after the call,
360 /// no release_units is filled for the sake of performance.
361 /// return false if there are some references to the blob,
362 /// in this case release_units contains pextents
363 /// (identified by their offsets relative to the blob start)
364 /// that are not used any more and can be safely deallocated.
368 PExtentVector
*release
);
370 bool can_split() const;
371 bool can_split_at(uint32_t blob_offset
) const;
373 uint32_t blob_offset
,
374 bluestore_blob_use_tracker_t
* r
);
377 const bluestore_blob_use_tracker_t
& other
) const;
379 void bound_encode(size_t& p
) const {
380 denc_varint(au_size
, p
);
382 denc_varint(num_au
, p
);
384 denc_varint(total_bytes
, p
);
386 size_t elem_size
= 0;
387 denc_varint((uint32_t)0, elem_size
);
388 p
+= elem_size
* num_au
;
392 void encode(ceph::buffer::list::contiguous_appender
& p
) const {
393 denc_varint(au_size
, p
);
395 denc_varint(num_au
, p
);
397 denc_varint(total_bytes
, p
);
399 size_t elem_size
= 0;
400 denc_varint((uint32_t)0, elem_size
);
401 for (size_t i
= 0; i
< num_au
; ++i
) {
402 denc_varint(bytes_per_au
[i
], p
);
407 void decode(ceph::buffer::ptr::const_iterator
& p
) {
409 denc_varint(au_size
, p
);
412 denc_varint(_num_au
, p
);
415 denc_varint(total_bytes
, p
);
418 for (size_t i
= 0; i
< _num_au
; ++i
) {
419 denc_varint(bytes_per_au
[i
], p
);
425 void dump(ceph::Formatter
*f
) const;
426 static void generate_test_instances(std::list
<bluestore_blob_use_tracker_t
*>& o
);
428 void allocate(uint32_t _num_au
);
429 void release(uint32_t _num_au
, uint32_t* ptr
);
431 WRITE_CLASS_DENC(bluestore_blob_use_tracker_t
)
432 std::ostream
& operator<<(std::ostream
& out
, const bluestore_blob_use_tracker_t
& rm
);
434 /// blob: a piece of data on disk
435 struct bluestore_blob_t
{
437 PExtentVector extents
; ///< raw data position on device
438 uint32_t logical_length
= 0; ///< original length of data stored in the blob
439 uint32_t compressed_length
= 0; ///< compressed length if any
443 LEGACY_FLAG_MUTABLE
= 1, ///< [legacy] blob can be overwritten or split
444 FLAG_COMPRESSED
= 2, ///< blob is compressed
445 FLAG_CSUM
= 4, ///< blob has checksums
446 FLAG_HAS_UNUSED
= 8, ///< blob has unused std::map
447 FLAG_SHARED
= 16, ///< blob is shared; see external SharedBlob
449 static std::string
get_flags_string(unsigned flags
);
451 uint32_t flags
= 0; ///< FLAG_*
453 typedef uint16_t unused_t
;
454 unused_t unused
= 0; ///< portion that has never been written to (bitmap)
456 uint8_t csum_type
= Checksummer::CSUM_NONE
; ///< CSUM_*
457 uint8_t csum_chunk_order
= 0; ///< csum block size is 1<<block_order bytes
459 ceph::buffer::ptr csum_data
; ///< opaque std::vector of csum data
461 bluestore_blob_t(uint32_t f
= 0) : flags(f
) {}
463 const PExtentVector
& get_extents() const {
466 PExtentVector
& dirty_extents() {
471 void bound_encode(size_t& p
, uint64_t struct_v
) const {
472 ceph_assert(struct_v
== 1 || struct_v
== 2);
474 denc_varint(flags
, p
);
475 denc_varint_lowz(logical_length
, p
);
476 denc_varint_lowz(compressed_length
, p
);
478 denc(csum_chunk_order
, p
);
479 denc_varint(csum_data
.length(), p
);
480 p
+= csum_data
.length();
481 p
+= sizeof(unused_t
);
484 void encode(ceph::buffer::list::contiguous_appender
& p
, uint64_t struct_v
) const {
485 ceph_assert(struct_v
== 1 || struct_v
== 2);
487 denc_varint(flags
, p
);
488 if (is_compressed()) {
489 denc_varint_lowz(logical_length
, p
);
490 denc_varint_lowz(compressed_length
, p
);
494 denc(csum_chunk_order
, p
);
495 denc_varint(csum_data
.length(), p
);
496 memcpy(p
.get_pos_add(csum_data
.length()), csum_data
.c_str(),
504 void decode(ceph::buffer::ptr::const_iterator
& p
, uint64_t struct_v
) {
505 ceph_assert(struct_v
== 1 || struct_v
== 2);
507 denc_varint(flags
, p
);
508 if (is_compressed()) {
509 denc_varint_lowz(logical_length
, p
);
510 denc_varint_lowz(compressed_length
, p
);
512 logical_length
= get_ondisk_length();
516 denc(csum_chunk_order
, p
);
519 csum_data
= p
.get_ptr(len
);
520 csum_data
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
527 bool can_split() const {
529 !has_flag(FLAG_SHARED
) &&
530 !has_flag(FLAG_COMPRESSED
) &&
531 !has_flag(FLAG_HAS_UNUSED
); // splitting unused set is complex
533 bool can_split_at(uint32_t blob_offset
) const {
534 return !has_csum() || blob_offset
% get_csum_chunk_size() == 0;
537 void dump(ceph::Formatter
*f
) const;
538 static void generate_test_instances(std::list
<bluestore_blob_t
*>& ls
);
540 bool has_flag(unsigned f
) const {
543 void set_flag(unsigned f
) {
546 void clear_flag(unsigned f
) {
549 std::string
get_flags_string() const {
550 return get_flags_string(flags
);
553 void set_compressed(uint64_t clen_orig
, uint64_t clen
) {
554 set_flag(FLAG_COMPRESSED
);
555 logical_length
= clen_orig
;
556 compressed_length
= clen
;
558 bool is_mutable() const {
559 return !is_compressed() && !is_shared();
561 bool is_compressed() const {
562 return has_flag(FLAG_COMPRESSED
);
564 bool has_csum() const {
565 return has_flag(FLAG_CSUM
);
567 bool has_unused() const {
568 return has_flag(FLAG_HAS_UNUSED
);
570 bool is_shared() const {
571 return has_flag(FLAG_SHARED
);
574 /// return chunk (i.e. min readable block) size for the blob
575 uint64_t get_chunk_size(uint64_t dev_block_size
) const {
577 std::max
<uint64_t>(dev_block_size
, get_csum_chunk_size()) : dev_block_size
;
579 uint32_t get_csum_chunk_size() const {
580 return 1 << csum_chunk_order
;
582 uint32_t get_compressed_payload_length() const {
583 return is_compressed() ? compressed_length
: 0;
585 uint64_t calc_offset(uint64_t x_off
, uint64_t *plen
) const {
586 auto p
= extents
.begin();
587 ceph_assert(p
!= extents
.end());
588 while (x_off
>= p
->length
) {
591 ceph_assert(p
!= extents
.end());
594 *plen
= p
->length
- x_off
;
595 return p
->offset
+ x_off
;
598 // validate whether or not the status of pextents within the given range
599 // meets the requirement(allocated or unallocated).
600 bool _validate_range(uint64_t b_off
, uint64_t b_len
,
601 bool require_allocated
) const {
602 auto p
= extents
.begin();
603 ceph_assert(p
!= extents
.end());
604 while (b_off
>= p
->length
) {
606 if (++p
== extents
.end())
611 if (require_allocated
!= p
->is_valid()) {
614 if (p
->length
>= b_len
) {
618 if (++p
== extents
.end())
621 ceph_abort_msg("we should not get here");
625 /// return true if the entire range is allocated
626 /// (mapped to extents on disk)
627 bool is_allocated(uint64_t b_off
, uint64_t b_len
) const {
628 return _validate_range(b_off
, b_len
, true);
631 /// return true if the entire range is unallocated
632 /// (not mapped to extents on disk)
633 bool is_unallocated(uint64_t b_off
, uint64_t b_len
) const {
634 return _validate_range(b_off
, b_len
, false);
637 /// return true if the logical range has never been used
638 bool is_unused(uint64_t offset
, uint64_t length
) const {
642 ceph_assert(!is_compressed());
643 uint64_t blob_len
= get_logical_length();
644 ceph_assert((blob_len
% (sizeof(unused
)*8)) == 0);
645 ceph_assert(offset
+ length
<= blob_len
);
646 uint64_t chunk_size
= blob_len
/ (sizeof(unused
)*8);
647 uint64_t start
= offset
/ chunk_size
;
648 uint64_t end
= round_up_to(offset
+ length
, chunk_size
) / chunk_size
;
650 while (i
< end
&& (unused
& (1u << i
))) {
656 /// mark a range that has never been used
657 void add_unused(uint64_t offset
, uint64_t length
) {
658 ceph_assert(!is_compressed());
659 uint64_t blob_len
= get_logical_length();
660 ceph_assert((blob_len
% (sizeof(unused
)*8)) == 0);
661 ceph_assert(offset
+ length
<= blob_len
);
662 uint64_t chunk_size
= blob_len
/ (sizeof(unused
)*8);
663 uint64_t start
= round_up_to(offset
, chunk_size
) / chunk_size
;
664 uint64_t end
= (offset
+ length
) / chunk_size
;
665 for (auto i
= start
; i
< end
; ++i
) {
669 set_flag(FLAG_HAS_UNUSED
);
673 /// indicate that a range has (now) been used.
674 void mark_used(uint64_t offset
, uint64_t length
) {
676 ceph_assert(!is_compressed());
677 uint64_t blob_len
= get_logical_length();
678 ceph_assert((blob_len
% (sizeof(unused
)*8)) == 0);
679 ceph_assert(offset
+ length
<= blob_len
);
680 uint64_t chunk_size
= blob_len
/ (sizeof(unused
)*8);
681 uint64_t start
= offset
/ chunk_size
;
682 uint64_t end
= round_up_to(offset
+ length
, chunk_size
) / chunk_size
;
683 for (auto i
= start
; i
< end
; ++i
) {
684 unused
&= ~(1u << i
);
687 clear_flag(FLAG_HAS_UNUSED
);
692 // map_f_invoke templates intended to mask parameters which are not expected
693 // by the provided callback
694 template<class F
, typename
std::enable_if
<std::is_invocable_r_v
<
698 uint64_t>>::type
* = nullptr>
699 int map_f_invoke(uint64_t lo
,
700 const bluestore_pextent_t
& p
,
702 uint64_t l
, F
&& f
) const{
706 template<class F
, typename
std::enable_if
<std::is_invocable_r_v
<
711 uint64_t>>::type
* = nullptr>
712 int map_f_invoke(uint64_t lo
,
713 const bluestore_pextent_t
& p
,
715 uint64_t l
, F
&& f
) const {
719 template<class F
, typename
std::enable_if
<std::is_invocable_r_v
<
722 const bluestore_pextent_t
&,
724 uint64_t>>::type
* = nullptr>
725 int map_f_invoke(uint64_t lo
,
726 const bluestore_pextent_t
& p
,
728 uint64_t l
, F
&& f
) const {
733 int map(uint64_t x_off
, uint64_t x_len
, F
&& f
) const {
735 auto p
= extents
.begin();
736 ceph_assert(p
!= extents
.end());
737 while (x_off
>= p
->length
) {
740 ceph_assert(p
!= extents
.end());
742 while (x_len
> 0 && p
!= extents
.end()) {
743 uint64_t l
= std::min(p
->length
- x_off
, x_len
);
744 int r
= map_f_invoke(x_off0
, *p
, p
->offset
+ x_off
, l
, f
);
756 void map_bl(uint64_t x_off
,
757 ceph::buffer::list
& bl
,
759 static_assert(std::is_invocable_v
<F
, uint64_t, ceph::buffer::list
&>);
761 auto p
= extents
.begin();
762 ceph_assert(p
!= extents
.end());
763 while (x_off
>= p
->length
) {
766 ceph_assert(p
!= extents
.end());
768 ceph::buffer::list::iterator it
= bl
.begin();
769 uint64_t x_len
= bl
.length();
771 ceph_assert(p
!= extents
.end());
772 uint64_t l
= std::min(p
->length
- x_off
, x_len
);
773 ceph::buffer::list t
;
775 f(p
->offset
+ x_off
, t
);
782 uint32_t get_ondisk_length() const {
784 for (auto &p
: extents
) {
790 uint32_t get_logical_length() const {
791 return logical_length
;
793 size_t get_csum_value_size() const;
795 size_t get_csum_count() const {
796 size_t vs
= get_csum_value_size();
799 return csum_data
.length() / vs
;
801 uint64_t get_csum_item(unsigned i
) const {
802 size_t cs
= get_csum_value_size();
803 const char *p
= csum_data
.c_str();
806 ceph_abort_msg("no csum data, bad index");
808 return reinterpret_cast<const uint8_t*>(p
)[i
];
810 return reinterpret_cast<const ceph_le16
*>(p
)[i
];
812 return reinterpret_cast<const ceph_le32
*>(p
)[i
];
814 return reinterpret_cast<const ceph_le64
*>(p
)[i
];
816 ceph_abort_msg("unrecognized csum word size");
819 const char *get_csum_item_ptr(unsigned i
) const {
820 size_t cs
= get_csum_value_size();
821 return csum_data
.c_str() + (cs
* i
);
823 char *get_csum_item_ptr(unsigned i
) {
824 size_t cs
= get_csum_value_size();
825 return csum_data
.c_str() + (cs
* i
);
828 void init_csum(unsigned type
, unsigned order
, unsigned len
) {
831 csum_chunk_order
= order
;
832 csum_data
= ceph::buffer::create(get_csum_value_size() * len
/ get_csum_chunk_size());
834 csum_data
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
837 /// calculate csum for the buffer at the given b_off
838 void calc_csum(uint64_t b_off
, const ceph::buffer::list
& bl
);
840 /// verify csum: return -EOPNOTSUPP for unsupported checksum type;
841 /// return -1 and valid(nonnegative) b_bad_off for checksum error;
842 /// return 0 if all is well.
843 int verify_csum(uint64_t b_off
, const ceph::buffer::list
& bl
, int* b_bad_off
,
844 uint64_t *bad_csum
) const;
846 bool can_prune_tail() const {
848 extents
.size() > 1 && // if it's all invalid it's not pruning.
849 !extents
.back().is_valid() &&
853 const auto &p
= extents
.back();
854 logical_length
-= p
.length
;
859 csum_data
= ceph::buffer::ptr(t
.c_str(),
860 get_logical_length() / get_csum_chunk_size() *
861 get_csum_value_size());
864 void add_tail(uint32_t new_len
) {
865 ceph_assert(is_mutable());
866 ceph_assert(!has_unused());
867 ceph_assert(new_len
> logical_length
);
868 extents
.emplace_back(
870 bluestore_pextent_t::INVALID_OFFSET
,
871 new_len
- logical_length
));
872 logical_length
= new_len
;
876 csum_data
= ceph::buffer::create(
877 get_csum_value_size() * logical_length
/ get_csum_chunk_size());
878 csum_data
.copy_in(0, t
.length(), t
.c_str());
879 csum_data
.zero(t
.length(), csum_data
.length() - t
.length());
882 uint32_t get_release_size(uint32_t min_alloc_size
) const {
883 if (is_compressed()) {
884 return get_logical_length();
886 uint32_t res
= get_csum_chunk_size();
887 if (!has_csum() || res
< min_alloc_size
) {
888 res
= min_alloc_size
;
893 void split(uint32_t blob_offset
, bluestore_blob_t
& rb
);
894 void allocated(uint32_t b_off
, uint32_t length
, const PExtentVector
& allocs
);
895 void allocated_test(const bluestore_pextent_t
& alloc
); // intended for UT only
897 /// updates blob's pextents container and return unused pextents eligible
899 /// all - indicates that the whole blob to be released.
900 /// logical - specifies set of logical extents within blob's
902 /// Returns true if blob has no more valid pextents
903 bool release_extents(
905 const PExtentVector
& logical
,
908 WRITE_CLASS_DENC_FEATURED(bluestore_blob_t
)
910 std::ostream
& operator<<(std::ostream
& out
, const bluestore_blob_t
& o
);
913 /// shared blob state
914 struct bluestore_shared_blob_t
{
915 MEMPOOL_CLASS_HELPERS();
916 uint64_t sbid
; ///> shared blob id
917 bluestore_extent_ref_map_t ref_map
; ///< shared blob extents
919 bluestore_shared_blob_t(uint64_t _sbid
) : sbid(_sbid
) {}
920 bluestore_shared_blob_t(uint64_t _sbid
,
921 bluestore_extent_ref_map_t
&& _ref_map
)
922 : sbid(_sbid
), ref_map(std::move(_ref_map
)) {}
924 DENC(bluestore_shared_blob_t
, v
, p
) {
931 void dump(ceph::Formatter
*f
) const;
932 static void generate_test_instances(std::list
<bluestore_shared_blob_t
*>& ls
);
935 return ref_map
.empty();
938 WRITE_CLASS_DENC(bluestore_shared_blob_t
)
940 std::ostream
& operator<<(std::ostream
& out
, const bluestore_shared_blob_t
& o
);
942 /// onode: per-object metadata
943 struct bluestore_onode_t
{
944 uint64_t nid
= 0; ///< numeric id (locally unique)
945 uint64_t size
= 0; ///< object size
946 // mempool to be assigned to buffer::ptr manually
947 std::map
<mempool::bluestore_cache_meta::string
, ceph::buffer::ptr
> attrs
;
950 uint32_t offset
= 0; ///< logical offset for start of shard
951 uint32_t bytes
= 0; ///< encoded bytes
952 DENC(shard_info
, v
, p
) {
953 denc_varint(v
.offset
, p
);
954 denc_varint(v
.bytes
, p
);
956 void dump(ceph::Formatter
*f
) const;
958 std::vector
<shard_info
> extent_map_shards
; ///< extent std::map shards (if any)
960 uint32_t expected_object_size
= 0;
961 uint32_t expected_write_size
= 0;
962 uint32_t alloc_hint_flags
= 0;
966 std::map
<uint32_t, uint64_t> zone_offset_refs
; ///< (zone, offset) refs to this onode
969 FLAG_OMAP
= 1, ///< object may have omap data
970 FLAG_PGMETA_OMAP
= 2, ///< omap data is in meta omap prefix
971 FLAG_PERPOOL_OMAP
= 4, ///< omap data is in per-pool prefix; per-pool keys
972 FLAG_PERPG_OMAP
= 8, ///< omap data is in per-pg prefix; per-pg keys
975 std::string
get_flags_string() const {
977 if (flags
& FLAG_OMAP
) {
980 if (flags
& FLAG_PGMETA_OMAP
) {
983 if (flags
& FLAG_PERPOOL_OMAP
) {
984 s
+= "+per_pool_omap";
986 if (flags
& FLAG_PERPG_OMAP
) {
992 bool has_flag(unsigned f
) const {
996 void set_flag(unsigned f
) {
1000 void clear_flag(unsigned f
) {
1004 bool has_omap() const {
1005 return has_flag(FLAG_OMAP
);
1008 static bool is_pgmeta_omap(uint8_t flags
) {
1009 return flags
& FLAG_PGMETA_OMAP
;
1011 static bool is_perpool_omap(uint8_t flags
) {
1012 return flags
& FLAG_PERPOOL_OMAP
;
1014 static bool is_perpg_omap(uint8_t flags
) {
1015 return flags
& FLAG_PERPG_OMAP
;
1017 bool is_pgmeta_omap() const {
1018 return has_flag(FLAG_PGMETA_OMAP
);
1020 bool is_perpool_omap() const {
1021 return has_flag(FLAG_PERPOOL_OMAP
);
1023 bool is_perpg_omap() const {
1024 return has_flag(FLAG_PERPG_OMAP
);
1027 void set_omap_flags(bool legacy
) {
1028 set_flag(FLAG_OMAP
| (legacy
? 0 : (FLAG_PERPOOL_OMAP
| FLAG_PERPG_OMAP
)));
1030 void set_omap_flags_pgmeta() {
1031 set_flag(FLAG_OMAP
| FLAG_PGMETA_OMAP
);
1034 void clear_omap_flag() {
1035 clear_flag(FLAG_OMAP
|
1041 DENC(bluestore_onode_t
, v
, p
) {
1042 DENC_START(2, 1, p
);
1043 denc_varint(v
.nid
, p
);
1044 denc_varint(v
.size
, p
);
1047 denc(v
.extent_map_shards
, p
);
1048 denc_varint(v
.expected_object_size
, p
);
1049 denc_varint(v
.expected_write_size
, p
);
1050 denc_varint(v
.alloc_hint_flags
, p
);
1051 if (struct_v
>= 2) {
1052 denc(v
.zone_offset_refs
, p
);
1056 void dump(ceph::Formatter
*f
) const;
1057 static void generate_test_instances(std::list
<bluestore_onode_t
*>& o
);
1059 WRITE_CLASS_DENC(bluestore_onode_t::shard_info
)
1060 WRITE_CLASS_DENC(bluestore_onode_t
)
1062 std::ostream
& operator<<(std::ostream
& out
, const bluestore_onode_t::shard_info
& si
);
1064 /// writeahead-logged op
1065 struct bluestore_deferred_op_t
{
1071 PExtentVector extents
;
1072 ceph::buffer::list data
;
1074 DENC(bluestore_deferred_op_t
, v
, p
) {
1075 DENC_START(1, 1, p
);
1081 void dump(ceph::Formatter
*f
) const;
1082 static void generate_test_instances(std::list
<bluestore_deferred_op_t
*>& o
);
1084 WRITE_CLASS_DENC(bluestore_deferred_op_t
)
1087 /// writeahead-logged transaction
1088 struct bluestore_deferred_transaction_t
{
1090 std::list
<bluestore_deferred_op_t
> ops
;
1091 interval_set
<uint64_t> released
; ///< allocations to release after tx
1093 bluestore_deferred_transaction_t() : seq(0) {}
1095 DENC(bluestore_deferred_transaction_t
, v
, p
) {
1096 DENC_START(1, 1, p
);
1099 denc(v
.released
, p
);
1102 void dump(ceph::Formatter
*f
) const;
1103 static void generate_test_instances(std::list
<bluestore_deferred_transaction_t
*>& o
);
1105 WRITE_CLASS_DENC(bluestore_deferred_transaction_t
)
1107 struct bluestore_compression_header_t
{
1108 uint8_t type
= Compressor::COMP_ALG_NONE
;
1109 uint32_t length
= 0;
1110 std::optional
<int32_t> compressor_message
;
1112 bluestore_compression_header_t() {}
1113 bluestore_compression_header_t(uint8_t _type
)
1116 DENC(bluestore_compression_header_t
, v
, p
) {
1117 DENC_START(2, 1, p
);
1120 if (struct_v
>= 2) {
1121 denc(v
.compressor_message
, p
);
1125 void dump(ceph::Formatter
*f
) const;
1126 static void generate_test_instances(std::list
<bluestore_compression_header_t
*>& o
);
1128 WRITE_CLASS_DENC(bluestore_compression_header_t
)
1130 template <template <typename
> typename V
, class COUNTER_TYPE
= int32_t>
1131 class ref_counter_2hash_tracker_t
{
1132 size_t num_non_zero
= 0;
1133 size_t num_buckets
= 0;
1134 V
<COUNTER_TYPE
> buckets1
;
1135 V
<COUNTER_TYPE
> buckets2
;
1138 ref_counter_2hash_tracker_t(uint64_t mem_cap
) {
1139 num_buckets
= mem_cap
/ sizeof(COUNTER_TYPE
) / 2;
1140 ceph_assert(num_buckets
);
1141 buckets1
.resize(num_buckets
);
1142 buckets2
.resize(num_buckets
);
1146 size_t get_num_buckets() const {
1150 void inc(const char* hash_val
, size_t hash_val_len
, int n
) {
1151 auto h
= ceph_str_hash_rjenkins((const char*)hash_val
, hash_val_len
) %
1153 if (buckets1
[h
] == 0 && n
) {
1155 } else if (buckets1
[h
] == -n
) {
1159 h
= ceph_str_hash_linux((const char*)hash_val
, hash_val_len
) % num_buckets
;
1160 if (buckets2
[h
] == 0 && n
) {
1162 } else if (buckets2
[h
] == -n
) {
1168 bool test_hash_conflict(
1169 const char* hash_val1
,
1170 const char* hash_val2
,
1171 size_t hash_val_len
) const {
1173 auto h1
= ceph_str_hash_rjenkins((const char*)hash_val1
, hash_val_len
);
1174 auto h2
= ceph_str_hash_rjenkins((const char*)hash_val2
, hash_val_len
);
1175 auto h3
= ceph_str_hash_linux((const char*)hash_val1
, hash_val_len
);
1176 auto h4
= ceph_str_hash_linux((const char*)hash_val2
, hash_val_len
);
1177 return ((h1
% num_buckets
) == (h2
% num_buckets
)) &&
1178 ((h3
% num_buckets
) == (h4
% num_buckets
));
1181 bool test_all_zero(const char* hash_val
, size_t hash_val_len
) const {
1182 auto h
= ceph_str_hash_rjenkins((const char*)hash_val
, hash_val_len
);
1183 if (buckets1
[h
% num_buckets
] != 0) {
1186 h
= ceph_str_hash_linux((const char*)hash_val
, hash_val_len
);
1187 return buckets2
[h
% num_buckets
] == 0;
1190 // returns number of mismatching buckets
1191 size_t count_non_zero() const {
1192 return num_non_zero
;
1195 for (size_t i
= 0; i
< num_buckets
; i
++) {
1203 class shared_blob_2hash_tracker_t
1204 : public ref_counter_2hash_tracker_t
<mempool::bluestore_fsck::vector
> {
1206 static const size_t hash_input_len
= 3;
1208 typedef std::array
<uint64_t, hash_input_len
> hash_input_t
;
1210 static size_t get_hash_input_size() {
1211 return hash_input_len
* sizeof(hash_input_t::value_type
);
1214 inline hash_input_t
build_hash_input(uint64_t sbid
, uint64_t offset
) const;
1216 size_t au_void_bits
= 0;
1220 shared_blob_2hash_tracker_t(uint64_t mem_cap
, size_t alloc_unit
)
1221 : ref_counter_2hash_tracker_t(mem_cap
) {
1222 ceph_assert(alloc_unit
);
1223 ceph_assert(std::has_single_bit(alloc_unit
));
1224 au_void_bits
= std::countr_zero(alloc_unit
);
1226 void inc(uint64_t sbid
, uint64_t offset
, int n
);
1227 void inc_range(uint64_t sbid
, uint64_t offset
, uint32_t len
, int n
);
1229 bool test_hash_conflict(
1233 uint64_t offset2
) const;
1236 uint64_t offset
) const;
1237 bool test_all_zero_range(
1240 uint32_t len
) const;
1244 // subzero value indicates (potentially) stray blob,
1245 // i.e. blob that has got no real references from onodes
1250 INVALID_POOL_ID
= INT64_MIN
1253 int64_t pool_id
= INVALID_POOL_ID
;
1254 // subzero value indicates compressed_allocated as well
1255 int32_t allocated_chunks
= 0;
1257 sb_info_t(int64_t _sbid
= 0) : sbid(_sbid
)
1260 bool operator< (const sb_info_t
& other
) const {
1261 return std::abs(sbid
) < std::abs(other
.sbid
);
1263 bool operator< (const uint64_t& other_sbid
) const {
1264 return uint64_t(std::abs(sbid
)) < other_sbid
;
1266 bool is_stray() const {
1269 uint64_t get_sbid() const {
1270 return uint64_t(std::abs(sbid
));
1273 sbid
= std::abs(sbid
);
1275 } __attribute__((packed
));
1277 // Space-efficient container to keep a set of sb_info structures
1278 // given that the majority of entries are appended in a proper id-sorted
1279 // order. Hence one can keep them in a regular vector and apply binary search
1280 // whenever specific entry to be found.
1281 // For the rare occasions when out-of-order append takes place - an auxilliary
1282 // regular map is used.
1283 struct sb_info_space_efficient_map_t
{
1284 // large array sorted by the user
1285 mempool::bluestore_fsck::vector
<sb_info_t
> items
;
1286 // small additional set of items we maintain sorting ourselves
1287 // this would never keep an entry with id > items.back().id
1288 mempool::bluestore_fsck::vector
<sb_info_t
> aux_items
;
1290 sb_info_t
& add_maybe_stray(uint64_t sbid
) {
1291 return _add(-int64_t(sbid
));
1293 sb_info_t
& add_or_adopt(uint64_t sbid
) {
1294 auto& r
= _add(sbid
);
1298 auto find(uint64_t id
) {
1299 if (items
.size() != 0) {
1300 auto it
= std::lower_bound(
1304 [](const sb_info_t
& a
, const uint64_t& b
) {
1307 if (it
->get_sbid() == id
) {
1310 if (aux_items
.size() != 0) {
1311 auto it
= std::lower_bound(
1315 [](const sb_info_t
& a
, const uint64_t& b
) {
1318 if (it
->get_sbid() == id
) {
1325 // enumerates strays, order isn't guaranteed.
1326 void foreach_stray(std::function
<void(const sb_info_t
&)> cb
) {
1327 for (auto& sbi
: items
) {
1328 if (sbi
.is_stray()) {
1332 for (auto& sbi
: aux_items
) {
1333 if (sbi
.is_stray()) {
1343 items
.shrink_to_fit();
1344 aux_items
.shrink_to_fit();
1352 sb_info_t
& _add(int64_t id
) {
1353 uint64_t n_id
= uint64_t(std::abs(id
));
1354 if (items
.size() == 0 || n_id
> items
.back().get_sbid()) {
1355 return items
.emplace_back(id
);
1357 auto it
= find(n_id
);
1358 if (it
!= items
.end()) {
1361 if (aux_items
.size() == 0 || n_id
> aux_items
.back().get_sbid()) {
1362 return aux_items
.emplace_back(id
);
1364 // do sorted insertion, may be expensive!
1365 it
= std::upper_bound(
1369 [](const uint64_t& a
, const sb_info_t
& b
) {
1370 return a
< b
.get_sbid();
1372 return *aux_items
.emplace(it
, id
);