1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
16 #define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
19 #include <type_traits>
22 #include "include/mempool.h"
23 #include "include/types.h"
24 #include "include/interval_set.h"
25 #include "include/utime.h"
26 #include "common/hobject.h"
27 #include "compressor/Compressor.h"
28 #include "common/Checksummer.h"
29 #include "include/mempool.h"
30 #include "include/ceph_hash.h"
36 /// label for block device
37 struct bluestore_bdev_label_t
{
38 uuid_d osd_uuid
; ///< osd uuid
39 uint64_t size
= 0; ///< device size
40 utime_t btime
; ///< birth time
41 std::string description
; ///< device description
43 std::map
<std::string
,std::string
> meta
; ///< {read,write}_meta() content from ObjectStore
45 void encode(ceph::buffer::list
& bl
) const;
46 void decode(ceph::buffer::list::const_iterator
& p
);
47 void dump(ceph::Formatter
*f
) const;
48 static void generate_test_instances(std::list
<bluestore_bdev_label_t
*>& o
);
50 WRITE_CLASS_ENCODER(bluestore_bdev_label_t
)
52 std::ostream
& operator<<(std::ostream
& out
, const bluestore_bdev_label_t
& l
);
54 /// collection metadata
55 struct bluestore_cnode_t
{
56 uint32_t bits
; ///< how many bits of coll pgid are significant
58 explicit bluestore_cnode_t(int b
=0) : bits(b
) {}
60 DENC(bluestore_cnode_t
, v
, p
) {
65 void dump(ceph::Formatter
*f
) const;
66 static void generate_test_instances(std::list
<bluestore_cnode_t
*>& o
);
68 WRITE_CLASS_DENC(bluestore_cnode_t
)
70 std::ostream
& operator<<(std::ostream
& out
, const bluestore_cnode_t
& l
);
72 template <typename OFFS_TYPE
, typename LEN_TYPE
>
73 struct bluestore_interval_t
75 static const uint64_t INVALID_OFFSET
= ~0ull;
80 bluestore_interval_t(){}
81 bluestore_interval_t(uint64_t o
, uint64_t l
) : offset(o
), length(l
) {}
83 bool is_valid() const {
84 return offset
!= INVALID_OFFSET
;
86 uint64_t end() const {
87 return offset
!= INVALID_OFFSET
? offset
+ length
: INVALID_OFFSET
;
90 bool operator==(const bluestore_interval_t
& other
) const {
91 return offset
== other
.offset
&& length
== other
.length
;
96 /// pextent: physical extent
97 struct bluestore_pextent_t
: public bluestore_interval_t
<uint64_t, uint32_t>
99 bluestore_pextent_t() {}
100 bluestore_pextent_t(uint64_t o
, uint64_t l
) : bluestore_interval_t(o
, l
) {}
101 bluestore_pextent_t(const bluestore_interval_t
&ext
) :
102 bluestore_interval_t(ext
.offset
, ext
.length
) {}
104 DENC(bluestore_pextent_t
, v
, p
) {
105 denc_lba(v
.offset
, p
);
106 denc_varint_lowz(v
.length
, p
);
109 void dump(ceph::Formatter
*f
) const;
110 static void generate_test_instances(std::list
<bluestore_pextent_t
*>& ls
);
112 WRITE_CLASS_DENC(bluestore_pextent_t
)
114 std::ostream
& operator<<(std::ostream
& out
, const bluestore_pextent_t
& o
);
116 typedef mempool::bluestore_cache_other::vector
<bluestore_pextent_t
> PExtentVector
;
119 struct denc_traits
<PExtentVector
> {
120 static constexpr bool supported
= true;
121 static constexpr bool bounded
= false;
122 static constexpr bool featured
= false;
123 static constexpr bool need_contiguous
= true;
124 static void bound_encode(const PExtentVector
& v
, size_t& p
) {
125 p
+= sizeof(uint32_t);
126 const auto size
= v
.size();
129 denc(v
.front(), per
);
133 static void encode(const PExtentVector
& v
,
134 ceph::buffer::list::contiguous_appender
& p
) {
135 denc_varint(v
.size(), p
);
140 static void decode(PExtentVector
& v
, ceph::buffer::ptr::const_iterator
& p
) {
145 for (unsigned i
=0; i
<num
; ++i
) {
151 /// extent_map: a std::map of reference counted extents
152 struct bluestore_extent_ref_map_t
{
156 record_t(uint32_t l
=0, uint32_t r
=0) : length(l
), refs(r
) {}
157 DENC(bluestore_extent_ref_map_t::record_t
, v
, p
) {
158 denc_varint_lowz(v
.length
, p
);
159 denc_varint(v
.refs
, p
);
163 typedef mempool::bluestore_cache_other::map
<uint64_t,record_t
> map_t
;
167 void _maybe_merge_left(map_t::iterator
& p
);
173 return ref_map
.empty();
176 void get(uint64_t offset
, uint32_t len
);
177 void put(uint64_t offset
, uint32_t len
, PExtentVector
*release
,
178 bool *maybe_unshared
);
180 bool contains(uint64_t offset
, uint32_t len
) const;
181 bool intersects(uint64_t offset
, uint32_t len
) const;
183 void bound_encode(size_t& p
) const {
184 denc_varint((uint32_t)0, p
);
185 if (!ref_map
.empty()) {
186 size_t elem_size
= 0;
187 denc_varint_lowz((uint64_t)0, elem_size
);
188 ref_map
.begin()->second
.bound_encode(elem_size
);
189 p
+= elem_size
* ref_map
.size();
192 void encode(ceph::buffer::list::contiguous_appender
& p
) const {
193 const uint32_t n
= ref_map
.size();
196 auto i
= ref_map
.begin();
197 denc_varint_lowz(i
->first
, p
);
199 int64_t pos
= i
->first
;
200 while (++i
!= ref_map
.end()) {
201 denc_varint_lowz((int64_t)i
->first
- pos
, p
);
207 void decode(ceph::buffer::ptr::const_iterator
& p
) {
212 denc_varint_lowz(pos
, p
);
213 ref_map
[pos
].decode(p
);
216 denc_varint_lowz(delta
, p
);
218 ref_map
[pos
].decode(p
);
223 void dump(ceph::Formatter
*f
) const;
224 static void generate_test_instances(std::list
<bluestore_extent_ref_map_t
*>& o
);
226 WRITE_CLASS_DENC(bluestore_extent_ref_map_t
)
229 std::ostream
& operator<<(std::ostream
& out
, const bluestore_extent_ref_map_t
& rm
);
230 static inline bool operator==(const bluestore_extent_ref_map_t::record_t
& l
,
231 const bluestore_extent_ref_map_t::record_t
& r
) {
232 return l
.length
== r
.length
&& l
.refs
== r
.refs
;
234 static inline bool operator==(const bluestore_extent_ref_map_t
& l
,
235 const bluestore_extent_ref_map_t
& r
) {
236 return l
.ref_map
== r
.ref_map
;
238 static inline bool operator!=(const bluestore_extent_ref_map_t
& l
,
239 const bluestore_extent_ref_map_t
& r
) {
243 /// blob_use_tracker: a set of per-alloc unit ref buckets to track blob usage
244 struct bluestore_blob_use_tracker_t
{
245 // N.B.: There is no need to minimize au_size/num_au
246 // as much as possible (e.g. have just a single byte for au_size) since:
247 // 1) Struct isn't packed hence it's padded. And even if it's packed see 2)
248 // 2) Mem manager has its own granularity, most probably >= 8 bytes
250 uint32_t au_size
; // Allocation (=tracking) unit size,
251 // == 0 if uninitialized
252 uint32_t num_au
; // Amount of allocation units tracked
253 // == 0 if single unit or the whole blob is tracked
256 uint32_t* bytes_per_au
;
257 uint32_t total_bytes
;
260 bluestore_blob_use_tracker_t()
261 : au_size(0), num_au(0), bytes_per_au(nullptr) {
263 bluestore_blob_use_tracker_t(const bluestore_blob_use_tracker_t
& tracker
);
264 bluestore_blob_use_tracker_t
& operator=(const bluestore_blob_use_tracker_t
& rhs
);
265 ~bluestore_blob_use_tracker_t() {
271 delete[] bytes_per_au
;
273 mempool::pool_index_t(mempool::mempool_bluestore_cache_other
)).
274 adjust_count(-1, -sizeof(uint32_t) * num_au
);
281 uint32_t get_referenced_bytes() const {
286 for (size_t i
= 0; i
< num_au
; ++i
) {
287 total
+= bytes_per_au
[i
];
292 bool is_not_empty() const {
294 return total_bytes
!= 0;
296 for (size_t i
= 0; i
< num_au
; ++i
) {
297 if (bytes_per_au
[i
]) {
304 bool is_empty() const {
305 return !is_not_empty();
307 void prune_tail(uint32_t new_len
) {
309 new_len
= round_up_to(new_len
, au_size
);
310 uint32_t _num_au
= new_len
/ au_size
;
311 ceph_assert(_num_au
<= num_au
);
313 num_au
= _num_au
; // bytes_per_au array is left unmodified
320 void add_tail(uint32_t new_len
, uint32_t _au_size
) {
321 auto full_size
= au_size
* (num_au
? num_au
: 1);
322 ceph_assert(new_len
>= full_size
);
323 if (new_len
== full_size
) {
327 uint32_t old_total
= total_bytes
;
329 init(new_len
, _au_size
);
331 bytes_per_au
[0] = old_total
;
333 ceph_assert(_au_size
== au_size
);
334 new_len
= round_up_to(new_len
, au_size
);
335 uint32_t _num_au
= new_len
/ au_size
;
336 ceph_assert(_num_au
>= num_au
);
337 if (_num_au
> num_au
) {
338 auto old_bytes
= bytes_per_au
;
339 auto old_num_au
= num_au
;
342 for (size_t i
= 0; i
< old_num_au
; i
++) {
343 bytes_per_au
[i
] = old_bytes
[i
];
345 for (size_t i
= old_num_au
; i
< num_au
; i
++) {
354 uint32_t full_length
,
361 /// put: return true if the blob has no references any more after the call,
362 /// no release_units is filled for the sake of performance.
363 /// return false if there are some references to the blob,
364 /// in this case release_units contains pextents
365 /// (identified by their offsets relative to the blob start)
366 /// that are not used any more and can be safely deallocated.
370 PExtentVector
*release
);
372 bool can_split() const;
373 bool can_split_at(uint32_t blob_offset
) const;
375 uint32_t blob_offset
,
376 bluestore_blob_use_tracker_t
* r
);
379 const bluestore_blob_use_tracker_t
& other
) const;
381 void bound_encode(size_t& p
) const {
382 denc_varint(au_size
, p
);
384 denc_varint(num_au
, p
);
386 denc_varint(total_bytes
, p
);
388 size_t elem_size
= 0;
389 denc_varint((uint32_t)0, elem_size
);
390 p
+= elem_size
* num_au
;
394 void encode(ceph::buffer::list::contiguous_appender
& p
) const {
395 denc_varint(au_size
, p
);
397 denc_varint(num_au
, p
);
399 denc_varint(total_bytes
, p
);
401 size_t elem_size
= 0;
402 denc_varint((uint32_t)0, elem_size
);
403 for (size_t i
= 0; i
< num_au
; ++i
) {
404 denc_varint(bytes_per_au
[i
], p
);
409 void decode(ceph::buffer::ptr::const_iterator
& p
) {
411 denc_varint(au_size
, p
);
413 denc_varint(num_au
, p
);
415 denc_varint(total_bytes
, p
);
418 for (size_t i
= 0; i
< num_au
; ++i
) {
419 denc_varint(bytes_per_au
[i
], p
);
425 void dump(ceph::Formatter
*f
) const;
426 static void generate_test_instances(std::list
<bluestore_blob_use_tracker_t
*>& o
);
430 WRITE_CLASS_DENC(bluestore_blob_use_tracker_t
)
431 std::ostream
& operator<<(std::ostream
& out
, const bluestore_blob_use_tracker_t
& rm
);
433 /// blob: a piece of data on disk
434 struct bluestore_blob_t
{
436 PExtentVector extents
; ///< raw data position on device
437 uint32_t logical_length
= 0; ///< original length of data stored in the blob
438 uint32_t compressed_length
= 0; ///< compressed length if any
442 LEGACY_FLAG_MUTABLE
= 1, ///< [legacy] blob can be overwritten or split
443 FLAG_COMPRESSED
= 2, ///< blob is compressed
444 FLAG_CSUM
= 4, ///< blob has checksums
445 FLAG_HAS_UNUSED
= 8, ///< blob has unused std::map
446 FLAG_SHARED
= 16, ///< blob is shared; see external SharedBlob
448 static std::string
get_flags_string(unsigned flags
);
450 uint32_t flags
= 0; ///< FLAG_*
452 typedef uint16_t unused_t
;
453 unused_t unused
= 0; ///< portion that has never been written to (bitmap)
455 uint8_t csum_type
= Checksummer::CSUM_NONE
; ///< CSUM_*
456 uint8_t csum_chunk_order
= 0; ///< csum block size is 1<<block_order bytes
458 ceph::buffer::ptr csum_data
; ///< opaque std::vector of csum data
460 bluestore_blob_t(uint32_t f
= 0) : flags(f
) {}
462 const PExtentVector
& get_extents() const {
465 PExtentVector
& dirty_extents() {
470 void bound_encode(size_t& p
, uint64_t struct_v
) const {
471 ceph_assert(struct_v
== 1 || struct_v
== 2);
473 denc_varint(flags
, p
);
474 denc_varint_lowz(logical_length
, p
);
475 denc_varint_lowz(compressed_length
, p
);
477 denc(csum_chunk_order
, p
);
478 denc_varint(csum_data
.length(), p
);
479 p
+= csum_data
.length();
480 p
+= sizeof(unused_t
);
483 void encode(ceph::buffer::list::contiguous_appender
& p
, uint64_t struct_v
) const {
484 ceph_assert(struct_v
== 1 || struct_v
== 2);
486 denc_varint(flags
, p
);
487 if (is_compressed()) {
488 denc_varint_lowz(logical_length
, p
);
489 denc_varint_lowz(compressed_length
, p
);
493 denc(csum_chunk_order
, p
);
494 denc_varint(csum_data
.length(), p
);
495 memcpy(p
.get_pos_add(csum_data
.length()), csum_data
.c_str(),
503 void decode(ceph::buffer::ptr::const_iterator
& p
, uint64_t struct_v
) {
504 ceph_assert(struct_v
== 1 || struct_v
== 2);
506 denc_varint(flags
, p
);
507 if (is_compressed()) {
508 denc_varint_lowz(logical_length
, p
);
509 denc_varint_lowz(compressed_length
, p
);
511 logical_length
= get_ondisk_length();
515 denc(csum_chunk_order
, p
);
518 csum_data
= p
.get_ptr(len
);
519 csum_data
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
526 bool can_split() const {
528 !has_flag(FLAG_SHARED
) &&
529 !has_flag(FLAG_COMPRESSED
) &&
530 !has_flag(FLAG_HAS_UNUSED
); // splitting unused set is complex
532 bool can_split_at(uint32_t blob_offset
) const {
533 return !has_csum() || blob_offset
% get_csum_chunk_size() == 0;
536 void dump(ceph::Formatter
*f
) const;
537 static void generate_test_instances(std::list
<bluestore_blob_t
*>& ls
);
539 bool has_flag(unsigned f
) const {
542 void set_flag(unsigned f
) {
545 void clear_flag(unsigned f
) {
548 std::string
get_flags_string() const {
549 return get_flags_string(flags
);
552 void set_compressed(uint64_t clen_orig
, uint64_t clen
) {
553 set_flag(FLAG_COMPRESSED
);
554 logical_length
= clen_orig
;
555 compressed_length
= clen
;
557 bool is_mutable() const {
558 return !is_compressed() && !is_shared();
560 bool is_compressed() const {
561 return has_flag(FLAG_COMPRESSED
);
563 bool has_csum() const {
564 return has_flag(FLAG_CSUM
);
566 bool has_unused() const {
567 return has_flag(FLAG_HAS_UNUSED
);
569 bool is_shared() const {
570 return has_flag(FLAG_SHARED
);
573 /// return chunk (i.e. min readable block) size for the blob
574 uint64_t get_chunk_size(uint64_t dev_block_size
) const {
576 std::max
<uint64_t>(dev_block_size
, get_csum_chunk_size()) : dev_block_size
;
578 uint32_t get_csum_chunk_size() const {
579 return 1 << csum_chunk_order
;
581 uint32_t get_compressed_payload_length() const {
582 return is_compressed() ? compressed_length
: 0;
584 uint64_t calc_offset(uint64_t x_off
, uint64_t *plen
) const {
585 auto p
= extents
.begin();
586 ceph_assert(p
!= extents
.end());
587 while (x_off
>= p
->length
) {
590 ceph_assert(p
!= extents
.end());
593 *plen
= p
->length
- x_off
;
594 return p
->offset
+ x_off
;
597 // validate whether or not the status of pextents within the given range
598 // meets the requirement(allocated or unallocated).
599 bool _validate_range(uint64_t b_off
, uint64_t b_len
,
600 bool require_allocated
) const {
601 auto p
= extents
.begin();
602 ceph_assert(p
!= extents
.end());
603 while (b_off
>= p
->length
) {
605 if (++p
== extents
.end())
610 if (require_allocated
!= p
->is_valid()) {
613 if (p
->length
>= b_len
) {
617 if (++p
== extents
.end())
620 ceph_abort_msg("we should not get here");
624 /// return true if the entire range is allocated
625 /// (mapped to extents on disk)
626 bool is_allocated(uint64_t b_off
, uint64_t b_len
) const {
627 return _validate_range(b_off
, b_len
, true);
630 /// return true if the entire range is unallocated
631 /// (not mapped to extents on disk)
632 bool is_unallocated(uint64_t b_off
, uint64_t b_len
) const {
633 return _validate_range(b_off
, b_len
, false);
636 /// return true if the logical range has never been used
637 bool is_unused(uint64_t offset
, uint64_t length
) const {
641 ceph_assert(!is_compressed());
642 uint64_t blob_len
= get_logical_length();
643 ceph_assert((blob_len
% (sizeof(unused
)*8)) == 0);
644 ceph_assert(offset
+ length
<= blob_len
);
645 uint64_t chunk_size
= blob_len
/ (sizeof(unused
)*8);
646 uint64_t start
= offset
/ chunk_size
;
647 uint64_t end
= round_up_to(offset
+ length
, chunk_size
) / chunk_size
;
649 while (i
< end
&& (unused
& (1u << i
))) {
655 /// mark a range that has never been used
656 void add_unused(uint64_t offset
, uint64_t length
) {
657 ceph_assert(!is_compressed());
658 uint64_t blob_len
= get_logical_length();
659 ceph_assert((blob_len
% (sizeof(unused
)*8)) == 0);
660 ceph_assert(offset
+ length
<= blob_len
);
661 uint64_t chunk_size
= blob_len
/ (sizeof(unused
)*8);
662 uint64_t start
= round_up_to(offset
, chunk_size
) / chunk_size
;
663 uint64_t end
= (offset
+ length
) / chunk_size
;
664 for (auto i
= start
; i
< end
; ++i
) {
668 set_flag(FLAG_HAS_UNUSED
);
672 /// indicate that a range has (now) been used.
673 void mark_used(uint64_t offset
, uint64_t length
) {
675 ceph_assert(!is_compressed());
676 uint64_t blob_len
= get_logical_length();
677 ceph_assert((blob_len
% (sizeof(unused
)*8)) == 0);
678 ceph_assert(offset
+ length
<= blob_len
);
679 uint64_t chunk_size
= blob_len
/ (sizeof(unused
)*8);
680 uint64_t start
= offset
/ chunk_size
;
681 uint64_t end
= round_up_to(offset
+ length
, chunk_size
) / chunk_size
;
682 for (auto i
= start
; i
< end
; ++i
) {
683 unused
&= ~(1u << i
);
686 clear_flag(FLAG_HAS_UNUSED
);
691 // map_f_invoke templates intended to mask parameters which are not expected
692 // by the provided callback
693 template<class F
, typename
std::enable_if
<std::is_invocable_r_v
<
697 uint64_t>>::type
* = nullptr>
698 int map_f_invoke(uint64_t lo
,
699 const bluestore_pextent_t
& p
,
701 uint64_t l
, F
&& f
) const{
705 template<class F
, typename
std::enable_if
<std::is_invocable_r_v
<
710 uint64_t>>::type
* = nullptr>
711 int map_f_invoke(uint64_t lo
,
712 const bluestore_pextent_t
& p
,
714 uint64_t l
, F
&& f
) const {
718 template<class F
, typename
std::enable_if
<std::is_invocable_r_v
<
721 const bluestore_pextent_t
&,
723 uint64_t>>::type
* = nullptr>
724 int map_f_invoke(uint64_t lo
,
725 const bluestore_pextent_t
& p
,
727 uint64_t l
, F
&& f
) const {
732 int map(uint64_t x_off
, uint64_t x_len
, F
&& f
) const {
734 auto p
= extents
.begin();
735 ceph_assert(p
!= extents
.end());
736 while (x_off
>= p
->length
) {
739 ceph_assert(p
!= extents
.end());
741 while (x_len
> 0 && p
!= extents
.end()) {
742 uint64_t l
= std::min(p
->length
- x_off
, x_len
);
743 int r
= map_f_invoke(x_off0
, *p
, p
->offset
+ x_off
, l
, f
);
755 void map_bl(uint64_t x_off
,
756 ceph::buffer::list
& bl
,
758 static_assert(std::is_invocable_v
<F
, uint64_t, ceph::buffer::list
&>);
760 auto p
= extents
.begin();
761 ceph_assert(p
!= extents
.end());
762 while (x_off
>= p
->length
) {
765 ceph_assert(p
!= extents
.end());
767 ceph::buffer::list::iterator it
= bl
.begin();
768 uint64_t x_len
= bl
.length();
770 ceph_assert(p
!= extents
.end());
771 uint64_t l
= std::min(p
->length
- x_off
, x_len
);
772 ceph::buffer::list t
;
774 f(p
->offset
+ x_off
, t
);
781 uint32_t get_ondisk_length() const {
783 for (auto &p
: extents
) {
789 uint32_t get_logical_length() const {
790 return logical_length
;
792 size_t get_csum_value_size() const;
794 size_t get_csum_count() const {
795 size_t vs
= get_csum_value_size();
798 return csum_data
.length() / vs
;
800 uint64_t get_csum_item(unsigned i
) const {
801 size_t cs
= get_csum_value_size();
802 const char *p
= csum_data
.c_str();
805 ceph_abort_msg("no csum data, bad index");
807 return reinterpret_cast<const uint8_t*>(p
)[i
];
809 return reinterpret_cast<const ceph_le16
*>(p
)[i
];
811 return reinterpret_cast<const ceph_le32
*>(p
)[i
];
813 return reinterpret_cast<const ceph_le64
*>(p
)[i
];
815 ceph_abort_msg("unrecognized csum word size");
818 const char *get_csum_item_ptr(unsigned i
) const {
819 size_t cs
= get_csum_value_size();
820 return csum_data
.c_str() + (cs
* i
);
822 char *get_csum_item_ptr(unsigned i
) {
823 size_t cs
= get_csum_value_size();
824 return csum_data
.c_str() + (cs
* i
);
827 void init_csum(unsigned type
, unsigned order
, unsigned len
) {
830 csum_chunk_order
= order
;
831 csum_data
= ceph::buffer::create(get_csum_value_size() * len
/ get_csum_chunk_size());
833 csum_data
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
836 /// calculate csum for the buffer at the given b_off
837 void calc_csum(uint64_t b_off
, const ceph::buffer::list
& bl
);
839 /// verify csum: return -EOPNOTSUPP for unsupported checksum type;
840 /// return -1 and valid(nonnegative) b_bad_off for checksum error;
841 /// return 0 if all is well.
842 int verify_csum(uint64_t b_off
, const ceph::buffer::list
& bl
, int* b_bad_off
,
843 uint64_t *bad_csum
) const;
845 bool can_prune_tail() const {
847 extents
.size() > 1 && // if it's all invalid it's not pruning.
848 !extents
.back().is_valid() &&
852 const auto &p
= extents
.back();
853 logical_length
-= p
.length
;
858 csum_data
= ceph::buffer::ptr(t
.c_str(),
859 get_logical_length() / get_csum_chunk_size() *
860 get_csum_value_size());
863 void add_tail(uint32_t new_len
) {
864 ceph_assert(is_mutable());
865 ceph_assert(!has_unused());
866 ceph_assert(new_len
> logical_length
);
867 extents
.emplace_back(
869 bluestore_pextent_t::INVALID_OFFSET
,
870 new_len
- logical_length
));
871 logical_length
= new_len
;
875 csum_data
= ceph::buffer::create(
876 get_csum_value_size() * logical_length
/ get_csum_chunk_size());
877 csum_data
.copy_in(0, t
.length(), t
.c_str());
878 csum_data
.zero(t
.length(), csum_data
.length() - t
.length());
881 uint32_t get_release_size(uint32_t min_alloc_size
) const {
882 if (is_compressed()) {
883 return get_logical_length();
885 uint32_t res
= get_csum_chunk_size();
886 if (!has_csum() || res
< min_alloc_size
) {
887 res
= min_alloc_size
;
892 void split(uint32_t blob_offset
, bluestore_blob_t
& rb
);
893 void allocated(uint32_t b_off
, uint32_t length
, const PExtentVector
& allocs
);
894 void allocated_test(const bluestore_pextent_t
& alloc
); // intended for UT only
896 /// updates blob's pextents container and return unused pextents eligible
898 /// all - indicates that the whole blob to be released.
899 /// logical - specifies set of logical extents within blob's
901 /// Returns true if blob has no more valid pextents
902 bool release_extents(
904 const PExtentVector
& logical
,
907 WRITE_CLASS_DENC_FEATURED(bluestore_blob_t
)
909 std::ostream
& operator<<(std::ostream
& out
, const bluestore_blob_t
& o
);
912 /// shared blob state
913 struct bluestore_shared_blob_t
{
914 MEMPOOL_CLASS_HELPERS();
915 uint64_t sbid
; ///> shared blob id
916 bluestore_extent_ref_map_t ref_map
; ///< shared blob extents
918 bluestore_shared_blob_t(uint64_t _sbid
) : sbid(_sbid
) {}
919 bluestore_shared_blob_t(uint64_t _sbid
,
920 bluestore_extent_ref_map_t
&& _ref_map
)
921 : sbid(_sbid
), ref_map(std::move(_ref_map
)) {}
923 DENC(bluestore_shared_blob_t
, v
, p
) {
930 void dump(ceph::Formatter
*f
) const;
931 static void generate_test_instances(std::list
<bluestore_shared_blob_t
*>& ls
);
934 return ref_map
.empty();
937 WRITE_CLASS_DENC(bluestore_shared_blob_t
)
939 std::ostream
& operator<<(std::ostream
& out
, const bluestore_shared_blob_t
& o
);
941 /// onode: per-object metadata
942 struct bluestore_onode_t
{
943 uint64_t nid
= 0; ///< numeric id (locally unique)
944 uint64_t size
= 0; ///< object size
945 // mempool to be assigned to buffer::ptr manually
946 std::map
<mempool::bluestore_cache_meta::string
, ceph::buffer::ptr
> attrs
;
949 uint32_t offset
= 0; ///< logical offset for start of shard
950 uint32_t bytes
= 0; ///< encoded bytes
951 DENC(shard_info
, v
, p
) {
952 denc_varint(v
.offset
, p
);
953 denc_varint(v
.bytes
, p
);
955 void dump(ceph::Formatter
*f
) const;
957 std::vector
<shard_info
> extent_map_shards
; ///< extent std::map shards (if any)
959 uint32_t expected_object_size
= 0;
960 uint32_t expected_write_size
= 0;
961 uint32_t alloc_hint_flags
= 0;
965 std::map
<uint32_t, uint64_t> zone_offset_refs
; ///< (zone, offset) refs to this onode
968 FLAG_OMAP
= 1, ///< object may have omap data
969 FLAG_PGMETA_OMAP
= 2, ///< omap data is in meta omap prefix
970 FLAG_PERPOOL_OMAP
= 4, ///< omap data is in per-pool prefix; per-pool keys
971 FLAG_PERPG_OMAP
= 8, ///< omap data is in per-pg prefix; per-pg keys
974 std::string
get_flags_string() const {
976 if (flags
& FLAG_OMAP
) {
979 if (flags
& FLAG_PGMETA_OMAP
) {
982 if (flags
& FLAG_PERPOOL_OMAP
) {
983 s
+= "+per_pool_omap";
985 if (flags
& FLAG_PERPG_OMAP
) {
991 bool has_flag(unsigned f
) const {
995 void set_flag(unsigned f
) {
999 void clear_flag(unsigned f
) {
1003 bool has_omap() const {
1004 return has_flag(FLAG_OMAP
);
1007 static bool is_pgmeta_omap(uint8_t flags
) {
1008 return flags
& FLAG_PGMETA_OMAP
;
1010 static bool is_perpool_omap(uint8_t flags
) {
1011 return flags
& FLAG_PERPOOL_OMAP
;
1013 static bool is_perpg_omap(uint8_t flags
) {
1014 return flags
& FLAG_PERPG_OMAP
;
1016 bool is_pgmeta_omap() const {
1017 return has_flag(FLAG_PGMETA_OMAP
);
1019 bool is_perpool_omap() const {
1020 return has_flag(FLAG_PERPOOL_OMAP
);
1022 bool is_perpg_omap() const {
1023 return has_flag(FLAG_PERPG_OMAP
);
1026 void set_omap_flags(bool legacy
) {
1027 set_flag(FLAG_OMAP
| (legacy
? 0 : (FLAG_PERPOOL_OMAP
| FLAG_PERPG_OMAP
)));
1029 void set_omap_flags_pgmeta() {
1030 set_flag(FLAG_OMAP
| FLAG_PGMETA_OMAP
);
1033 void clear_omap_flag() {
1034 clear_flag(FLAG_OMAP
|
1040 DENC(bluestore_onode_t
, v
, p
) {
1041 DENC_START(2, 1, p
);
1042 denc_varint(v
.nid
, p
);
1043 denc_varint(v
.size
, p
);
1046 denc(v
.extent_map_shards
, p
);
1047 denc_varint(v
.expected_object_size
, p
);
1048 denc_varint(v
.expected_write_size
, p
);
1049 denc_varint(v
.alloc_hint_flags
, p
);
1050 if (struct_v
>= 2) {
1051 denc(v
.zone_offset_refs
, p
);
1055 void dump(ceph::Formatter
*f
) const;
1056 static void generate_test_instances(std::list
<bluestore_onode_t
*>& o
);
1058 WRITE_CLASS_DENC(bluestore_onode_t::shard_info
)
1059 WRITE_CLASS_DENC(bluestore_onode_t
)
1061 std::ostream
& operator<<(std::ostream
& out
, const bluestore_onode_t::shard_info
& si
);
1063 /// writeahead-logged op
1064 struct bluestore_deferred_op_t
{
1070 PExtentVector extents
;
1071 ceph::buffer::list data
;
1073 DENC(bluestore_deferred_op_t
, v
, p
) {
1074 DENC_START(1, 1, p
);
1080 void dump(ceph::Formatter
*f
) const;
1081 static void generate_test_instances(std::list
<bluestore_deferred_op_t
*>& o
);
1083 WRITE_CLASS_DENC(bluestore_deferred_op_t
)
1086 /// writeahead-logged transaction
1087 struct bluestore_deferred_transaction_t
{
1089 std::list
<bluestore_deferred_op_t
> ops
;
1090 interval_set
<uint64_t> released
; ///< allocations to release after tx
1092 bluestore_deferred_transaction_t() : seq(0) {}
1094 DENC(bluestore_deferred_transaction_t
, v
, p
) {
1095 DENC_START(1, 1, p
);
1098 denc(v
.released
, p
);
1101 void dump(ceph::Formatter
*f
) const;
1102 static void generate_test_instances(std::list
<bluestore_deferred_transaction_t
*>& o
);
1104 WRITE_CLASS_DENC(bluestore_deferred_transaction_t
)
1106 struct bluestore_compression_header_t
{
1107 uint8_t type
= Compressor::COMP_ALG_NONE
;
1108 uint32_t length
= 0;
1109 boost::optional
<int32_t> compressor_message
;
1111 bluestore_compression_header_t() {}
1112 bluestore_compression_header_t(uint8_t _type
)
1115 DENC(bluestore_compression_header_t
, v
, p
) {
1116 DENC_START(2, 1, p
);
1119 if (struct_v
>= 2) {
1120 denc(v
.compressor_message
, p
);
1124 void dump(ceph::Formatter
*f
) const;
1125 static void generate_test_instances(std::list
<bluestore_compression_header_t
*>& o
);
1127 WRITE_CLASS_DENC(bluestore_compression_header_t
)
1129 template <template <typename
> typename V
, class COUNTER_TYPE
= int32_t>
1130 class ref_counter_2hash_tracker_t
{
1131 size_t num_non_zero
= 0;
1132 size_t num_buckets
= 0;
1133 V
<COUNTER_TYPE
> buckets1
;
1134 V
<COUNTER_TYPE
> buckets2
;
1137 ref_counter_2hash_tracker_t(uint64_t mem_cap
) {
1138 num_buckets
= mem_cap
/ sizeof(COUNTER_TYPE
) / 2;
1139 ceph_assert(num_buckets
);
1140 buckets1
.resize(num_buckets
);
1141 buckets2
.resize(num_buckets
);
1145 size_t get_num_buckets() const {
1149 void inc(const char* hash_val
, size_t hash_val_len
, int n
) {
1150 auto h
= ceph_str_hash_rjenkins((const char*)hash_val
, hash_val_len
) %
1152 if (buckets1
[h
] == 0 && n
) {
1154 } else if (buckets1
[h
] == -n
) {
1158 h
= ceph_str_hash_linux((const char*)hash_val
, hash_val_len
) % num_buckets
;
1159 if (buckets2
[h
] == 0 && n
) {
1161 } else if (buckets2
[h
] == -n
) {
1167 bool test_hash_conflict(
1168 const char* hash_val1
,
1169 const char* hash_val2
,
1170 size_t hash_val_len
) const {
1172 auto h1
= ceph_str_hash_rjenkins((const char*)hash_val1
, hash_val_len
);
1173 auto h2
= ceph_str_hash_rjenkins((const char*)hash_val2
, hash_val_len
);
1174 auto h3
= ceph_str_hash_linux((const char*)hash_val1
, hash_val_len
);
1175 auto h4
= ceph_str_hash_linux((const char*)hash_val2
, hash_val_len
);
1176 return ((h1
% num_buckets
) == (h2
% num_buckets
)) &&
1177 ((h3
% num_buckets
) == (h4
% num_buckets
));
1180 bool test_all_zero(const char* hash_val
, size_t hash_val_len
) const {
1181 auto h
= ceph_str_hash_rjenkins((const char*)hash_val
, hash_val_len
);
1182 if (buckets1
[h
% num_buckets
] != 0) {
1185 h
= ceph_str_hash_linux((const char*)hash_val
, hash_val_len
);
1186 return buckets2
[h
% num_buckets
] == 0;
1189 // returns number of mismatching buckets
1190 size_t count_non_zero() const {
1191 return num_non_zero
;
1194 for (size_t i
= 0; i
< num_buckets
; i
++) {
1202 class shared_blob_2hash_tracker_t
1203 : public ref_counter_2hash_tracker_t
<mempool::bluestore_fsck::vector
> {
1205 static const size_t hash_input_len
= 3;
1207 typedef std::array
<uint64_t, hash_input_len
> hash_input_t
;
1209 static size_t get_hash_input_size() {
1210 return hash_input_len
* sizeof(hash_input_t::value_type
);
1213 inline hash_input_t
build_hash_input(uint64_t sbid
, uint64_t offset
) const;
1215 size_t au_void_bits
= 0;
1219 shared_blob_2hash_tracker_t(uint64_t mem_cap
, size_t alloc_unit
)
1220 : ref_counter_2hash_tracker_t(mem_cap
) {
1221 ceph_assert(alloc_unit
);
1222 ceph_assert(isp2(alloc_unit
));
1223 au_void_bits
= ctz(alloc_unit
);
1225 void inc(uint64_t sbid
, uint64_t offset
, int n
);
1226 void inc_range(uint64_t sbid
, uint64_t offset
, uint32_t len
, int n
);
1228 bool test_hash_conflict(
1232 uint64_t offset2
) const;
1235 uint64_t offset
) const;
1236 bool test_all_zero_range(
1239 uint32_t len
) const;
1243 // subzero value indicates (potentially) stray blob,
1244 // i.e. blob that has got no real references from onodes
1249 INVALID_POOL_ID
= INT64_MIN
1252 int64_t pool_id
= INVALID_POOL_ID
;
1253 // subzero value indicates compressed_allocated as well
1254 int32_t allocated_chunks
= 0;
1256 sb_info_t(int64_t _sbid
= 0) : sbid(_sbid
)
1259 bool operator< (const sb_info_t
& other
) const {
1260 return std::abs(sbid
) < std::abs(other
.sbid
);
1262 bool operator< (const uint64_t& other_sbid
) const {
1263 return uint64_t(std::abs(sbid
)) < other_sbid
;
1265 bool is_stray() const {
1268 uint64_t get_sbid() const {
1269 return uint64_t(std::abs(sbid
));
1272 sbid
= std::abs(sbid
);
1274 } __attribute__((packed
));
1276 // Space-efficient container to keep a set of sb_info structures
1277 // given that the majority of entries are appended in a proper id-sorted
1278 // order. Hence one can keep them in a regular vector and apply binary search
1279 // whenever specific entry to be found.
1280 // For the rare occasions when out-of-order append takes place - an auxilliary
1281 // regular map is used.
1282 struct sb_info_space_efficient_map_t
{
1283 // large array sorted by the user
1284 mempool::bluestore_fsck::vector
<sb_info_t
> items
;
1285 // small additional set of items we maintain sorting ourselves
1286 // this would never keep an entry with id > items.back().id
1287 mempool::bluestore_fsck::vector
<sb_info_t
> aux_items
;
1289 sb_info_t
& add_maybe_stray(uint64_t sbid
) {
1290 return _add(-int64_t(sbid
));
1292 sb_info_t
& add_or_adopt(uint64_t sbid
) {
1293 auto& r
= _add(sbid
);
1297 auto find(uint64_t id
) {
1298 if (items
.size() != 0) {
1299 auto it
= std::lower_bound(
1303 [](const sb_info_t
& a
, const uint64_t& b
) {
1306 if (it
->get_sbid() == id
) {
1309 if (aux_items
.size() != 0) {
1310 auto it
= std::lower_bound(
1314 [](const sb_info_t
& a
, const uint64_t& b
) {
1317 if (it
->get_sbid() == id
) {
1324 // enumerates strays, order isn't guaranteed.
1325 void foreach_stray(std::function
<void(const sb_info_t
&)> cb
) {
1326 for (auto& sbi
: items
) {
1327 if (sbi
.is_stray()) {
1331 for (auto& sbi
: aux_items
) {
1332 if (sbi
.is_stray()) {
1342 items
.shrink_to_fit();
1343 aux_items
.shrink_to_fit();
1351 sb_info_t
& _add(int64_t id
) {
1352 uint64_t n_id
= uint64_t(std::abs(id
));
1353 if (items
.size() == 0 || n_id
> items
.back().get_sbid()) {
1354 return items
.emplace_back(id
);
1356 auto it
= find(n_id
);
1357 if (it
!= items
.end()) {
1360 if (aux_items
.size() == 0 || n_id
> aux_items
.back().get_sbid()) {
1361 return aux_items
.emplace_back(id
);
1363 // do sorted insertion, may be expensive!
1364 it
= std::upper_bound(
1368 [](const uint64_t& a
, const sb_info_t
& b
) {
1369 return a
< b
.get_sbid();
1371 return *aux_items
.emplace(it
, id
);