1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_OSD_BLUESTORE_H
16 #define CEPH_OSD_BLUESTORE_H
26 #include <condition_variable>
28 #include <boost/intrusive/list.hpp>
29 #include <boost/intrusive/unordered_set.hpp>
30 #include <boost/intrusive/set.hpp>
31 #include <boost/functional/hash.hpp>
32 #include <boost/dynamic_bitset.hpp>
33 #include <boost/circular_buffer.hpp>
35 #include "include/cpp-btree/btree_set.h"
37 #include "include/ceph_assert.h"
38 #include "include/unordered_map.h"
39 #include "include/mempool.h"
40 #include "include/hash.h"
41 #include "common/bloom_filter.hpp"
42 #include "common/Finisher.h"
43 #include "common/ceph_mutex.h"
44 #include "common/Throttle.h"
45 #include "common/perf_counters.h"
46 #include "common/PriorityCache.h"
47 #include "compressor/Compressor.h"
48 #include "os/ObjectStore.h"
50 #include "bluestore_types.h"
51 #include "BlockDevice.h"
53 #include "common/EventTrace.h"
56 class FreelistManager
;
57 class BlueStoreRepairer
;
60 //#define DEBUG_DEFERRED
64 // constants for Buffer::optimize()
65 #define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
69 l_bluestore_first
= 732430,
70 l_bluestore_kv_flush_lat
,
71 l_bluestore_kv_commit_lat
,
72 l_bluestore_kv_sync_lat
,
73 l_bluestore_kv_final_lat
,
74 l_bluestore_state_prepare_lat
,
75 l_bluestore_state_aio_wait_lat
,
76 l_bluestore_state_io_done_lat
,
77 l_bluestore_state_kv_queued_lat
,
78 l_bluestore_state_kv_committing_lat
,
79 l_bluestore_state_kv_done_lat
,
80 l_bluestore_state_deferred_queued_lat
,
81 l_bluestore_state_deferred_aio_wait_lat
,
82 l_bluestore_state_deferred_cleanup_lat
,
83 l_bluestore_state_finishing_lat
,
84 l_bluestore_state_done_lat
,
85 l_bluestore_throttle_lat
,
86 l_bluestore_submit_lat
,
87 l_bluestore_commit_lat
,
89 l_bluestore_read_onode_meta_lat
,
90 l_bluestore_read_wait_aio_lat
,
91 l_bluestore_compress_lat
,
92 l_bluestore_decompress_lat
,
94 l_bluestore_compress_success_count
,
95 l_bluestore_compress_rejected_count
,
96 l_bluestore_write_pad_bytes
,
97 l_bluestore_deferred_write_ops
,
98 l_bluestore_deferred_write_bytes
,
99 l_bluestore_write_penalty_read_ops
,
100 l_bluestore_allocated
,
102 l_bluestore_compressed
,
103 l_bluestore_compressed_allocated
,
104 l_bluestore_compressed_original
,
106 l_bluestore_pinned_onodes
,
107 l_bluestore_onode_hits
,
108 l_bluestore_onode_misses
,
109 l_bluestore_onode_shard_hits
,
110 l_bluestore_onode_shard_misses
,
114 l_bluestore_buffer_bytes
,
115 l_bluestore_buffer_hit_bytes
,
116 l_bluestore_buffer_miss_bytes
,
117 l_bluestore_write_big
,
118 l_bluestore_write_big_bytes
,
119 l_bluestore_write_big_blobs
,
120 l_bluestore_write_small
,
121 l_bluestore_write_small_bytes
,
122 l_bluestore_write_small_unused
,
123 l_bluestore_write_small_deferred
,
124 l_bluestore_write_small_pre_read
,
125 l_bluestore_write_small_new
,
127 l_bluestore_onode_reshard
,
128 l_bluestore_blob_split
,
129 l_bluestore_extent_compress
,
130 l_bluestore_gc_merged
,
131 l_bluestore_read_eio
,
132 l_bluestore_reads_with_retries
,
133 l_bluestore_fragmentation
,
134 l_bluestore_omap_seek_to_first_lat
,
135 l_bluestore_omap_upper_bound_lat
,
136 l_bluestore_omap_lower_bound_lat
,
137 l_bluestore_omap_next_lat
,
138 l_bluestore_omap_get_keys_lat
,
139 l_bluestore_omap_get_values_lat
,
140 l_bluestore_clist_lat
,
141 l_bluestore_remove_lat
,
145 #define META_POOL_ID ((uint64_t)-1ull)
147 class BlueStore
: public ObjectStore
,
148 public BlueFSDeviceExpander
,
149 public md_config_obs_t
{
150 // -----------------------------------------------------
154 const char** get_tracked_conf_keys() const override
;
155 void handle_conf_change(const ConfigProxy
& conf
,
156 const std::set
<std::string
> &changed
) override
;
158 //handler for discard event
159 void handle_discard(interval_set
<uint64_t>& to_release
);
162 void _set_compression();
163 void _set_throttle_params();
164 int _set_cache_sizes();
165 void _set_max_defer_interval() {
167 cct
->_conf
.get_val
<double>("bluestore_max_defer_interval");
172 typedef map
<uint64_t, bufferlist
> ready_regions_t
;
177 typedef boost::intrusive_ptr
<Collection
> CollectionRef
;
180 virtual void aio_finish(BlueStore
*store
) = 0;
181 virtual ~AioContext() {}
186 MEMPOOL_CLASS_HELPERS();
189 STATE_EMPTY
, ///< empty buffer -- used for cache history
190 STATE_CLEAN
, ///< clean data that is up to date
191 STATE_WRITING
, ///< data that is being written (io not yet complete)
193 static const char *get_state_name(int s
) {
195 case STATE_EMPTY
: return "empty";
196 case STATE_CLEAN
: return "clean";
197 case STATE_WRITING
: return "writing";
198 default: return "???";
202 FLAG_NOCACHE
= 1, ///< trim when done WRITING (do not become CLEAN)
203 // NOTE: fix operator<< when you define a second flag
205 static const char *get_flag_name(int s
) {
207 case FLAG_NOCACHE
: return "nocache";
208 default: return "???";
213 uint16_t state
; ///< STATE_*
214 uint16_t cache_private
= 0; ///< opaque (to us) value used by Cache impl
215 uint32_t flags
; ///< FLAG_*
217 uint32_t offset
, length
;
220 boost::intrusive::list_member_hook
<> lru_item
;
221 boost::intrusive::list_member_hook
<> state_item
;
223 Buffer(BufferSpace
*space
, unsigned s
, uint64_t q
, uint32_t o
, uint32_t l
,
225 : space(space
), state(s
), flags(f
), seq(q
), offset(o
), length(l
) {}
226 Buffer(BufferSpace
*space
, unsigned s
, uint64_t q
, uint32_t o
, bufferlist
& b
,
228 : space(space
), state(s
), flags(f
), seq(q
), offset(o
),
229 length(b
.length()), data(b
) {}
231 bool is_empty() const {
232 return state
== STATE_EMPTY
;
234 bool is_clean() const {
235 return state
== STATE_CLEAN
;
237 bool is_writing() const {
238 return state
== STATE_WRITING
;
241 uint32_t end() const {
242 return offset
+ length
;
245 void truncate(uint32_t newlen
) {
246 ceph_assert(newlen
< length
);
249 t
.substr_of(data
, 0, newlen
);
254 void maybe_rebuild() {
256 (data
.get_num_buffers() > 1 ||
257 data
.front().wasted() > data
.length() / MAX_BUFFER_SLOP_RATIO_DEN
)) {
262 void dump(Formatter
*f
) const {
263 f
->dump_string("state", get_state_name(state
));
264 f
->dump_unsigned("seq", seq
);
265 f
->dump_unsigned("offset", offset
);
266 f
->dump_unsigned("length", length
);
267 f
->dump_unsigned("data_length", data
.length());
271 struct BufferCacheShard
;
273 /// map logical extent range (object) onto buffers
276 BYPASS_CLEAN_CACHE
= 0x1, // bypass clean cache
279 typedef boost::intrusive::list
<
281 boost::intrusive::member_hook
<
283 boost::intrusive::list_member_hook
<>,
284 &Buffer::state_item
> > state_list_t
;
286 mempool::bluestore_cache_meta::map
<uint32_t, std::unique_ptr
<Buffer
>>
289 // we use a bare intrusive list here instead of std::map because
290 // it uses less memory and we expect this to be very small (very
291 // few IOs in flight to the same Blob at the same time).
292 state_list_t writing
; ///< writing buffers, sorted by seq, ascending
295 ceph_assert(buffer_map
.empty());
296 ceph_assert(writing
.empty());
299 void _add_buffer(BufferCacheShard
* cache
, Buffer
*b
, int level
, Buffer
*near
) {
300 cache
->_audit("_add_buffer start");
301 buffer_map
[b
->offset
].reset(b
);
302 if (b
->is_writing()) {
303 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_writing
);
304 if (writing
.empty() || writing
.rbegin()->seq
<= b
->seq
) {
305 writing
.push_back(*b
);
307 auto it
= writing
.begin();
308 while (it
->seq
< b
->seq
) {
312 ceph_assert(it
->seq
>= b
->seq
);
313 // note that this will insert b before it
314 // hence the order is maintained
315 writing
.insert(it
, *b
);
318 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
319 cache
->_add(b
, level
, near
);
321 cache
->_audit("_add_buffer end");
323 void _rm_buffer(BufferCacheShard
* cache
, Buffer
*b
) {
324 _rm_buffer(cache
, buffer_map
.find(b
->offset
));
326 void _rm_buffer(BufferCacheShard
* cache
,
327 map
<uint32_t, std::unique_ptr
<Buffer
>>::iterator p
) {
328 ceph_assert(p
!= buffer_map
.end());
329 cache
->_audit("_rm_buffer start");
330 if (p
->second
->is_writing()) {
331 writing
.erase(writing
.iterator_to(*p
->second
));
333 cache
->_rm(p
->second
.get());
336 cache
->_audit("_rm_buffer end");
339 map
<uint32_t,std::unique_ptr
<Buffer
>>::iterator
_data_lower_bound(
341 auto i
= buffer_map
.lower_bound(offset
);
342 if (i
!= buffer_map
.begin()) {
344 if (i
->first
+ i
->second
->length
<= offset
)
350 // must be called under protection of the Cache lock
351 void _clear(BufferCacheShard
* cache
);
353 // return value is the highest cache_private of a trimmed buffer, or 0.
354 int discard(BufferCacheShard
* cache
, uint32_t offset
, uint32_t length
) {
355 std::lock_guard
l(cache
->lock
);
356 int ret
= _discard(cache
, offset
, length
);
360 int _discard(BufferCacheShard
* cache
, uint32_t offset
, uint32_t length
);
362 void write(BufferCacheShard
* cache
, uint64_t seq
, uint32_t offset
, bufferlist
& bl
,
364 std::lock_guard
l(cache
->lock
);
365 Buffer
*b
= new Buffer(this, Buffer::STATE_WRITING
, seq
, offset
, bl
,
367 b
->cache_private
= _discard(cache
, offset
, bl
.length());
368 _add_buffer(cache
, b
, (flags
& Buffer::FLAG_NOCACHE
) ? 0 : 1, nullptr);
371 void _finish_write(BufferCacheShard
* cache
, uint64_t seq
);
372 void did_read(BufferCacheShard
* cache
, uint32_t offset
, bufferlist
& bl
) {
373 std::lock_guard
l(cache
->lock
);
374 Buffer
*b
= new Buffer(this, Buffer::STATE_CLEAN
, 0, offset
, bl
);
375 b
->cache_private
= _discard(cache
, offset
, bl
.length());
376 _add_buffer(cache
, b
, 1, nullptr);
380 void read(BufferCacheShard
* cache
, uint32_t offset
, uint32_t length
,
381 BlueStore::ready_regions_t
& res
,
382 interval_set
<uint32_t>& res_intervals
,
385 void truncate(BufferCacheShard
* cache
, uint32_t offset
) {
386 discard(cache
, offset
, (uint32_t)-1 - offset
);
389 void split(BufferCacheShard
* cache
, size_t pos
, BufferSpace
&r
);
391 void dump(BufferCacheShard
* cache
, Formatter
*f
) const {
392 std::lock_guard
l(cache
->lock
);
393 f
->open_array_section("buffers");
394 for (auto& i
: buffer_map
) {
395 f
->open_object_section("buffer");
396 ceph_assert(i
.first
== i
.second
->offset
);
404 struct SharedBlobSet
;
406 /// in-memory shared blob state (incl cached buffers)
408 MEMPOOL_CLASS_HELPERS();
410 std::atomic_int nref
= {0}; ///< reference count
415 uint64_t sbid_unloaded
; ///< sbid if persistent isn't loaded
416 bluestore_shared_blob_t
*persistent
; ///< persistent part of the shared blob if any
418 BufferSpace bc
; ///< buffer cache
420 SharedBlob(Collection
*_coll
) : coll(_coll
), sbid_unloaded(0) {
422 get_cache()->add_blob();
425 SharedBlob(uint64_t i
, Collection
*_coll
);
428 uint64_t get_sbid() const {
429 return loaded
? persistent
->sbid
: sbid_unloaded
;
432 friend void intrusive_ptr_add_ref(SharedBlob
*b
) { b
->get(); }
433 friend void intrusive_ptr_release(SharedBlob
*b
) { b
->put(); }
435 void dump(Formatter
* f
) const;
436 friend ostream
& operator<<(ostream
& out
, const SharedBlob
& sb
);
443 /// get logical references
444 void get_ref(uint64_t offset
, uint32_t length
);
446 /// put logical references, and get back any released extents
447 void put_ref(uint64_t offset
, uint32_t length
,
448 PExtentVector
*r
, bool *unshare
);
450 void finish_write(uint64_t seq
);
452 friend bool operator==(const SharedBlob
&l
, const SharedBlob
&r
) {
453 return l
.get_sbid() == r
.get_sbid();
455 inline BufferCacheShard
* get_cache() {
456 return coll
? coll
->cache
: nullptr;
458 inline SharedBlobSet
* get_parent() {
459 return coll
? &(coll
->shared_blob_set
) : nullptr;
461 inline bool is_loaded() const {
466 typedef boost::intrusive_ptr
<SharedBlob
> SharedBlobRef
;
468 /// a lookup table of SharedBlobs
469 struct SharedBlobSet
{
470 /// protect lookup, insertion, removal
471 ceph::mutex lock
= ceph::make_mutex("BlueStore::SharedBlobSet::lock");
473 // we use a bare pointer because we don't want to affect the ref
475 mempool::bluestore_cache_meta::unordered_map
<uint64_t,SharedBlob
*> sb_map
;
477 SharedBlobRef
lookup(uint64_t sbid
) {
478 std::lock_guard
l(lock
);
479 auto p
= sb_map
.find(sbid
);
480 if (p
== sb_map
.end() ||
481 p
->second
->nref
== 0) {
487 void add(Collection
* coll
, SharedBlob
*sb
) {
488 std::lock_guard
l(lock
);
489 sb_map
[sb
->get_sbid()] = sb
;
493 bool remove(SharedBlob
*sb
, bool verify_nref_is_zero
=false) {
494 std::lock_guard
l(lock
);
495 ceph_assert(sb
->get_parent() == this);
496 if (verify_nref_is_zero
&& sb
->nref
!= 0) {
499 // only remove if it still points to us
500 auto p
= sb_map
.find(sb
->get_sbid());
501 if (p
!= sb_map
.end() &&
509 std::lock_guard
l(lock
);
510 return sb_map
.empty();
513 template <int LogLevelV
>
514 void dump(CephContext
*cct
);
517 //#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
519 /// in-memory blob metadata and associated cached buffers (if any)
521 MEMPOOL_CLASS_HELPERS();
523 std::atomic_int nref
= {0}; ///< reference count
524 int16_t id
= -1; ///< id, for spanning blobs only, >= 0
525 int16_t last_encoded_id
= -1; ///< (ephemeral) used during encoding only
526 SharedBlobRef shared_blob
; ///< shared blob state (if any)
529 mutable bluestore_blob_t blob
; ///< decoded blob metadata
531 mutable bufferlist blob_bl
; ///< cached encoded blob, blob is dirty if empty
533 /// refs from this shard. ephemeral if id<0, persisted if spanning.
534 bluestore_blob_use_tracker_t used_in_blob
;
538 friend void intrusive_ptr_add_ref(Blob
*b
) { b
->get(); }
539 friend void intrusive_ptr_release(Blob
*b
) { b
->put(); }
541 void dump(Formatter
* f
) const;
542 friend ostream
& operator<<(ostream
& out
, const Blob
&b
);
544 const bluestore_blob_use_tracker_t
& get_blob_use_tracker() const {
547 bool is_referenced() const {
548 return used_in_blob
.is_not_empty();
550 uint32_t get_referenced_bytes() const {
551 return used_in_blob
.get_referenced_bytes();
554 bool is_spanning() const {
558 bool can_split() const {
559 std::lock_guard
l(shared_blob
->get_cache()->lock
);
560 // splitting a BufferSpace writing list is too hard; don't try.
561 return shared_blob
->bc
.writing
.empty() &&
562 used_in_blob
.can_split() &&
563 get_blob().can_split();
566 bool can_split_at(uint32_t blob_offset
) const {
567 return used_in_blob
.can_split_at(blob_offset
) &&
568 get_blob().can_split_at(blob_offset
);
571 bool can_reuse_blob(uint32_t min_alloc_size
,
572 uint32_t target_blob_size
,
577 o
.shared_blob
= shared_blob
;
584 inline const bluestore_blob_t
& get_blob() const {
587 inline bluestore_blob_t
& dirty_blob() {
594 /// discard buffers for unallocated regions
595 void discard_unallocated(Collection
*coll
);
597 /// get logical references
598 void get_ref(Collection
*coll
, uint32_t offset
, uint32_t length
);
599 /// put logical references, and get back any released extents
600 bool put_ref(Collection
*coll
, uint32_t offset
, uint32_t length
,
604 void split(Collection
*coll
, uint32_t blob_offset
, Blob
*o
);
616 void _encode() const {
617 if (blob_bl
.length() == 0 ) {
618 encode(blob
, blob_bl
);
620 ceph_assert(blob_bl
.length());
625 bool include_ref_map
) const {
627 p
+= blob_bl
.length();
628 if (include_ref_map
) {
629 used_in_blob
.bound_encode(p
);
633 bufferlist::contiguous_appender
& p
,
634 bool include_ref_map
) const {
637 if (include_ref_map
) {
638 used_in_blob
.encode(p
);
642 Collection */
*coll*/
,
643 bufferptr::const_iterator
& p
,
644 bool include_ref_map
) {
645 const char *start
= p
.get_pos();
647 const char *end
= p
.get_pos();
649 blob_bl
.append(start
, end
- start
);
650 if (include_ref_map
) {
651 used_in_blob
.decode(p
);
659 bool include_ref_map
) const {
660 denc(blob
, p
, struct_v
);
661 if (blob
.is_shared()) {
664 if (include_ref_map
) {
665 used_in_blob
.bound_encode(p
);
669 bufferlist::contiguous_appender
& p
,
672 bool include_ref_map
) const {
673 denc(blob
, p
, struct_v
);
674 if (blob
.is_shared()) {
677 if (include_ref_map
) {
678 used_in_blob
.encode(p
);
683 bufferptr::const_iterator
& p
,
686 bool include_ref_map
);
689 typedef boost::intrusive_ptr
<Blob
> BlobRef
;
690 typedef mempool::bluestore_cache_meta::map
<int,BlobRef
> blob_map_t
;
692 /// a logical extent, pointing to (some portion of) a blob
693 typedef boost::intrusive::set_base_hook
<boost::intrusive::optimize_size
<true> > ExtentBase
; //making an alias to avoid build warnings
694 struct Extent
: public ExtentBase
{
695 MEMPOOL_CLASS_HELPERS();
697 uint32_t logical_offset
= 0; ///< logical offset
698 uint32_t blob_offset
= 0; ///< blob offset
699 uint32_t length
= 0; ///< length
700 BlobRef blob
; ///< the blob with our data
702 /// ctor for lookup only
703 explicit Extent(uint32_t lo
) : ExtentBase(), logical_offset(lo
) { }
704 /// ctor for delayed initialization (see decode_some())
705 explicit Extent() : ExtentBase() {
707 /// ctor for general usage
708 Extent(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
)
710 logical_offset(lo
), blob_offset(o
), length(l
) {
715 blob
->shared_blob
->get_cache()->rm_extent();
719 void dump(Formatter
* f
) const;
721 void assign_blob(const BlobRef
& b
) {
724 blob
->shared_blob
->get_cache()->add_extent();
727 // comparators for intrusive_set
728 friend bool operator<(const Extent
&a
, const Extent
&b
) {
729 return a
.logical_offset
< b
.logical_offset
;
731 friend bool operator>(const Extent
&a
, const Extent
&b
) {
732 return a
.logical_offset
> b
.logical_offset
;
734 friend bool operator==(const Extent
&a
, const Extent
&b
) {
735 return a
.logical_offset
== b
.logical_offset
;
738 uint32_t blob_start() const {
739 return logical_offset
- blob_offset
;
742 uint32_t blob_end() const {
743 return blob_start() + blob
->get_blob().get_logical_length();
746 uint32_t logical_end() const {
747 return logical_offset
+ length
;
750 // return true if any piece of the blob is out of
751 // the given range [o, o + l].
752 bool blob_escapes_range(uint32_t o
, uint32_t l
) const {
753 return blob_start() < o
|| blob_end() > o
+ l
;
756 typedef boost::intrusive::set
<Extent
> extent_map_t
;
759 friend ostream
& operator<<(ostream
& out
, const Extent
& e
);
762 boost::intrusive::list_member_hook
<> old_extent_item
;
765 bool blob_empty
; // flag to track the last removed extent that makes blob
766 // empty - required to update compression stat properly
767 OldExtent(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
)
768 : e(lo
, o
, l
, b
), blob_empty(false) {
770 static OldExtent
* create(CollectionRef c
,
776 typedef boost::intrusive::list
<
778 boost::intrusive::member_hook
<
780 boost::intrusive::list_member_hook
<>,
781 &OldExtent::old_extent_item
> > old_extent_map_t
;
785 /// a sharded extent map, mapping offsets to lextents to blobs
788 extent_map_t extent_map
; ///< map of Extents to Blobs
789 blob_map_t spanning_blob_map
; ///< blobs that span shards
790 typedef boost::intrusive_ptr
<Onode
> OnodeRef
;
793 bluestore_onode_t::shard_info
*shard_info
= nullptr;
794 unsigned extents
= 0; ///< count extents in this shard
795 bool loaded
= false; ///< true if shard is loaded
796 bool dirty
= false; ///< true if shard is dirty and needs reencoding
798 mempool::bluestore_cache_meta::vector
<Shard
> shards
; ///< shards
800 bufferlist inline_bl
; ///< cached encoded map, if unsharded; empty=>dirty
802 uint32_t needs_reshard_begin
= 0;
803 uint32_t needs_reshard_end
= 0;
805 void dup(BlueStore
* b
, TransContext
*, CollectionRef
&, OnodeRef
&, OnodeRef
&,
806 uint64_t&, uint64_t&, uint64_t&);
808 bool needs_reshard() const {
809 return needs_reshard_end
> needs_reshard_begin
;
811 void clear_needs_reshard() {
812 needs_reshard_begin
= needs_reshard_end
= 0;
814 void request_reshard(uint32_t begin
, uint32_t end
) {
815 if (begin
< needs_reshard_begin
) {
816 needs_reshard_begin
= begin
;
818 if (end
> needs_reshard_end
) {
819 needs_reshard_end
= end
;
823 struct DeleteDisposer
{
824 void operator()(Extent
*e
) { delete e
; }
829 extent_map
.clear_and_dispose(DeleteDisposer());
833 extent_map
.clear_and_dispose(DeleteDisposer());
836 clear_needs_reshard();
839 void dump(Formatter
* f
) const;
841 bool encode_some(uint32_t offset
, uint32_t length
, bufferlist
& bl
,
843 unsigned decode_some(bufferlist
& bl
);
845 void bound_encode_spanning_blobs(size_t& p
);
846 void encode_spanning_blobs(bufferlist::contiguous_appender
& p
);
847 void decode_spanning_blobs(bufferptr::const_iterator
& p
);
849 BlobRef
get_spanning_blob(int id
) {
850 auto p
= spanning_blob_map
.find(id
);
851 ceph_assert(p
!= spanning_blob_map
.end());
855 void update(KeyValueDB::Transaction t
, bool force
);
856 decltype(BlueStore::Blob::id
) allocate_spanning_blob_id();
859 KeyValueDB::Transaction t
);
861 /// initialize Shards from the onode
862 void init_shards(bool loaded
, bool dirty
);
864 /// return index of shard containing offset
865 /// or -1 if not found
866 int seek_shard(uint32_t offset
) {
867 size_t end
= shards
.size();
868 size_t mid
, left
= 0;
869 size_t right
= end
; // one passed the right end
871 while (left
< right
) {
872 mid
= left
+ (right
- left
) / 2;
873 if (offset
>= shards
[mid
].shard_info
->offset
) {
874 size_t next
= mid
+ 1;
875 if (next
>= end
|| offset
< shards
[next
].shard_info
->offset
)
877 //continue to search forwards
880 //continue to search backwards
885 return -1; // not found
888 /// check if a range spans a shard
889 bool spans_shard(uint32_t offset
, uint32_t length
) {
890 if (shards
.empty()) {
893 int s
= seek_shard(offset
);
895 if (s
== (int)shards
.size() - 1) {
896 return false; // last shard
898 if (offset
+ length
<= shards
[s
+1].shard_info
->offset
) {
904 /// ensure that a range of the map is loaded
905 void fault_range(KeyValueDB
*db
,
906 uint32_t offset
, uint32_t length
);
908 /// ensure a range of the map is marked dirty
909 void dirty_range(uint32_t offset
, uint32_t length
);
911 /// for seek_lextent test
912 extent_map_t::iterator
find(uint64_t offset
);
914 /// seek to the first lextent including or after offset
915 extent_map_t::iterator
seek_lextent(uint64_t offset
);
916 extent_map_t::const_iterator
seek_lextent(uint64_t offset
) const;
919 void add(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
) {
920 extent_map
.insert(*new Extent(lo
, o
, l
, b
));
923 /// remove (and delete) an Extent
924 void rm(extent_map_t::iterator p
) {
925 extent_map
.erase_and_dispose(p
, DeleteDisposer());
928 bool has_any_lextents(uint64_t offset
, uint64_t length
);
930 /// consolidate adjacent lextents in extent_map
931 int compress_extent_map(uint64_t offset
, uint64_t length
);
933 /// punch a logical hole. add lextents to deref to target list.
934 void punch_hole(CollectionRef
&c
,
935 uint64_t offset
, uint64_t length
,
936 old_extent_map_t
*old_extents
);
938 /// put new lextent into lextent_map overwriting existing ones if
939 /// any and update references accordingly
940 Extent
*set_lextent(CollectionRef
&c
,
941 uint64_t logical_offset
,
942 uint64_t offset
, uint64_t length
,
944 old_extent_map_t
*old_extents
);
946 /// split a blob (and referring extents)
947 BlobRef
split_blob(BlobRef lb
, uint32_t blob_offset
, uint32_t pos
);
950 /// Compressed Blob Garbage collector
952 The primary idea of the collector is to estimate a difference between
953 allocation units(AU) currently present for compressed blobs and new AUs
954 required to store that data uncompressed.
955 Estimation is performed for protrusive extents within a logical range
956 determined by a concatenation of old_extents collection and specific(current)
958 The root cause for old_extents use is the need to handle blob ref counts
959 properly. Old extents still hold blob refs and hence we need to traverse
960 the collection to determine if blob to be released.
961 Protrusive extents are extents that fit into the blob set in action
962 (ones that are below the logical range from above) but not removed totally
963 due to the current write.
965 extent1 <loffs = 100, boffs = 100, len = 100> ->
966 blob1<compressed, len_on_disk=4096, logical_len=8192>
967 extent2 <loffs = 200, boffs = 200, len = 100> ->
968 blob2<raw, len_on_disk=4096, llen=4096>
969 extent3 <loffs = 300, boffs = 300, len = 100> ->
970 blob1<compressed, len_on_disk=4096, llen=8192>
971 extent4 <loffs = 4096, boffs = 0, len = 100> ->
972 blob3<raw, len_on_disk=4096, llen=4096>
974 protrusive extents are within the following ranges <0~300, 400~8192-400>
975 In this case existing AUs that might be removed due to GC (i.e. blob1)
977 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
978 Hence we should do a collect.
980 class GarbageCollector
983 /// return amount of allocation units that might be saved due to GC
987 const ExtentMap
& extent_map
,
988 const old_extent_map_t
& old_extents
,
989 uint64_t min_alloc_size
);
991 /// return a collection of extents to perform GC on
992 const interval_set
<uint64_t>& get_extents_to_collect() const {
993 return extents_to_collect
;
995 GarbageCollector(CephContext
* _cct
) : cct(_cct
) {}
999 uint64_t referenced_bytes
= 0; ///< amount of bytes referenced in blob
1000 int64_t expected_allocations
= 0; ///< new alloc units required
1001 ///< in case of gc fulfilled
1002 bool collect_candidate
= false; ///< indicate if blob has any extents
1003 ///< eligible for GC.
1004 extent_map_t::const_iterator first_lextent
; ///< points to the first
1005 ///< lextent referring to
1006 ///< the blob if any.
1007 ///< collect_candidate flag
1008 ///< determines the validity
1009 extent_map_t::const_iterator last_lextent
; ///< points to the last
1010 ///< lextent referring to
1011 ///< the blob if any.
1013 BlobInfo(uint64_t ref_bytes
) :
1014 referenced_bytes(ref_bytes
) {
1018 map
<Blob
*, BlobInfo
> affected_blobs
; ///< compressed blobs and their ref_map
1019 ///< copies that are affected by the
1022 ///< protrusive extents that should be collected if GC takes place
1023 interval_set
<uint64_t> extents_to_collect
;
1025 boost::optional
<uint64_t > used_alloc_unit
; ///< last processed allocation
1026 ///< unit when traversing
1027 ///< protrusive extents.
1028 ///< Other extents mapped to
1029 ///< this AU to be ignored
1030 ///< (except the case where
1031 ///< uncompressed extent follows
1032 ///< compressed one - see below).
1033 BlobInfo
* blob_info_counted
= nullptr; ///< set if previous allocation unit
1034 ///< caused expected_allocations
1035 ///< counter increment at this blob.
1036 ///< if uncompressed extent follows
1037 ///< a decrement for the
1038 ///< expected_allocations counter
1040 int64_t expected_allocations
= 0; ///< new alloc units required in case
1041 ///< of gc fulfilled
1042 int64_t expected_for_release
= 0; ///< alloc units currently used by
1043 ///< compressed blobs that might
1047 void process_protrusive_extents(const BlueStore::ExtentMap
& extent_map
,
1048 uint64_t start_offset
,
1049 uint64_t end_offset
,
1050 uint64_t start_touch_offset
,
1051 uint64_t end_touch_offset
,
1052 uint64_t min_alloc_size
);
1056 /// an in-memory object
1058 MEMPOOL_CLASS_HELPERS();
1060 std::atomic_int nref
; ///< reference count
1064 /// key under PREFIX_OBJ where we are stored
1065 mempool::bluestore_cache_meta::string key
;
1067 boost::intrusive::list_member_hook
<> lru_item
;
1069 bluestore_onode_t onode
; ///< metadata stored as value in kv store
1070 bool exists
; ///< true if object logically exists
1071 bool cached
; ///< Onode is logically in the cache
1072 /// (it can be pinned and hence physically out
1073 /// of it at the moment though)
1074 std::atomic_bool pinned
; ///< Onode is pinned
1075 /// (or should be pinned when cached)
1076 ExtentMap extent_map
;
1078 // track txc's that have not been committed to kv store (and whose
1079 // effects cannot be read via the kvdb read methods)
1080 std::atomic
<int> flushing_count
= {0};
1081 std::atomic
<int> waiting_count
= {0};
1082 /// protect flush_txns
1083 ceph::mutex flush_lock
= ceph::make_mutex("BlueStore::Onode::flush_lock");
1084 ceph::condition_variable flush_cond
; ///< wait here for uncommitted txns
1086 Onode(Collection
*c
, const ghobject_t
& o
,
1087 const mempool::bluestore_cache_meta::string
& k
)
1097 Onode(Collection
* c
, const ghobject_t
& o
,
1098 const std::string
& k
)
1108 Onode(Collection
* c
, const ghobject_t
& o
,
1120 static Onode
* decode(
1122 const ghobject_t
& oid
,
1124 const bufferlist
& v
);
1126 void dump(Formatter
* f
) const;
1132 inline bool put_cache() {
1133 ceph_assert(!cached
);
1137 inline bool pop_cache() {
1138 ceph_assert(cached
);
1143 const string
& get_omap_prefix();
1144 void get_omap_header(string
*out
);
1145 void get_omap_key(const string
& key
, string
*out
);
1146 void rewrite_omap_key(const string
& old
, string
*out
);
1147 void get_omap_tail(string
*out
);
1148 void decode_omap_key(const string
& key
, string
*user_key
);
1150 typedef boost::intrusive_ptr
<Onode
> OnodeRef
;
1152 /// A generic Cache Shard
1155 PerfCounters
*logger
;
1157 /// protect lru and other structures
1158 ceph::recursive_mutex lock
= {
1159 ceph::make_recursive_mutex("BlueStore::CacheShard::lock") };
1161 std::atomic
<uint64_t> max
= {0};
1162 std::atomic
<uint64_t> num
= {0};
1164 CacheShard(CephContext
* cct
) : cct(cct
), logger(nullptr) {}
1165 virtual ~CacheShard() {}
1167 void set_max(uint64_t max_
) {
1171 uint64_t _get_num() {
1175 virtual void _trim_to(uint64_t new_size
) = 0;
1177 if (cct
->_conf
->objectstore_blackhole
) {
1178 // do not trim if we are throwing away IOs a layer down
1185 std::lock_guard
l(lock
);
1189 std::lock_guard
l(lock
);
1190 // we should not be shutting down after the blackhole is enabled
1191 assert(!cct
->_conf
->objectstore_blackhole
);
1196 virtual void _audit(const char *s
) = 0;
1198 void _audit(const char *s
) { /* no-op */ }
1202 /// A Generic onode Cache Shard
1203 struct OnodeCacheShard
: public CacheShard
{
1204 std::atomic
<uint64_t> num_pinned
= {0};
1206 std::array
<std::pair
<ghobject_t
, mono_clock::time_point
>, 64> dumped_onodes
;
1208 virtual void _pin(Onode
* o
) = 0;
1209 virtual void _unpin(Onode
* o
) = 0;
1212 OnodeCacheShard(CephContext
* cct
) : CacheShard(cct
) {}
1213 static OnodeCacheShard
*create(CephContext
* cct
, string type
,
1214 PerfCounters
*logger
);
1215 virtual void _add(Onode
* o
, int level
) = 0;
1216 virtual void _rm(Onode
* o
) = 0;
1217 virtual void _unpin_and_rm(Onode
* o
) = 0;
1219 virtual void move_pinned(OnodeCacheShard
*to
, Onode
*o
) = 0;
1220 virtual void add_stats(uint64_t *onodes
, uint64_t *pinned_onodes
) = 0;
1222 return _get_num() == 0;
1226 /// A Generic buffer Cache Shard
1227 struct BufferCacheShard
: public CacheShard
{
1228 std::atomic
<uint64_t> num_extents
= {0};
1229 std::atomic
<uint64_t> num_blobs
= {0};
1230 uint64_t buffer_bytes
= 0;
1233 BufferCacheShard(CephContext
* cct
) : CacheShard(cct
) {}
1234 static BufferCacheShard
*create(CephContext
* cct
, string type
,
1235 PerfCounters
*logger
);
1236 virtual void _add(Buffer
*b
, int level
, Buffer
*near
) = 0;
1237 virtual void _rm(Buffer
*b
) = 0;
1238 virtual void _move(BufferCacheShard
*src
, Buffer
*b
) = 0;
1239 virtual void _touch(Buffer
*b
) = 0;
1240 virtual void _adjust_size(Buffer
*b
, int64_t delta
) = 0;
1242 uint64_t _get_bytes() {
1243 return buffer_bytes
;
1260 virtual void add_stats(uint64_t *extents
,
1263 uint64_t *bytes
) = 0;
1266 std::lock_guard
l(lock
);
1267 return _get_bytes() == 0;
1272 OnodeCacheShard
*cache
;
1276 mempool::bluestore_cache_meta::unordered_map
<ghobject_t
,OnodeRef
> onode_map
;
1278 friend struct Collection
; // for split_cache()
1279 friend struct Onode
; // for put()
1280 friend struct LruOnodeCacheShard
;
1281 void _remove(const ghobject_t
& oid
);
1283 OnodeSpace(OnodeCacheShard
*c
) : cache(c
) {}
1288 OnodeRef
add(const ghobject_t
& oid
, OnodeRef
& o
);
1289 OnodeRef
lookup(const ghobject_t
& o
);
1290 void rename(OnodeRef
& o
, const ghobject_t
& old_oid
,
1291 const ghobject_t
& new_oid
,
1292 const mempool::bluestore_cache_meta::string
& new_okey
);
1296 template <int LogLevelV
>
1297 void dump(CephContext
*cct
);
1299 /// return true if f true for any item
1300 bool map_any(std::function
<bool(Onode
*)> f
);
1304 using OpSequencerRef
= ceph::ref_t
<OpSequencer
>;
1306 struct Collection
: public CollectionImpl
{
1309 BufferCacheShard
*cache
; ///< our cache shard
1310 bluestore_cnode_t cnode
;
1311 ceph::shared_mutex lock
=
1312 ceph::make_shared_mutex("BlueStore::Collection::lock", true, false);
1316 SharedBlobSet shared_blob_set
; ///< open SharedBlobs
1318 // cache onodes on a per-collection basis to avoid lock
1320 OnodeSpace onode_map
;
1323 pool_opts_t pool_opts
;
1324 ContextQueue
*commit_queue
;
1326 OnodeCacheShard
* get_onode_cache() const {
1327 return onode_map
.cache
;
1329 OnodeRef
get_onode(const ghobject_t
& oid
, bool create
, bool is_createop
=false);
1331 // the terminology is confusing here, sorry!
1333 // blob_t shared_blob_t
1334 // !shared unused -> open
1335 // shared !loaded -> open + shared
1336 // shared loaded -> open + shared + loaded
1339 // open = SharedBlob is instantiated
1340 // shared = blob_t shared flag is set; SharedBlob is hashed.
1341 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1342 void open_shared_blob(uint64_t sbid
, BlobRef b
);
1343 void load_shared_blob(SharedBlobRef sb
);
1344 void make_blob_shared(uint64_t sbid
, BlobRef b
);
1345 uint64_t make_blob_unshared(SharedBlob
*sb
);
1347 BlobRef
new_blob() {
1348 BlobRef b
= new Blob();
1349 b
->shared_blob
= new SharedBlob(this);
1353 bool contains(const ghobject_t
& oid
) {
1355 return oid
.hobj
.pool
== -1;
1357 if (cid
.is_pg(&spgid
))
1359 spgid
.pgid
.contains(cnode
.bits
, oid
) &&
1360 oid
.shard_id
== spgid
.shard
;
1364 int64_t pool() const {
1368 void split_cache(Collection
*dest
);
1370 bool flush_commit(Context
*c
) override
;
1371 void flush() override
;
1372 void flush_all_but_last();
1374 Collection(BlueStore
*ns
, OnodeCacheShard
*oc
, BufferCacheShard
*bc
, coll_t c
);
1377 class OmapIteratorImpl
: public ObjectMap::ObjectMapIteratorImpl
{
1380 KeyValueDB::Iterator it
;
1383 string
_stringify() const;
1386 OmapIteratorImpl(CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
);
1387 int seek_to_first() override
;
1388 int upper_bound(const string
&after
) override
;
1389 int lower_bound(const string
&to
) override
;
1390 bool valid() override
;
1391 int next() override
;
1392 string
key() override
;
1393 bufferlist
value() override
;
1394 std::string
tail_key() {
1398 int status() override
{
1403 struct volatile_statfs
{
1405 STATFS_ALLOCATED
= 0,
1407 STATFS_COMPRESSED_ORIGINAL
,
1409 STATFS_COMPRESSED_ALLOCATED
,
1412 int64_t values
[STATFS_LAST
];
1414 memset(this, 0, sizeof(volatile_statfs
));
1417 *this = volatile_statfs();
1419 void publish(store_statfs_t
* buf
) const {
1420 buf
->allocated
= allocated();
1421 buf
->data_stored
= stored();
1422 buf
->data_compressed
= compressed();
1423 buf
->data_compressed_original
= compressed_original();
1424 buf
->data_compressed_allocated
= compressed_allocated();
1427 volatile_statfs
& operator+=(const volatile_statfs
& other
) {
1428 for (size_t i
= 0; i
< STATFS_LAST
; ++i
) {
1429 values
[i
] += other
.values
[i
];
1433 int64_t& allocated() {
1434 return values
[STATFS_ALLOCATED
];
1437 return values
[STATFS_STORED
];
1439 int64_t& compressed_original() {
1440 return values
[STATFS_COMPRESSED_ORIGINAL
];
1442 int64_t& compressed() {
1443 return values
[STATFS_COMPRESSED
];
1445 int64_t& compressed_allocated() {
1446 return values
[STATFS_COMPRESSED_ALLOCATED
];
1448 int64_t allocated() const {
1449 return values
[STATFS_ALLOCATED
];
1451 int64_t stored() const {
1452 return values
[STATFS_STORED
];
1454 int64_t compressed_original() const {
1455 return values
[STATFS_COMPRESSED_ORIGINAL
];
1457 int64_t compressed() const {
1458 return values
[STATFS_COMPRESSED
];
1460 int64_t compressed_allocated() const {
1461 return values
[STATFS_COMPRESSED_ALLOCATED
];
1463 volatile_statfs
& operator=(const store_statfs_t
& st
) {
1464 values
[STATFS_ALLOCATED
] = st
.allocated
;
1465 values
[STATFS_STORED
] = st
.data_stored
;
1466 values
[STATFS_COMPRESSED_ORIGINAL
] = st
.data_compressed_original
;
1467 values
[STATFS_COMPRESSED
] = st
.data_compressed
;
1468 values
[STATFS_COMPRESSED_ALLOCATED
] = st
.data_compressed_allocated
;
1472 return values
[STATFS_ALLOCATED
] == 0 &&
1473 values
[STATFS_STORED
] == 0 &&
1474 values
[STATFS_COMPRESSED
] == 0 &&
1475 values
[STATFS_COMPRESSED_ORIGINAL
] == 0 &&
1476 values
[STATFS_COMPRESSED_ALLOCATED
] == 0;
1478 void decode(bufferlist::const_iterator
& it
) {
1480 for (size_t i
= 0; i
< STATFS_LAST
; i
++) {
1481 decode(values
[i
], it
);
1485 void encode(bufferlist
& bl
) {
1487 for (size_t i
= 0; i
< STATFS_LAST
; i
++) {
1488 encode(values
[i
], bl
);
1493 struct TransContext final
: public AioContext
{
1494 MEMPOOL_CLASS_HELPERS();
1500 STATE_KV_QUEUED
, // queued for kv_sync_thread submission
1501 STATE_KV_SUBMITTED
, // submitted to kv; not yet synced
1503 STATE_DEFERRED_QUEUED
, // in deferred_queue (pending or running)
1504 STATE_DEFERRED_CLEANUP
, // remove deferred kv record
1505 STATE_DEFERRED_DONE
,
1510 state_t state
= STATE_PREPARE
;
1512 const char *get_state_name() {
1514 case STATE_PREPARE
: return "prepare";
1515 case STATE_AIO_WAIT
: return "aio_wait";
1516 case STATE_IO_DONE
: return "io_done";
1517 case STATE_KV_QUEUED
: return "kv_queued";
1518 case STATE_KV_SUBMITTED
: return "kv_submitted";
1519 case STATE_KV_DONE
: return "kv_done";
1520 case STATE_DEFERRED_QUEUED
: return "deferred_queued";
1521 case STATE_DEFERRED_CLEANUP
: return "deferred_cleanup";
1522 case STATE_DEFERRED_DONE
: return "deferred_done";
1523 case STATE_FINISHING
: return "finishing";
1524 case STATE_DONE
: return "done";
1529 #if defined(WITH_LTTNG)
1530 const char *get_state_latency_name(int state
) {
1532 case l_bluestore_state_prepare_lat
: return "prepare";
1533 case l_bluestore_state_aio_wait_lat
: return "aio_wait";
1534 case l_bluestore_state_io_done_lat
: return "io_done";
1535 case l_bluestore_state_kv_queued_lat
: return "kv_queued";
1536 case l_bluestore_state_kv_committing_lat
: return "kv_committing";
1537 case l_bluestore_state_kv_done_lat
: return "kv_done";
1538 case l_bluestore_state_deferred_queued_lat
: return "deferred_queued";
1539 case l_bluestore_state_deferred_cleanup_lat
: return "deferred_cleanup";
1540 case l_bluestore_state_finishing_lat
: return "finishing";
1541 case l_bluestore_state_done_lat
: return "done";
1548 OpSequencerRef osr
; // this should be ch->osr
1549 boost::intrusive::list_member_hook
<> sequencer_item
;
1551 uint64_t bytes
= 0, ios
= 0, cost
= 0;
1553 set
<OnodeRef
> onodes
; ///< these need to be updated/written
1554 set
<OnodeRef
> modified_objects
; ///< objects we modified (and need a ref)
1555 set
<SharedBlobRef
> shared_blobs
; ///< these need to be updated/written
1556 set
<SharedBlobRef
> shared_blobs_written
; ///< update these on io completion
1558 KeyValueDB::Transaction t
; ///< then we will commit this
1559 list
<Context
*> oncommits
; ///< more commit completions
1560 list
<CollectionRef
> removed_collections
; ///< colls we removed
1562 boost::intrusive::list_member_hook
<> deferred_queue_item
;
1563 bluestore_deferred_transaction_t
*deferred_txn
= nullptr; ///< if any
1565 interval_set
<uint64_t> allocated
, released
;
1566 volatile_statfs statfs_delta
; ///< overall store statistics delta
1567 uint64_t osd_pool_id
= META_POOL_ID
; ///< osd pool id we're operating on
1570 bool had_ios
= false; ///< true if we submitted IOs before our kv txn
1573 mono_clock::time_point start
;
1574 mono_clock::time_point last_stamp
;
1576 uint64_t last_nid
= 0; ///< if non-zero, highest new nid we allocated
1577 uint64_t last_blobid
= 0; ///< if non-zero, highest new blobid we allocated
1579 #if defined(WITH_LTTNG)
1580 bool tracing
= false;
1583 explicit TransContext(CephContext
* cct
, Collection
*c
, OpSequencer
*o
,
1584 list
<Context
*> *on_commits
)
1588 start(mono_clock::now()) {
1591 oncommits
.swap(*on_commits
);
1595 delete deferred_txn
;
1598 void write_onode(OnodeRef
&o
) {
1601 void write_shared_blob(SharedBlobRef
&sb
) {
1602 shared_blobs
.insert(sb
);
1604 void unshare_blob(SharedBlob
*sb
) {
1605 shared_blobs
.erase(sb
);
1608 /// note we logically modified object (when onode itself is unmodified)
1609 void note_modified_object(OnodeRef
&o
) {
1610 // onode itself isn't written, though
1611 modified_objects
.insert(o
);
1613 void note_removed_object(OnodeRef
& o
) {
1615 modified_objects
.insert(o
);
1618 void aio_finish(BlueStore
*store
) override
{
1619 store
->txc_aio_finish(this);
1623 class BlueStoreThrottle
{
1624 #if defined(WITH_LTTNG)
1625 const std::chrono::time_point
<mono_clock
> time_base
= mono_clock::now();
1627 // Time of last chosen io (microseconds)
1628 std::atomic
<uint64_t> previous_emitted_tp_time_mono_mcs
= {0};
1629 std::atomic
<uint64_t> ios_started_since_last_traced
= {0};
1630 std::atomic
<uint64_t> ios_completed_since_last_traced
= {0};
1632 std::atomic_uint pending_kv_ios
= {0};
1633 std::atomic_uint pending_deferred_ios
= {0};
1635 // Min period between trace points (microseconds)
1636 std::atomic
<uint64_t> trace_period_mcs
= {0};
1640 uint64_t *completed
) {
1641 uint64_t min_period_mcs
= trace_period_mcs
.load(
1642 std::memory_order_relaxed
);
1644 if (min_period_mcs
== 0) {
1646 *completed
= ios_completed_since_last_traced
.exchange(0);
1649 ios_started_since_last_traced
++;
1650 auto now_mcs
= ceph::to_microseconds
<uint64_t>(
1651 mono_clock::now() - time_base
);
1652 uint64_t previous_mcs
= previous_emitted_tp_time_mono_mcs
;
1653 uint64_t period_mcs
= now_mcs
- previous_mcs
;
1654 if (period_mcs
> min_period_mcs
) {
1655 if (previous_emitted_tp_time_mono_mcs
.compare_exchange_strong(
1656 previous_mcs
, now_mcs
)) {
1657 // This would be racy at a sufficiently extreme trace rate, but isn't
1658 // worth the overhead of doing it more carefully.
1659 *started
= ios_started_since_last_traced
.exchange(0);
1660 *completed
= ios_completed_since_last_traced
.exchange(0);
1669 #if defined(WITH_LTTNG)
1670 void emit_initial_tracepoint(
1673 mono_clock::time_point
);
1675 void emit_initial_tracepoint(
1678 mono_clock::time_point
) {}
1681 Throttle throttle_bytes
; ///< submit to commit
1682 Throttle throttle_deferred_bytes
; ///< submit to deferred complete
1685 BlueStoreThrottle(CephContext
*cct
) :
1686 throttle_bytes(cct
, "bluestore_throttle_bytes", 0),
1687 throttle_deferred_bytes(cct
, "bluestore_throttle_deferred_bytes", 0)
1689 reset_throttle(cct
->_conf
);
1692 #if defined(WITH_LTTNG)
1693 void complete_kv(TransContext
&txc
);
1694 void complete(TransContext
&txc
);
1696 void complete_kv(TransContext
&txc
) {}
1697 void complete(TransContext
&txc
) {}
1700 mono_clock::duration
log_state_latency(
1701 TransContext
&txc
, PerfCounters
*logger
, int state
);
1702 bool try_start_transaction(
1705 mono_clock::time_point
);
1706 void finish_start_transaction(
1709 mono_clock::time_point
);
1710 void release_kv_throttle(uint64_t cost
) {
1711 throttle_bytes
.put(cost
);
1713 void release_deferred_throttle(uint64_t cost
) {
1714 throttle_deferred_bytes
.put(cost
);
1716 bool should_submit_deferred() {
1717 return throttle_deferred_bytes
.past_midpoint();
1719 void reset_throttle(const ConfigProxy
&conf
) {
1720 throttle_bytes
.reset_max(conf
->bluestore_throttle_bytes
);
1721 throttle_deferred_bytes
.reset_max(
1722 conf
->bluestore_throttle_bytes
+
1723 conf
->bluestore_throttle_deferred_bytes
);
1724 #if defined(WITH_LTTNG)
1725 double rate
= conf
.get_val
<double>("bluestore_throttle_trace_rate");
1726 trace_period_mcs
= rate
> 0 ? floor((1/rate
) * 1000000.0) : 0;
1731 typedef boost::intrusive::list
<
1733 boost::intrusive::member_hook
<
1735 boost::intrusive::list_member_hook
<>,
1736 &TransContext::deferred_queue_item
> > deferred_queue_t
;
1738 struct DeferredBatch final
: public AioContext
{
1740 struct deferred_io
{
1741 bufferlist bl
; ///< data
1742 uint64_t seq
; ///< deferred transaction seq
1744 map
<uint64_t,deferred_io
> iomap
; ///< map of ios in this batch
1745 deferred_queue_t txcs
; ///< txcs in this batch
1746 IOContext ioc
; ///< our aios
1747 /// bytes of pending io for each deferred seq (may be 0)
1748 map
<uint64_t,int> seq_bytes
;
1750 void _discard(CephContext
*cct
, uint64_t offset
, uint64_t length
);
1751 void _audit(CephContext
*cct
);
1753 DeferredBatch(CephContext
*cct
, OpSequencer
*osr
)
1754 : osr(osr
), ioc(cct
, this) {}
1757 void prepare_write(CephContext
*cct
,
1758 uint64_t seq
, uint64_t offset
, uint64_t length
,
1759 bufferlist::const_iterator
& p
);
1761 void aio_finish(BlueStore
*store
) override
{
1762 store
->_deferred_aio_finish(osr
);
1766 class OpSequencer
: public RefCountedObject
{
1768 ceph::mutex qlock
= ceph::make_mutex("BlueStore::OpSequencer::qlock");
1769 ceph::condition_variable qcond
;
1770 typedef boost::intrusive::list
<
1772 boost::intrusive::member_hook
<
1774 boost::intrusive::list_member_hook
<>,
1775 &TransContext::sequencer_item
> > q_list_t
;
1776 q_list_t q
; ///< transactions
1778 boost::intrusive::list_member_hook
<> deferred_osr_queue_item
;
1780 DeferredBatch
*deferred_running
= nullptr;
1781 DeferredBatch
*deferred_pending
= nullptr;
1786 uint64_t last_seq
= 0;
1788 std::atomic_int txc_with_unstable_io
= {0}; ///< num txcs with unstable io
1790 std::atomic_int kv_committing_serially
= {0};
1792 std::atomic_int kv_submitted_waiters
= {0};
1794 std::atomic_bool zombie
= {false}; ///< in zombie_osr set (collection going away)
1796 const uint32_t sequencer_id
;
1798 uint32_t get_sequencer_id() const {
1799 return sequencer_id
;
1802 void queue_new(TransContext
*txc
) {
1803 std::lock_guard
l(qlock
);
1804 txc
->seq
= ++last_seq
;
1809 std::unique_lock
l(qlock
);
1814 void drain_preceding(TransContext
*txc
) {
1815 std::unique_lock
l(qlock
);
1816 while (&q
.front() != txc
)
1820 bool _is_all_kv_submitted() {
1821 // caller must hold qlock & q.empty() must not empty
1822 ceph_assert(!q
.empty());
1823 TransContext
*txc
= &q
.back();
1824 if (txc
->state
>= TransContext::STATE_KV_SUBMITTED
) {
1831 std::unique_lock
l(qlock
);
1833 // set flag before the check because the condition
1834 // may become true outside qlock, and we need to make
1835 // sure those threads see waiters and signal qcond.
1836 ++kv_submitted_waiters
;
1837 if (q
.empty() || _is_all_kv_submitted()) {
1838 --kv_submitted_waiters
;
1842 --kv_submitted_waiters
;
1846 void flush_all_but_last() {
1847 std::unique_lock
l(qlock
);
1848 assert (q
.size() >= 1);
1850 // set flag before the check because the condition
1851 // may become true outside qlock, and we need to make
1852 // sure those threads see waiters and signal qcond.
1853 ++kv_submitted_waiters
;
1854 if (q
.size() <= 1) {
1855 --kv_submitted_waiters
;
1858 auto it
= q
.rbegin();
1860 if (it
->state
>= TransContext::STATE_KV_SUBMITTED
) {
1861 --kv_submitted_waiters
;
1866 --kv_submitted_waiters
;
1870 bool flush_commit(Context
*c
) {
1871 std::lock_guard
l(qlock
);
1875 TransContext
*txc
= &q
.back();
1876 if (txc
->state
>= TransContext::STATE_KV_DONE
) {
1879 txc
->oncommits
.push_back(c
);
1883 FRIEND_MAKE_REF(OpSequencer
);
1884 OpSequencer(BlueStore
*store
, uint32_t sequencer_id
, const coll_t
& c
)
1885 : RefCountedObject(store
->cct
),
1886 store(store
), cid(c
), sequencer_id(sequencer_id
) {
1889 ceph_assert(q
.empty());
1893 typedef boost::intrusive::list
<
1895 boost::intrusive::member_hook
<
1897 boost::intrusive::list_member_hook
<>,
1898 &OpSequencer::deferred_osr_queue_item
> > deferred_osr_queue_t
;
1900 struct KVSyncThread
: public Thread
{
1902 explicit KVSyncThread(BlueStore
*s
) : store(s
) {}
1903 void *entry() override
{
1904 store
->_kv_sync_thread();
1908 struct KVFinalizeThread
: public Thread
{
1910 explicit KVFinalizeThread(BlueStore
*s
) : store(s
) {}
1912 store
->_kv_finalize_thread();
1917 struct DBHistogram
{
1926 map
<int, struct value_dist
> val_map
; ///< slab id to count, max length of value and key
1929 map
<string
, map
<int, struct key_dist
> > key_hist
;
1930 map
<int, uint64_t> value_hist
;
1931 int get_key_slab(size_t sz
);
1932 string
get_key_slab_to_range(int slab
);
1933 int get_value_slab(size_t sz
);
1934 string
get_value_slab_to_range(int slab
);
1935 void update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
1936 const string
&prefix
, size_t key_size
, size_t value_size
);
1937 void dump(Formatter
*f
);
1940 // --------------------------------------------------------
1943 BlueFS
*bluefs
= nullptr;
1944 bluefs_layout_t bluefs_layout
;
1945 mono_time bluefs_last_balance
;
1946 utime_t next_dump_on_bluefs_alloc_failure
;
1948 KeyValueDB
*db
= nullptr;
1949 BlockDevice
*bdev
= nullptr;
1950 std::string freelist_type
;
1951 FreelistManager
*fm
= nullptr;
1952 Allocator
*alloc
= nullptr;
1954 int path_fd
= -1; ///< open handle to $path
1955 int fsid_fd
= -1; ///< open handle (locked) to $path/fsid
1956 bool mounted
= false;
1958 ceph::shared_mutex coll_lock
= ceph::make_shared_mutex("BlueStore::coll_lock"); ///< rwlock to protect coll_map
1959 mempool::bluestore_cache_other::unordered_map
<coll_t
, CollectionRef
> coll_map
;
1960 bool collections_had_errors
= false;
1961 map
<coll_t
,CollectionRef
> new_coll_map
;
1963 vector
<OnodeCacheShard
*> onode_cache_shards
;
1964 vector
<BufferCacheShard
*> buffer_cache_shards
;
1966 /// protect zombie_osr_set
1967 ceph::mutex zombie_osr_lock
= ceph::make_mutex("BlueStore::zombie_osr_lock");
1968 uint32_t next_sequencer_id
= 0;
1969 std::map
<coll_t
,OpSequencerRef
> zombie_osr_set
; ///< set of OpSequencers for deleted collections
1971 std::atomic
<uint64_t> nid_last
= {0};
1972 std::atomic
<uint64_t> nid_max
= {0};
1973 std::atomic
<uint64_t> blobid_last
= {0};
1974 std::atomic
<uint64_t> blobid_max
= {0};
1976 interval_set
<uint64_t> bluefs_extents
; ///< block extents owned by bluefs
1977 interval_set
<uint64_t> bluefs_extents_reclaiming
; ///< currently reclaiming
1979 ceph::mutex deferred_lock
= ceph::make_mutex("BlueStore::deferred_lock");
1980 std::atomic
<uint64_t> deferred_seq
= {0};
1981 deferred_osr_queue_t deferred_queue
; ///< osr's with deferred io pending
1982 int deferred_queue_size
= 0; ///< num txc's queued across all osrs
1983 atomic_int deferred_aggressive
= {0}; ///< aggressive wakeup of kv thread
1985 utime_t deferred_last_submitted
= utime_t();
1987 KVSyncThread kv_sync_thread
;
1988 ceph::mutex kv_lock
= ceph::make_mutex("BlueStore::kv_lock");
1989 ceph::condition_variable kv_cond
;
1990 bool _kv_only
= false;
1991 bool kv_sync_started
= false;
1992 bool kv_stop
= false;
1993 bool kv_finalize_started
= false;
1994 bool kv_finalize_stop
= false;
1995 deque
<TransContext
*> kv_queue
; ///< ready, already submitted
1996 deque
<TransContext
*> kv_queue_unsubmitted
; ///< ready, need submit by kv thread
1997 deque
<TransContext
*> kv_committing
; ///< currently syncing
1998 deque
<DeferredBatch
*> deferred_done_queue
; ///< deferred ios done
1999 bool kv_sync_in_progress
= false;
2001 KVFinalizeThread kv_finalize_thread
;
2002 ceph::mutex kv_finalize_lock
= ceph::make_mutex("BlueStore::kv_finalize_lock");
2003 ceph::condition_variable kv_finalize_cond
;
2004 deque
<TransContext
*> kv_committing_to_finalize
; ///< pending finalization
2005 deque
<DeferredBatch
*> deferred_stable_to_finalize
; ///< pending finalization
2006 bool kv_finalize_in_progress
= false;
2008 PerfCounters
*logger
= nullptr;
2010 list
<CollectionRef
> removed_collections
;
2012 ceph::shared_mutex debug_read_error_lock
=
2013 ceph::make_shared_mutex("BlueStore::debug_read_error_lock");
2014 set
<ghobject_t
> debug_data_error_objects
;
2015 set
<ghobject_t
> debug_mdata_error_objects
;
2017 std::atomic
<int> csum_type
= {Checksummer::CSUM_CRC32C
};
2019 uint64_t block_size
= 0; ///< block size of block device (power of 2)
2020 uint64_t block_mask
= 0; ///< mask to get just the block offset
2021 size_t block_size_order
= 0; ///< bits to shift to get block size
2023 uint64_t min_alloc_size
; ///< minimum allocation unit (power of 2)
2024 ///< bits for min_alloc_size
2025 uint8_t min_alloc_size_order
= 0;
2026 static_assert(std::numeric_limits
<uint8_t>::max() >
2027 std::numeric_limits
<decltype(min_alloc_size
)>::digits
,
2028 "not enough bits for min_alloc_size");
2030 bool per_pool_omap
= false;
2032 ///< maximum allocation unit (power of 2)
2033 std::atomic
<uint64_t> max_alloc_size
= {0};
2035 ///< number threshold for forced deferred writes
2036 std::atomic
<int> deferred_batch_ops
= {0};
2038 ///< size threshold for forced deferred writes
2039 std::atomic
<uint64_t> prefer_deferred_size
= {0};
2041 ///< approx cost per io, in bytes
2042 std::atomic
<uint64_t> throttle_cost_per_io
= {0};
2044 std::atomic
<Compressor::CompressionMode
> comp_mode
=
2045 {Compressor::COMP_NONE
}; ///< compression mode
2046 CompressorRef compressor
;
2047 std::atomic
<uint64_t> comp_min_blob_size
= {0};
2048 std::atomic
<uint64_t> comp_max_blob_size
= {0};
2050 std::atomic
<uint64_t> max_blob_size
= {0}; ///< maximum blob size
2052 uint64_t kv_ios
= 0;
2053 uint64_t kv_throttle_costs
= 0;
2055 // cache trim control
2056 uint64_t cache_size
= 0; ///< total cache size
2057 double cache_meta_ratio
= 0; ///< cache ratio dedicated to metadata
2058 double cache_kv_ratio
= 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
2059 double cache_data_ratio
= 0; ///< cache ratio dedicated to object data
2060 bool cache_autotune
= false; ///< cache autotune setting
2061 double cache_autotune_interval
= 0; ///< time to wait between cache rebalancing
2062 uint64_t osd_memory_target
= 0; ///< OSD memory target when autotuning cache
2063 uint64_t osd_memory_base
= 0; ///< OSD base memory when autotuning cache
2064 double osd_memory_expected_fragmentation
= 0; ///< expected memory fragmentation
2065 uint64_t osd_memory_cache_min
= 0; ///< Min memory to assign when autotuning cache
2066 double osd_memory_cache_resize_interval
= 0; ///< Time to wait between cache resizing
2067 double max_defer_interval
= 0; ///< Time to wait between last deferred submit
2068 std::atomic
<uint32_t> config_changed
= {0}; ///< Counter to determine if there is a configuration change.
2070 typedef map
<uint64_t, volatile_statfs
> osd_pools_map
;
2072 ceph::mutex vstatfs_lock
= ceph::make_mutex("BlueStore::vstatfs_lock");
2073 volatile_statfs vstatfs
;
2074 osd_pools_map osd_pools
; // protected by vstatfs_lock as well
2076 bool per_pool_stat_collection
= true;
2078 struct MempoolThread
: public Thread
{
2082 ceph::condition_variable cond
;
2083 ceph::mutex lock
= ceph::make_mutex("BlueStore::MempoolThread::lock");
2085 std::shared_ptr
<PriorityCache::PriCache
> binned_kv_cache
= nullptr;
2086 std::shared_ptr
<PriorityCache::Manager
> pcm
= nullptr;
2088 struct MempoolCache
: public PriorityCache::PriCache
{
2090 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1] = {0};
2091 int64_t committed_bytes
= 0;
2092 double cache_ratio
= 0;
2094 MempoolCache(BlueStore
*s
) : store(s
) {};
2096 virtual uint64_t _get_used_bytes() const = 0;
2098 virtual int64_t request_cache_bytes(
2099 PriorityCache::Priority pri
, uint64_t total_cache
) const {
2100 int64_t assigned
= get_cache_bytes(pri
);
2103 // All cache items are currently shoved into the PRI1 priority
2104 case PriorityCache::Priority::PRI1
:
2106 int64_t request
= _get_used_bytes();
2107 return(request
> assigned
) ? request
- assigned
: 0;
2115 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
2116 return cache_bytes
[pri
];
2118 virtual int64_t get_cache_bytes() const {
2121 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
2122 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
2123 total
+= get_cache_bytes(pri
);
2127 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
2128 cache_bytes
[pri
] = bytes
;
2130 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
2131 cache_bytes
[pri
] += bytes
;
2133 virtual int64_t commit_cache_size(uint64_t total_cache
) {
2134 committed_bytes
= PriorityCache::get_chunk(
2135 get_cache_bytes(), total_cache
);
2136 return committed_bytes
;
2138 virtual int64_t get_committed_size() const {
2139 return committed_bytes
;
2141 virtual double get_cache_ratio() const {
2144 virtual void set_cache_ratio(double ratio
) {
2145 cache_ratio
= ratio
;
2147 virtual string
get_cache_name() const = 0;
2150 struct MetaCache
: public MempoolCache
{
2151 MetaCache(BlueStore
*s
) : MempoolCache(s
) {};
2153 virtual uint64_t _get_used_bytes() const {
2154 return mempool::bluestore_Buffer::allocated_bytes() +
2155 mempool::bluestore_Blob::allocated_bytes() +
2156 mempool::bluestore_Extent::allocated_bytes() +
2157 mempool::bluestore_cache_meta::allocated_bytes() +
2158 mempool::bluestore_cache_other::allocated_bytes() +
2159 mempool::bluestore_cache_onode::allocated_bytes() +
2160 mempool::bluestore_SharedBlob::allocated_bytes() +
2161 mempool::bluestore_inline_bl::allocated_bytes();
2164 virtual string
get_cache_name() const {
2165 return "BlueStore Meta Cache";
2168 uint64_t _get_num_onodes() const {
2169 uint64_t onode_num
=
2170 mempool::bluestore_cache_onode::allocated_items();
2171 return (2 > onode_num
) ? 2 : onode_num
;
2174 double get_bytes_per_onode() const {
2175 return (double)_get_used_bytes() / (double)_get_num_onodes();
2178 std::shared_ptr
<MetaCache
> meta_cache
;
2180 struct DataCache
: public MempoolCache
{
2181 DataCache(BlueStore
*s
) : MempoolCache(s
) {};
2183 virtual uint64_t _get_used_bytes() const {
2185 for (auto i
: store
->buffer_cache_shards
) {
2186 bytes
+= i
->_get_bytes();
2190 virtual string
get_cache_name() const {
2191 return "BlueStore Data Cache";
2194 std::shared_ptr
<DataCache
> data_cache
;
2197 explicit MempoolThread(BlueStore
*s
)
2199 meta_cache(new MetaCache(s
)),
2200 data_cache(new DataCache(s
)) {}
2202 void *entry() override
;
2204 ceph_assert(stop
== false);
2205 create("bstore_mempool");
2216 void _adjust_cache_settings();
2217 void _update_cache_settings();
2218 void _resize_shards(bool interval_stats
);
2221 // --------------------------------------------------------
2224 void _init_logger();
2225 void _shutdown_logger();
2226 int _reload_logger();
2230 int _open_fsid(bool create
);
2232 int _read_fsid(uuid_d
*f
);
2235 void _set_alloc_sizes();
2236 void _set_blob_size();
2237 void _set_finisher_num();
2238 void _set_per_pool_omap();
2239 void _update_osd_memory_options();
2241 int _open_bdev(bool create
);
2242 // Verifies if disk space is enough for reserved + min bluefs
2243 // and alters the latter if needed.
2244 // Depends on min_alloc_size hence should be called after
2245 // its initialization (and outside of _open_bdev)
2246 void _validate_bdev();
2249 int _minimal_open_bluefs(bool create
);
2250 void _minimal_close_bluefs();
2251 int _open_bluefs(bool create
);
2252 void _close_bluefs(bool cold_close
);
2254 // Limited (u)mount intended for BlueFS operations only
2255 int _mount_for_bluefs();
2256 void _umount_for_bluefs();
2259 int _is_bluefs(bool create
, bool* ret
);
2261 * opens both DB and dependant super_meta, FreelistManager and allocator
2262 * in the proper order
2264 int _open_db_and_around(bool read_only
);
2265 void _close_db_and_around(bool read_only
);
2267 // updates legacy bluefs related recs in DB to a state valid for
2268 // downgrades from nautilus.
2269 void _sync_bluefs_and_fm();
2272 * @warning to_repair_db means that we open this db to repair it, will not
2273 * hold the rocksdb's file lock.
2275 int _open_db(bool create
,
2276 bool to_repair_db
=false,
2277 bool read_only
= false);
2278 void _close_db(bool read_only
);
2279 int _open_fm(KeyValueDB::Transaction t
, bool read_only
);
2281 int _write_out_fm_meta(uint64_t target_size
,
2282 bool update_root_size
= false,
2283 bluestore_bdev_label_t
* res_label
= nullptr);
2285 void _close_alloc();
2286 int _open_collections();
2287 void _fsck_collections(int64_t* errors
);
2288 void _close_collections();
2290 int _setup_block_symlink_or_file(string name
, string path
, uint64_t size
,
2294 utime_t
get_deferred_last_submitted() {
2295 std::lock_guard
l(deferred_lock
);
2296 return deferred_last_submitted
;
2299 static int _write_bdev_label(CephContext
* cct
,
2300 string path
, bluestore_bdev_label_t label
);
2301 static int _read_bdev_label(CephContext
* cct
, string path
,
2302 bluestore_bdev_label_t
*label
);
2304 int _check_or_set_bdev_label(string path
, uint64_t size
, string desc
,
2307 int _open_super_meta();
2309 void _open_statfs();
2310 void _get_statfs_overall(struct store_statfs_t
*buf
);
2312 void _dump_alloc_on_failure();
2314 int64_t _get_bluefs_size_delta(uint64_t bluefs_free
, uint64_t bluefs_total
);
2315 int _balance_bluefs_freespace();
2317 CollectionRef
_get_collection(const coll_t
& cid
);
2318 void _queue_reap_collection(CollectionRef
& c
);
2319 void _reap_collections();
2320 void _update_cache_logger();
2322 void _assign_nid(TransContext
*txc
, OnodeRef o
);
2323 uint64_t _assign_blobid(TransContext
*txc
);
2325 template <int LogLevelV
>
2326 friend void _dump_onode(CephContext
*cct
, const Onode
& o
);
2327 template <int LogLevelV
>
2328 friend void _dump_extent_map(CephContext
*cct
, const ExtentMap
& em
);
2329 template <int LogLevelV
>
2330 friend void _dump_transaction(CephContext
*cct
, Transaction
*t
);
2332 TransContext
*_txc_create(Collection
*c
, OpSequencer
*osr
,
2333 list
<Context
*> *on_commits
);
2334 void _txc_update_store_statfs(TransContext
*txc
);
2335 void _txc_add_transaction(TransContext
*txc
, Transaction
*t
);
2336 void _txc_calc_cost(TransContext
*txc
);
2337 void _txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
);
2338 void _txc_state_proc(TransContext
*txc
);
2339 void _txc_aio_submit(TransContext
*txc
);
2341 void txc_aio_finish(void *p
) {
2342 _txc_state_proc(static_cast<TransContext
*>(p
));
2345 void _txc_finish_io(TransContext
*txc
);
2346 void _txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
);
2347 void _txc_apply_kv(TransContext
*txc
, bool sync_submit_transaction
);
2348 void _txc_committed_kv(TransContext
*txc
);
2349 void _txc_finish(TransContext
*txc
);
2350 void _txc_release_alloc(TransContext
*txc
);
2352 void _osr_attach(Collection
*c
);
2353 void _osr_register_zombie(OpSequencer
*osr
);
2354 void _osr_drain(OpSequencer
*osr
);
2355 void _osr_drain_preceding(TransContext
*txc
);
2356 void _osr_drain_all();
2360 void _kv_sync_thread();
2361 void _kv_finalize_thread();
2363 bluestore_deferred_op_t
*_get_deferred_op(TransContext
*txc
);
2364 void _deferred_queue(TransContext
*txc
);
2366 void deferred_try_submit();
2368 void _deferred_submit_unlock(OpSequencer
*osr
);
2369 void _deferred_aio_finish(OpSequencer
*osr
);
2370 int _deferred_replay();
2373 using mempool_dynamic_bitset
=
2374 boost::dynamic_bitset
<uint64_t,
2375 mempool::bluestore_fsck::pool_allocator
<uint64_t>>;
2376 using per_pool_statfs
=
2377 mempool::bluestore_fsck::map
<uint64_t, store_statfs_t
>;
2385 MAX_FSCK_ERROR_LINES
= 100,
2389 int _fsck_check_extents(
2391 const ghobject_t
& oid
,
2392 const PExtentVector
& extents
,
2394 mempool_dynamic_bitset
&used_blocks
,
2395 uint64_t granularity
,
2396 BlueStoreRepairer
* repairer
,
2397 store_statfs_t
& expected_statfs
,
2400 void _fsck_check_pool_statfs(
2401 per_pool_statfs
& expected_pool_statfs
,
2404 BlueStoreRepairer
* repairer
);
2406 int _fsck(FSCKDepth depth
, bool repair
);
2407 int _fsck_on_open(BlueStore::FSCKDepth depth
, bool repair
);
2409 void _buffer_cache_write(
2415 b
->shared_blob
->bc
.write(b
->shared_blob
->get_cache(), txc
->seq
, offset
, bl
,
2417 txc
->shared_blobs_written
.insert(b
->shared_blob
);
2420 int _collection_list(
2421 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
,
2422 int max
, bool legacy
, vector
<ghobject_t
> *ls
, ghobject_t
*next
);
2424 template <typename T
, typename F
>
2425 T
select_option(const std::string
& opt_name
, T val1
, F f
) {
2426 //NB: opt_name reserved for future use
2427 boost::optional
<T
> val2
= f();
2434 void _apply_padding(uint64_t head_pad
,
2436 bufferlist
& padded
);
2438 void _record_onode(OnodeRef
&o
, KeyValueDB::Transaction
&txn
);
2440 // -- ondisk version ---
2442 const int32_t latest_ondisk_format
= 4; ///< our version
2443 const int32_t min_readable_ondisk_format
= 1; ///< what we can read
2444 const int32_t min_compat_ondisk_format
= 3; ///< who can read us
2447 int32_t ondisk_format
= 0; ///< value detected on mount
2449 int _upgrade_super(); ///< upgrade (called during open_super)
2450 uint64_t _get_ondisk_reserved() const;
2451 void _prepare_ondisk_format_super(KeyValueDB::Transaction
& t
);
2453 // --- public interface ---
2455 BlueStore(CephContext
*cct
, const string
& path
);
2456 BlueStore(CephContext
*cct
, const string
& path
, uint64_t min_alloc_size
); // Ctor for UT only
2457 ~BlueStore() override
;
2459 string
get_type() override
{
2463 bool needs_journal() override
{ return false; };
2464 bool wants_journal() override
{ return false; };
2465 bool allows_journal() override
{ return false; };
2467 uint64_t get_min_alloc_size() const override
{
2468 return min_alloc_size
;
2471 int get_devices(set
<string
> *ls
) override
;
2473 bool is_rotational() override
;
2474 bool is_journal_rotational() override
;
2476 string
get_default_device_class() override
{
2477 string device_class
;
2478 map
<string
, string
> metadata
;
2479 collect_metadata(&metadata
);
2480 auto it
= metadata
.find("bluestore_bdev_type");
2481 if (it
!= metadata
.end()) {
2482 device_class
= it
->second
;
2484 return device_class
;
2490 set
<string
> *failed
) override
;
2492 static int get_block_device_fsid(CephContext
* cct
, const string
& path
,
2495 bool test_mount_in_use() override
;
2498 int _mount(bool kv_only
, bool open_db
=true);
2500 int mount() override
{
2501 return _mount(false);
2503 int umount() override
;
2505 int start_kv_only(KeyValueDB
**pdb
, bool open_db
=true) {
2506 int r
= _mount(true, open_db
);
2513 int write_meta(const std::string
& key
, const std::string
& value
) override
;
2514 int read_meta(const std::string
& key
, std::string
*value
) override
;
2519 int fsck(bool deep
) override
{
2520 return _fsck(deep
? FSCK_DEEP
: FSCK_REGULAR
, false);
2522 int repair(bool deep
) override
{
2523 return _fsck(deep
? FSCK_DEEP
: FSCK_REGULAR
, true);
2525 int quick_fix() override
{
2526 return _fsck(FSCK_SHALLOW
, true);
2529 void set_cache_shards(unsigned num
) override
;
2530 void dump_cache_stats(Formatter
*f
) override
{
2531 int onode_count
= 0, buffers_bytes
= 0;
2532 for (auto i
: onode_cache_shards
) {
2533 onode_count
+= i
->_get_num();
2535 for (auto i
: buffer_cache_shards
) {
2536 buffers_bytes
+= i
->_get_bytes();
2538 f
->dump_int("bluestore_onode", onode_count
);
2539 f
->dump_int("bluestore_buffers", buffers_bytes
);
2541 void dump_cache_stats(ostream
& ss
) override
{
2542 int onode_count
= 0, buffers_bytes
= 0;
2543 for (auto i
: onode_cache_shards
) {
2544 onode_count
+= i
->_get_num();
2546 for (auto i
: buffer_cache_shards
) {
2547 buffers_bytes
+= i
->_get_bytes();
2549 ss
<< "bluestore_onode: " << onode_count
;
2550 ss
<< "bluestore_buffers: " << buffers_bytes
;
2553 int validate_hobject_key(const hobject_t
&obj
) const override
{
2556 unsigned get_max_attr_name_length() override
{
2557 return 256; // arbitrary; there is no real limit internally
2560 int mkfs() override
;
2561 int mkjournal() override
{
2565 void get_db_statistics(Formatter
*f
) override
;
2566 void generate_db_histogram(Formatter
*f
) override
;
2567 void _shutdown_cache();
2568 int flush_cache(ostream
*os
= NULL
) override
;
2569 void dump_perf_counters(Formatter
*f
) override
{
2570 f
->open_object_section("perf_counters");
2571 logger
->dump_formatted(f
, false);
2575 int add_new_bluefs_device(int id
, const string
& path
);
2576 int migrate_to_existing_bluefs_device(const set
<int>& devs_source
,
2578 int migrate_to_new_bluefs_device(const set
<int>& devs_source
,
2580 const string
& path
);
2581 int expand_devices(ostream
& out
);
2582 string
get_device_path(unsigned id
);
2584 int dump_bluefs_sizes(ostream
& out
);
2587 int statfs(struct store_statfs_t
*buf
,
2588 osd_alert_list_t
* alerts
= nullptr) override
;
2589 int pool_statfs(uint64_t pool_id
, struct store_statfs_t
*buf
,
2590 bool *per_pool_omap
) override
;
2592 void collect_metadata(map
<string
,string
> *pm
) override
;
2594 bool exists(CollectionHandle
&c
, const ghobject_t
& oid
) override
;
2595 int set_collection_opts(
2596 CollectionHandle
& c
,
2597 const pool_opts_t
& opts
) override
;
2599 CollectionHandle
&c
,
2600 const ghobject_t
& oid
,
2602 bool allow_eio
= false) override
;
2604 CollectionHandle
&c
,
2605 const ghobject_t
& oid
,
2609 uint32_t op_flags
= 0) override
;
2613 // --------------------------------------------------------
2614 // intermediate data structures used while reading
2616 uint64_t logical_offset
;
2617 uint64_t blob_xoffset
; //region offset within the blob
2620 // used later in read process
2623 region_t(uint64_t offset
, uint64_t b_offs
, uint64_t len
, uint64_t front
= 0)
2624 : logical_offset(offset
),
2625 blob_xoffset(b_offs
),
2628 region_t(const region_t
& from
)
2629 : logical_offset(from
.logical_offset
),
2630 blob_xoffset(from
.blob_xoffset
),
2631 length(from
.length
),
2634 friend ostream
& operator<<(ostream
& out
, const region_t
& r
) {
2635 return out
<< "0x" << std::hex
<< r
.logical_offset
<< ":"
2636 << r
.blob_xoffset
<< "~" << r
.length
<< std::dec
;
2640 // merged blob read request
2645 std::list
<region_t
> regs
; // original read regions
2647 read_req_t(uint64_t off
, uint64_t len
) : r_off(off
), r_len(len
) {}
2649 friend ostream
& operator<<(ostream
& out
, const read_req_t
& r
) {
2650 out
<< "{<0x" << std::hex
<< r
.r_off
<< ", 0x" << r
.r_len
<< "> : [";
2651 for (const auto& reg
: r
.regs
)
2653 return out
<< "]}" << std::dec
;
2657 typedef list
<read_req_t
> regions2read_t
;
2658 typedef map
<BlueStore::BlobRef
, regions2read_t
> blobs2read_t
;
2664 int read_cache_policy
,
2665 ready_regions_t
& ready_regions
,
2666 blobs2read_t
& blobs2read
);
2669 int _prepare_read_ioc(
2670 blobs2read_t
& blobs2read
,
2671 vector
<bufferlist
>* compressed_blob_bls
,
2674 int _generate_read_result_bl(
2678 ready_regions_t
& ready_regions
,
2679 vector
<bufferlist
>& compressed_blob_bls
,
2680 blobs2read_t
& blobs2read
,
2691 uint32_t op_flags
= 0,
2692 uint64_t retry_count
= 0);
2697 const interval_set
<uint64_t>& m
,
2699 uint32_t op_flags
= 0,
2700 uint64_t retry_count
= 0);
2702 int _fiemap(CollectionHandle
&c_
, const ghobject_t
& oid
,
2703 uint64_t offset
, size_t len
, interval_set
<uint64_t>& destset
);
2705 int fiemap(CollectionHandle
&c
, const ghobject_t
& oid
,
2706 uint64_t offset
, size_t len
, bufferlist
& bl
) override
;
2707 int fiemap(CollectionHandle
&c
, const ghobject_t
& oid
,
2708 uint64_t offset
, size_t len
, map
<uint64_t, uint64_t>& destmap
) override
;
2711 CollectionHandle
&c_
,
2712 const ghobject_t
& oid
,
2713 interval_set
<uint64_t>& m
,
2715 uint32_t op_flags
) override
;
2717 int dump_onode(CollectionHandle
&c
, const ghobject_t
& oid
,
2718 const string
& section_name
, Formatter
*f
) override
;
2720 int getattr(CollectionHandle
&c
, const ghobject_t
& oid
, const char *name
,
2721 bufferptr
& value
) override
;
2723 int getattrs(CollectionHandle
&c
, const ghobject_t
& oid
,
2724 map
<string
,bufferptr
>& aset
) override
;
2726 int list_collections(vector
<coll_t
>& ls
) override
;
2728 CollectionHandle
open_collection(const coll_t
&c
) override
;
2729 CollectionHandle
create_new_collection(const coll_t
& cid
) override
;
2730 void set_collection_commit_queue(const coll_t
& cid
,
2731 ContextQueue
*commit_queue
) override
;
2733 bool collection_exists(const coll_t
& c
) override
;
2734 int collection_empty(CollectionHandle
& c
, bool *empty
) override
;
2735 int collection_bits(CollectionHandle
& c
) override
;
2737 int collection_list(CollectionHandle
&c
,
2738 const ghobject_t
& start
,
2739 const ghobject_t
& end
,
2741 vector
<ghobject_t
> *ls
, ghobject_t
*next
) override
;
2743 int collection_list_legacy(CollectionHandle
&c
,
2744 const ghobject_t
& start
,
2745 const ghobject_t
& end
,
2747 vector
<ghobject_t
> *ls
,
2748 ghobject_t
*next
) override
;
2751 CollectionHandle
&c
, ///< [in] Collection containing oid
2752 const ghobject_t
&oid
, ///< [in] Object containing omap
2753 bufferlist
*header
, ///< [out] omap header
2754 map
<string
, bufferlist
> *out
/// < [out] Key to value map
2757 Collection
*c
, ///< [in] Collection containing oid
2758 const ghobject_t
&oid
, ///< [in] Object containing omap
2759 bufferlist
*header
, ///< [out] omap header
2760 map
<string
, bufferlist
> *out
/// < [out] Key to value map
2762 int _onode_omap_get(
2763 const OnodeRef
&o
, ///< [in] Object containing omap
2764 bufferlist
*header
, ///< [out] omap header
2765 map
<string
, bufferlist
> *out
/// < [out] Key to value map
2770 int omap_get_header(
2771 CollectionHandle
&c
, ///< [in] Collection containing oid
2772 const ghobject_t
&oid
, ///< [in] Object containing omap
2773 bufferlist
*header
, ///< [out] omap header
2774 bool allow_eio
= false ///< [in] don't assert on eio
2777 /// Get keys defined on oid
2779 CollectionHandle
&c
, ///< [in] Collection containing oid
2780 const ghobject_t
&oid
, ///< [in] Object containing omap
2781 set
<string
> *keys
///< [out] Keys defined on oid
2785 int omap_get_values(
2786 CollectionHandle
&c
, ///< [in] Collection containing oid
2787 const ghobject_t
&oid
, ///< [in] Object containing omap
2788 const set
<string
> &keys
, ///< [in] Keys to get
2789 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
2793 int omap_get_values(
2794 CollectionHandle
&c
, ///< [in] Collection containing oid
2795 const ghobject_t
&oid
, ///< [in] Object containing omap
2796 const std::optional
<string
> &start_after
, ///< [in] Keys to get
2797 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
2801 /// Filters keys into out which are defined on oid
2802 int omap_check_keys(
2803 CollectionHandle
&c
, ///< [in] Collection containing oid
2804 const ghobject_t
&oid
, ///< [in] Object containing omap
2805 const set
<string
> &keys
, ///< [in] Keys to check
2806 set
<string
> *out
///< [out] Subset of keys defined on oid
2809 ObjectMap::ObjectMapIterator
get_omap_iterator(
2810 CollectionHandle
&c
, ///< [in] collection
2811 const ghobject_t
&oid
///< [in] object
2814 void set_fsid(uuid_d u
) override
{
2817 uuid_d
get_fsid() override
{
2821 uint64_t estimate_objects_overhead(uint64_t num_objects
) override
{
2822 return num_objects
* 300; //assuming per-object overhead is 300 bytes
2825 struct BSPerfTracker
{
2826 PerfCounters::avg_tracker
<uint64_t> os_commit_latency_ns
;
2827 PerfCounters::avg_tracker
<uint64_t> os_apply_latency_ns
;
2829 objectstore_perf_stat_t
get_cur_stats() const {
2830 objectstore_perf_stat_t ret
;
2831 ret
.os_commit_latency_ns
= os_commit_latency_ns
.current_avg();
2832 ret
.os_apply_latency_ns
= os_apply_latency_ns
.current_avg();
2836 void update_from_perfcounters(PerfCounters
&logger
);
2839 objectstore_perf_stat_t
get_cur_stats() override
{
2840 perf_tracker
.update_from_perfcounters(*logger
);
2841 return perf_tracker
.get_cur_stats();
2843 const PerfCounters
* get_perf_counters() const override
{
2846 const PerfCounters
* get_bluefs_perf_counters() const {
2847 return bluefs
->get_perf_counters();
2850 int queue_transactions(
2851 CollectionHandle
& ch
,
2852 vector
<Transaction
>& tls
,
2853 TrackedOpRef op
= TrackedOpRef(),
2854 ThreadPool::TPHandle
*handle
= NULL
) override
;
2857 void inject_data_error(const ghobject_t
& o
) override
{
2858 std::unique_lock
l(debug_read_error_lock
);
2859 debug_data_error_objects
.insert(o
);
2861 void inject_mdata_error(const ghobject_t
& o
) override
{
2862 std::unique_lock
l(debug_read_error_lock
);
2863 debug_mdata_error_objects
.insert(o
);
2866 /// methods to inject various errors fsck can repair
2867 void inject_broken_shared_blob_key(const string
& key
,
2868 const bufferlist
& bl
);
2869 void inject_leaked(uint64_t len
);
2870 void inject_false_free(coll_t cid
, ghobject_t oid
);
2871 void inject_statfs(const string
& key
, const store_statfs_t
& new_statfs
);
2872 void inject_global_statfs(const store_statfs_t
& new_statfs
);
2873 void inject_misreference(coll_t cid1
, ghobject_t oid1
,
2874 coll_t cid2
, ghobject_t oid2
,
2876 void inject_zombie_spanning_blob(coll_t cid
, ghobject_t oid
, int16_t blob_id
);
2877 // resets global per_pool_omap in DB
2878 void inject_legacy_omap();
2879 // resets per_pool_omap | pgmeta_omap for onode
2880 void inject_legacy_omap(coll_t cid
, ghobject_t oid
);
2882 void compact() override
{
2886 bool has_builtin_csum() const override
{
2891 Allocate space for BlueFS from slow device.
2892 Either automatically applies allocated extents to underlying
2893 BlueFS (extents == nullptr) or just return them (non-null extents) provided
2895 int allocate_bluefs_freespace(
2898 PExtentVector
* extents
);
2900 inline void log_latency(const char* name
,
2902 const ceph::timespan
& lat
,
2903 double lat_threshold
,
2904 const char* info
= "") const;
2906 inline void log_latency_fn(const char* name
,
2908 const ceph::timespan
& lat
,
2909 double lat_threshold
,
2910 std::function
<string (const ceph::timespan
& lat
)> fn
) const;
2913 bool _debug_data_eio(const ghobject_t
& o
) {
2914 if (!cct
->_conf
->bluestore_debug_inject_read_err
) {
2917 std::shared_lock
l(debug_read_error_lock
);
2918 return debug_data_error_objects
.count(o
);
2920 bool _debug_mdata_eio(const ghobject_t
& o
) {
2921 if (!cct
->_conf
->bluestore_debug_inject_read_err
) {
2924 std::shared_lock
l(debug_read_error_lock
);
2925 return debug_mdata_error_objects
.count(o
);
2927 void _debug_obj_on_delete(const ghobject_t
& o
) {
2928 if (cct
->_conf
->bluestore_debug_inject_read_err
) {
2929 std::unique_lock
l(debug_read_error_lock
);
2930 debug_data_error_objects
.erase(o
);
2931 debug_mdata_error_objects
.erase(o
);
2935 ceph::mutex qlock
= ceph::make_mutex("BlueStore::Alerts::qlock");
2936 string failed_cmode
;
2937 set
<string
> failed_compressors
;
2938 string spillover_alert
;
2939 string legacy_statfs_alert
;
2940 string no_per_pool_omap_alert
;
2941 string disk_size_mismatch_alert
;
2943 void _log_alerts(osd_alert_list_t
& alerts
);
2944 bool _set_compression_alert(bool cmode
, const char* s
) {
2945 std::lock_guard
l(qlock
);
2947 bool ret
= failed_cmode
.empty();
2951 return failed_compressors
.emplace(s
).second
;
2953 void _clear_compression_alert() {
2954 std::lock_guard
l(qlock
);
2955 failed_compressors
.clear();
2956 failed_cmode
.clear();
2959 void _set_spillover_alert(const string
& s
) {
2960 std::lock_guard
l(qlock
);
2961 spillover_alert
= s
;
2963 void _clear_spillover_alert() {
2964 std::lock_guard
l(qlock
);
2965 spillover_alert
.clear();
2968 void _check_legacy_statfs_alert();
2969 void _check_no_per_pool_omap_alert();
2970 void _set_disk_size_mismatch_alert(const string
& s
) {
2971 std::lock_guard
l(qlock
);
2972 disk_size_mismatch_alert
= s
;
2977 // --------------------------------------------------------
2978 // read processing internal methods
2981 const bluestore_blob_t
* blob
,
2982 uint64_t blob_xoffset
,
2983 const bufferlist
& bl
,
2984 uint64_t logical_offset
) const;
2985 int _decompress(bufferlist
& source
, bufferlist
* result
);
2988 // --------------------------------------------------------
2991 struct WriteContext
{
2992 bool buffered
= false; ///< buffered write
2993 bool compress
= false; ///< compressed write
2994 uint64_t target_blob_size
= 0; ///< target (max) blob size
2995 unsigned csum_order
= 0; ///< target checksum chunk order
2997 old_extent_map_t old_extents
; ///< must deref these blobs
2998 interval_set
<uint64_t> extents_to_gc
; ///< extents for garbage collection
3001 uint64_t logical_offset
; ///< write logical offset
3003 uint64_t blob_length
;
3006 uint64_t b_off0
; ///< original offset in a blob prior to padding
3007 uint64_t length0
; ///< original data length prior to padding
3010 bool new_blob
; ///< whether new blob was created
3012 bool compressed
= false;
3013 bufferlist compressed_bl
;
3014 size_t compressed_len
= 0;
3017 uint64_t logical_offs
,
3027 logical_offset(logical_offs
),
3029 blob_length(blob_len
),
3034 mark_unused(_mark_unused
),
3035 new_blob(_new_blob
) {}
3037 vector
<write_item
> writes
; ///< blobs we're writing
3039 /// partial clone of the context
3040 void fork(const WriteContext
& other
) {
3041 buffered
= other
.buffered
;
3042 compress
= other
.compress
;
3043 target_blob_size
= other
.target_blob_size
;
3044 csum_order
= other
.csum_order
;
3056 writes
.emplace_back(loffs
,
3066 /// Checks for writes to the same pextent within a blob
3071 uint64_t min_alloc_size
);
3074 void _do_write_small(
3078 uint64_t offset
, uint64_t length
,
3079 bufferlist::iterator
& blp
,
3080 WriteContext
*wctx
);
3085 uint64_t offset
, uint64_t length
,
3086 bufferlist::iterator
& blp
,
3087 WriteContext
*wctx
);
3088 int _do_alloc_write(
3092 WriteContext
*wctx
);
3098 set
<SharedBlob
*> *maybe_unshared_blobs
=0);
3100 int _write(TransContext
*txc
,
3103 uint64_t offset
, size_t len
,
3105 uint32_t fadvise_flags
);
3106 void _pad_zeros(bufferlist
*bl
, uint64_t *offset
,
3107 uint64_t chunk_size
);
3109 void _choose_write_options(CollectionRef
& c
,
3111 uint32_t fadvise_flags
,
3112 WriteContext
*wctx
);
3114 int _do_gc(TransContext
*txc
,
3117 const WriteContext
& wctx
,
3118 uint64_t *dirty_start
,
3119 uint64_t *dirty_end
);
3121 int _do_write(TransContext
*txc
,
3124 uint64_t offset
, uint64_t length
,
3126 uint32_t fadvise_flags
);
3127 void _do_write_data(TransContext
*txc
,
3133 WriteContext
*wctx
);
3135 int _touch(TransContext
*txc
,
3138 int _do_zero(TransContext
*txc
,
3141 uint64_t offset
, size_t len
);
3142 int _zero(TransContext
*txc
,
3145 uint64_t offset
, size_t len
);
3146 void _do_truncate(TransContext
*txc
,
3150 set
<SharedBlob
*> *maybe_unshared_blobs
=0);
3151 int _truncate(TransContext
*txc
,
3155 int _remove(TransContext
*txc
,
3158 int _do_remove(TransContext
*txc
,
3161 int _setattr(TransContext
*txc
,
3166 int _setattrs(TransContext
*txc
,
3169 const map
<string
,bufferptr
>& aset
);
3170 int _rmattr(TransContext
*txc
,
3173 const string
& name
);
3174 int _rmattrs(TransContext
*txc
,
3177 void _do_omap_clear(TransContext
*txc
, OnodeRef
&o
);
3178 int _omap_clear(TransContext
*txc
,
3181 int _omap_setkeys(TransContext
*txc
,
3185 int _omap_setheader(TransContext
*txc
,
3188 bufferlist
& header
);
3189 int _omap_rmkeys(TransContext
*txc
,
3193 int _omap_rmkey_range(TransContext
*txc
,
3196 const string
& first
, const string
& last
);
3197 int _set_alloc_hint(
3201 uint64_t expected_object_size
,
3202 uint64_t expected_write_size
,
3204 int _do_clone_range(TransContext
*txc
,
3208 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
);
3209 int _clone(TransContext
*txc
,
3213 int _clone_range(TransContext
*txc
,
3217 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
);
3218 int _rename(TransContext
*txc
,
3222 const ghobject_t
& new_oid
);
3223 int _create_collection(TransContext
*txc
, const coll_t
&cid
,
3224 unsigned bits
, CollectionRef
*c
);
3225 int _remove_collection(TransContext
*txc
, const coll_t
&cid
,
3227 void _do_remove_collection(TransContext
*txc
, CollectionRef
*c
);
3228 int _split_collection(TransContext
*txc
,
3231 unsigned bits
, int rem
);
3232 int _merge_collection(TransContext
*txc
,
3237 void _collect_allocation_stats(uint64_t need
, uint32_t alloc_size
,
3239 void _record_allocation_stats();
3241 uint64_t probe_count
= 0;
3242 std::atomic
<uint64_t> alloc_stats_count
= {0};
3243 std::atomic
<uint64_t> alloc_stats_fragments
= { 0 };
3244 std::atomic
<uint64_t> alloc_stats_size
= { 0 };
3246 std::array
<std::tuple
<uint64_t, uint64_t, uint64_t>, 5> alloc_stats_history
=
3247 { std::make_tuple(0ul, 0ul, 0ul) };
3249 std::atomic
<uint64_t> out_of_sync_fm
= {0};
3250 // --------------------------------------------------------
3251 // BlueFSDeviceExpander implementation
3252 uint64_t get_recommended_expansion_delta(uint64_t bluefs_free
,
3253 uint64_t bluefs_total
) override
{
3254 auto delta
= _get_bluefs_size_delta(bluefs_free
, bluefs_total
);
3255 return delta
> 0 ? delta
: 0;
3257 int allocate_freespace(
3260 PExtentVector
& extents
) override
{
3261 return allocate_bluefs_freespace(min_size
, size
, &extents
);
3263 uint64_t available_freespace(uint64_t alloc_size
) override
;
3264 inline bool _use_rotational_settings();
3269 int64_t pool_id
= INT64_MIN
;
3270 list
<ghobject_t
> oids
;
3271 BlueStore::SharedBlobRef sb
;
3272 bluestore_extent_ref_map_t ref_map
;
3273 bool compressed
= false;
3274 bool passed
= false;
3275 bool updated
= false;
3277 typedef btree::btree_set
<
3278 uint64_t, std::less
<uint64_t>,
3279 mempool::bluestore_fsck::pool_allocator
<uint64_t>> uint64_t_btree_t
;
3281 typedef mempool::bluestore_fsck::map
<uint64_t, sb_info_t
> sb_info_map_t
;
3282 struct FSCK_ObjectCtx
{
3285 uint64_t& num_objects
;
3286 uint64_t& num_extents
;
3287 uint64_t& num_blobs
;
3288 uint64_t& num_sharded_objects
;
3289 uint64_t& num_spanning_blobs
;
3291 mempool_dynamic_bitset
* used_blocks
;
3292 uint64_t_btree_t
* used_omap_head
;
3294 ceph::mutex
* sb_info_lock
;
3295 sb_info_map_t
& sb_info
;
3297 store_statfs_t
& expected_store_statfs
;
3298 per_pool_statfs
& expected_pool_statfs
;
3299 BlueStoreRepairer
* repairer
;
3301 FSCK_ObjectCtx(int64_t& e
,
3303 uint64_t& _num_objects
,
3304 uint64_t& _num_extents
,
3305 uint64_t& _num_blobs
,
3306 uint64_t& _num_sharded_objects
,
3307 uint64_t& _num_spanning_blobs
,
3308 mempool_dynamic_bitset
* _ub
,
3309 uint64_t_btree_t
* _used_omap_head
,
3310 ceph::mutex
* _sb_info_lock
,
3311 sb_info_map_t
& _sb_info
,
3312 store_statfs_t
& _store_statfs
,
3313 per_pool_statfs
& _pool_statfs
,
3314 BlueStoreRepairer
* _repairer
) :
3317 num_objects(_num_objects
),
3318 num_extents(_num_extents
),
3319 num_blobs(_num_blobs
),
3320 num_sharded_objects(_num_sharded_objects
),
3321 num_spanning_blobs(_num_spanning_blobs
),
3323 used_omap_head(_used_omap_head
),
3324 sb_info_lock(_sb_info_lock
),
3326 expected_store_statfs(_store_statfs
),
3327 expected_pool_statfs(_pool_statfs
),
3328 repairer(_repairer
) {
3332 OnodeRef
fsck_check_objects_shallow(
3336 const ghobject_t
& oid
,
3338 const bufferlist
& value
,
3339 mempool::bluestore_fsck::list
<string
>* expecting_shards
,
3340 map
<BlobRef
, bluestore_blob_t::unused_t
>* referenced
,
3341 const BlueStore::FSCK_ObjectCtx
& ctx
);
3344 void _fsck_check_object_omap(FSCKDepth depth
,
3346 const BlueStore::FSCK_ObjectCtx
& ctx
);
3348 void _fsck_check_objects(FSCKDepth depth
,
3349 FSCK_ObjectCtx
& ctx
);
3352 inline ostream
& operator<<(ostream
& out
, const BlueStore::volatile_statfs
& s
) {
3355 << s
.values
[BlueStore::volatile_statfs::STATFS_ALLOCATED
]
3357 << s
.values
[BlueStore::volatile_statfs::STATFS_STORED
]
3359 << s
.values
[BlueStore::volatile_statfs::STATFS_COMPRESSED
]
3360 << " compressed_orig:"
3361 << s
.values
[BlueStore::volatile_statfs::STATFS_COMPRESSED_ORIGINAL
]
3362 << " compressed_alloc:"
3363 << s
.values
[BlueStore::volatile_statfs::STATFS_COMPRESSED_ALLOCATED
];
3366 static inline void intrusive_ptr_add_ref(BlueStore::Onode
*o
) {
3369 static inline void intrusive_ptr_release(BlueStore::Onode
*o
) {
3373 static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer
*o
) {
3376 static inline void intrusive_ptr_release(BlueStore::OpSequencer
*o
) {
3380 class BlueStoreRepairer
3383 // to simplify future potential migration to mempools
3384 using fsck_interval
= interval_set
<uint64_t>;
3386 // Structure to track what pextents are used for specific cid/oid.
3387 // Similar to Bloom filter positive and false-positive matches are
3389 // Maintains two lists of bloom filters for both cids and oids
3390 // where each list entry is a BF for specific disk pextent
3391 // The length of the extent per filter is measured on init.
3392 // Allows to filter out 'uninteresting' pextents to speadup subsequent
3393 // 'is_used' access.
3394 struct StoreSpaceTracker
{
3395 const uint64_t BLOOM_FILTER_SALT_COUNT
= 2;
3396 const uint64_t BLOOM_FILTER_TABLE_SIZE
= 32; // bytes per single filter
3397 const uint64_t BLOOM_FILTER_EXPECTED_COUNT
= 16; // arbitrary selected
3398 static const uint64_t DEF_MEM_CAP
= 128 * 1024 * 1024;
3400 typedef mempool::bluestore_fsck::vector
<bloom_filter
> bloom_vector
;
3401 bloom_vector collections_bfs
;
3402 bloom_vector objects_bfs
;
3404 bool was_filtered_out
= false;
3405 uint64_t granularity
= 0; // extent length for a single filter
3407 StoreSpaceTracker() {
3409 StoreSpaceTracker(const StoreSpaceTracker
& from
) :
3410 collections_bfs(from
.collections_bfs
),
3411 objects_bfs(from
.objects_bfs
),
3412 granularity(from
.granularity
) {
3415 void init(uint64_t total
,
3416 uint64_t min_alloc_size
,
3417 uint64_t mem_cap
= DEF_MEM_CAP
) {
3418 ceph_assert(!granularity
); // not initialized yet
3419 ceph_assert(min_alloc_size
&& isp2(min_alloc_size
));
3420 ceph_assert(mem_cap
);
3422 total
= round_up_to(total
, min_alloc_size
);
3423 granularity
= total
* BLOOM_FILTER_TABLE_SIZE
* 2 / mem_cap
;
3426 granularity
= min_alloc_size
;
3428 granularity
= round_up_to(granularity
, min_alloc_size
);
3431 uint64_t entries
= round_up_to(total
, granularity
) / granularity
;
3432 collections_bfs
.resize(entries
,
3433 bloom_filter(BLOOM_FILTER_SALT_COUNT
,
3434 BLOOM_FILTER_TABLE_SIZE
,
3436 BLOOM_FILTER_EXPECTED_COUNT
));
3437 objects_bfs
.resize(entries
,
3438 bloom_filter(BLOOM_FILTER_SALT_COUNT
,
3439 BLOOM_FILTER_TABLE_SIZE
,
3441 BLOOM_FILTER_EXPECTED_COUNT
));
3443 inline uint32_t get_hash(const coll_t
& cid
) const {
3444 return cid
.hash_to_shard(1);
3446 inline void set_used(uint64_t offset
, uint64_t len
,
3447 const coll_t
& cid
, const ghobject_t
& oid
) {
3448 ceph_assert(granularity
); // initialized
3450 // can't call this func after filter_out has been applied
3451 ceph_assert(!was_filtered_out
);
3455 auto pos
= offset
/ granularity
;
3456 auto end_pos
= (offset
+ len
- 1) / granularity
;
3457 while (pos
<= end_pos
) {
3458 collections_bfs
[pos
].insert(get_hash(cid
));
3459 objects_bfs
[pos
].insert(oid
.hobj
.get_hash());
3463 // filter-out entries unrelated to the specified(broken) extents.
3464 // 'is_used' calls are permitted after that only
3465 size_t filter_out(const fsck_interval
& extents
);
3467 // determines if collection's present after filtering-out
3468 inline bool is_used(const coll_t
& cid
) const {
3469 ceph_assert(was_filtered_out
);
3470 for(auto& bf
: collections_bfs
) {
3471 if (bf
.contains(get_hash(cid
))) {
3477 // determines if object's present after filtering-out
3478 inline bool is_used(const ghobject_t
& oid
) const {
3479 ceph_assert(was_filtered_out
);
3480 for(auto& bf
: objects_bfs
) {
3481 if (bf
.contains(oid
.hobj
.get_hash())) {
3487 // determines if collection's present before filtering-out
3488 inline bool is_used(const coll_t
& cid
, uint64_t offs
) const {
3489 ceph_assert(granularity
); // initialized
3490 ceph_assert(!was_filtered_out
);
3491 auto &bf
= collections_bfs
[offs
/ granularity
];
3492 if (bf
.contains(get_hash(cid
))) {
3497 // determines if object's present before filtering-out
3498 inline bool is_used(const ghobject_t
& oid
, uint64_t offs
) const {
3499 ceph_assert(granularity
); // initialized
3500 ceph_assert(!was_filtered_out
);
3501 auto &bf
= objects_bfs
[offs
/ granularity
];
3502 if (bf
.contains(oid
.hobj
.get_hash())) {
3509 void fix_per_pool_omap(KeyValueDB
*db
);
3510 bool remove_key(KeyValueDB
*db
, const string
& prefix
, const string
& key
);
3511 bool fix_shared_blob(KeyValueDB
*db
,
3513 const bufferlist
* bl
);
3514 bool fix_statfs(KeyValueDB
*db
, const string
& key
,
3515 const store_statfs_t
& new_statfs
);
3517 bool fix_leaked(KeyValueDB
*db
,
3518 FreelistManager
* fm
,
3519 uint64_t offset
, uint64_t len
);
3520 bool fix_false_free(KeyValueDB
*db
,
3521 FreelistManager
* fm
,
3522 uint64_t offset
, uint64_t len
);
3523 bool fix_bluefs_extents(std::atomic
<uint64_t>& out_of_sync_flag
);
3524 KeyValueDB::Transaction
fix_spanning_blobs(KeyValueDB
* db
);
3526 void init(uint64_t total_space
, uint64_t lres_tracking_unit_size
);
3528 bool preprocess_misreference(KeyValueDB
*db
);
3530 unsigned apply(KeyValueDB
* db
);
3532 void note_misreference(uint64_t offs
, uint64_t len
, bool inc_error
) {
3533 misreferenced_extents
.union_insert(offs
, len
);
3538 // In fact this is the only repairer's method which is thread-safe!!
3539 void inc_repaired() {
3543 StoreSpaceTracker
& get_space_usage_tracker() {
3544 return space_usage_tracker
;
3546 const fsck_interval
& get_misreferences() const {
3547 return misreferenced_extents
;
3549 KeyValueDB::Transaction
get_fix_misreferences_txn() {
3550 return fix_misreferences_txn
;
3554 std::atomic
<unsigned> to_repair_cnt
= { 0 };
3555 KeyValueDB::Transaction fix_per_pool_omap_txn
;
3556 KeyValueDB::Transaction fix_fm_leaked_txn
;
3557 KeyValueDB::Transaction fix_fm_false_free_txn
;
3558 KeyValueDB::Transaction remove_key_txn
;
3559 KeyValueDB::Transaction fix_statfs_txn
;
3560 KeyValueDB::Transaction fix_shared_blob_txn
;
3562 KeyValueDB::Transaction fix_misreferences_txn
;
3563 KeyValueDB::Transaction fix_onode_txn
;
3565 StoreSpaceTracker space_usage_tracker
;
3567 // non-shared extents with multiple references
3568 fsck_interval misreferenced_extents
;
3572 class RocksDBBlueFSVolumeSelector
: public BlueFSVolumeSelector
3574 template <class T
, size_t MaxX
, size_t MaxY
>
3576 T values
[MaxX
][MaxY
];
3581 T
& at(size_t x
, size_t y
) {
3582 ceph_assert(x
< MaxX
);
3583 ceph_assert(y
< MaxY
);
3585 return values
[x
][y
];
3587 size_t get_max_x() const {
3590 size_t get_max_y() const {
3594 memset(values
, 0, sizeof(values
));
3599 // use 0/nullptr as unset indication
3601 LEVEL_LOG
= LEVEL_FIRST
, // BlueFS log
3607 // add +1 row for corresponding per-device totals
3608 // add +1 column for per-level actual (taken from file size) total
3609 typedef matrix_2d
<uint64_t, BlueFS::MAX_BDEV
+ 1, LEVEL_MAX
- LEVEL_FIRST
+ 1> per_level_per_dev_usage_t
;
3611 per_level_per_dev_usage_t per_level_per_dev_usage
;
3612 // file count per level, add +1 to keep total file count
3613 uint64_t per_level_files
[LEVEL_MAX
- LEVEL_FIRST
+ 1] = { 0 };
3615 // Note: maximum per-device totals below might be smaller than corresponding
3616 // perf counters by up to a single alloc unit (1M) due to superblock extent.
3617 // The later is not accounted here.
3618 per_level_per_dev_usage_t per_level_per_dev_max
;
3620 uint64_t l_totals
[LEVEL_MAX
- LEVEL_FIRST
];
3621 uint64_t db_avail4slow
= 0;
3628 RocksDBBlueFSVolumeSelector(
3629 uint64_t _wal_total
,
3631 uint64_t _slow_total
,
3632 uint64_t _level0_size
,
3633 uint64_t _level_base
,
3634 uint64_t _level_multiplier
,
3635 double reserved_factor
,
3639 l_totals
[LEVEL_LOG
- LEVEL_FIRST
] = 0; // not used at the moment
3640 l_totals
[LEVEL_WAL
- LEVEL_FIRST
] = _wal_total
;
3641 l_totals
[LEVEL_DB
- LEVEL_FIRST
] = _db_total
;
3642 l_totals
[LEVEL_SLOW
- LEVEL_FIRST
] = _slow_total
;
3648 // Calculating how much extra space is available at DB volume.
3649 // Depending on the presence of explicit reserved size specification it might be either
3650 // * DB volume size - reserved
3652 // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
3654 uint64_t prev_levels
= _level0_size
;
3655 uint64_t cur_level
= _level_base
;
3656 uint64_t cur_threshold
= 0;
3658 uint64_t next_level
= cur_level
* _level_multiplier
;
3659 uint64_t next_threshold
= prev_levels
+ cur_level
+ next_level
* reserved_factor
;
3660 if (_db_total
<= next_threshold
) {
3661 db_avail4slow
= cur_threshold
? _db_total
- cur_threshold
: 0;
3664 prev_levels
+= cur_level
;
3665 cur_level
= next_level
;
3666 cur_threshold
= next_threshold
;
3670 db_avail4slow
= _db_total
- reserved
;
3674 void* get_hint_for_log() const override
{
3675 return reinterpret_cast<void*>(LEVEL_LOG
);
3677 void* get_hint_by_dir(const string
& dirname
) const override
;
3679 void add_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
3680 if (hint
== nullptr)
3682 size_t pos
= (size_t)hint
- LEVEL_FIRST
;
3683 for (auto& p
: fnode
.extents
) {
3684 auto& cur
= per_level_per_dev_usage
.at(p
.bdev
, pos
);
3685 auto& max
= per_level_per_dev_max
.at(p
.bdev
, pos
);
3691 //update per-device totals
3692 auto& cur
= per_level_per_dev_usage
.at(p
.bdev
, LEVEL_MAX
- LEVEL_FIRST
);
3693 auto& max
= per_level_per_dev_max
.at(p
.bdev
, LEVEL_MAX
- LEVEL_FIRST
);
3701 //update per-level actual totals
3702 auto& cur
= per_level_per_dev_usage
.at(BlueFS::MAX_BDEV
, pos
);
3703 auto& max
= per_level_per_dev_max
.at(BlueFS::MAX_BDEV
, pos
);
3709 ++per_level_files
[pos
];
3710 ++per_level_files
[LEVEL_MAX
- LEVEL_FIRST
];
3712 void sub_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
3713 if (hint
== nullptr)
3715 size_t pos
= (size_t)hint
- LEVEL_FIRST
;
3716 for (auto& p
: fnode
.extents
) {
3717 auto& cur
= per_level_per_dev_usage
.at(p
.bdev
, pos
);
3718 ceph_assert(cur
>= p
.length
);
3721 //update per-device totals
3722 auto& cur2
= per_level_per_dev_usage
.at(p
.bdev
, LEVEL_MAX
- LEVEL_FIRST
);
3723 ceph_assert(cur2
>= p
.length
);
3726 //update per-level actual totals
3727 auto& cur
= per_level_per_dev_usage
.at(BlueFS::MAX_BDEV
, pos
);
3728 ceph_assert(cur
>= fnode
.size
);
3730 ceph_assert(per_level_files
[pos
] > 0);
3731 --per_level_files
[pos
];
3732 ceph_assert(per_level_files
[LEVEL_MAX
- LEVEL_FIRST
] > 0);
3733 --per_level_files
[LEVEL_MAX
- LEVEL_FIRST
];
3735 void add_usage(void* hint
, uint64_t fsize
) override
{
3736 if (hint
== nullptr)
3738 size_t pos
= (size_t)hint
- LEVEL_FIRST
;
3739 //update per-level actual totals
3740 auto& cur
= per_level_per_dev_usage
.at(BlueFS::MAX_BDEV
, pos
);
3741 auto& max
= per_level_per_dev_max
.at(BlueFS::MAX_BDEV
, pos
);
3747 void sub_usage(void* hint
, uint64_t fsize
) override
{
3748 if (hint
== nullptr)
3750 size_t pos
= (size_t)hint
- LEVEL_FIRST
;
3751 //update per-level actual totals
3752 auto& cur
= per_level_per_dev_usage
.at(BlueFS::MAX_BDEV
, pos
);
3753 ceph_assert(cur
>= fsize
);
3754 per_level_per_dev_usage
.at(BlueFS::MAX_BDEV
, pos
) -= fsize
;
3757 uint8_t select_prefer_bdev(void* h
) override
;
3759 const std::string
& base
,
3760 BlueFSVolumeSelector::paths
& res
) const override
;
3762 void dump(ostream
& sout
) override
;