1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_OSD_BLUESTORE_H
16 #define CEPH_OSD_BLUESTORE_H
26 #include <condition_variable>
28 #include <boost/intrusive/list.hpp>
29 #include <boost/intrusive/unordered_set.hpp>
30 #include <boost/intrusive/set.hpp>
31 #include <boost/functional/hash.hpp>
32 #include <boost/dynamic_bitset.hpp>
33 #include <boost/circular_buffer.hpp>
35 #include "include/cpp-btree/btree_set.h"
37 #include "include/ceph_assert.h"
38 #include "include/interval_set.h"
39 #include "include/unordered_map.h"
40 #include "include/mempool.h"
41 #include "include/hash.h"
42 #include "common/bloom_filter.hpp"
43 #include "common/Finisher.h"
44 #include "common/ceph_mutex.h"
45 #include "common/Throttle.h"
46 #include "common/perf_counters.h"
47 #include "common/PriorityCache.h"
48 #include "compressor/Compressor.h"
49 #include "os/ObjectStore.h"
51 #include "bluestore_types.h"
53 #include "common/EventTrace.h"
56 #include "common/zipkin_trace.h"
60 class FreelistManager
;
61 class BlueStoreRepairer
;
64 //#define DEBUG_DEFERRED
68 // constants for Buffer::optimize()
69 #define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
70 #define CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
73 l_bluestore_first
= 732430,
74 // space utilization stats
75 //****************************************
76 l_bluestore_allocated
,
78 l_bluestore_fragmentation
,
79 l_bluestore_alloc_unit
,
80 //****************************************
82 // Update op processing state latencies
83 //****************************************
84 l_bluestore_state_prepare_lat
,
85 l_bluestore_state_aio_wait_lat
,
86 l_bluestore_state_io_done_lat
,
87 l_bluestore_state_kv_queued_lat
,
88 l_bluestore_state_kv_committing_lat
,
89 l_bluestore_state_kv_done_lat
,
90 l_bluestore_state_finishing_lat
,
91 l_bluestore_state_done_lat
,
93 l_bluestore_state_deferred_queued_lat
,
94 l_bluestore_state_deferred_aio_wait_lat
,
95 l_bluestore_state_deferred_cleanup_lat
,
97 l_bluestore_commit_lat
,
98 //****************************************
100 // Update Transaction stats
101 //****************************************
102 l_bluestore_throttle_lat
,
103 l_bluestore_submit_lat
,
105 //****************************************
108 //****************************************
109 l_bluestore_read_onode_meta_lat
,
110 l_bluestore_read_wait_aio_lat
,
111 l_bluestore_csum_lat
,
112 l_bluestore_read_eio
,
113 l_bluestore_reads_with_retries
,
114 l_bluestore_read_lat
,
115 //****************************************
117 // kv_thread latencies
118 //****************************************
119 l_bluestore_kv_flush_lat
,
120 l_bluestore_kv_commit_lat
,
121 l_bluestore_kv_sync_lat
,
122 l_bluestore_kv_final_lat
,
123 //****************************************
126 //****************************************
127 l_bluestore_write_big
,
128 l_bluestore_write_big_bytes
,
129 l_bluestore_write_big_blobs
,
130 l_bluestore_write_big_deferred
,
132 l_bluestore_write_small
,
133 l_bluestore_write_small_bytes
,
134 l_bluestore_write_small_unused
,
135 l_bluestore_write_small_pre_read
,
137 l_bluestore_write_pad_bytes
,
138 l_bluestore_write_penalty_read_ops
,
139 l_bluestore_write_new
,
141 l_bluestore_issued_deferred_writes
,
142 l_bluestore_issued_deferred_write_bytes
,
143 l_bluestore_submitted_deferred_writes
,
144 l_bluestore_submitted_deferred_write_bytes
,
146 l_bluestore_write_big_skipped_blobs
,
147 l_bluestore_write_big_skipped_bytes
,
148 l_bluestore_write_small_skipped
,
149 l_bluestore_write_small_skipped_bytes
,
150 //****************************************
152 // compressions stats
153 //****************************************
154 l_bluestore_compressed
,
155 l_bluestore_compressed_allocated
,
156 l_bluestore_compressed_original
,
157 l_bluestore_compress_lat
,
158 l_bluestore_decompress_lat
,
159 l_bluestore_compress_success_count
,
160 l_bluestore_compress_rejected_count
,
161 //****************************************
164 //****************************************
166 l_bluestore_pinned_onodes
,
167 l_bluestore_onode_hits
,
168 l_bluestore_onode_misses
,
169 l_bluestore_onode_shard_hits
,
170 l_bluestore_onode_shard_misses
,
173 //****************************************
175 // buffer cache stats
176 //****************************************
178 l_bluestore_buffer_bytes
,
179 l_bluestore_buffer_hit_bytes
,
180 l_bluestore_buffer_miss_bytes
,
181 //****************************************
184 //****************************************
185 l_bluestore_onode_reshard
,
186 l_bluestore_blob_split
,
187 l_bluestore_extent_compress
,
188 l_bluestore_gc_merged
,
189 //****************************************
191 // other client ops latencies
192 //****************************************
193 l_bluestore_omap_seek_to_first_lat
,
194 l_bluestore_omap_upper_bound_lat
,
195 l_bluestore_omap_lower_bound_lat
,
196 l_bluestore_omap_next_lat
,
197 l_bluestore_omap_get_keys_lat
,
198 l_bluestore_omap_get_values_lat
,
199 l_bluestore_omap_clear_lat
,
200 l_bluestore_clist_lat
,
201 l_bluestore_remove_lat
,
202 l_bluestore_truncate_lat
,
203 //****************************************
206 //****************************************
207 l_bluestore_allocate_hist
,
208 //****************************************
212 #define META_POOL_ID ((uint64_t)-1ull)
214 class BlueStore
: public ObjectStore
,
215 public md_config_obs_t
{
216 // -----------------------------------------------------
220 const char** get_tracked_conf_keys() const override
;
221 void handle_conf_change(const ConfigProxy
& conf
,
222 const std::set
<std::string
> &changed
) override
;
224 //handler for discard event
225 void handle_discard(interval_set
<uint64_t>& to_release
);
228 void _set_compression();
229 void _set_throttle_params();
230 int _set_cache_sizes();
231 void _set_max_defer_interval() {
233 cct
->_conf
.get_val
<double>("bluestore_max_defer_interval");
238 typedef std::map
<uint64_t, ceph::buffer::list
> ready_regions_t
;
243 typedef boost::intrusive_ptr
<Collection
> CollectionRef
;
246 virtual void aio_finish(BlueStore
*store
) = 0;
247 virtual ~AioContext() {}
252 MEMPOOL_CLASS_HELPERS();
255 STATE_EMPTY
, ///< empty buffer -- used for cache history
256 STATE_CLEAN
, ///< clean data that is up to date
257 STATE_WRITING
, ///< data that is being written (io not yet complete)
259 static const char *get_state_name(int s
) {
261 case STATE_EMPTY
: return "empty";
262 case STATE_CLEAN
: return "clean";
263 case STATE_WRITING
: return "writing";
264 default: return "???";
268 FLAG_NOCACHE
= 1, ///< trim when done WRITING (do not become CLEAN)
269 // NOTE: fix operator<< when you define a second flag
271 static const char *get_flag_name(int s
) {
273 case FLAG_NOCACHE
: return "nocache";
274 default: return "???";
279 uint16_t state
; ///< STATE_*
280 uint16_t cache_private
= 0; ///< opaque (to us) value used by Cache impl
281 uint32_t flags
; ///< FLAG_*
283 uint32_t offset
, length
;
284 ceph::buffer::list data
;
285 std::shared_ptr
<int64_t> cache_age_bin
; ///< cache age bin
287 boost::intrusive::list_member_hook
<> lru_item
;
288 boost::intrusive::list_member_hook
<> state_item
;
290 Buffer(BufferSpace
*space
, unsigned s
, uint64_t q
, uint32_t o
, uint32_t l
,
292 : space(space
), state(s
), flags(f
), seq(q
), offset(o
), length(l
) {}
293 Buffer(BufferSpace
*space
, unsigned s
, uint64_t q
, uint32_t o
, ceph::buffer::list
& b
,
295 : space(space
), state(s
), flags(f
), seq(q
), offset(o
),
296 length(b
.length()), data(b
) {}
298 bool is_empty() const {
299 return state
== STATE_EMPTY
;
301 bool is_clean() const {
302 return state
== STATE_CLEAN
;
304 bool is_writing() const {
305 return state
== STATE_WRITING
;
308 uint32_t end() const {
309 return offset
+ length
;
312 void truncate(uint32_t newlen
) {
313 ceph_assert(newlen
< length
);
315 ceph::buffer::list t
;
316 t
.substr_of(data
, 0, newlen
);
321 void maybe_rebuild() {
323 (data
.get_num_buffers() > 1 ||
324 data
.front().wasted() > data
.length() / MAX_BUFFER_SLOP_RATIO_DEN
)) {
329 void dump(ceph::Formatter
*f
) const {
330 f
->dump_string("state", get_state_name(state
));
331 f
->dump_unsigned("seq", seq
);
332 f
->dump_unsigned("offset", offset
);
333 f
->dump_unsigned("length", length
);
334 f
->dump_unsigned("data_length", data
.length());
338 struct BufferCacheShard
;
340 /// map logical extent range (object) onto buffers
343 BYPASS_CLEAN_CACHE
= 0x1, // bypass clean cache
346 typedef boost::intrusive::list
<
348 boost::intrusive::member_hook
<
350 boost::intrusive::list_member_hook
<>,
351 &Buffer::state_item
> > state_list_t
;
353 mempool::bluestore_cache_meta::map
<uint32_t, std::unique_ptr
<Buffer
>>
356 // we use a bare intrusive list here instead of std::map because
357 // it uses less memory and we expect this to be very small (very
358 // few IOs in flight to the same Blob at the same time).
359 state_list_t writing
; ///< writing buffers, sorted by seq, ascending
362 ceph_assert(buffer_map
.empty());
363 ceph_assert(writing
.empty());
366 void _add_buffer(BufferCacheShard
* cache
, Buffer
* b
, int level
, Buffer
* near
) {
367 cache
->_audit("_add_buffer start");
368 buffer_map
[b
->offset
].reset(b
);
369 if (b
->is_writing()) {
370 // we might get already cached data for which resetting mempool is inppropriate
371 // hence calling try_assign_to_mempool
372 b
->data
.try_assign_to_mempool(mempool::mempool_bluestore_writing
);
373 if (writing
.empty() || writing
.rbegin()->seq
<= b
->seq
) {
374 writing
.push_back(*b
);
376 auto it
= writing
.begin();
377 while (it
->seq
< b
->seq
) {
381 ceph_assert(it
->seq
>= b
->seq
);
382 // note that this will insert b before it
383 // hence the order is maintained
384 writing
.insert(it
, *b
);
387 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
388 cache
->_add(b
, level
, near
);
390 cache
->_audit("_add_buffer end");
392 void _rm_buffer(BufferCacheShard
* cache
, Buffer
*b
) {
393 _rm_buffer(cache
, buffer_map
.find(b
->offset
));
395 void _rm_buffer(BufferCacheShard
* cache
,
396 std::map
<uint32_t, std::unique_ptr
<Buffer
>>::iterator p
) {
397 ceph_assert(p
!= buffer_map
.end());
398 cache
->_audit("_rm_buffer start");
399 if (p
->second
->is_writing()) {
400 writing
.erase(writing
.iterator_to(*p
->second
));
402 cache
->_rm(p
->second
.get());
405 cache
->_audit("_rm_buffer end");
408 std::map
<uint32_t,std::unique_ptr
<Buffer
>>::iterator
_data_lower_bound(
410 auto i
= buffer_map
.lower_bound(offset
);
411 if (i
!= buffer_map
.begin()) {
413 if (i
->first
+ i
->second
->length
<= offset
)
419 // must be called under protection of the Cache lock
420 void _clear(BufferCacheShard
* cache
);
422 // return value is the highest cache_private of a trimmed buffer, or 0.
423 int discard(BufferCacheShard
* cache
, uint32_t offset
, uint32_t length
) {
424 std::lock_guard
l(cache
->lock
);
425 int ret
= _discard(cache
, offset
, length
);
429 int _discard(BufferCacheShard
* cache
, uint32_t offset
, uint32_t length
);
431 void write(BufferCacheShard
* cache
, uint64_t seq
, uint32_t offset
, ceph::buffer::list
& bl
,
433 std::lock_guard
l(cache
->lock
);
434 Buffer
*b
= new Buffer(this, Buffer::STATE_WRITING
, seq
, offset
, bl
,
436 b
->cache_private
= _discard(cache
, offset
, bl
.length());
437 _add_buffer(cache
, b
, (flags
& Buffer::FLAG_NOCACHE
) ? 0 : 1, nullptr);
440 void _finish_write(BufferCacheShard
* cache
, uint64_t seq
);
441 void did_read(BufferCacheShard
* cache
, uint32_t offset
, ceph::buffer::list
& bl
) {
442 std::lock_guard
l(cache
->lock
);
443 Buffer
*b
= new Buffer(this, Buffer::STATE_CLEAN
, 0, offset
, bl
);
444 b
->cache_private
= _discard(cache
, offset
, bl
.length());
445 _add_buffer(cache
, b
, 1, nullptr);
449 void read(BufferCacheShard
* cache
, uint32_t offset
, uint32_t length
,
450 BlueStore::ready_regions_t
& res
,
451 interval_set
<uint32_t>& res_intervals
,
454 void truncate(BufferCacheShard
* cache
, uint32_t offset
) {
455 discard(cache
, offset
, (uint32_t)-1 - offset
);
458 void split(BufferCacheShard
* cache
, size_t pos
, BufferSpace
&r
);
460 void dump(BufferCacheShard
* cache
, ceph::Formatter
*f
) const {
461 std::lock_guard
l(cache
->lock
);
462 f
->open_array_section("buffers");
463 for (auto& i
: buffer_map
) {
464 f
->open_object_section("buffer");
465 ceph_assert(i
.first
== i
.second
->offset
);
473 struct SharedBlobSet
;
475 /// in-memory shared blob state (incl cached buffers)
477 MEMPOOL_CLASS_HELPERS();
479 std::atomic_int nref
= {0}; ///< reference count
484 uint64_t sbid_unloaded
; ///< sbid if persistent isn't loaded
485 bluestore_shared_blob_t
*persistent
; ///< persistent part of the shared blob if any
487 BufferSpace bc
; ///< buffer cache
489 SharedBlob(Collection
*_coll
) : coll(_coll
), sbid_unloaded(0) {
491 get_cache()->add_blob();
494 SharedBlob(uint64_t i
, Collection
*_coll
);
497 uint64_t get_sbid() const {
498 return loaded
? persistent
->sbid
: sbid_unloaded
;
501 friend void intrusive_ptr_add_ref(SharedBlob
*b
) { b
->get(); }
502 friend void intrusive_ptr_release(SharedBlob
*b
) { b
->put(); }
504 void dump(ceph::Formatter
* f
) const;
505 friend std::ostream
& operator<<(std::ostream
& out
, const SharedBlob
& sb
);
512 /// get logical references
513 void get_ref(uint64_t offset
, uint32_t length
);
515 /// put logical references, and get back any released extents
516 void put_ref(uint64_t offset
, uint32_t length
,
517 PExtentVector
*r
, bool *unshare
);
519 void finish_write(uint64_t seq
);
521 friend bool operator==(const SharedBlob
&l
, const SharedBlob
&r
) {
522 return l
.get_sbid() == r
.get_sbid();
524 inline BufferCacheShard
* get_cache() {
525 return coll
? coll
->cache
: nullptr;
527 inline SharedBlobSet
* get_parent() {
528 return coll
? &(coll
->shared_blob_set
) : nullptr;
530 inline bool is_loaded() const {
535 typedef boost::intrusive_ptr
<SharedBlob
> SharedBlobRef
;
537 /// a lookup table of SharedBlobs
538 struct SharedBlobSet
{
539 /// protect lookup, insertion, removal
540 ceph::mutex lock
= ceph::make_mutex("BlueStore::SharedBlobSet::lock");
542 // we use a bare pointer because we don't want to affect the ref
544 mempool::bluestore_cache_meta::unordered_map
<uint64_t,SharedBlob
*> sb_map
;
546 SharedBlobRef
lookup(uint64_t sbid
) {
547 std::lock_guard
l(lock
);
548 auto p
= sb_map
.find(sbid
);
549 if (p
== sb_map
.end() ||
550 p
->second
->nref
== 0) {
556 void add(Collection
* coll
, SharedBlob
*sb
) {
557 std::lock_guard
l(lock
);
558 sb_map
[sb
->get_sbid()] = sb
;
562 bool remove(SharedBlob
*sb
, bool verify_nref_is_zero
=false) {
563 std::lock_guard
l(lock
);
564 ceph_assert(sb
->get_parent() == this);
565 if (verify_nref_is_zero
&& sb
->nref
!= 0) {
568 // only remove if it still points to us
569 auto p
= sb_map
.find(sb
->get_sbid());
570 if (p
!= sb_map
.end() &&
578 std::lock_guard
l(lock
);
579 return sb_map
.empty();
582 template <int LogLevelV
>
583 void dump(CephContext
*cct
);
586 //#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
588 /// in-memory blob metadata and associated cached buffers (if any)
590 MEMPOOL_CLASS_HELPERS();
592 std::atomic_int nref
= {0}; ///< reference count
593 int16_t id
= -1; ///< id, for spanning blobs only, >= 0
594 int16_t last_encoded_id
= -1; ///< (ephemeral) used during encoding only
595 SharedBlobRef shared_blob
; ///< shared blob state (if any)
598 mutable bluestore_blob_t blob
; ///< decoded blob metadata
600 mutable ceph::buffer::list blob_bl
; ///< cached encoded blob, blob is dirty if empty
602 /// refs from this shard. ephemeral if id<0, persisted if spanning.
603 bluestore_blob_use_tracker_t used_in_blob
;
607 friend void intrusive_ptr_add_ref(Blob
*b
) { b
->get(); }
608 friend void intrusive_ptr_release(Blob
*b
) { b
->put(); }
610 void dump(ceph::Formatter
* f
) const;
611 friend std::ostream
& operator<<(std::ostream
& out
, const Blob
&b
);
613 const bluestore_blob_use_tracker_t
& get_blob_use_tracker() const {
616 bool is_referenced() const {
617 return used_in_blob
.is_not_empty();
619 uint32_t get_referenced_bytes() const {
620 return used_in_blob
.get_referenced_bytes();
623 bool is_spanning() const {
627 bool can_split() const {
628 std::lock_guard
l(shared_blob
->get_cache()->lock
);
629 // splitting a BufferSpace writing list is too hard; don't try.
630 return shared_blob
->bc
.writing
.empty() &&
631 used_in_blob
.can_split() &&
632 get_blob().can_split();
635 bool can_split_at(uint32_t blob_offset
) const {
636 return used_in_blob
.can_split_at(blob_offset
) &&
637 get_blob().can_split_at(blob_offset
);
640 bool can_reuse_blob(uint32_t min_alloc_size
,
641 uint32_t target_blob_size
,
646 o
.shared_blob
= shared_blob
;
653 inline const bluestore_blob_t
& get_blob() const {
656 inline bluestore_blob_t
& dirty_blob() {
663 /// discard buffers for unallocated regions
664 void discard_unallocated(Collection
*coll
);
666 /// get logical references
667 void get_ref(Collection
*coll
, uint32_t offset
, uint32_t length
);
668 /// put logical references, and get back any released extents
669 bool put_ref(Collection
*coll
, uint32_t offset
, uint32_t length
,
673 void split(Collection
*coll
, uint32_t blob_offset
, Blob
*o
);
685 void _encode() const {
686 if (blob_bl
.length() == 0 ) {
687 encode(blob
, blob_bl
);
689 ceph_assert(blob_bl
.length());
694 bool include_ref_map
) const {
696 p
+= blob_bl
.length();
697 if (include_ref_map
) {
698 used_in_blob
.bound_encode(p
);
702 ceph::buffer::list::contiguous_appender
& p
,
703 bool include_ref_map
) const {
706 if (include_ref_map
) {
707 used_in_blob
.encode(p
);
711 Collection */
*coll*/
,
712 ceph::buffer::ptr::const_iterator
& p
,
713 bool include_ref_map
) {
714 const char *start
= p
.get_pos();
716 const char *end
= p
.get_pos();
718 blob_bl
.append(start
, end
- start
);
719 if (include_ref_map
) {
720 used_in_blob
.decode(p
);
728 bool include_ref_map
) const {
729 denc(blob
, p
, struct_v
);
730 if (blob
.is_shared()) {
733 if (include_ref_map
) {
734 used_in_blob
.bound_encode(p
);
738 ceph::buffer::list::contiguous_appender
& p
,
741 bool include_ref_map
) const {
742 denc(blob
, p
, struct_v
);
743 if (blob
.is_shared()) {
746 if (include_ref_map
) {
747 used_in_blob
.encode(p
);
752 ceph::buffer::ptr::const_iterator
& p
,
755 bool include_ref_map
);
758 typedef boost::intrusive_ptr
<Blob
> BlobRef
;
759 typedef mempool::bluestore_cache_meta::map
<int,BlobRef
> blob_map_t
;
761 /// a logical extent, pointing to (some portion of) a blob
762 typedef boost::intrusive::set_base_hook
<boost::intrusive::optimize_size
<true> > ExtentBase
; //making an alias to avoid build warnings
763 struct Extent
: public ExtentBase
{
764 MEMPOOL_CLASS_HELPERS();
766 uint32_t logical_offset
= 0; ///< logical offset
767 uint32_t blob_offset
= 0; ///< blob offset
768 uint32_t length
= 0; ///< length
769 BlobRef blob
; ///< the blob with our data
771 /// ctor for lookup only
772 explicit Extent(uint32_t lo
) : ExtentBase(), logical_offset(lo
) { }
773 /// ctor for delayed initialization (see decode_some())
774 explicit Extent() : ExtentBase() {
776 /// ctor for general usage
777 Extent(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
)
779 logical_offset(lo
), blob_offset(o
), length(l
) {
784 blob
->shared_blob
->get_cache()->rm_extent();
788 void dump(ceph::Formatter
* f
) const;
790 void assign_blob(const BlobRef
& b
) {
793 blob
->shared_blob
->get_cache()->add_extent();
796 // comparators for intrusive_set
797 friend bool operator<(const Extent
&a
, const Extent
&b
) {
798 return a
.logical_offset
< b
.logical_offset
;
800 friend bool operator>(const Extent
&a
, const Extent
&b
) {
801 return a
.logical_offset
> b
.logical_offset
;
803 friend bool operator==(const Extent
&a
, const Extent
&b
) {
804 return a
.logical_offset
== b
.logical_offset
;
807 uint32_t blob_start() const {
808 return logical_offset
- blob_offset
;
811 uint32_t blob_end() const {
812 return blob_start() + blob
->get_blob().get_logical_length();
815 uint32_t logical_end() const {
816 return logical_offset
+ length
;
819 // return true if any piece of the blob is out of
820 // the given range [o, o + l].
821 bool blob_escapes_range(uint32_t o
, uint32_t l
) const {
822 return blob_start() < o
|| blob_end() > o
+ l
;
825 typedef boost::intrusive::set
<Extent
> extent_map_t
;
828 friend std::ostream
& operator<<(std::ostream
& out
, const Extent
& e
);
831 boost::intrusive::list_member_hook
<> old_extent_item
;
834 bool blob_empty
; // flag to track the last removed extent that makes blob
835 // empty - required to update compression stat properly
836 OldExtent(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
)
837 : e(lo
, o
, l
, b
), blob_empty(false) {
839 static OldExtent
* create(CollectionRef c
,
845 typedef boost::intrusive::list
<
847 boost::intrusive::member_hook
<
849 boost::intrusive::list_member_hook
<>,
850 &OldExtent::old_extent_item
> > old_extent_map_t
;
854 /// a sharded extent map, mapping offsets to lextents to blobs
857 extent_map_t extent_map
; ///< map of Extents to Blobs
858 blob_map_t spanning_blob_map
; ///< blobs that span shards
859 typedef boost::intrusive_ptr
<Onode
> OnodeRef
;
862 bluestore_onode_t::shard_info
*shard_info
= nullptr;
863 unsigned extents
= 0; ///< count extents in this shard
864 bool loaded
= false; ///< true if shard is loaded
865 bool dirty
= false; ///< true if shard is dirty and needs reencoding
867 mempool::bluestore_cache_meta::vector
<Shard
> shards
; ///< shards
869 ceph::buffer::list inline_bl
; ///< cached encoded map, if unsharded; empty=>dirty
871 uint32_t needs_reshard_begin
= 0;
872 uint32_t needs_reshard_end
= 0;
874 void dup(BlueStore
* b
, TransContext
*, CollectionRef
&, OnodeRef
&, OnodeRef
&,
875 uint64_t&, uint64_t&, uint64_t&);
877 bool needs_reshard() const {
878 return needs_reshard_end
> needs_reshard_begin
;
880 void clear_needs_reshard() {
881 needs_reshard_begin
= needs_reshard_end
= 0;
883 void request_reshard(uint32_t begin
, uint32_t end
) {
884 if (begin
< needs_reshard_begin
) {
885 needs_reshard_begin
= begin
;
887 if (end
> needs_reshard_end
) {
888 needs_reshard_end
= end
;
892 struct DeleteDisposer
{
893 void operator()(Extent
*e
) { delete e
; }
898 extent_map
.clear_and_dispose(DeleteDisposer());
902 extent_map
.clear_and_dispose(DeleteDisposer());
905 clear_needs_reshard();
908 void dump(ceph::Formatter
* f
) const;
910 bool encode_some(uint32_t offset
, uint32_t length
, ceph::buffer::list
& bl
,
912 unsigned decode_some(ceph::buffer::list
& bl
);
914 void bound_encode_spanning_blobs(size_t& p
);
915 void encode_spanning_blobs(ceph::buffer::list::contiguous_appender
& p
);
916 void decode_spanning_blobs(ceph::buffer::ptr::const_iterator
& p
);
918 BlobRef
get_spanning_blob(int id
) {
919 auto p
= spanning_blob_map
.find(id
);
920 ceph_assert(p
!= spanning_blob_map
.end());
924 void update(KeyValueDB::Transaction t
, bool force
);
925 decltype(BlueStore::Blob::id
) allocate_spanning_blob_id();
928 KeyValueDB::Transaction t
);
930 /// initialize Shards from the onode
931 void init_shards(bool loaded
, bool dirty
);
933 /// return index of shard containing offset
934 /// or -1 if not found
935 int seek_shard(uint32_t offset
) {
936 size_t end
= shards
.size();
937 size_t mid
, left
= 0;
938 size_t right
= end
; // one passed the right end
940 while (left
< right
) {
941 mid
= left
+ (right
- left
) / 2;
942 if (offset
>= shards
[mid
].shard_info
->offset
) {
943 size_t next
= mid
+ 1;
944 if (next
>= end
|| offset
< shards
[next
].shard_info
->offset
)
946 //continue to search forwards
949 //continue to search backwards
954 return -1; // not found
957 /// check if a range spans a shard
958 bool spans_shard(uint32_t offset
, uint32_t length
) {
959 if (shards
.empty()) {
962 int s
= seek_shard(offset
);
964 if (s
== (int)shards
.size() - 1) {
965 return false; // last shard
967 if (offset
+ length
<= shards
[s
+1].shard_info
->offset
) {
973 /// ensure that a range of the map is loaded
974 void fault_range(KeyValueDB
*db
,
975 uint32_t offset
, uint32_t length
);
977 /// ensure a range of the map is marked dirty
978 void dirty_range(uint32_t offset
, uint32_t length
);
980 /// for seek_lextent test
981 extent_map_t::iterator
find(uint64_t offset
);
983 /// seek to the first lextent including or after offset
984 extent_map_t::iterator
seek_lextent(uint64_t offset
);
985 extent_map_t::const_iterator
seek_lextent(uint64_t offset
) const;
988 void add(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
) {
989 extent_map
.insert(*new Extent(lo
, o
, l
, b
));
992 /// remove (and delete) an Extent
993 void rm(extent_map_t::iterator p
) {
994 extent_map
.erase_and_dispose(p
, DeleteDisposer());
997 bool has_any_lextents(uint64_t offset
, uint64_t length
);
999 /// consolidate adjacent lextents in extent_map
1000 int compress_extent_map(uint64_t offset
, uint64_t length
);
1002 /// punch a logical hole. add lextents to deref to target list.
1003 void punch_hole(CollectionRef
&c
,
1004 uint64_t offset
, uint64_t length
,
1005 old_extent_map_t
*old_extents
);
1007 /// put new lextent into lextent_map overwriting existing ones if
1008 /// any and update references accordingly
1009 Extent
*set_lextent(CollectionRef
&c
,
1010 uint64_t logical_offset
,
1011 uint64_t offset
, uint64_t length
,
1013 old_extent_map_t
*old_extents
);
1015 /// split a blob (and referring extents)
1016 BlobRef
split_blob(BlobRef lb
, uint32_t blob_offset
, uint32_t pos
);
1018 void provide_shard_info_to_onode(bufferlist v
, uint32_t shard_id
);
1021 /// Compressed Blob Garbage collector
1023 The primary idea of the collector is to estimate a difference between
1024 allocation units(AU) currently present for compressed blobs and new AUs
1025 required to store that data uncompressed.
1026 Estimation is performed for protrusive extents within a logical range
1027 determined by a concatenation of old_extents collection and specific(current)
1029 The root cause for old_extents use is the need to handle blob ref counts
1030 properly. Old extents still hold blob refs and hence we need to traverse
1031 the collection to determine if blob to be released.
1032 Protrusive extents are extents that fit into the blob std::set in action
1033 (ones that are below the logical range from above) but not removed totally
1034 due to the current write.
1036 extent1 <loffs = 100, boffs = 100, len = 100> ->
1037 blob1<compressed, len_on_disk=4096, logical_len=8192>
1038 extent2 <loffs = 200, boffs = 200, len = 100> ->
1039 blob2<raw, len_on_disk=4096, llen=4096>
1040 extent3 <loffs = 300, boffs = 300, len = 100> ->
1041 blob1<compressed, len_on_disk=4096, llen=8192>
1042 extent4 <loffs = 4096, boffs = 0, len = 100> ->
1043 blob3<raw, len_on_disk=4096, llen=4096>
1045 protrusive extents are within the following ranges <0~300, 400~8192-400>
1046 In this case existing AUs that might be removed due to GC (i.e. blob1)
1048 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
1049 Hence we should do a collect.
1051 class GarbageCollector
1054 /// return amount of allocation units that might be saved due to GC
1058 const ExtentMap
& extent_map
,
1059 const old_extent_map_t
& old_extents
,
1060 uint64_t min_alloc_size
);
1062 /// return a collection of extents to perform GC on
1063 const interval_set
<uint64_t>& get_extents_to_collect() const {
1064 return extents_to_collect
;
1066 GarbageCollector(CephContext
* _cct
) : cct(_cct
) {}
1070 uint64_t referenced_bytes
= 0; ///< amount of bytes referenced in blob
1071 int64_t expected_allocations
= 0; ///< new alloc units required
1072 ///< in case of gc fulfilled
1073 bool collect_candidate
= false; ///< indicate if blob has any extents
1074 ///< eligible for GC.
1075 extent_map_t::const_iterator first_lextent
; ///< points to the first
1076 ///< lextent referring to
1077 ///< the blob if any.
1078 ///< collect_candidate flag
1079 ///< determines the validity
1080 extent_map_t::const_iterator last_lextent
; ///< points to the last
1081 ///< lextent referring to
1082 ///< the blob if any.
1084 BlobInfo(uint64_t ref_bytes
) :
1085 referenced_bytes(ref_bytes
) {
1089 std::map
<Blob
*, BlobInfo
> affected_blobs
; ///< compressed blobs and their ref_map
1090 ///< copies that are affected by the
1093 ///< protrusive extents that should be collected if GC takes place
1094 interval_set
<uint64_t> extents_to_collect
;
1096 boost::optional
<uint64_t > used_alloc_unit
; ///< last processed allocation
1097 ///< unit when traversing
1098 ///< protrusive extents.
1099 ///< Other extents mapped to
1100 ///< this AU to be ignored
1101 ///< (except the case where
1102 ///< uncompressed extent follows
1103 ///< compressed one - see below).
1104 BlobInfo
* blob_info_counted
= nullptr; ///< std::set if previous allocation unit
1105 ///< caused expected_allocations
1106 ///< counter increment at this blob.
1107 ///< if uncompressed extent follows
1108 ///< a decrement for the
1109 ///< expected_allocations counter
1111 int64_t expected_allocations
= 0; ///< new alloc units required in case
1112 ///< of gc fulfilled
1113 int64_t expected_for_release
= 0; ///< alloc units currently used by
1114 ///< compressed blobs that might
1118 void process_protrusive_extents(const BlueStore::ExtentMap
& extent_map
,
1119 uint64_t start_offset
,
1120 uint64_t end_offset
,
1121 uint64_t start_touch_offset
,
1122 uint64_t end_touch_offset
,
1123 uint64_t min_alloc_size
);
1127 /// an in-memory object
1129 MEMPOOL_CLASS_HELPERS();
1131 std::atomic_int nref
; ///< reference count
1132 std::atomic_int put_nref
= {0};
1136 /// key under PREFIX_OBJ where we are stored
1137 mempool::bluestore_cache_meta::string key
;
1139 boost::intrusive::list_member_hook
<> lru_item
;
1141 bluestore_onode_t onode
; ///< metadata stored as value in kv store
1142 bool exists
; ///< true if object logically exists
1143 bool cached
; ///< Onode is logically in the cache
1144 /// (it can be pinned and hence physically out
1145 /// of it at the moment though)
1146 std::atomic_bool pinned
; ///< Onode is pinned
1147 /// (or should be pinned when cached)
1148 ExtentMap extent_map
;
1150 // track txc's that have not been committed to kv store (and whose
1151 // effects cannot be read via the kvdb read methods)
1152 std::atomic
<int> flushing_count
= {0};
1153 std::atomic
<int> waiting_count
= {0};
1154 /// protect flush_txns
1155 ceph::mutex flush_lock
= ceph::make_mutex("BlueStore::Onode::flush_lock");
1156 ceph::condition_variable flush_cond
; ///< wait here for uncommitted txns
1157 std::shared_ptr
<int64_t> cache_age_bin
; ///< cache age bin
1159 Onode(Collection
*c
, const ghobject_t
& o
,
1160 const mempool::bluestore_cache_meta::string
& k
)
1170 Onode(Collection
* c
, const ghobject_t
& o
,
1171 const std::string
& k
)
1181 Onode(Collection
* c
, const ghobject_t
& o
,
1193 static Onode
* decode(
1195 const ghobject_t
& oid
,
1196 const std::string
& key
,
1197 const ceph::buffer::list
& v
);
1199 void dump(ceph::Formatter
* f
) const;
1205 inline bool put_cache() {
1206 ceph_assert(!cached
);
1210 inline bool pop_cache() {
1211 ceph_assert(cached
);
1216 static const std::string
& calc_omap_prefix(uint8_t flags
);
1217 static void calc_omap_header(uint8_t flags
, const Onode
* o
,
1219 static void calc_omap_key(uint8_t flags
, const Onode
* o
,
1220 const std::string
& key
, std::string
* out
);
1221 static void calc_omap_tail(uint8_t flags
, const Onode
* o
,
1224 const std::string
& get_omap_prefix() {
1225 return calc_omap_prefix(onode
.flags
);
1227 void get_omap_header(std::string
* out
) {
1228 calc_omap_header(onode
.flags
, this, out
);
1230 void get_omap_key(const std::string
& key
, std::string
* out
) {
1231 calc_omap_key(onode
.flags
, this, key
, out
);
1233 void get_omap_tail(std::string
* out
) {
1234 calc_omap_tail(onode
.flags
, this, out
);
1237 void rewrite_omap_key(const std::string
& old
, std::string
*out
);
1238 void decode_omap_key(const std::string
& key
, std::string
*user_key
);
1241 // Return the offset of an object on disk. This function is intended *only*
1242 // for use with zoned storage devices because in these devices, the objects
1243 // are laid out contiguously on disk, which is not the case in general.
1244 // Also, it should always be called after calling extent_map.fault_range(),
1245 // so that the extent map is loaded.
1246 int64_t zoned_get_ondisk_starting_offset() const {
1247 return extent_map
.extent_map
.begin()->blob
->
1248 get_blob().calc_offset(0, nullptr);
1253 typedef boost::intrusive_ptr
<Onode
> OnodeRef
;
1255 /// A generic Cache Shard
1258 PerfCounters
*logger
;
1260 /// protect lru and other structures
1261 ceph::recursive_mutex lock
= {
1262 ceph::make_recursive_mutex("BlueStore::CacheShard::lock") };
1264 std::atomic
<uint64_t> max
= {0};
1265 std::atomic
<uint64_t> num
= {0};
1266 boost::circular_buffer
<std::shared_ptr
<int64_t>> age_bins
;
1268 CacheShard(CephContext
* cct
) : cct(cct
), logger(nullptr), age_bins(1) {
1271 virtual ~CacheShard() {}
1273 void set_max(uint64_t max_
) {
1277 uint64_t _get_num() {
1281 virtual void _trim_to(uint64_t new_size
) = 0;
1283 if (cct
->_conf
->objectstore_blackhole
) {
1284 // do not trim if we are throwing away IOs a layer down
1291 std::lock_guard
l(lock
);
1295 std::lock_guard
l(lock
);
1296 // we should not be shutting down after the blackhole is enabled
1297 ceph_assert(!cct
->_conf
->objectstore_blackhole
);
1301 virtual void shift_bins() {
1302 std::lock_guard
l(lock
);
1303 age_bins
.push_front(std::make_shared
<int64_t>(0));
1305 virtual uint32_t get_bin_count() {
1306 std::lock_guard
l(lock
);
1307 return age_bins
.capacity();
1309 virtual void set_bin_count(uint32_t count
) {
1310 std::lock_guard
l(lock
);
1311 age_bins
.set_capacity(count
);
1313 virtual uint64_t sum_bins(uint32_t start
, uint32_t end
) {
1314 std::lock_guard
l(lock
);
1315 auto size
= age_bins
.size();
1320 end
= (size
< end
) ? size
: end
;
1321 for (auto i
= start
; i
< end
; i
++) {
1322 count
+= *(age_bins
[i
]);
1328 virtual void _audit(const char *s
) = 0;
1330 void _audit(const char *s
) { /* no-op */ }
1334 /// A Generic onode Cache Shard
1335 struct OnodeCacheShard
: public CacheShard
{
1336 std::atomic
<uint64_t> num_pinned
= {0};
1337 std::array
<std::pair
<ghobject_t
, ceph::mono_clock::time_point
>, 64> dumped_onodes
;
1339 virtual void _pin(Onode
* o
) = 0;
1340 virtual void _unpin(Onode
* o
) = 0;
1343 OnodeCacheShard(CephContext
* cct
) : CacheShard(cct
) {}
1344 static OnodeCacheShard
*create(CephContext
* cct
, std::string type
,
1345 PerfCounters
*logger
);
1346 virtual void _add(Onode
* o
, int level
) = 0;
1347 virtual void _rm(Onode
* o
) = 0;
1348 virtual void _unpin_and_rm(Onode
* o
) = 0;
1350 virtual void move_pinned(OnodeCacheShard
*to
, Onode
*o
) = 0;
1351 virtual void add_stats(uint64_t *onodes
, uint64_t *pinned_onodes
) = 0;
1353 return _get_num() == 0;
1357 /// A Generic buffer Cache Shard
1358 struct BufferCacheShard
: public CacheShard
{
1359 std::atomic
<uint64_t> num_extents
= {0};
1360 std::atomic
<uint64_t> num_blobs
= {0};
1361 uint64_t buffer_bytes
= 0;
1364 BufferCacheShard(CephContext
* cct
) : CacheShard(cct
) {}
1365 static BufferCacheShard
*create(CephContext
* cct
, std::string type
,
1366 PerfCounters
*logger
);
1367 virtual void _add(Buffer
*b
, int level
, Buffer
*near
) = 0;
1368 virtual void _rm(Buffer
*b
) = 0;
1369 virtual void _move(BufferCacheShard
*src
, Buffer
*b
) = 0;
1370 virtual void _touch(Buffer
*b
) = 0;
1371 virtual void _adjust_size(Buffer
*b
, int64_t delta
) = 0;
1373 uint64_t _get_bytes() {
1374 return buffer_bytes
;
1391 virtual void add_stats(uint64_t *extents
,
1394 uint64_t *bytes
) = 0;
1397 std::lock_guard
l(lock
);
1398 return _get_bytes() == 0;
1403 OnodeCacheShard
*cache
;
1407 mempool::bluestore_cache_meta::unordered_map
<ghobject_t
,OnodeRef
> onode_map
;
1409 friend struct Collection
; // for split_cache()
1410 friend struct Onode
; // for put()
1411 friend struct LruOnodeCacheShard
;
1412 void _remove(const ghobject_t
& oid
);
1414 OnodeSpace(OnodeCacheShard
*c
) : cache(c
) {}
1419 OnodeRef
add(const ghobject_t
& oid
, OnodeRef
& o
);
1420 OnodeRef
lookup(const ghobject_t
& o
);
1421 void rename(OnodeRef
& o
, const ghobject_t
& old_oid
,
1422 const ghobject_t
& new_oid
,
1423 const mempool::bluestore_cache_meta::string
& new_okey
);
1427 template <int LogLevelV
>
1428 void dump(CephContext
*cct
);
1430 /// return true if f true for any item
1431 bool map_any(std::function
<bool(Onode
*)> f
);
1435 using OpSequencerRef
= ceph::ref_t
<OpSequencer
>;
1437 struct Collection
: public CollectionImpl
{
1440 BufferCacheShard
*cache
; ///< our cache shard
1441 bluestore_cnode_t cnode
;
1442 ceph::shared_mutex lock
=
1443 ceph::make_shared_mutex("BlueStore::Collection::lock", true, false);
1447 SharedBlobSet shared_blob_set
; ///< open SharedBlobs
1449 // cache onodes on a per-collection basis to avoid lock
1451 OnodeSpace onode_map
;
1454 pool_opts_t pool_opts
;
1455 ContextQueue
*commit_queue
;
1457 OnodeCacheShard
* get_onode_cache() const {
1458 return onode_map
.cache
;
1460 OnodeRef
get_onode(const ghobject_t
& oid
, bool create
, bool is_createop
=false);
1462 // the terminology is confusing here, sorry!
1464 // blob_t shared_blob_t
1465 // !shared unused -> open
1466 // shared !loaded -> open + shared
1467 // shared loaded -> open + shared + loaded
1470 // open = SharedBlob is instantiated
1471 // shared = blob_t shared flag is std::set; SharedBlob is hashed.
1472 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1473 void open_shared_blob(uint64_t sbid
, BlobRef b
);
1474 void load_shared_blob(SharedBlobRef sb
);
1475 void make_blob_shared(uint64_t sbid
, BlobRef b
);
1476 uint64_t make_blob_unshared(SharedBlob
*sb
);
1478 BlobRef
new_blob() {
1479 BlobRef b
= new Blob();
1480 b
->shared_blob
= new SharedBlob(this);
1484 bool contains(const ghobject_t
& oid
) {
1486 return oid
.hobj
.pool
== -1;
1488 if (cid
.is_pg(&spgid
))
1490 spgid
.pgid
.contains(cnode
.bits
, oid
) &&
1491 oid
.shard_id
== spgid
.shard
;
1495 int64_t pool() const {
1499 void split_cache(Collection
*dest
);
1501 bool flush_commit(Context
*c
) override
;
1502 void flush() override
;
1503 void flush_all_but_last();
1505 Collection(BlueStore
*ns
, OnodeCacheShard
*oc
, BufferCacheShard
*bc
, coll_t c
);
1508 class OmapIteratorImpl
: public ObjectMap::ObjectMapIteratorImpl
{
1511 KeyValueDB::Iterator it
;
1512 std::string head
, tail
;
1514 std::string
_stringify() const;
1517 OmapIteratorImpl(CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
);
1518 int seek_to_first() override
;
1519 int upper_bound(const std::string
&after
) override
;
1520 int lower_bound(const std::string
&to
) override
;
1521 bool valid() override
;
1522 int next() override
;
1523 std::string
key() override
;
1524 ceph::buffer::list
value() override
;
1525 std::string
tail_key() override
{
1529 int status() override
{
1534 struct volatile_statfs
{
1536 STATFS_ALLOCATED
= 0,
1538 STATFS_COMPRESSED_ORIGINAL
,
1540 STATFS_COMPRESSED_ALLOCATED
,
1543 int64_t values
[STATFS_LAST
];
1545 memset(this, 0, sizeof(volatile_statfs
));
1548 *this = volatile_statfs();
1550 void publish(store_statfs_t
* buf
) const {
1551 buf
->allocated
= allocated();
1552 buf
->data_stored
= stored();
1553 buf
->data_compressed
= compressed();
1554 buf
->data_compressed_original
= compressed_original();
1555 buf
->data_compressed_allocated
= compressed_allocated();
1558 volatile_statfs
& operator+=(const volatile_statfs
& other
) {
1559 for (size_t i
= 0; i
< STATFS_LAST
; ++i
) {
1560 values
[i
] += other
.values
[i
];
1564 int64_t& allocated() {
1565 return values
[STATFS_ALLOCATED
];
1568 return values
[STATFS_STORED
];
1570 int64_t& compressed_original() {
1571 return values
[STATFS_COMPRESSED_ORIGINAL
];
1573 int64_t& compressed() {
1574 return values
[STATFS_COMPRESSED
];
1576 int64_t& compressed_allocated() {
1577 return values
[STATFS_COMPRESSED_ALLOCATED
];
1579 int64_t allocated() const {
1580 return values
[STATFS_ALLOCATED
];
1582 int64_t stored() const {
1583 return values
[STATFS_STORED
];
1585 int64_t compressed_original() const {
1586 return values
[STATFS_COMPRESSED_ORIGINAL
];
1588 int64_t compressed() const {
1589 return values
[STATFS_COMPRESSED
];
1591 int64_t compressed_allocated() const {
1592 return values
[STATFS_COMPRESSED_ALLOCATED
];
1594 volatile_statfs
& operator=(const store_statfs_t
& st
) {
1595 values
[STATFS_ALLOCATED
] = st
.allocated
;
1596 values
[STATFS_STORED
] = st
.data_stored
;
1597 values
[STATFS_COMPRESSED_ORIGINAL
] = st
.data_compressed_original
;
1598 values
[STATFS_COMPRESSED
] = st
.data_compressed
;
1599 values
[STATFS_COMPRESSED_ALLOCATED
] = st
.data_compressed_allocated
;
1603 return values
[STATFS_ALLOCATED
] == 0 &&
1604 values
[STATFS_STORED
] == 0 &&
1605 values
[STATFS_COMPRESSED
] == 0 &&
1606 values
[STATFS_COMPRESSED_ORIGINAL
] == 0 &&
1607 values
[STATFS_COMPRESSED_ALLOCATED
] == 0;
1609 void decode(ceph::buffer::list::const_iterator
& it
) {
1611 for (size_t i
= 0; i
< STATFS_LAST
; i
++) {
1612 decode(values
[i
], it
);
1616 void encode(ceph::buffer::list
& bl
) {
1618 for (size_t i
= 0; i
< STATFS_LAST
; i
++) {
1619 encode(values
[i
], bl
);
1624 struct TransContext final
: public AioContext
{
1625 MEMPOOL_CLASS_HELPERS();
1631 STATE_KV_QUEUED
, // queued for kv_sync_thread submission
1632 STATE_KV_SUBMITTED
, // submitted to kv; not yet synced
1634 STATE_DEFERRED_QUEUED
, // in deferred_queue (pending or running)
1635 STATE_DEFERRED_CLEANUP
, // remove deferred kv record
1636 STATE_DEFERRED_DONE
,
1641 const char *get_state_name() {
1643 case STATE_PREPARE
: return "prepare";
1644 case STATE_AIO_WAIT
: return "aio_wait";
1645 case STATE_IO_DONE
: return "io_done";
1646 case STATE_KV_QUEUED
: return "kv_queued";
1647 case STATE_KV_SUBMITTED
: return "kv_submitted";
1648 case STATE_KV_DONE
: return "kv_done";
1649 case STATE_DEFERRED_QUEUED
: return "deferred_queued";
1650 case STATE_DEFERRED_CLEANUP
: return "deferred_cleanup";
1651 case STATE_DEFERRED_DONE
: return "deferred_done";
1652 case STATE_FINISHING
: return "finishing";
1653 case STATE_DONE
: return "done";
1658 #if defined(WITH_LTTNG)
1659 const char *get_state_latency_name(int state
) {
1661 case l_bluestore_state_prepare_lat
: return "prepare";
1662 case l_bluestore_state_aio_wait_lat
: return "aio_wait";
1663 case l_bluestore_state_io_done_lat
: return "io_done";
1664 case l_bluestore_state_kv_queued_lat
: return "kv_queued";
1665 case l_bluestore_state_kv_committing_lat
: return "kv_committing";
1666 case l_bluestore_state_kv_done_lat
: return "kv_done";
1667 case l_bluestore_state_deferred_queued_lat
: return "deferred_queued";
1668 case l_bluestore_state_deferred_cleanup_lat
: return "deferred_cleanup";
1669 case l_bluestore_state_finishing_lat
: return "finishing";
1670 case l_bluestore_state_done_lat
: return "done";
1676 inline void set_state(state_t s
) {
1680 trace
.event(get_state_name());
1684 inline state_t
get_state() {
1689 OpSequencerRef osr
; // this should be ch->osr
1690 boost::intrusive::list_member_hook
<> sequencer_item
;
1692 uint64_t bytes
= 0, ios
= 0, cost
= 0;
1694 std::set
<OnodeRef
> onodes
; ///< these need to be updated/written
1695 std::set
<OnodeRef
> modified_objects
; ///< objects we modified (and need a ref)
1698 // zone refs to add/remove. each zone ref is a (zone, offset) tuple. The offset
1699 // is the first offset in the zone that the onode touched; subsequent writes
1700 // to that zone do not generate additional refs. This is a bit imprecise but
1701 // is sufficient to generate reasonably sequential reads when doing zone
1702 // cleaning with less metadata than a ref for every extent.
1703 std::map
<std::pair
<OnodeRef
, uint32_t>, uint64_t> new_zone_offset_refs
;
1704 std::map
<std::pair
<OnodeRef
, uint32_t>, uint64_t> old_zone_offset_refs
;
1707 std::set
<SharedBlobRef
> shared_blobs
; ///< these need to be updated/written
1708 std::set
<SharedBlobRef
> shared_blobs_written
; ///< update these on io completion
1710 KeyValueDB::Transaction t
; ///< then we will commit this
1711 std::list
<Context
*> oncommits
; ///< more commit completions
1712 std::list
<CollectionRef
> removed_collections
; ///< colls we removed
1714 boost::intrusive::list_member_hook
<> deferred_queue_item
;
1715 bluestore_deferred_transaction_t
*deferred_txn
= nullptr; ///< if any
1717 interval_set
<uint64_t> allocated
, released
;
1718 volatile_statfs statfs_delta
; ///< overall store statistics delta
1719 uint64_t osd_pool_id
= META_POOL_ID
; ///< osd pool id we're operating on
1722 bool had_ios
= false; ///< true if we submitted IOs before our kv txn
1725 ceph::mono_clock::time_point start
;
1726 ceph::mono_clock::time_point last_stamp
;
1728 uint64_t last_nid
= 0; ///< if non-zero, highest new nid we allocated
1729 uint64_t last_blobid
= 0; ///< if non-zero, highest new blobid we allocated
1731 #if defined(WITH_LTTNG)
1732 bool tracing
= false;
1736 ZTracer::Trace trace
;
1739 explicit TransContext(CephContext
* cct
, Collection
*c
, OpSequencer
*o
,
1740 std::list
<Context
*> *on_commits
)
1744 start(ceph::mono_clock::now()) {
1747 oncommits
.swap(*on_commits
);
1753 trace
.event("txc destruct");
1756 delete deferred_txn
;
1759 void write_onode(OnodeRef
&o
) {
1762 void write_shared_blob(SharedBlobRef
&sb
) {
1763 shared_blobs
.insert(sb
);
1765 void unshare_blob(SharedBlob
*sb
) {
1766 shared_blobs
.erase(sb
);
1769 /// note we logically modified object (when onode itself is unmodified)
1770 void note_modified_object(OnodeRef
&o
) {
1771 // onode itself isn't written, though
1772 modified_objects
.insert(o
);
1774 void note_removed_object(OnodeRef
& o
) {
1775 modified_objects
.insert(o
);
1780 void note_write_zone_offset(OnodeRef
& o
, uint32_t zone
, uint64_t offset
) {
1781 o
->onode
.zone_offset_refs
[zone
] = offset
;
1782 new_zone_offset_refs
[std::make_pair(o
, zone
)] = offset
;
1784 void note_release_zone_offset(OnodeRef
& o
, uint32_t zone
, uint64_t offset
) {
1785 old_zone_offset_refs
[std::make_pair(o
, zone
)] = offset
;
1786 o
->onode
.zone_offset_refs
.erase(zone
);
1790 void aio_finish(BlueStore
*store
) override
{
1791 store
->txc_aio_finish(this);
1794 state_t state
= STATE_PREPARE
;
1797 class BlueStoreThrottle
{
1798 #if defined(WITH_LTTNG)
1799 const std::chrono::time_point
<ceph::mono_clock
> time_base
= ceph::mono_clock::now();
1801 // Time of last chosen io (microseconds)
1802 std::atomic
<uint64_t> previous_emitted_tp_time_mono_mcs
= {0};
1803 std::atomic
<uint64_t> ios_started_since_last_traced
= {0};
1804 std::atomic
<uint64_t> ios_completed_since_last_traced
= {0};
1806 std::atomic_uint pending_kv_ios
= {0};
1807 std::atomic_uint pending_deferred_ios
= {0};
1809 // Min period between trace points (microseconds)
1810 std::atomic
<uint64_t> trace_period_mcs
= {0};
1814 uint64_t *completed
) {
1815 uint64_t min_period_mcs
= trace_period_mcs
.load(
1816 std::memory_order_relaxed
);
1818 if (min_period_mcs
== 0) {
1820 *completed
= ios_completed_since_last_traced
.exchange(0);
1823 ios_started_since_last_traced
++;
1824 auto now_mcs
= ceph::to_microseconds
<uint64_t>(
1825 ceph::mono_clock::now() - time_base
);
1826 uint64_t previous_mcs
= previous_emitted_tp_time_mono_mcs
;
1827 uint64_t period_mcs
= now_mcs
- previous_mcs
;
1828 if (period_mcs
> min_period_mcs
) {
1829 if (previous_emitted_tp_time_mono_mcs
.compare_exchange_strong(
1830 previous_mcs
, now_mcs
)) {
1831 // This would be racy at a sufficiently extreme trace rate, but isn't
1832 // worth the overhead of doing it more carefully.
1833 *started
= ios_started_since_last_traced
.exchange(0);
1834 *completed
= ios_completed_since_last_traced
.exchange(0);
1843 #if defined(WITH_LTTNG)
1844 void emit_initial_tracepoint(
1847 ceph::mono_clock::time_point
);
1849 void emit_initial_tracepoint(
1852 ceph::mono_clock::time_point
) {}
1855 Throttle throttle_bytes
; ///< submit to commit
1856 Throttle throttle_deferred_bytes
; ///< submit to deferred complete
1859 BlueStoreThrottle(CephContext
*cct
) :
1860 throttle_bytes(cct
, "bluestore_throttle_bytes", 0),
1861 throttle_deferred_bytes(cct
, "bluestore_throttle_deferred_bytes", 0)
1863 reset_throttle(cct
->_conf
);
1866 #if defined(WITH_LTTNG)
1867 void complete_kv(TransContext
&txc
);
1868 void complete(TransContext
&txc
);
1870 void complete_kv(TransContext
&txc
) {}
1871 void complete(TransContext
&txc
) {}
1874 ceph::mono_clock::duration
log_state_latency(
1875 TransContext
&txc
, PerfCounters
*logger
, int state
);
1876 bool try_start_transaction(
1879 ceph::mono_clock::time_point
);
1880 void finish_start_transaction(
1883 ceph::mono_clock::time_point
);
1884 void release_kv_throttle(uint64_t cost
) {
1885 throttle_bytes
.put(cost
);
1887 void release_deferred_throttle(uint64_t cost
) {
1888 throttle_deferred_bytes
.put(cost
);
1890 bool should_submit_deferred() {
1891 return throttle_deferred_bytes
.past_midpoint();
1893 void reset_throttle(const ConfigProxy
&conf
) {
1894 throttle_bytes
.reset_max(conf
->bluestore_throttle_bytes
);
1895 throttle_deferred_bytes
.reset_max(
1896 conf
->bluestore_throttle_bytes
+
1897 conf
->bluestore_throttle_deferred_bytes
);
1898 #if defined(WITH_LTTNG)
1899 double rate
= conf
.get_val
<double>("bluestore_throttle_trace_rate");
1900 trace_period_mcs
= rate
> 0 ? floor((1/rate
) * 1000000.0) : 0;
1905 typedef boost::intrusive::list
<
1907 boost::intrusive::member_hook
<
1909 boost::intrusive::list_member_hook
<>,
1910 &TransContext::deferred_queue_item
> > deferred_queue_t
;
1912 struct DeferredBatch final
: public AioContext
{
1914 struct deferred_io
{
1915 ceph::buffer::list bl
; ///< data
1916 uint64_t seq
; ///< deferred transaction seq
1918 std::map
<uint64_t,deferred_io
> iomap
; ///< map of ios in this batch
1919 deferred_queue_t txcs
; ///< txcs in this batch
1920 IOContext ioc
; ///< our aios
1921 /// bytes of pending io for each deferred seq (may be 0)
1922 std::map
<uint64_t,int> seq_bytes
;
1924 void _discard(CephContext
*cct
, uint64_t offset
, uint64_t length
);
1925 void _audit(CephContext
*cct
);
1927 DeferredBatch(CephContext
*cct
, OpSequencer
*osr
)
1928 : osr(osr
), ioc(cct
, this) {}
1931 void prepare_write(CephContext
*cct
,
1932 uint64_t seq
, uint64_t offset
, uint64_t length
,
1933 ceph::buffer::list::const_iterator
& p
);
1935 void aio_finish(BlueStore
*store
) override
{
1936 store
->_deferred_aio_finish(osr
);
1940 class OpSequencer
: public RefCountedObject
{
1942 ceph::mutex qlock
= ceph::make_mutex("BlueStore::OpSequencer::qlock");
1943 ceph::condition_variable qcond
;
1944 typedef boost::intrusive::list
<
1946 boost::intrusive::member_hook
<
1948 boost::intrusive::list_member_hook
<>,
1949 &TransContext::sequencer_item
> > q_list_t
;
1950 q_list_t q
; ///< transactions
1952 boost::intrusive::list_member_hook
<> deferred_osr_queue_item
;
1954 DeferredBatch
*deferred_running
= nullptr;
1955 DeferredBatch
*deferred_pending
= nullptr;
1957 ceph::mutex deferred_lock
= ceph::make_mutex("BlueStore::OpSequencer::deferred_lock");
1962 uint64_t last_seq
= 0;
1964 std::atomic_int txc_with_unstable_io
= {0}; ///< num txcs with unstable io
1966 std::atomic_int kv_committing_serially
= {0};
1968 std::atomic_int kv_submitted_waiters
= {0};
1970 std::atomic_bool zombie
= {false}; ///< in zombie_osr std::set (collection going away)
1972 const uint32_t sequencer_id
;
1974 uint32_t get_sequencer_id() const {
1975 return sequencer_id
;
1978 void queue_new(TransContext
*txc
) {
1979 std::lock_guard
l(qlock
);
1980 txc
->seq
= ++last_seq
;
1985 std::unique_lock
l(qlock
);
1990 void drain_preceding(TransContext
*txc
) {
1991 std::unique_lock
l(qlock
);
1992 while (&q
.front() != txc
)
1996 bool _is_all_kv_submitted() {
1997 // caller must hold qlock & q.empty() must not empty
1998 ceph_assert(!q
.empty());
1999 TransContext
*txc
= &q
.back();
2000 if (txc
->get_state() >= TransContext::STATE_KV_SUBMITTED
) {
2007 std::unique_lock
l(qlock
);
2009 // std::set flag before the check because the condition
2010 // may become true outside qlock, and we need to make
2011 // sure those threads see waiters and signal qcond.
2012 ++kv_submitted_waiters
;
2013 if (q
.empty() || _is_all_kv_submitted()) {
2014 --kv_submitted_waiters
;
2018 --kv_submitted_waiters
;
2022 void flush_all_but_last() {
2023 std::unique_lock
l(qlock
);
2024 ceph_assert (q
.size() >= 1);
2026 // std::set flag before the check because the condition
2027 // may become true outside qlock, and we need to make
2028 // sure those threads see waiters and signal qcond.
2029 ++kv_submitted_waiters
;
2030 if (q
.size() <= 1) {
2031 --kv_submitted_waiters
;
2034 auto it
= q
.rbegin();
2036 if (it
->get_state() >= TransContext::STATE_KV_SUBMITTED
) {
2037 --kv_submitted_waiters
;
2042 --kv_submitted_waiters
;
2046 bool flush_commit(Context
*c
) {
2047 std::lock_guard
l(qlock
);
2051 TransContext
*txc
= &q
.back();
2052 if (txc
->get_state() >= TransContext::STATE_KV_DONE
) {
2055 txc
->oncommits
.push_back(c
);
2059 FRIEND_MAKE_REF(OpSequencer
);
2060 OpSequencer(BlueStore
*store
, uint32_t sequencer_id
, const coll_t
& c
)
2061 : RefCountedObject(store
->cct
),
2062 store(store
), cid(c
), sequencer_id(sequencer_id
) {
2065 ceph_assert(q
.empty());
2069 typedef boost::intrusive::list
<
2071 boost::intrusive::member_hook
<
2073 boost::intrusive::list_member_hook
<>,
2074 &OpSequencer::deferred_osr_queue_item
> > deferred_osr_queue_t
;
2076 struct KVSyncThread
: public Thread
{
2078 explicit KVSyncThread(BlueStore
*s
) : store(s
) {}
2079 void *entry() override
{
2080 store
->_kv_sync_thread();
2084 struct KVFinalizeThread
: public Thread
{
2086 explicit KVFinalizeThread(BlueStore
*s
) : store(s
) {}
2087 void *entry() override
{
2088 store
->_kv_finalize_thread();
2094 struct ZonedCleanerThread
: public Thread
{
2096 explicit ZonedCleanerThread(BlueStore
*s
) : store(s
) {}
2097 void *entry() override
{
2098 store
->_zoned_cleaner_thread();
2104 struct BigDeferredWriteContext
{
2105 uint64_t off
= 0; // original logical offset
2106 uint32_t b_off
= 0; // blob relative offset
2108 uint64_t head_read
= 0;
2109 uint64_t tail_read
= 0;
2111 uint64_t blob_start
= 0;
2112 PExtentVector res_extents
;
2114 inline uint64_t blob_aligned_len() const {
2115 return used
+ head_read
+ tail_read
;
2118 bool can_defer(BlueStore::extent_map_t::iterator ep
,
2119 uint64_t prefer_deferred_size
,
2120 uint64_t block_size
,
2127 // --------------------------------------------------------
2130 BlueFS
*bluefs
= nullptr;
2131 bluefs_layout_t bluefs_layout
;
2132 utime_t next_dump_on_bluefs_alloc_failure
;
2134 KeyValueDB
*db
= nullptr;
2135 BlockDevice
*bdev
= nullptr;
2136 std::string freelist_type
;
2137 FreelistManager
*fm
= nullptr;
2139 Allocator
*alloc
= nullptr; ///< allocator consumed by BlueStore
2140 bluefs_shared_alloc_context_t shared_alloc
; ///< consumed by BlueFS (may be == alloc)
2143 int path_fd
= -1; ///< open handle to $path
2144 int fsid_fd
= -1; ///< open handle (locked) to $path/fsid
2145 bool mounted
= false;
2147 // store open_db options:
2148 bool db_was_opened_read_only
= true;
2149 bool need_to_destage_allocation_file
= false;
2151 ///< rwlock to protect coll_map/new_coll_map
2152 ceph::shared_mutex coll_lock
= ceph::make_shared_mutex("BlueStore::coll_lock");
2153 mempool::bluestore_cache_other::unordered_map
<coll_t
, CollectionRef
> coll_map
;
2154 bool collections_had_errors
= false;
2155 std::map
<coll_t
,CollectionRef
> new_coll_map
;
2157 std::vector
<OnodeCacheShard
*> onode_cache_shards
;
2158 std::vector
<BufferCacheShard
*> buffer_cache_shards
;
2160 /// protect zombie_osr_set
2161 ceph::mutex zombie_osr_lock
= ceph::make_mutex("BlueStore::zombie_osr_lock");
2162 uint32_t next_sequencer_id
= 0;
2163 std::map
<coll_t
,OpSequencerRef
> zombie_osr_set
; ///< std::set of OpSequencers for deleted collections
2165 std::atomic
<uint64_t> nid_last
= {0};
2166 std::atomic
<uint64_t> nid_max
= {0};
2167 std::atomic
<uint64_t> blobid_last
= {0};
2168 std::atomic
<uint64_t> blobid_max
= {0};
2170 ceph::mutex deferred_lock
= ceph::make_mutex("BlueStore::deferred_lock");
2171 ceph::mutex atomic_alloc_and_submit_lock
=
2172 ceph::make_mutex("BlueStore::atomic_alloc_and_submit_lock");
2173 std::atomic
<uint64_t> deferred_seq
= {0};
2174 deferred_osr_queue_t deferred_queue
; ///< osr's with deferred io pending
2175 std::atomic_int deferred_queue_size
= {0}; ///< num txc's queued across all osrs
2176 std::atomic_int deferred_aggressive
= {0}; ///< aggressive wakeup of kv thread
2178 utime_t deferred_last_submitted
= utime_t();
2180 KVSyncThread kv_sync_thread
;
2181 ceph::mutex kv_lock
= ceph::make_mutex("BlueStore::kv_lock");
2182 ceph::condition_variable kv_cond
;
2183 bool _kv_only
= false;
2184 bool kv_sync_started
= false;
2185 bool kv_stop
= false;
2186 bool kv_finalize_started
= false;
2187 bool kv_finalize_stop
= false;
2188 std::deque
<TransContext
*> kv_queue
; ///< ready, already submitted
2189 std::deque
<TransContext
*> kv_queue_unsubmitted
; ///< ready, need submit by kv thread
2190 std::deque
<TransContext
*> kv_committing
; ///< currently syncing
2191 std::deque
<DeferredBatch
*> deferred_done_queue
; ///< deferred ios done
2192 bool kv_sync_in_progress
= false;
2194 KVFinalizeThread kv_finalize_thread
;
2195 ceph::mutex kv_finalize_lock
= ceph::make_mutex("BlueStore::kv_finalize_lock");
2196 ceph::condition_variable kv_finalize_cond
;
2197 std::deque
<TransContext
*> kv_committing_to_finalize
; ///< pending finalization
2198 std::deque
<DeferredBatch
*> deferred_stable_to_finalize
; ///< pending finalization
2199 bool kv_finalize_in_progress
= false;
2202 ZonedCleanerThread zoned_cleaner_thread
;
2203 ceph::mutex zoned_cleaner_lock
= ceph::make_mutex("BlueStore::zoned_cleaner_lock");
2204 ceph::condition_variable zoned_cleaner_cond
;
2205 bool zoned_cleaner_started
= false;
2206 bool zoned_cleaner_stop
= false;
2207 std::deque
<uint64_t> zoned_cleaner_queue
;
2210 PerfCounters
*logger
= nullptr;
2212 std::list
<CollectionRef
> removed_collections
;
2214 ceph::shared_mutex debug_read_error_lock
=
2215 ceph::make_shared_mutex("BlueStore::debug_read_error_lock");
2216 std::set
<ghobject_t
> debug_data_error_objects
;
2217 std::set
<ghobject_t
> debug_mdata_error_objects
;
2219 std::atomic
<int> csum_type
= {Checksummer::CSUM_CRC32C
};
2221 uint64_t block_size
= 0; ///< block size of block device (power of 2)
2222 uint64_t block_mask
= 0; ///< mask to get just the block offset
2223 size_t block_size_order
= 0; ///< bits to shift to get block size
2224 uint64_t optimal_io_size
= 0;///< best performance io size for block device
2226 uint64_t min_alloc_size
; ///< minimum allocation unit (power of 2)
2227 uint8_t min_alloc_size_order
= 0;///< bits to shift to get min_alloc_size
2228 uint64_t min_alloc_size_mask
;///< mask for fast checking of allocation alignment
2229 static_assert(std::numeric_limits
<uint8_t>::max() >
2230 std::numeric_limits
<decltype(min_alloc_size
)>::digits
,
2231 "not enough bits for min_alloc_size");
2234 uint64_t zone_size
= 0; ///< number of SMR zones
2235 uint64_t first_sequential_zone
= 0; ///< first SMR zone that is sequential-only
2238 // Please preserve the order since it's DB persistent
2242 } per_pool_omap
= OMAP_BULK
;
2244 ///< maximum allocation unit (power of 2)
2245 std::atomic
<uint64_t> max_alloc_size
= {0};
2247 ///< number threshold for forced deferred writes
2248 std::atomic
<int> deferred_batch_ops
= {0};
2250 ///< size threshold for forced deferred writes
2251 std::atomic
<uint64_t> prefer_deferred_size
= {0};
2253 ///< approx cost per io, in bytes
2254 std::atomic
<uint64_t> throttle_cost_per_io
= {0};
2256 std::atomic
<Compressor::CompressionMode
> comp_mode
=
2257 {Compressor::COMP_NONE
}; ///< compression mode
2258 CompressorRef compressor
;
2259 std::atomic
<uint64_t> comp_min_blob_size
= {0};
2260 std::atomic
<uint64_t> comp_max_blob_size
= {0};
2262 std::atomic
<uint64_t> max_blob_size
= {0}; ///< maximum blob size
2264 uint64_t kv_ios
= 0;
2265 uint64_t kv_throttle_costs
= 0;
2267 // cache trim control
2268 uint64_t cache_size
= 0; ///< total cache size
2269 double cache_meta_ratio
= 0; ///< cache ratio dedicated to metadata
2270 double cache_kv_ratio
= 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
2271 double cache_kv_onode_ratio
= 0; ///< cache ratio dedicated to kv onodes (e.g., rocksdb onode CF)
2272 double cache_data_ratio
= 0; ///< cache ratio dedicated to object data
2273 bool cache_autotune
= false; ///< cache autotune setting
2274 double cache_age_bin_interval
= 0; ///< time to wait between cache age bin rotations
2275 double cache_autotune_interval
= 0; ///< time to wait between cache rebalancing
2276 std::vector
<uint64_t> kv_bins
; ///< kv autotune bins
2277 std::vector
<uint64_t> kv_onode_bins
; ///< kv onode autotune bins
2278 std::vector
<uint64_t> meta_bins
; ///< meta autotune bins
2279 std::vector
<uint64_t> data_bins
; ///< data autotune bins
2280 uint64_t osd_memory_target
= 0; ///< OSD memory target when autotuning cache
2281 uint64_t osd_memory_base
= 0; ///< OSD base memory when autotuning cache
2282 double osd_memory_expected_fragmentation
= 0; ///< expected memory fragmentation
2283 uint64_t osd_memory_cache_min
= 0; ///< Min memory to assign when autotuning cache
2284 double osd_memory_cache_resize_interval
= 0; ///< Time to wait between cache resizing
2285 double max_defer_interval
= 0; ///< Time to wait between last deferred submit
2286 std::atomic
<uint32_t> config_changed
= {0}; ///< Counter to determine if there is a configuration change.
2288 typedef std::map
<uint64_t, volatile_statfs
> osd_pools_map
;
2290 ceph::mutex vstatfs_lock
= ceph::make_mutex("BlueStore::vstatfs_lock");
2291 volatile_statfs vstatfs
;
2292 osd_pools_map osd_pools
; // protected by vstatfs_lock as well
2294 bool per_pool_stat_collection
= true;
2296 struct MempoolThread
: public Thread
{
2300 ceph::condition_variable cond
;
2301 ceph::mutex lock
= ceph::make_mutex("BlueStore::MempoolThread::lock");
2303 std::shared_ptr
<PriorityCache::PriCache
> binned_kv_cache
= nullptr;
2304 std::shared_ptr
<PriorityCache::PriCache
> binned_kv_onode_cache
= nullptr;
2305 std::shared_ptr
<PriorityCache::Manager
> pcm
= nullptr;
2307 struct MempoolCache
: public PriorityCache::PriCache
{
2309 uint64_t bins
[PriorityCache::Priority::LAST
+1] = {0};
2310 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1] = {0};
2311 int64_t committed_bytes
= 0;
2312 double cache_ratio
= 0;
2314 MempoolCache(BlueStore
*s
) : store(s
) {};
2316 virtual uint64_t _get_used_bytes() const = 0;
2317 virtual uint64_t _sum_bins(uint32_t start
, uint32_t end
) const = 0;
2319 virtual int64_t request_cache_bytes(
2320 PriorityCache::Priority pri
, uint64_t total_cache
) const {
2321 int64_t assigned
= get_cache_bytes(pri
);
2324 case PriorityCache::Priority::PRI0
:
2326 // BlueStore caches currently don't put anything in PRI0
2329 case PriorityCache::Priority::LAST
:
2331 uint32_t max
= get_bin_count();
2332 int64_t request
= _get_used_bytes() - _sum_bins(0, max
);
2333 return(request
> assigned
) ? request
- assigned
: 0;
2337 ceph_assert(pri
> 0 && pri
< PriorityCache::Priority::LAST
);
2338 auto prev_pri
= static_cast<PriorityCache::Priority
>(pri
- 1);
2339 uint64_t start
= get_bins(prev_pri
);
2340 uint64_t end
= get_bins(pri
);
2341 int64_t request
= _sum_bins(start
, end
);
2342 return(request
> assigned
) ? request
- assigned
: 0;
2348 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
2349 return cache_bytes
[pri
];
2351 virtual int64_t get_cache_bytes() const {
2354 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
2355 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
2356 total
+= get_cache_bytes(pri
);
2360 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
2361 cache_bytes
[pri
] = bytes
;
2363 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
2364 cache_bytes
[pri
] += bytes
;
2366 virtual int64_t commit_cache_size(uint64_t total_cache
) {
2367 committed_bytes
= PriorityCache::get_chunk(
2368 get_cache_bytes(), total_cache
);
2369 return committed_bytes
;
2371 virtual int64_t get_committed_size() const {
2372 return committed_bytes
;
2374 virtual uint64_t get_bins(PriorityCache::Priority pri
) const {
2375 if (pri
> PriorityCache::Priority::PRI0
&&
2376 pri
< PriorityCache::Priority::LAST
) {
2381 virtual void set_bins(PriorityCache::Priority pri
, uint64_t end_bin
) {
2382 if (pri
<= PriorityCache::Priority::PRI0
||
2383 pri
>= PriorityCache::Priority::LAST
) {
2386 bins
[pri
] = end_bin
;
2388 for (int pri
= 1; pri
< PriorityCache::Priority::LAST
; pri
++) {
2389 if (bins
[pri
] > max
) {
2395 virtual void import_bins(const std::vector
<uint64_t> &bins_v
) {
2397 for (int pri
= 1; pri
< PriorityCache::Priority::LAST
; pri
++) {
2398 unsigned i
= (unsigned) pri
- 1;
2399 if (i
< bins_v
.size()) {
2400 bins
[pri
] = bins_v
[i
];
2401 if (bins
[pri
] > max
) {
2410 virtual double get_cache_ratio() const {
2413 virtual void set_cache_ratio(double ratio
) {
2414 cache_ratio
= ratio
;
2416 virtual std::string
get_cache_name() const = 0;
2417 virtual uint32_t get_bin_count() const = 0;
2418 virtual void set_bin_count(uint32_t count
) = 0;
2421 struct MetaCache
: public MempoolCache
{
2422 MetaCache(BlueStore
*s
) : MempoolCache(s
) {};
2424 virtual uint32_t get_bin_count() const {
2425 return store
->onode_cache_shards
[0]->get_bin_count();
2427 virtual void set_bin_count(uint32_t count
) {
2428 for (auto i
: store
->onode_cache_shards
) {
2429 i
->set_bin_count(count
);
2432 virtual uint64_t _get_used_bytes() const {
2433 return mempool::bluestore_Buffer::allocated_bytes() +
2434 mempool::bluestore_Blob::allocated_bytes() +
2435 mempool::bluestore_Extent::allocated_bytes() +
2436 mempool::bluestore_cache_meta::allocated_bytes() +
2437 mempool::bluestore_cache_other::allocated_bytes() +
2438 mempool::bluestore_cache_onode::allocated_bytes() +
2439 mempool::bluestore_SharedBlob::allocated_bytes() +
2440 mempool::bluestore_inline_bl::allocated_bytes();
2442 virtual void shift_bins() {
2443 for (auto i
: store
->onode_cache_shards
) {
2447 virtual uint64_t _sum_bins(uint32_t start
, uint32_t end
) const {
2448 uint64_t onodes
= 0;
2449 for (auto i
: store
->onode_cache_shards
) {
2450 onodes
+= i
->sum_bins(start
, end
);
2452 return onodes
*get_bytes_per_onode();
2454 virtual std::string
get_cache_name() const {
2455 return "BlueStore Meta Cache";
2457 uint64_t _get_num_onodes() const {
2458 uint64_t onode_num
=
2459 mempool::bluestore_cache_onode::allocated_items();
2460 return (2 > onode_num
) ? 2 : onode_num
;
2462 double get_bytes_per_onode() const {
2463 return (double)_get_used_bytes() / (double)_get_num_onodes();
2466 std::shared_ptr
<MetaCache
> meta_cache
;
2468 struct DataCache
: public MempoolCache
{
2469 DataCache(BlueStore
*s
) : MempoolCache(s
) {};
2471 virtual uint32_t get_bin_count() const {
2472 return store
->buffer_cache_shards
[0]->get_bin_count();
2474 virtual void set_bin_count(uint32_t count
) {
2475 for (auto i
: store
->buffer_cache_shards
) {
2476 i
->set_bin_count(count
);
2479 virtual uint64_t _get_used_bytes() const {
2481 for (auto i
: store
->buffer_cache_shards
) {
2482 bytes
+= i
->_get_bytes();
2486 virtual void shift_bins() {
2487 for (auto i
: store
->buffer_cache_shards
) {
2491 virtual uint64_t _sum_bins(uint32_t start
, uint32_t end
) const {
2493 for (auto i
: store
->buffer_cache_shards
) {
2494 bytes
+= i
->sum_bins(start
, end
);
2498 virtual std::string
get_cache_name() const {
2499 return "BlueStore Data Cache";
2502 std::shared_ptr
<DataCache
> data_cache
;
2505 explicit MempoolThread(BlueStore
*s
)
2507 meta_cache(new MetaCache(s
)),
2508 data_cache(new DataCache(s
)) {}
2510 void *entry() override
;
2512 ceph_assert(stop
== false);
2513 create("bstore_mempool");
2524 void _update_cache_settings();
2525 void _resize_shards(bool interval_stats
);
2529 ZTracer::Endpoint trace_endpoint
{"0.0.0.0", 0, "BlueStore"};
2532 // --------------------------------------------------------
2535 void _init_logger();
2536 void _shutdown_logger();
2537 int _reload_logger();
2541 int _open_fsid(bool create
);
2543 int _read_fsid(uuid_d
*f
);
2546 void _set_alloc_sizes();
2547 void _set_blob_size();
2548 void _set_finisher_num();
2549 void _set_per_pool_omap();
2550 void _update_osd_memory_options();
2552 int _open_bdev(bool create
);
2553 // Verifies if disk space is enough for reserved + min bluefs
2554 // and alters the latter if needed.
2555 // Depends on min_alloc_size hence should be called after
2556 // its initialization (and outside of _open_bdev)
2557 void _validate_bdev();
2560 int _minimal_open_bluefs(bool create
);
2561 void _minimal_close_bluefs();
2562 int _open_bluefs(bool create
, bool read_only
);
2563 void _close_bluefs();
2565 int _is_bluefs(bool create
, bool* ret
);
2567 * opens both DB and dependant super_meta, FreelistManager and allocator
2568 * in the proper order
2570 int _open_db_and_around(bool read_only
, bool to_repair
= false);
2571 void _close_db_and_around();
2573 int _prepare_db_environment(bool create
, bool read_only
,
2574 std::string
* kv_dir
, std::string
* kv_backend
);
2577 * @warning to_repair_db means that we open this db to repair it, will not
2578 * hold the rocksdb's file lock.
2580 int _open_db(bool create
,
2581 bool to_repair_db
=false,
2582 bool read_only
= false);
2584 void _close_db_leave_bluefs();
2585 int _open_fm(KeyValueDB::Transaction t
, bool read_only
, bool fm_restore
= false);
2587 int _write_out_fm_meta(uint64_t target_size
);
2588 int _create_alloc();
2589 int _init_alloc(std::map
<uint64_t, uint64_t> *zone_adjustments
);
2590 void _post_init_alloc(const std::map
<uint64_t, uint64_t>& zone_adjustments
);
2591 void _close_alloc();
2592 int _open_collections();
2593 void _fsck_collections(int64_t* errors
);
2594 void _close_collections();
2596 int _setup_block_symlink_or_file(std::string name
, std::string path
, uint64_t size
,
2600 utime_t
get_deferred_last_submitted() {
2601 std::lock_guard
l(deferred_lock
);
2602 return deferred_last_submitted
;
2605 static int _write_bdev_label(CephContext
* cct
,
2606 const std::string
&path
, bluestore_bdev_label_t label
);
2607 static int _read_bdev_label(CephContext
* cct
, const std::string
&path
,
2608 bluestore_bdev_label_t
*label
);
2610 int _check_or_set_bdev_label(std::string path
, uint64_t size
, std::string desc
,
2612 int _set_bdev_label_size(const std::string
& path
, uint64_t size
);
2614 int _open_super_meta();
2616 void _open_statfs();
2617 void _get_statfs_overall(struct store_statfs_t
*buf
);
2619 void _dump_alloc_on_failure();
2621 CollectionRef
_get_collection(const coll_t
& cid
);
2622 CollectionRef
_get_collection_by_oid(const ghobject_t
& oid
);
2623 void _queue_reap_collection(CollectionRef
& c
);
2624 void _reap_collections();
2625 void _update_cache_logger();
2627 void _assign_nid(TransContext
*txc
, OnodeRef o
);
2628 uint64_t _assign_blobid(TransContext
*txc
);
2630 template <int LogLevelV
>
2631 friend void _dump_onode(CephContext
*cct
, const Onode
& o
);
2632 template <int LogLevelV
>
2633 friend void _dump_extent_map(CephContext
*cct
, const ExtentMap
& em
);
2634 template <int LogLevelV
>
2635 friend void _dump_transaction(CephContext
*cct
, Transaction
*t
);
2637 TransContext
*_txc_create(Collection
*c
, OpSequencer
*osr
,
2638 std::list
<Context
*> *on_commits
,
2639 TrackedOpRef osd_op
=TrackedOpRef());
2640 void _txc_update_store_statfs(TransContext
*txc
);
2641 void _txc_add_transaction(TransContext
*txc
, Transaction
*t
);
2642 void _txc_calc_cost(TransContext
*txc
);
2643 void _txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
);
2644 void _txc_state_proc(TransContext
*txc
);
2645 void _txc_aio_submit(TransContext
*txc
);
2647 void txc_aio_finish(void *p
) {
2648 _txc_state_proc(static_cast<TransContext
*>(p
));
2651 void _txc_finish_io(TransContext
*txc
);
2652 void _txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
);
2653 void _txc_apply_kv(TransContext
*txc
, bool sync_submit_transaction
);
2654 void _txc_committed_kv(TransContext
*txc
);
2655 void _txc_finish(TransContext
*txc
);
2656 void _txc_release_alloc(TransContext
*txc
);
2658 void _osr_attach(Collection
*c
);
2659 void _osr_register_zombie(OpSequencer
*osr
);
2660 void _osr_drain(OpSequencer
*osr
);
2661 void _osr_drain_preceding(TransContext
*txc
);
2662 void _osr_drain_all();
2666 void _kv_sync_thread();
2667 void _kv_finalize_thread();
2670 void _zoned_cleaner_start();
2671 void _zoned_cleaner_stop();
2672 void _zoned_cleaner_thread();
2673 void _zoned_clean_zone(uint64_t zone_num
,
2674 class ZonedAllocator
*a
,
2675 class ZonedFreelistManager
*f
);
2676 void _clean_some(ghobject_t oid
, uint32_t zone_num
);
2679 bluestore_deferred_op_t
*_get_deferred_op(TransContext
*txc
, uint64_t len
);
2680 void _deferred_queue(TransContext
*txc
);
2682 void deferred_try_submit();
2684 void _deferred_submit_unlock(OpSequencer
*osr
);
2685 void _deferred_aio_finish(OpSequencer
*osr
);
2686 int _deferred_replay();
2687 bool _eliminate_outdated_deferred(bluestore_deferred_transaction_t
* deferred_txn
,
2688 interval_set
<uint64_t>& bluefs_extents
);
2691 using mempool_dynamic_bitset
=
2692 boost::dynamic_bitset
<uint64_t,
2693 mempool::bluestore_fsck::pool_allocator
<uint64_t>>;
2694 using per_pool_statfs
=
2695 mempool::bluestore_fsck::map
<uint64_t, store_statfs_t
>;
2703 MAX_FSCK_ERROR_LINES
= 100,
2707 int _fsck_check_extents(
2708 std::string_view ctx_descr
,
2709 const PExtentVector
& extents
,
2711 mempool_dynamic_bitset
&used_blocks
,
2712 uint64_t granularity
,
2713 BlueStoreRepairer
* repairer
,
2714 store_statfs_t
& expected_statfs
,
2717 void _fsck_check_pool_statfs(
2718 per_pool_statfs
& expected_pool_statfs
,
2721 BlueStoreRepairer
* repairer
);
2722 void _fsck_repair_shared_blobs(
2723 BlueStoreRepairer
& repairer
,
2724 shared_blob_2hash_tracker_t
& sb_ref_counts
,
2725 sb_info_space_efficient_map_t
& sb_info
);
2727 int _fsck(FSCKDepth depth
, bool repair
);
2728 int _fsck_on_open(BlueStore::FSCKDepth depth
, bool repair
);
2730 void _buffer_cache_write(
2734 ceph::buffer::list
& bl
,
2736 b
->shared_blob
->bc
.write(b
->shared_blob
->get_cache(), txc
->seq
, offset
, bl
,
2738 txc
->shared_blobs_written
.insert(b
->shared_blob
);
2741 int _collection_list(
2742 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
,
2743 int max
, bool legacy
, std::vector
<ghobject_t
> *ls
, ghobject_t
*next
);
2745 template <typename T
, typename F
>
2746 T
select_option(const std::string
& opt_name
, T val1
, F f
) {
2747 //NB: opt_name reserved for future use
2748 boost::optional
<T
> val2
= f();
2755 void _apply_padding(uint64_t head_pad
,
2757 ceph::buffer::list
& padded
);
2759 void _record_onode(OnodeRef
&o
, KeyValueDB::Transaction
&txn
);
2761 // -- ondisk version ---
2763 const int32_t latest_ondisk_format
= 4; ///< our version
2764 const int32_t min_readable_ondisk_format
= 1; ///< what we can read
2765 const int32_t min_compat_ondisk_format
= 3; ///< who can read us
2768 int32_t ondisk_format
= 0; ///< value detected on mount
2769 bool m_fast_shutdown
= false;
2770 int _upgrade_super(); ///< upgrade (called during open_super)
2771 uint64_t _get_ondisk_reserved() const;
2772 void _prepare_ondisk_format_super(KeyValueDB::Transaction
& t
);
2774 // --- public interface ---
2776 BlueStore(CephContext
*cct
, const std::string
& path
);
2777 BlueStore(CephContext
*cct
, const std::string
& path
, uint64_t min_alloc_size
); // Ctor for UT only
2778 ~BlueStore() override
;
2780 std::string
get_type() override
{
2784 bool needs_journal() override
{ return false; };
2785 bool wants_journal() override
{ return false; };
2786 bool allows_journal() override
{ return false; };
2788 void prepare_for_fast_shutdown() override
;
2789 virtual bool has_null_manager();
2791 uint64_t get_min_alloc_size() const override
{
2792 return min_alloc_size
;
2795 int get_devices(std::set
<std::string
> *ls
) override
;
2797 bool is_rotational() override
;
2798 bool is_journal_rotational() override
;
2799 bool is_db_rotational() ;
2801 std::string
get_default_device_class() override
{
2802 std::string device_class
;
2803 std::map
<std::string
, std::string
> metadata
;
2804 collect_metadata(&metadata
);
2805 auto it
= metadata
.find("bluestore_bdev_type");
2806 if (it
!= metadata
.end()) {
2807 device_class
= it
->second
;
2809 return device_class
;
2814 std::set
<int> *nodes
,
2815 std::set
<std::string
> *failed
) override
;
2817 static int get_block_device_fsid(CephContext
* cct
, const std::string
& path
,
2820 bool test_mount_in_use() override
;
2825 int mount() override
{
2828 int umount() override
;
2830 int open_db_environment(KeyValueDB
**pdb
, bool to_repair
);
2831 int close_db_environment();
2832 BlueFS
* get_bluefs();
2834 int write_meta(const std::string
& key
, const std::string
& value
) override
;
2835 int read_meta(const std::string
& key
, std::string
*value
) override
;
2837 // open in read-only and limited mode
2841 int fsck(bool deep
) override
{
2842 return _fsck(deep
? FSCK_DEEP
: FSCK_REGULAR
, false);
2844 int repair(bool deep
) override
{
2845 return _fsck(deep
? FSCK_DEEP
: FSCK_REGULAR
, true);
2847 int quick_fix() override
{
2848 return _fsck(FSCK_SHALLOW
, true);
2851 void set_cache_shards(unsigned num
) override
;
2852 void dump_cache_stats(ceph::Formatter
*f
) override
{
2853 int onode_count
= 0, buffers_bytes
= 0;
2854 for (auto i
: onode_cache_shards
) {
2855 onode_count
+= i
->_get_num();
2857 for (auto i
: buffer_cache_shards
) {
2858 buffers_bytes
+= i
->_get_bytes();
2860 f
->dump_int("bluestore_onode", onode_count
);
2861 f
->dump_int("bluestore_buffers", buffers_bytes
);
2863 void dump_cache_stats(std::ostream
& ss
) override
{
2864 int onode_count
= 0, buffers_bytes
= 0;
2865 for (auto i
: onode_cache_shards
) {
2866 onode_count
+= i
->_get_num();
2868 for (auto i
: buffer_cache_shards
) {
2869 buffers_bytes
+= i
->_get_bytes();
2871 ss
<< "bluestore_onode: " << onode_count
;
2872 ss
<< "bluestore_buffers: " << buffers_bytes
;
2875 int validate_hobject_key(const hobject_t
&obj
) const override
{
2878 unsigned get_max_attr_name_length() override
{
2879 return 256; // arbitrary; there is no real limit internally
2882 int mkfs() override
;
2883 int mkjournal() override
{
2887 void get_db_statistics(ceph::Formatter
*f
) override
;
2888 void generate_db_histogram(ceph::Formatter
*f
) override
;
2889 void _shutdown_cache();
2890 int flush_cache(std::ostream
*os
= NULL
) override
;
2891 void dump_perf_counters(ceph::Formatter
*f
) override
{
2892 f
->open_object_section("perf_counters");
2893 logger
->dump_formatted(f
, false);
2897 int add_new_bluefs_device(int id
, const std::string
& path
);
2898 int migrate_to_existing_bluefs_device(const std::set
<int>& devs_source
,
2900 int migrate_to_new_bluefs_device(const std::set
<int>& devs_source
,
2902 const std::string
& path
);
2903 int expand_devices(std::ostream
& out
);
2904 std::string
get_device_path(unsigned id
);
2906 int dump_bluefs_sizes(std::ostream
& out
);
2909 int statfs(struct store_statfs_t
*buf
,
2910 osd_alert_list_t
* alerts
= nullptr) override
;
2911 int pool_statfs(uint64_t pool_id
, struct store_statfs_t
*buf
,
2912 bool *per_pool_omap
) override
;
2914 void collect_metadata(std::map
<std::string
,std::string
> *pm
) override
;
2916 bool exists(CollectionHandle
&c
, const ghobject_t
& oid
) override
;
2917 int set_collection_opts(
2918 CollectionHandle
& c
,
2919 const pool_opts_t
& opts
) override
;
2921 CollectionHandle
&c
,
2922 const ghobject_t
& oid
,
2924 bool allow_eio
= false) override
;
2926 CollectionHandle
&c
,
2927 const ghobject_t
& oid
,
2930 ceph::buffer::list
& bl
,
2931 uint32_t op_flags
= 0) override
;
2935 // --------------------------------------------------------
2936 // intermediate data structures used while reading
2938 uint64_t logical_offset
;
2939 uint64_t blob_xoffset
; //region offset within the blob
2942 // used later in read process
2945 region_t(uint64_t offset
, uint64_t b_offs
, uint64_t len
, uint64_t front
= 0)
2946 : logical_offset(offset
),
2947 blob_xoffset(b_offs
),
2950 region_t(const region_t
& from
)
2951 : logical_offset(from
.logical_offset
),
2952 blob_xoffset(from
.blob_xoffset
),
2953 length(from
.length
),
2956 friend std::ostream
& operator<<(std::ostream
& out
, const region_t
& r
) {
2957 return out
<< "0x" << std::hex
<< r
.logical_offset
<< ":"
2958 << r
.blob_xoffset
<< "~" << r
.length
<< std::dec
;
2962 // merged blob read request
2966 ceph::buffer::list bl
;
2967 std::list
<region_t
> regs
; // original read regions
2969 read_req_t(uint64_t off
, uint64_t len
) : r_off(off
), r_len(len
) {}
2971 friend std::ostream
& operator<<(std::ostream
& out
, const read_req_t
& r
) {
2972 out
<< "{<0x" << std::hex
<< r
.r_off
<< ", 0x" << r
.r_len
<< "> : [";
2973 for (const auto& reg
: r
.regs
)
2975 return out
<< "]}" << std::dec
;
2979 typedef std::list
<read_req_t
> regions2read_t
;
2980 typedef std::map
<BlueStore::BlobRef
, regions2read_t
> blobs2read_t
;
2986 int read_cache_policy
,
2987 ready_regions_t
& ready_regions
,
2988 blobs2read_t
& blobs2read
);
2991 int _prepare_read_ioc(
2992 blobs2read_t
& blobs2read
,
2993 std::vector
<ceph::buffer::list
>* compressed_blob_bls
,
2996 int _generate_read_result_bl(
3000 ready_regions_t
& ready_regions
,
3001 std::vector
<ceph::buffer::list
>& compressed_blob_bls
,
3002 blobs2read_t
& blobs2read
,
3005 ceph::buffer::list
& bl
);
3012 ceph::buffer::list
& bl
,
3013 uint32_t op_flags
= 0,
3014 uint64_t retry_count
= 0);
3019 const interval_set
<uint64_t>& m
,
3020 ceph::buffer::list
& bl
,
3021 uint32_t op_flags
= 0,
3022 uint64_t retry_count
= 0);
3024 int _fiemap(CollectionHandle
&c_
, const ghobject_t
& oid
,
3025 uint64_t offset
, size_t len
, interval_set
<uint64_t>& destset
);
3027 int fiemap(CollectionHandle
&c
, const ghobject_t
& oid
,
3028 uint64_t offset
, size_t len
, ceph::buffer::list
& bl
) override
;
3029 int fiemap(CollectionHandle
&c
, const ghobject_t
& oid
,
3030 uint64_t offset
, size_t len
, std::map
<uint64_t, uint64_t>& destmap
) override
;
3033 CollectionHandle
&c_
,
3034 const ghobject_t
& oid
,
3035 interval_set
<uint64_t>& m
,
3036 ceph::buffer::list
& bl
,
3037 uint32_t op_flags
) override
;
3039 int dump_onode(CollectionHandle
&c
, const ghobject_t
& oid
,
3040 const std::string
& section_name
, ceph::Formatter
*f
) override
;
3042 int getattr(CollectionHandle
&c
, const ghobject_t
& oid
, const char *name
,
3043 ceph::buffer::ptr
& value
) override
;
3045 int getattrs(CollectionHandle
&c
, const ghobject_t
& oid
,
3046 std::map
<std::string
,ceph::buffer::ptr
, std::less
<>>& aset
) override
;
3048 int list_collections(std::vector
<coll_t
>& ls
) override
;
3050 CollectionHandle
open_collection(const coll_t
&c
) override
;
3051 CollectionHandle
create_new_collection(const coll_t
& cid
) override
;
3052 void set_collection_commit_queue(const coll_t
& cid
,
3053 ContextQueue
*commit_queue
) override
;
3055 bool collection_exists(const coll_t
& c
) override
;
3056 int collection_empty(CollectionHandle
& c
, bool *empty
) override
;
3057 int collection_bits(CollectionHandle
& c
) override
;
3059 int collection_list(CollectionHandle
&c
,
3060 const ghobject_t
& start
,
3061 const ghobject_t
& end
,
3063 std::vector
<ghobject_t
> *ls
, ghobject_t
*next
) override
;
3065 int collection_list_legacy(CollectionHandle
&c
,
3066 const ghobject_t
& start
,
3067 const ghobject_t
& end
,
3069 std::vector
<ghobject_t
> *ls
,
3070 ghobject_t
*next
) override
;
3073 CollectionHandle
&c
, ///< [in] Collection containing oid
3074 const ghobject_t
&oid
, ///< [in] Object containing omap
3075 ceph::buffer::list
*header
, ///< [out] omap header
3076 std::map
<std::string
, ceph::buffer::list
> *out
/// < [out] Key to value map
3079 Collection
*c
, ///< [in] Collection containing oid
3080 const ghobject_t
&oid
, ///< [in] Object containing omap
3081 ceph::buffer::list
*header
, ///< [out] omap header
3082 std::map
<std::string
, ceph::buffer::list
> *out
/// < [out] Key to value map
3084 int _onode_omap_get(
3085 const OnodeRef
&o
, ///< [in] Object containing omap
3086 ceph::buffer::list
*header
, ///< [out] omap header
3087 std::map
<std::string
, ceph::buffer::list
> *out
/// < [out] Key to value map
3092 int omap_get_header(
3093 CollectionHandle
&c
, ///< [in] Collection containing oid
3094 const ghobject_t
&oid
, ///< [in] Object containing omap
3095 ceph::buffer::list
*header
, ///< [out] omap header
3096 bool allow_eio
= false ///< [in] don't assert on eio
3099 /// Get keys defined on oid
3101 CollectionHandle
&c
, ///< [in] Collection containing oid
3102 const ghobject_t
&oid
, ///< [in] Object containing omap
3103 std::set
<std::string
> *keys
///< [out] Keys defined on oid
3107 int omap_get_values(
3108 CollectionHandle
&c
, ///< [in] Collection containing oid
3109 const ghobject_t
&oid
, ///< [in] Object containing omap
3110 const std::set
<std::string
> &keys
, ///< [in] Keys to get
3111 std::map
<std::string
, ceph::buffer::list
> *out
///< [out] Returned keys and values
3115 int omap_get_values(
3116 CollectionHandle
&c
, ///< [in] Collection containing oid
3117 const ghobject_t
&oid
, ///< [in] Object containing omap
3118 const std::optional
<std::string
> &start_after
, ///< [in] Keys to get
3119 std::map
<std::string
, ceph::buffer::list
> *out
///< [out] Returned keys and values
3123 /// Filters keys into out which are defined on oid
3124 int omap_check_keys(
3125 CollectionHandle
&c
, ///< [in] Collection containing oid
3126 const ghobject_t
&oid
, ///< [in] Object containing omap
3127 const std::set
<std::string
> &keys
, ///< [in] Keys to check
3128 std::set
<std::string
> *out
///< [out] Subset of keys defined on oid
3131 ObjectMap::ObjectMapIterator
get_omap_iterator(
3132 CollectionHandle
&c
, ///< [in] collection
3133 const ghobject_t
&oid
///< [in] object
3136 void set_fsid(uuid_d u
) override
{
3139 uuid_d
get_fsid() override
{
3143 uint64_t estimate_objects_overhead(uint64_t num_objects
) override
{
3144 return num_objects
* 300; //assuming per-object overhead is 300 bytes
3147 struct BSPerfTracker
{
3148 PerfCounters::avg_tracker
<uint64_t> os_commit_latency_ns
;
3149 PerfCounters::avg_tracker
<uint64_t> os_apply_latency_ns
;
3151 objectstore_perf_stat_t
get_cur_stats() const {
3152 objectstore_perf_stat_t ret
;
3153 ret
.os_commit_latency_ns
= os_commit_latency_ns
.current_avg();
3154 ret
.os_apply_latency_ns
= os_apply_latency_ns
.current_avg();
3158 void update_from_perfcounters(PerfCounters
&logger
);
3161 objectstore_perf_stat_t
get_cur_stats() override
{
3162 perf_tracker
.update_from_perfcounters(*logger
);
3163 return perf_tracker
.get_cur_stats();
3165 const PerfCounters
* get_perf_counters() const override
{
3168 const PerfCounters
* get_bluefs_perf_counters() const {
3169 return bluefs
->get_perf_counters();
3171 KeyValueDB
* get_kv() {
3175 int queue_transactions(
3176 CollectionHandle
& ch
,
3177 std::vector
<Transaction
>& tls
,
3178 TrackedOpRef op
= TrackedOpRef(),
3179 ThreadPool::TPHandle
*handle
= NULL
) override
;
3182 void inject_data_error(const ghobject_t
& o
) override
{
3183 std::unique_lock
l(debug_read_error_lock
);
3184 debug_data_error_objects
.insert(o
);
3186 void inject_mdata_error(const ghobject_t
& o
) override
{
3187 std::unique_lock
l(debug_read_error_lock
);
3188 debug_mdata_error_objects
.insert(o
);
3191 /// methods to inject various errors fsck can repair
3192 void inject_broken_shared_blob_key(const std::string
& key
,
3193 const ceph::buffer::list
& bl
);
3194 void inject_no_shared_blob_key();
3195 void inject_stray_shared_blob_key(uint64_t sbid
);
3197 void inject_leaked(uint64_t len
);
3198 void inject_false_free(coll_t cid
, ghobject_t oid
);
3199 void inject_statfs(const std::string
& key
, const store_statfs_t
& new_statfs
);
3200 void inject_global_statfs(const store_statfs_t
& new_statfs
);
3201 void inject_misreference(coll_t cid1
, ghobject_t oid1
,
3202 coll_t cid2
, ghobject_t oid2
,
3204 void inject_zombie_spanning_blob(coll_t cid
, ghobject_t oid
, int16_t blob_id
);
3205 // resets global per_pool_omap in DB
3206 void inject_legacy_omap();
3207 // resets per_pool_omap | pgmeta_omap for onode
3208 void inject_legacy_omap(coll_t cid
, ghobject_t oid
);
3209 void inject_stray_omap(uint64_t head
, const std::string
& name
);
3211 void inject_bluefs_file(std::string_view dir
,
3212 std::string_view name
,
3215 void compact() override
{
3219 bool has_builtin_csum() const override
{
3223 inline void log_latency(const char* name
,
3225 const ceph::timespan
& lat
,
3226 double lat_threshold
,
3227 const char* info
= "") const;
3229 inline void log_latency_fn(const char* name
,
3231 const ceph::timespan
& lat
,
3232 double lat_threshold
,
3233 std::function
<std::string (const ceph::timespan
& lat
)> fn
) const;
3236 bool _debug_data_eio(const ghobject_t
& o
) {
3237 if (!cct
->_conf
->bluestore_debug_inject_read_err
) {
3240 std::shared_lock
l(debug_read_error_lock
);
3241 return debug_data_error_objects
.count(o
);
3243 bool _debug_mdata_eio(const ghobject_t
& o
) {
3244 if (!cct
->_conf
->bluestore_debug_inject_read_err
) {
3247 std::shared_lock
l(debug_read_error_lock
);
3248 return debug_mdata_error_objects
.count(o
);
3250 void _debug_obj_on_delete(const ghobject_t
& o
) {
3251 if (cct
->_conf
->bluestore_debug_inject_read_err
) {
3252 std::unique_lock
l(debug_read_error_lock
);
3253 debug_data_error_objects
.erase(o
);
3254 debug_mdata_error_objects
.erase(o
);
3258 ceph::mutex qlock
= ceph::make_mutex("BlueStore::Alerts::qlock");
3259 std::string failed_cmode
;
3260 std::set
<std::string
> failed_compressors
;
3261 std::string spillover_alert
;
3262 std::string legacy_statfs_alert
;
3263 std::string no_per_pool_omap_alert
;
3264 std::string no_per_pg_omap_alert
;
3265 std::string disk_size_mismatch_alert
;
3266 std::string spurious_read_errors_alert
;
3268 void _log_alerts(osd_alert_list_t
& alerts
);
3269 bool _set_compression_alert(bool cmode
, const char* s
) {
3270 std::lock_guard
l(qlock
);
3272 bool ret
= failed_cmode
.empty();
3276 return failed_compressors
.emplace(s
).second
;
3278 void _clear_compression_alert() {
3279 std::lock_guard
l(qlock
);
3280 failed_compressors
.clear();
3281 failed_cmode
.clear();
3284 void _set_spillover_alert(const std::string
& s
) {
3285 std::lock_guard
l(qlock
);
3286 spillover_alert
= s
;
3288 void _clear_spillover_alert() {
3289 std::lock_guard
l(qlock
);
3290 spillover_alert
.clear();
3293 void _check_legacy_statfs_alert();
3294 void _check_no_per_pg_or_pool_omap_alert();
3295 void _set_disk_size_mismatch_alert(const std::string
& s
) {
3296 std::lock_guard
l(qlock
);
3297 disk_size_mismatch_alert
= s
;
3299 void _set_spurious_read_errors_alert(const std::string
& s
) {
3300 std::lock_guard
l(qlock
);
3301 spurious_read_errors_alert
= s
;
3306 // --------------------------------------------------------
3307 // read processing internal methods
3310 const bluestore_blob_t
* blob
,
3311 uint64_t blob_xoffset
,
3312 const ceph::buffer::list
& bl
,
3313 uint64_t logical_offset
) const;
3314 int _decompress(ceph::buffer::list
& source
, ceph::buffer::list
* result
);
3317 // --------------------------------------------------------
3320 struct WriteContext
{
3321 bool buffered
= false; ///< buffered write
3322 bool compress
= false; ///< compressed write
3323 uint64_t target_blob_size
= 0; ///< target (max) blob size
3324 unsigned csum_order
= 0; ///< target checksum chunk order
3326 old_extent_map_t old_extents
; ///< must deref these blobs
3327 interval_set
<uint64_t> extents_to_gc
; ///< extents for garbage collection
3330 uint64_t logical_offset
; ///< write logical offset
3332 uint64_t blob_length
;
3334 ceph::buffer::list bl
;
3335 uint64_t b_off0
; ///< original offset in a blob prior to padding
3336 uint64_t length0
; ///< original data length prior to padding
3339 bool new_blob
; ///< whether new blob was created
3341 bool compressed
= false;
3342 ceph::buffer::list compressed_bl
;
3343 size_t compressed_len
= 0;
3346 uint64_t logical_offs
,
3350 ceph::buffer::list
& bl
,
3356 logical_offset(logical_offs
),
3358 blob_length(blob_len
),
3363 mark_unused(_mark_unused
),
3364 new_blob(_new_blob
) {}
3366 std::vector
<write_item
> writes
; ///< blobs we're writing
3368 /// partial clone of the context
3369 void fork(const WriteContext
& other
) {
3370 buffered
= other
.buffered
;
3371 compress
= other
.compress
;
3372 target_blob_size
= other
.target_blob_size
;
3373 csum_order
= other
.csum_order
;
3380 ceph::buffer::list
& bl
,
3385 writes
.emplace_back(loffs
,
3395 /// Checks for writes to the same pextent within a blob
3400 uint64_t min_alloc_size
);
3402 void _do_write_small(
3406 uint64_t offset
, uint64_t length
,
3407 ceph::buffer::list::iterator
& blp
,
3408 WriteContext
*wctx
);
3409 void _do_write_big_apply_deferred(
3413 BigDeferredWriteContext
& dctx
,
3414 bufferlist::iterator
& blp
,
3415 WriteContext
* wctx
);
3420 uint64_t offset
, uint64_t length
,
3421 ceph::buffer::list::iterator
& blp
,
3422 WriteContext
*wctx
);
3423 int _do_alloc_write(
3427 WriteContext
*wctx
);
3433 std::set
<SharedBlob
*> *maybe_unshared_blobs
=0);
3435 int _write(TransContext
*txc
,
3438 uint64_t offset
, size_t len
,
3439 ceph::buffer::list
& bl
,
3440 uint32_t fadvise_flags
);
3441 void _pad_zeros(ceph::buffer::list
*bl
, uint64_t *offset
,
3442 uint64_t chunk_size
);
3444 void _choose_write_options(CollectionRef
& c
,
3446 uint32_t fadvise_flags
,
3447 WriteContext
*wctx
);
3449 int _do_gc(TransContext
*txc
,
3452 const WriteContext
& wctx
,
3453 uint64_t *dirty_start
,
3454 uint64_t *dirty_end
);
3456 int _do_write(TransContext
*txc
,
3459 uint64_t offset
, uint64_t length
,
3460 ceph::buffer::list
& bl
,
3461 uint32_t fadvise_flags
);
3462 void _do_write_data(TransContext
*txc
,
3467 ceph::buffer::list
& bl
,
3468 WriteContext
*wctx
);
3470 int _touch(TransContext
*txc
,
3473 int _do_zero(TransContext
*txc
,
3476 uint64_t offset
, size_t len
);
3477 int _zero(TransContext
*txc
,
3480 uint64_t offset
, size_t len
);
3481 void _do_truncate(TransContext
*txc
,
3485 std::set
<SharedBlob
*> *maybe_unshared_blobs
=0);
3486 int _truncate(TransContext
*txc
,
3490 int _remove(TransContext
*txc
,
3493 int _do_remove(TransContext
*txc
,
3496 int _setattr(TransContext
*txc
,
3499 const std::string
& name
,
3500 ceph::buffer::ptr
& val
);
3501 int _setattrs(TransContext
*txc
,
3504 const std::map
<std::string
,ceph::buffer::ptr
>& aset
);
3505 int _rmattr(TransContext
*txc
,
3508 const std::string
& name
);
3509 int _rmattrs(TransContext
*txc
,
3512 void _do_omap_clear(TransContext
*txc
, OnodeRef
&o
);
3513 int _omap_clear(TransContext
*txc
,
3516 int _omap_setkeys(TransContext
*txc
,
3519 ceph::buffer::list
& bl
);
3520 int _omap_setheader(TransContext
*txc
,
3523 ceph::buffer::list
& header
);
3524 int _omap_rmkeys(TransContext
*txc
,
3527 ceph::buffer::list
& bl
);
3528 int _omap_rmkey_range(TransContext
*txc
,
3531 const std::string
& first
, const std::string
& last
);
3532 int _set_alloc_hint(
3536 uint64_t expected_object_size
,
3537 uint64_t expected_write_size
,
3539 int _do_clone_range(TransContext
*txc
,
3543 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
);
3544 int _clone(TransContext
*txc
,
3548 int _clone_range(TransContext
*txc
,
3552 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
);
3553 int _rename(TransContext
*txc
,
3557 const ghobject_t
& new_oid
);
3558 int _create_collection(TransContext
*txc
, const coll_t
&cid
,
3559 unsigned bits
, CollectionRef
*c
);
3560 int _remove_collection(TransContext
*txc
, const coll_t
&cid
,
3562 void _do_remove_collection(TransContext
*txc
, CollectionRef
*c
);
3563 int _split_collection(TransContext
*txc
,
3566 unsigned bits
, int rem
);
3567 int _merge_collection(TransContext
*txc
,
3572 void _collect_allocation_stats(uint64_t need
, uint32_t alloc_size
,
3573 const PExtentVector
&);
3574 void _record_allocation_stats();
3576 uint64_t probe_count
= 0;
3577 std::atomic
<uint64_t> alloc_stats_count
= {0};
3578 std::atomic
<uint64_t> alloc_stats_fragments
= { 0 };
3579 std::atomic
<uint64_t> alloc_stats_size
= { 0 };
3581 std::array
<std::tuple
<uint64_t, uint64_t, uint64_t>, 5> alloc_stats_history
=
3582 { std::make_tuple(0ul, 0ul, 0ul) };
3584 inline bool _use_rotational_settings();
3587 typedef btree::btree_set
<
3588 uint64_t, std::less
<uint64_t>,
3589 mempool::bluestore_fsck::pool_allocator
<uint64_t>> uint64_t_btree_t
;
3591 struct FSCK_ObjectCtx
{
3594 uint64_t& num_objects
;
3595 uint64_t& num_extents
;
3596 uint64_t& num_blobs
;
3597 uint64_t& num_sharded_objects
;
3598 uint64_t& num_spanning_blobs
;
3600 mempool_dynamic_bitset
* used_blocks
;
3601 uint64_t_btree_t
* used_omap_head
;
3602 std::vector
<std::unordered_map
<ghobject_t
, uint64_t>> *zone_refs
;
3604 ceph::mutex
* sb_info_lock
;
3605 sb_info_space_efficient_map_t
& sb_info
;
3606 // approximate amount of references per <shared blob, chunk>
3607 shared_blob_2hash_tracker_t
& sb_ref_counts
;
3609 store_statfs_t
& expected_store_statfs
;
3610 per_pool_statfs
& expected_pool_statfs
;
3611 BlueStoreRepairer
* repairer
;
3613 FSCK_ObjectCtx(int64_t& e
,
3615 uint64_t& _num_objects
,
3616 uint64_t& _num_extents
,
3617 uint64_t& _num_blobs
,
3618 uint64_t& _num_sharded_objects
,
3619 uint64_t& _num_spanning_blobs
,
3620 mempool_dynamic_bitset
* _ub
,
3621 uint64_t_btree_t
* _used_omap_head
,
3622 std::vector
<std::unordered_map
<ghobject_t
, uint64_t>> *_zone_refs
,
3624 ceph::mutex
* _sb_info_lock
,
3625 sb_info_space_efficient_map_t
& _sb_info
,
3626 shared_blob_2hash_tracker_t
& _sb_ref_counts
,
3627 store_statfs_t
& _store_statfs
,
3628 per_pool_statfs
& _pool_statfs
,
3629 BlueStoreRepairer
* _repairer
) :
3632 num_objects(_num_objects
),
3633 num_extents(_num_extents
),
3634 num_blobs(_num_blobs
),
3635 num_sharded_objects(_num_sharded_objects
),
3636 num_spanning_blobs(_num_spanning_blobs
),
3638 used_omap_head(_used_omap_head
),
3639 zone_refs(_zone_refs
),
3640 sb_info_lock(_sb_info_lock
),
3642 sb_ref_counts(_sb_ref_counts
),
3643 expected_store_statfs(_store_statfs
),
3644 expected_pool_statfs(_pool_statfs
),
3645 repairer(_repairer
) {
3649 OnodeRef
fsck_check_objects_shallow(
3653 const ghobject_t
& oid
,
3654 const std::string
& key
,
3655 const ceph::buffer::list
& value
,
3656 mempool::bluestore_fsck::list
<std::string
>* expecting_shards
,
3657 std::map
<BlobRef
, bluestore_blob_t::unused_t
>* referenced
,
3658 const BlueStore::FSCK_ObjectCtx
& ctx
);
3659 #ifdef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
3660 int push_allocation_to_rocksdb();
3661 int read_allocation_from_drive_for_bluestore_tool();
3664 #define MAX_BLOBS_IN_ONODE 128
3665 struct read_alloc_stats_t
{
3666 //read_alloc_stats_t() { memset(&this, 0, sizeof(read_alloc_stats_t)); }
3667 uint32_t onode_count
= 0;
3668 uint32_t shard_count
= 0;
3670 uint32_t skipped_repeated_extent
= 0;
3671 uint32_t skipped_illegal_extent
= 0;
3673 uint32_t collection_search
= 0;
3674 uint32_t pad_limit_count
= 0;
3676 uint64_t shared_blobs_count
= 0;
3677 uint64_t compressed_blob_count
= 0;
3678 uint64_t spanning_blob_count
= 0;
3679 uint64_t insert_count
= 0;
3680 uint64_t extent_count
= 0;
3682 uint64_t saved_inplace_count
= 0;
3683 uint32_t merge_insert_count
= 0;
3684 uint32_t merge_inplace_count
= 0;
3686 std::array
<uint32_t, MAX_BLOBS_IN_ONODE
+1>blobs_in_onode
= {};
3687 //uint32_t blobs_in_onode[MAX_BLOBS_IN_ONODE+1];
3690 friend std::ostream
& operator<<(std::ostream
& out
, const read_alloc_stats_t
& stats
) {
3691 out
<< "==========================================================" << std::endl
;
3692 out
<< "NCB::onode_count = " ;out
.width(10);out
<< stats
.onode_count
<< std::endl
3693 << "NCB::shard_count = " ;out
.width(10);out
<< stats
.shard_count
<< std::endl
3694 << "NCB::shared_blobs_count = " ;out
.width(10);out
<< stats
.shared_blobs_count
<< std::endl
3695 << "NCB::compressed_blob_count = " ;out
.width(10);out
<< stats
.compressed_blob_count
<< std::endl
3696 << "NCB::spanning_blob_count = " ;out
.width(10);out
<< stats
.spanning_blob_count
<< std::endl
3697 << "NCB::collection search = " ;out
.width(10);out
<< stats
.collection_search
<< std::endl
3698 << "NCB::skipped_repeated_extent = " ;out
.width(10);out
<< stats
.skipped_repeated_extent
<< std::endl
3699 << "NCB::skipped_illegal_extent = " ;out
.width(10);out
<< stats
.skipped_illegal_extent
<< std::endl
3700 << "NCB::extent_count = " ;out
.width(10);out
<< stats
.extent_count
<< std::endl
3701 << "NCB::insert_count = " ;out
.width(10);out
<< stats
.insert_count
<< std::endl
;
3703 if (stats
.merge_insert_count
) {
3704 out
<< "NCB::merge_insert_count = " ;out
.width(10);out
<< stats
.merge_insert_count
<< std::endl
;
3706 if (stats
.merge_inplace_count
) {
3707 out
<< "NCB::merge_inplace_count = " ;out
.width(10);out
<< stats
.merge_inplace_count
<< std::endl
;
3708 out
<< "NCB::saved_inplace_count = " ;out
.width(10);out
<< stats
.saved_inplace_count
<< std::endl
;
3709 out
<< "NCB::saved inplace per call = " ;out
.width(10);out
<< stats
.saved_inplace_count
/stats
.merge_inplace_count
<< std::endl
;
3711 out
<< "==========================================================" << std::endl
;
3713 for (unsigned i
= 0; i
< MAX_BLOBS_IN_ONODE
; i
++ ) {
3714 if (stats
.blobs_in_onode
[i
]) {
3715 out
<< "NCB::We had " ;out
.width(9); out
<< stats
.blobs_in_onode
[i
]
3716 << " ONodes with "; out
.width(3); out
<< i
<< " blobs" << std::endl
;
3720 if (stats
.blobs_in_onode
[MAX_BLOBS_IN_ONODE
]) {
3721 out
<< "NCB::We had " ;out
.width(9);out
<< stats
.blobs_in_onode
[MAX_BLOBS_IN_ONODE
]
3722 << " ONodes with more than " << MAX_BLOBS_IN_ONODE
<< " blobs" << std::endl
;
3727 int compare_allocators(Allocator
* alloc1
, Allocator
* alloc2
, uint64_t req_extent_count
, uint64_t memory_target
);
3728 Allocator
* create_bitmap_allocator(uint64_t bdev_size
);
3729 int add_existing_bluefs_allocation(Allocator
* allocator
, read_alloc_stats_t
& stats
);
3730 int allocator_add_restored_entries(Allocator
*allocator
, const void *buff
, unsigned extent_count
, uint64_t *p_read_alloc_size
,
3731 uint64_t *p_extent_count
, const void *v_header
, BlueFS::FileReader
*p_handle
, uint64_t offset
);
3733 int copy_allocator(Allocator
* src_alloc
, Allocator
*dest_alloc
, uint64_t* p_num_entries
);
3734 int store_allocator(Allocator
* allocator
);
3735 int invalidate_allocation_file_on_bluefs();
3736 int __restore_allocator(Allocator
* allocator
, uint64_t *num
, uint64_t *bytes
);
3737 int restore_allocator(Allocator
* allocator
, uint64_t *num
, uint64_t *bytes
);
3738 int read_allocation_from_drive_on_startup();
3739 int reconstruct_allocations(SimpleBitmap
*smbmp
, read_alloc_stats_t
&stats
);
3740 int read_allocation_from_onodes(SimpleBitmap
*smbmp
, read_alloc_stats_t
& stats
);
3741 void read_allocation_from_single_onode(SimpleBitmap
*smbmp
, BlueStore::OnodeRef
& onode_ref
, read_alloc_stats_t
& stats
);
3742 void set_allocation_in_simple_bmap(SimpleBitmap
* sbmap
, uint64_t offset
, uint64_t length
);
3743 int commit_to_null_manager();
3744 int commit_to_real_manager();
3745 int db_cleanup(int ret
);
3746 int reset_fm_for_restore();
3747 int verify_rocksdb_allocations(Allocator
*allocator
);
3748 Allocator
* clone_allocator_without_bluefs(Allocator
*src_allocator
);
3749 Allocator
* initialize_allocator_from_freelist(FreelistManager
*real_fm
);
3750 void copy_allocator_content_to_fm(Allocator
*allocator
, FreelistManager
*real_fm
);
3753 void _fsck_check_object_omap(FSCKDepth depth
,
3755 const BlueStore::FSCK_ObjectCtx
& ctx
);
3757 void _fsck_check_objects(FSCKDepth depth
,
3758 FSCK_ObjectCtx
& ctx
);
3761 inline std::ostream
& operator<<(std::ostream
& out
, const BlueStore::volatile_statfs
& s
) {
3764 << s
.values
[BlueStore::volatile_statfs::STATFS_ALLOCATED
]
3766 << s
.values
[BlueStore::volatile_statfs::STATFS_STORED
]
3768 << s
.values
[BlueStore::volatile_statfs::STATFS_COMPRESSED
]
3769 << " compressed_orig:"
3770 << s
.values
[BlueStore::volatile_statfs::STATFS_COMPRESSED_ORIGINAL
]
3771 << " compressed_alloc:"
3772 << s
.values
[BlueStore::volatile_statfs::STATFS_COMPRESSED_ALLOCATED
];
3775 static inline void intrusive_ptr_add_ref(BlueStore::Onode
*o
) {
3778 static inline void intrusive_ptr_release(BlueStore::Onode
*o
) {
3782 static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer
*o
) {
3785 static inline void intrusive_ptr_release(BlueStore::OpSequencer
*o
) {
3789 class BlueStoreRepairer
3791 ceph::mutex lock
= ceph::make_mutex("BlueStore::BlueStoreRepairer::lock");
3794 // to simplify future potential migration to mempools
3795 using fsck_interval
= interval_set
<uint64_t>;
3797 // Structure to track what pextents are used for specific cid/oid.
3798 // Similar to Bloom filter positive and false-positive matches are
3800 // Maintains two lists of bloom filters for both cids and oids
3801 // where each list entry is a BF for specific disk pextent
3802 // The length of the extent per filter is measured on init.
3803 // Allows to filter out 'uninteresting' pextents to speadup subsequent
3804 // 'is_used' access.
3805 struct StoreSpaceTracker
{
3806 const uint64_t BLOOM_FILTER_SALT_COUNT
= 2;
3807 const uint64_t BLOOM_FILTER_TABLE_SIZE
= 32; // bytes per single filter
3808 const uint64_t BLOOM_FILTER_EXPECTED_COUNT
= 16; // arbitrary selected
3809 static const uint64_t DEF_MEM_CAP
= 128 * 1024 * 1024;
3811 typedef mempool::bluestore_fsck::vector
<bloom_filter
> bloom_vector
;
3812 bloom_vector collections_bfs
;
3813 bloom_vector objects_bfs
;
3815 bool was_filtered_out
= false;
3816 uint64_t granularity
= 0; // extent length for a single filter
3818 StoreSpaceTracker() {
3820 StoreSpaceTracker(const StoreSpaceTracker
& from
) :
3821 collections_bfs(from
.collections_bfs
),
3822 objects_bfs(from
.objects_bfs
),
3823 granularity(from
.granularity
) {
3826 void init(uint64_t total
,
3827 uint64_t min_alloc_size
,
3828 uint64_t mem_cap
= DEF_MEM_CAP
) {
3829 ceph_assert(!granularity
); // not initialized yet
3830 ceph_assert(min_alloc_size
&& isp2(min_alloc_size
));
3831 ceph_assert(mem_cap
);
3833 total
= round_up_to(total
, min_alloc_size
);
3834 granularity
= total
* BLOOM_FILTER_TABLE_SIZE
* 2 / mem_cap
;
3837 granularity
= min_alloc_size
;
3839 granularity
= round_up_to(granularity
, min_alloc_size
);
3842 uint64_t entries
= round_up_to(total
, granularity
) / granularity
;
3843 collections_bfs
.resize(entries
,
3844 bloom_filter(BLOOM_FILTER_SALT_COUNT
,
3845 BLOOM_FILTER_TABLE_SIZE
,
3847 BLOOM_FILTER_EXPECTED_COUNT
));
3848 objects_bfs
.resize(entries
,
3849 bloom_filter(BLOOM_FILTER_SALT_COUNT
,
3850 BLOOM_FILTER_TABLE_SIZE
,
3852 BLOOM_FILTER_EXPECTED_COUNT
));
3854 inline uint32_t get_hash(const coll_t
& cid
) const {
3855 return cid
.hash_to_shard(1);
3857 inline void set_used(uint64_t offset
, uint64_t len
,
3858 const coll_t
& cid
, const ghobject_t
& oid
) {
3859 ceph_assert(granularity
); // initialized
3861 // can't call this func after filter_out has been applied
3862 ceph_assert(!was_filtered_out
);
3866 auto pos
= offset
/ granularity
;
3867 auto end_pos
= (offset
+ len
- 1) / granularity
;
3868 while (pos
<= end_pos
) {
3869 collections_bfs
[pos
].insert(get_hash(cid
));
3870 objects_bfs
[pos
].insert(oid
.hobj
.get_hash());
3874 // filter-out entries unrelated to the specified(broken) extents.
3875 // 'is_used' calls are permitted after that only
3876 size_t filter_out(const fsck_interval
& extents
);
3878 // determines if collection's present after filtering-out
3879 inline bool is_used(const coll_t
& cid
) const {
3880 ceph_assert(was_filtered_out
);
3881 for(auto& bf
: collections_bfs
) {
3882 if (bf
.contains(get_hash(cid
))) {
3888 // determines if object's present after filtering-out
3889 inline bool is_used(const ghobject_t
& oid
) const {
3890 ceph_assert(was_filtered_out
);
3891 for(auto& bf
: objects_bfs
) {
3892 if (bf
.contains(oid
.hobj
.get_hash())) {
3898 // determines if collection's present before filtering-out
3899 inline bool is_used(const coll_t
& cid
, uint64_t offs
) const {
3900 ceph_assert(granularity
); // initialized
3901 ceph_assert(!was_filtered_out
);
3902 auto &bf
= collections_bfs
[offs
/ granularity
];
3903 if (bf
.contains(get_hash(cid
))) {
3908 // determines if object's present before filtering-out
3909 inline bool is_used(const ghobject_t
& oid
, uint64_t offs
) const {
3910 ceph_assert(granularity
); // initialized
3911 ceph_assert(!was_filtered_out
);
3912 auto &bf
= objects_bfs
[offs
/ granularity
];
3913 if (bf
.contains(oid
.hobj
.get_hash())) {
3921 void fix_per_pool_omap(KeyValueDB
*db
, int);
3922 bool remove_key(KeyValueDB
*db
, const std::string
& prefix
, const std::string
& key
);
3923 bool fix_shared_blob(KeyValueDB::Transaction txn
,
3925 bluestore_extent_ref_map_t
* ref_map
,
3926 size_t repaired
= 1);
3927 bool fix_statfs(KeyValueDB
*db
, const std::string
& key
,
3928 const store_statfs_t
& new_statfs
);
3930 bool fix_leaked(KeyValueDB
*db
,
3931 FreelistManager
* fm
,
3932 uint64_t offset
, uint64_t len
);
3933 bool fix_false_free(KeyValueDB
*db
,
3934 FreelistManager
* fm
,
3935 uint64_t offset
, uint64_t len
);
3936 bool fix_spanning_blobs(
3938 std::function
<void(KeyValueDB::Transaction
)> f
);
3940 bool preprocess_misreference(KeyValueDB
*db
);
3942 unsigned apply(KeyValueDB
* db
);
3944 void note_misreference(uint64_t offs
, uint64_t len
, bool inc_error
) {
3945 std::lock_guard
l(lock
);
3946 misreferenced_extents
.union_insert(offs
, len
);
3951 //////////////////////
3952 //In fact two methods below are the only ones in this class which are thread-safe!!
3953 void inc_repaired(size_t n
= 1) {
3956 void request_compaction() {
3957 need_compact
= true;
3959 //////////////////////
3961 void init_space_usage_tracker(
3962 uint64_t total_space
, uint64_t lres_tracking_unit_size
)
3964 //NB: not for use in multithreading mode!!!
3965 space_usage_tracker
.init(total_space
, lres_tracking_unit_size
);
3967 void set_space_used(uint64_t offset
, uint64_t len
,
3968 const coll_t
& cid
, const ghobject_t
& oid
) {
3969 std::lock_guard
l(lock
);
3970 space_usage_tracker
.set_used(offset
, len
, cid
, oid
);
3972 inline bool is_used(const coll_t
& cid
) const {
3973 //NB: not for use in multithreading mode!!!
3974 return space_usage_tracker
.is_used(cid
);
3976 inline bool is_used(const ghobject_t
& oid
) const {
3977 //NB: not for use in multithreading mode!!!
3978 return space_usage_tracker
.is_used(oid
);
3981 const fsck_interval
& get_misreferences() const {
3982 //NB: not for use in multithreading mode!!!
3983 return misreferenced_extents
;
3985 KeyValueDB::Transaction
get_fix_misreferences_txn() {
3986 //NB: not for use in multithreading mode!!!
3987 return fix_misreferences_txn
;
3991 std::atomic
<unsigned> to_repair_cnt
= { 0 };
3992 std::atomic
<bool> need_compact
= { false };
3993 KeyValueDB::Transaction fix_per_pool_omap_txn
;
3994 KeyValueDB::Transaction fix_fm_leaked_txn
;
3995 KeyValueDB::Transaction fix_fm_false_free_txn
;
3996 KeyValueDB::Transaction remove_key_txn
;
3997 KeyValueDB::Transaction fix_statfs_txn
;
3998 KeyValueDB::Transaction fix_shared_blob_txn
;
4000 KeyValueDB::Transaction fix_misreferences_txn
;
4001 KeyValueDB::Transaction fix_onode_txn
;
4003 StoreSpaceTracker space_usage_tracker
;
4005 // non-shared extents with multiple references
4006 fsck_interval misreferenced_extents
;
4010 class RocksDBBlueFSVolumeSelector
: public BlueFSVolumeSelector
4012 template <class T
, size_t MaxX
, size_t MaxY
>
4014 T values
[MaxX
][MaxY
];
4019 T
& at(size_t x
, size_t y
) {
4020 ceph_assert(x
< MaxX
);
4021 ceph_assert(y
< MaxY
);
4023 return values
[x
][y
];
4025 size_t get_max_x() const {
4028 size_t get_max_y() const {
4032 memset(values
, 0, sizeof(values
));
4037 // use 0/nullptr as unset indication
4039 LEVEL_LOG
= LEVEL_FIRST
, // BlueFS log
4045 // add +1 row for corresponding per-device totals
4046 // add +1 column for per-level actual (taken from file size) total
4047 typedef matrix_2d
<std::atomic
<uint64_t>, BlueFS::MAX_BDEV
+ 1, LEVEL_MAX
- LEVEL_FIRST
+ 1> per_level_per_dev_usage_t
;
4049 per_level_per_dev_usage_t per_level_per_dev_usage
;
4050 // file count per level, add +1 to keep total file count
4051 std::atomic
<uint64_t> per_level_files
[LEVEL_MAX
- LEVEL_FIRST
+ 1] = { 0 };
4053 // Note: maximum per-device totals below might be smaller than corresponding
4054 // perf counters by up to a single alloc unit (1M) due to superblock extent.
4055 // The later is not accounted here.
4056 per_level_per_dev_usage_t per_level_per_dev_max
;
4058 uint64_t l_totals
[LEVEL_MAX
- LEVEL_FIRST
];
4059 uint64_t db_avail4slow
= 0;
4066 RocksDBBlueFSVolumeSelector(
4067 uint64_t _wal_total
,
4069 uint64_t _slow_total
,
4070 uint64_t _level0_size
,
4071 uint64_t _level_base
,
4072 uint64_t _level_multiplier
,
4073 double reserved_factor
,
4077 l_totals
[LEVEL_LOG
- LEVEL_FIRST
] = 0; // not used at the moment
4078 l_totals
[LEVEL_WAL
- LEVEL_FIRST
] = _wal_total
;
4079 l_totals
[LEVEL_DB
- LEVEL_FIRST
] = _db_total
;
4080 l_totals
[LEVEL_SLOW
- LEVEL_FIRST
] = _slow_total
;
4086 // Calculating how much extra space is available at DB volume.
4087 // Depending on the presence of explicit reserved size specification it might be either
4088 // * DB volume size - reserved
4090 // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
4092 uint64_t prev_levels
= _level0_size
;
4093 uint64_t cur_level
= _level_base
;
4094 uint64_t cur_threshold
= 0;
4096 uint64_t next_level
= cur_level
* _level_multiplier
;
4097 uint64_t next_threshold
= prev_levels
+ cur_level
+ next_level
* reserved_factor
;
4098 if (_db_total
<= next_threshold
) {
4099 db_avail4slow
= cur_threshold
? _db_total
- cur_threshold
: 0;
4102 prev_levels
+= cur_level
;
4103 cur_level
= next_level
;
4104 cur_threshold
= next_threshold
;
4108 db_avail4slow
= _db_total
- reserved
;
4112 void* get_hint_for_log() const override
{
4113 return reinterpret_cast<void*>(LEVEL_LOG
);
4115 void* get_hint_by_dir(std::string_view dirname
) const override
;
4117 void add_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
4118 if (hint
== nullptr)
4120 size_t pos
= (size_t)hint
- LEVEL_FIRST
;
4121 for (auto& p
: fnode
.extents
) {
4122 auto& cur
= per_level_per_dev_usage
.at(p
.bdev
, pos
);
4123 auto& max
= per_level_per_dev_max
.at(p
.bdev
, pos
);
4124 uint64_t v
= cur
.fetch_add(p
.length
) + p
.length
;
4129 //update per-device totals
4130 auto& cur
= per_level_per_dev_usage
.at(p
.bdev
, LEVEL_MAX
- LEVEL_FIRST
);
4131 auto& max
= per_level_per_dev_max
.at(p
.bdev
, LEVEL_MAX
- LEVEL_FIRST
);
4132 uint64_t v
= cur
.fetch_add(p
.length
) + p
.length
;
4139 //update per-level actual totals
4140 auto& cur
= per_level_per_dev_usage
.at(BlueFS::MAX_BDEV
, pos
);
4141 auto& max
= per_level_per_dev_max
.at(BlueFS::MAX_BDEV
, pos
);
4142 uint64_t v
= cur
.fetch_add(fnode
.size
) + fnode
.size
;
4147 ++per_level_files
[pos
];
4148 ++per_level_files
[LEVEL_MAX
- LEVEL_FIRST
];
4150 void sub_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
4151 if (hint
== nullptr)
4153 size_t pos
= (size_t)hint
- LEVEL_FIRST
;
4154 for (auto& p
: fnode
.extents
) {
4155 auto& cur
= per_level_per_dev_usage
.at(p
.bdev
, pos
);
4156 ceph_assert(cur
>= p
.length
);
4159 //update per-device totals
4160 auto& cur2
= per_level_per_dev_usage
.at(p
.bdev
, LEVEL_MAX
- LEVEL_FIRST
);
4161 ceph_assert(cur2
>= p
.length
);
4164 //update per-level actual totals
4165 auto& cur
= per_level_per_dev_usage
.at(BlueFS::MAX_BDEV
, pos
);
4166 ceph_assert(cur
>= fnode
.size
);
4168 ceph_assert(per_level_files
[pos
] > 0);
4169 --per_level_files
[pos
];
4170 ceph_assert(per_level_files
[LEVEL_MAX
- LEVEL_FIRST
] > 0);
4171 --per_level_files
[LEVEL_MAX
- LEVEL_FIRST
];
4173 void add_usage(void* hint
, uint64_t size_more
) override
{
4174 if (hint
== nullptr)
4176 size_t pos
= (size_t)hint
- LEVEL_FIRST
;
4177 //update per-level actual totals
4178 auto& cur
= per_level_per_dev_usage
.at(BlueFS::MAX_BDEV
, pos
);
4179 auto& max
= per_level_per_dev_max
.at(BlueFS::MAX_BDEV
, pos
);
4180 uint64_t v
= cur
.fetch_add(size_more
) + size_more
;
4185 void sub_usage(void* hint
, uint64_t size_less
) override
{
4186 if (hint
== nullptr)
4188 size_t pos
= (size_t)hint
- LEVEL_FIRST
;
4189 //update per-level actual totals
4190 auto& cur
= per_level_per_dev_usage
.at(BlueFS::MAX_BDEV
, pos
);
4191 ceph_assert(cur
>= size_less
);
4195 uint8_t select_prefer_bdev(void* h
) override
;
4197 const std::string
& base
,
4198 BlueFSVolumeSelector::paths
& res
) const override
;
4200 void dump(std::ostream
& sout
) override
;
4201 BlueFSVolumeSelector
* clone_empty() const override
;
4202 bool compare(BlueFSVolumeSelector
* other
) override
;