1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_OSD_BLUESTORE_H
16 #define CEPH_OSD_BLUESTORE_H
24 #include <condition_variable>
26 #include <boost/intrusive/list.hpp>
27 #include <boost/intrusive/unordered_set.hpp>
28 #include <boost/intrusive/set.hpp>
29 #include <boost/functional/hash.hpp>
30 #include <boost/dynamic_bitset.hpp>
32 #include "include/assert.h"
33 #include "include/unordered_map.h"
34 #include "include/memory.h"
35 #include "include/mempool.h"
36 #include "common/Finisher.h"
37 #include "common/perf_counters.h"
38 #include "compressor/Compressor.h"
39 #include "os/ObjectStore.h"
41 #include "bluestore_types.h"
42 #include "BlockDevice.h"
43 #include "common/EventTrace.h"
46 class FreelistManager
;
50 //#define DEBUG_DEFERRED
54 // constants for Buffer::optimize()
55 #define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
59 l_bluestore_first
= 732430,
60 l_bluestore_kv_flush_lat
,
61 l_bluestore_kv_commit_lat
,
63 l_bluestore_state_prepare_lat
,
64 l_bluestore_state_aio_wait_lat
,
65 l_bluestore_state_io_done_lat
,
66 l_bluestore_state_kv_queued_lat
,
67 l_bluestore_state_kv_committing_lat
,
68 l_bluestore_state_kv_done_lat
,
69 l_bluestore_state_deferred_queued_lat
,
70 l_bluestore_state_deferred_aio_wait_lat
,
71 l_bluestore_state_deferred_cleanup_lat
,
72 l_bluestore_state_finishing_lat
,
73 l_bluestore_state_done_lat
,
74 l_bluestore_throttle_lat
,
75 l_bluestore_submit_lat
,
76 l_bluestore_commit_lat
,
78 l_bluestore_read_onode_meta_lat
,
79 l_bluestore_read_wait_aio_lat
,
80 l_bluestore_compress_lat
,
81 l_bluestore_decompress_lat
,
83 l_bluestore_compress_success_count
,
84 l_bluestore_compress_rejected_count
,
85 l_bluestore_write_pad_bytes
,
86 l_bluestore_deferred_write_ops
,
87 l_bluestore_deferred_write_bytes
,
88 l_bluestore_write_penalty_read_ops
,
89 l_bluestore_allocated
,
91 l_bluestore_compressed
,
92 l_bluestore_compressed_allocated
,
93 l_bluestore_compressed_original
,
95 l_bluestore_onode_hits
,
96 l_bluestore_onode_misses
,
97 l_bluestore_onode_shard_hits
,
98 l_bluestore_onode_shard_misses
,
102 l_bluestore_buffer_bytes
,
103 l_bluestore_buffer_hit_bytes
,
104 l_bluestore_buffer_miss_bytes
,
105 l_bluestore_write_big
,
106 l_bluestore_write_big_bytes
,
107 l_bluestore_write_big_blobs
,
108 l_bluestore_write_small
,
109 l_bluestore_write_small_bytes
,
110 l_bluestore_write_small_unused
,
111 l_bluestore_write_small_deferred
,
112 l_bluestore_write_small_pre_read
,
113 l_bluestore_write_small_new
,
115 l_bluestore_onode_reshard
,
116 l_bluestore_blob_split
,
117 l_bluestore_extent_compress
,
118 l_bluestore_gc_merged
,
122 class BlueStore
: public ObjectStore
,
123 public md_config_obs_t
{
124 // -----------------------------------------------------
128 const char** get_tracked_conf_keys() const override
;
129 void handle_conf_change(const struct md_config_t
*conf
,
130 const std::set
<std::string
> &changed
) override
;
133 void _set_compression();
134 void _set_throttle_params();
135 int _set_cache_sizes();
139 typedef map
<uint64_t, bufferlist
> ready_regions_t
;
143 typedef boost::intrusive_ptr
<Collection
> CollectionRef
;
146 virtual void aio_finish(BlueStore
*store
) = 0;
147 virtual ~AioContext() {}
152 MEMPOOL_CLASS_HELPERS();
155 STATE_EMPTY
, ///< empty buffer -- used for cache history
156 STATE_CLEAN
, ///< clean data that is up to date
157 STATE_WRITING
, ///< data that is being written (io not yet complete)
159 static const char *get_state_name(int s
) {
161 case STATE_EMPTY
: return "empty";
162 case STATE_CLEAN
: return "clean";
163 case STATE_WRITING
: return "writing";
164 default: return "???";
168 FLAG_NOCACHE
= 1, ///< trim when done WRITING (do not become CLEAN)
169 // NOTE: fix operator<< when you define a second flag
171 static const char *get_flag_name(int s
) {
173 case FLAG_NOCACHE
: return "nocache";
174 default: return "???";
179 uint16_t state
; ///< STATE_*
180 uint16_t cache_private
= 0; ///< opaque (to us) value used by Cache impl
181 uint32_t flags
; ///< FLAG_*
183 uint32_t offset
, length
;
186 boost::intrusive::list_member_hook
<> lru_item
;
187 boost::intrusive::list_member_hook
<> state_item
;
189 Buffer(BufferSpace
*space
, unsigned s
, uint64_t q
, uint32_t o
, uint32_t l
,
191 : space(space
), state(s
), flags(f
), seq(q
), offset(o
), length(l
) {}
192 Buffer(BufferSpace
*space
, unsigned s
, uint64_t q
, uint32_t o
, bufferlist
& b
,
194 : space(space
), state(s
), flags(f
), seq(q
), offset(o
),
195 length(b
.length()), data(b
) {}
197 bool is_empty() const {
198 return state
== STATE_EMPTY
;
200 bool is_clean() const {
201 return state
== STATE_CLEAN
;
203 bool is_writing() const {
204 return state
== STATE_WRITING
;
207 uint32_t end() const {
208 return offset
+ length
;
211 void truncate(uint32_t newlen
) {
212 assert(newlen
< length
);
215 t
.substr_of(data
, 0, newlen
);
220 void maybe_rebuild() {
222 (data
.get_num_buffers() > 1 ||
223 data
.front().wasted() > data
.length() / MAX_BUFFER_SLOP_RATIO_DEN
)) {
228 void dump(Formatter
*f
) const {
229 f
->dump_string("state", get_state_name(state
));
230 f
->dump_unsigned("seq", seq
);
231 f
->dump_unsigned("offset", offset
);
232 f
->dump_unsigned("length", length
);
233 f
->dump_unsigned("data_length", data
.length());
239 /// map logical extent range (object) onto buffers
241 typedef boost::intrusive::list
<
243 boost::intrusive::member_hook
<
245 boost::intrusive::list_member_hook
<>,
246 &Buffer::state_item
> > state_list_t
;
248 mempool::bluestore_cache_other::map
<uint32_t, std::unique_ptr
<Buffer
>>
251 // we use a bare intrusive list here instead of std::map because
252 // it uses less memory and we expect this to be very small (very
253 // few IOs in flight to the same Blob at the same time).
254 state_list_t writing
; ///< writing buffers, sorted by seq, ascending
257 assert(buffer_map
.empty());
258 assert(writing
.empty());
261 void _add_buffer(Cache
* cache
, Buffer
*b
, int level
, Buffer
*near
) {
262 cache
->_audit("_add_buffer start");
263 buffer_map
[b
->offset
].reset(b
);
264 if (b
->is_writing()) {
265 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_writing
);
266 if (writing
.empty() || writing
.rbegin()->seq
<= b
->seq
) {
267 writing
.push_back(*b
);
269 auto it
= writing
.begin();
270 while (it
->seq
< b
->seq
) {
274 assert(it
->seq
>= b
->seq
);
275 // note that this will insert b before it
276 // hence the order is maintained
277 writing
.insert(it
, *b
);
280 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
281 cache
->_add_buffer(b
, level
, near
);
283 cache
->_audit("_add_buffer end");
285 void _rm_buffer(Cache
* cache
, Buffer
*b
) {
286 _rm_buffer(cache
, buffer_map
.find(b
->offset
));
288 void _rm_buffer(Cache
* cache
,
289 map
<uint32_t, std::unique_ptr
<Buffer
>>::iterator p
) {
290 assert(p
!= buffer_map
.end());
291 cache
->_audit("_rm_buffer start");
292 if (p
->second
->is_writing()) {
293 writing
.erase(writing
.iterator_to(*p
->second
));
295 cache
->_rm_buffer(p
->second
.get());
298 cache
->_audit("_rm_buffer end");
301 map
<uint32_t,std::unique_ptr
<Buffer
>>::iterator
_data_lower_bound(
303 auto i
= buffer_map
.lower_bound(offset
);
304 if (i
!= buffer_map
.begin()) {
306 if (i
->first
+ i
->second
->length
<= offset
)
312 // must be called under protection of the Cache lock
313 void _clear(Cache
* cache
);
315 // return value is the highest cache_private of a trimmed buffer, or 0.
316 int discard(Cache
* cache
, uint32_t offset
, uint32_t length
) {
317 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
318 return _discard(cache
, offset
, length
);
320 int _discard(Cache
* cache
, uint32_t offset
, uint32_t length
);
322 void write(Cache
* cache
, uint64_t seq
, uint32_t offset
, bufferlist
& bl
,
324 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
325 Buffer
*b
= new Buffer(this, Buffer::STATE_WRITING
, seq
, offset
, bl
,
327 b
->cache_private
= _discard(cache
, offset
, bl
.length());
328 _add_buffer(cache
, b
, (flags
& Buffer::FLAG_NOCACHE
) ? 0 : 1, nullptr);
330 void finish_write(Cache
* cache
, uint64_t seq
);
331 void did_read(Cache
* cache
, uint32_t offset
, bufferlist
& bl
) {
332 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
333 Buffer
*b
= new Buffer(this, Buffer::STATE_CLEAN
, 0, offset
, bl
);
334 b
->cache_private
= _discard(cache
, offset
, bl
.length());
335 _add_buffer(cache
, b
, 1, nullptr);
338 void read(Cache
* cache
, uint32_t offset
, uint32_t length
,
339 BlueStore::ready_regions_t
& res
,
340 interval_set
<uint32_t>& res_intervals
);
342 void truncate(Cache
* cache
, uint32_t offset
) {
343 discard(cache
, offset
, (uint32_t)-1 - offset
);
346 void split(Cache
* cache
, size_t pos
, BufferSpace
&r
);
348 void dump(Cache
* cache
, Formatter
*f
) const {
349 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
350 f
->open_array_section("buffers");
351 for (auto& i
: buffer_map
) {
352 f
->open_object_section("buffer");
353 assert(i
.first
== i
.second
->offset
);
361 struct SharedBlobSet
;
363 /// in-memory shared blob state (incl cached buffers)
365 MEMPOOL_CLASS_HELPERS();
367 std::atomic_int nref
= {0}; ///< reference count
372 uint64_t sbid_unloaded
; ///< sbid if persistent isn't loaded
373 bluestore_shared_blob_t
*persistent
; ///< persistent part of the shared blob if any
375 BufferSpace bc
; ///< buffer cache
377 SharedBlob(Collection
*_coll
) : coll(_coll
), sbid_unloaded(0) {
379 get_cache()->add_blob();
382 SharedBlob(uint64_t i
, Collection
*_coll
);
385 uint64_t get_sbid() const {
386 return loaded
? persistent
->sbid
: sbid_unloaded
;
389 friend void intrusive_ptr_add_ref(SharedBlob
*b
) { b
->get(); }
390 friend void intrusive_ptr_release(SharedBlob
*b
) { b
->put(); }
392 friend ostream
& operator<<(ostream
& out
, const SharedBlob
& sb
);
399 /// get logical references
400 void get_ref(uint64_t offset
, uint32_t length
);
402 /// put logical references, and get back any released extents
403 void put_ref(uint64_t offset
, uint32_t length
,
404 PExtentVector
*r
, set
<SharedBlob
*> *maybe_unshared_blobs
);
406 friend bool operator==(const SharedBlob
&l
, const SharedBlob
&r
) {
407 return l
.get_sbid() == r
.get_sbid();
409 inline Cache
* get_cache() {
410 return coll
? coll
->cache
: nullptr;
412 inline SharedBlobSet
* get_parent() {
413 return coll
? &(coll
->shared_blob_set
) : nullptr;
415 inline bool is_loaded() const {
420 typedef boost::intrusive_ptr
<SharedBlob
> SharedBlobRef
;
422 /// a lookup table of SharedBlobs
423 struct SharedBlobSet
{
424 std::mutex lock
; ///< protect lookup, insertion, removal
426 // we use a bare pointer because we don't want to affect the ref
428 mempool::bluestore_cache_other::unordered_map
<uint64_t,SharedBlob
*> sb_map
;
430 SharedBlobRef
lookup(uint64_t sbid
) {
431 std::lock_guard
<std::mutex
> l(lock
);
432 auto p
= sb_map
.find(sbid
);
433 if (p
== sb_map
.end()) {
439 void add(Collection
* coll
, SharedBlob
*sb
) {
440 std::lock_guard
<std::mutex
> l(lock
);
441 sb_map
[sb
->get_sbid()] = sb
;
445 bool remove(SharedBlob
*sb
) {
446 std::lock_guard
<std::mutex
> l(lock
);
448 assert(sb
->get_parent() == this);
449 sb_map
.erase(sb
->get_sbid());
456 std::lock_guard
<std::mutex
> l(lock
);
457 return sb_map
.empty();
461 //#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
463 /// in-memory blob metadata and associated cached buffers (if any)
465 MEMPOOL_CLASS_HELPERS();
467 std::atomic_int nref
= {0}; ///< reference count
468 int16_t id
= -1; ///< id, for spanning blobs only, >= 0
469 int16_t last_encoded_id
= -1; ///< (ephemeral) used during encoding only
470 SharedBlobRef shared_blob
; ///< shared blob state (if any)
473 mutable bluestore_blob_t blob
; ///< decoded blob metadata
475 mutable bufferlist blob_bl
; ///< cached encoded blob, blob is dirty if empty
477 /// refs from this shard. ephemeral if id<0, persisted if spanning.
478 bluestore_blob_use_tracker_t used_in_blob
;
482 friend void intrusive_ptr_add_ref(Blob
*b
) { b
->get(); }
483 friend void intrusive_ptr_release(Blob
*b
) { b
->put(); }
485 friend ostream
& operator<<(ostream
& out
, const Blob
&b
);
487 const bluestore_blob_use_tracker_t
& get_blob_use_tracker() const {
490 bool is_referenced() const {
491 return used_in_blob
.is_not_empty();
493 uint32_t get_referenced_bytes() const {
494 return used_in_blob
.get_referenced_bytes();
497 bool is_spanning() const {
501 bool can_split() const {
502 std::lock_guard
<std::recursive_mutex
> l(shared_blob
->get_cache()->lock
);
503 // splitting a BufferSpace writing list is too hard; don't try.
504 return shared_blob
->bc
.writing
.empty() &&
505 used_in_blob
.can_split() &&
506 get_blob().can_split();
509 bool can_split_at(uint32_t blob_offset
) const {
510 return used_in_blob
.can_split_at(blob_offset
) &&
511 get_blob().can_split_at(blob_offset
);
514 bool can_reuse_blob(uint32_t min_alloc_size
,
515 uint32_t target_blob_size
,
520 o
.shared_blob
= shared_blob
;
527 inline const bluestore_blob_t
& get_blob() const {
530 inline bluestore_blob_t
& dirty_blob() {
537 /// discard buffers for unallocated regions
538 void discard_unallocated(Collection
*coll
);
540 /// get logical references
541 void get_ref(Collection
*coll
, uint32_t offset
, uint32_t length
);
542 /// put logical references, and get back any released extents
543 bool put_ref(Collection
*coll
, uint32_t offset
, uint32_t length
,
547 void split(Collection
*coll
, uint32_t blob_offset
, Blob
*o
);
559 void _encode() const {
560 if (blob_bl
.length() == 0 ) {
561 ::encode(blob
, blob_bl
);
563 assert(blob_bl
.length());
568 bool include_ref_map
) const {
570 p
+= blob_bl
.length();
571 if (include_ref_map
) {
572 used_in_blob
.bound_encode(p
);
576 bufferlist::contiguous_appender
& p
,
577 bool include_ref_map
) const {
580 if (include_ref_map
) {
581 used_in_blob
.encode(p
);
585 Collection */
*coll*/
,
586 bufferptr::iterator
& p
,
587 bool include_ref_map
) {
588 const char *start
= p
.get_pos();
590 const char *end
= p
.get_pos();
592 blob_bl
.append(start
, end
- start
);
593 if (include_ref_map
) {
594 used_in_blob
.decode(p
);
602 bool include_ref_map
) const {
603 denc(blob
, p
, struct_v
);
604 if (blob
.is_shared()) {
607 if (include_ref_map
) {
608 used_in_blob
.bound_encode(p
);
612 bufferlist::contiguous_appender
& p
,
615 bool include_ref_map
) const {
616 denc(blob
, p
, struct_v
);
617 if (blob
.is_shared()) {
620 if (include_ref_map
) {
621 used_in_blob
.encode(p
);
626 bufferptr::iterator
& p
,
629 bool include_ref_map
);
632 typedef boost::intrusive_ptr
<Blob
> BlobRef
;
633 typedef mempool::bluestore_cache_other::map
<int,BlobRef
> blob_map_t
;
635 /// a logical extent, pointing to (some portion of) a blob
636 typedef boost::intrusive::set_base_hook
<boost::intrusive::optimize_size
<true> > ExtentBase
; //making an alias to avoid build warnings
637 struct Extent
: public ExtentBase
{
638 MEMPOOL_CLASS_HELPERS();
640 uint32_t logical_offset
= 0; ///< logical offset
641 uint32_t blob_offset
= 0; ///< blob offset
642 uint32_t length
= 0; ///< length
643 BlobRef blob
; ///< the blob with our data
645 /// ctor for lookup only
646 explicit Extent(uint32_t lo
) : ExtentBase(), logical_offset(lo
) { }
647 /// ctor for delayed initialization (see decode_some())
648 explicit Extent() : ExtentBase() {
650 /// ctor for general usage
651 Extent(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
)
653 logical_offset(lo
), blob_offset(o
), length(l
) {
658 blob
->shared_blob
->get_cache()->rm_extent();
662 void assign_blob(const BlobRef
& b
) {
665 blob
->shared_blob
->get_cache()->add_extent();
668 // comparators for intrusive_set
669 friend bool operator<(const Extent
&a
, const Extent
&b
) {
670 return a
.logical_offset
< b
.logical_offset
;
672 friend bool operator>(const Extent
&a
, const Extent
&b
) {
673 return a
.logical_offset
> b
.logical_offset
;
675 friend bool operator==(const Extent
&a
, const Extent
&b
) {
676 return a
.logical_offset
== b
.logical_offset
;
679 uint32_t blob_start() const {
680 return logical_offset
- blob_offset
;
683 uint32_t blob_end() const {
684 return blob_start() + blob
->get_blob().get_logical_length();
687 uint32_t logical_end() const {
688 return logical_offset
+ length
;
691 // return true if any piece of the blob is out of
692 // the given range [o, o + l].
693 bool blob_escapes_range(uint32_t o
, uint32_t l
) const {
694 return blob_start() < o
|| blob_end() > o
+ l
;
697 typedef boost::intrusive::set
<Extent
> extent_map_t
;
700 friend ostream
& operator<<(ostream
& out
, const Extent
& e
);
703 boost::intrusive::list_member_hook
<> old_extent_item
;
706 bool blob_empty
; // flag to track the last removed extent that makes blob
707 // empty - required to update compression stat properly
708 OldExtent(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
)
709 : e(lo
, o
, l
, b
), blob_empty(false) {
711 static OldExtent
* create(CollectionRef c
,
717 typedef boost::intrusive::list
<
719 boost::intrusive::member_hook
<
721 boost::intrusive::list_member_hook
<>,
722 &OldExtent::old_extent_item
> > old_extent_map_t
;
726 /// a sharded extent map, mapping offsets to lextents to blobs
729 extent_map_t extent_map
; ///< map of Extents to Blobs
730 blob_map_t spanning_blob_map
; ///< blobs that span shards
733 bluestore_onode_t::shard_info
*shard_info
= nullptr;
734 unsigned extents
= 0; ///< count extents in this shard
735 bool loaded
= false; ///< true if shard is loaded
736 bool dirty
= false; ///< true if shard is dirty and needs reencoding
738 mempool::bluestore_cache_other::vector
<Shard
> shards
; ///< shards
740 bufferlist inline_bl
; ///< cached encoded map, if unsharded; empty=>dirty
742 uint32_t needs_reshard_begin
= 0;
743 uint32_t needs_reshard_end
= 0;
745 bool needs_reshard() const {
746 return needs_reshard_end
> needs_reshard_begin
;
748 void clear_needs_reshard() {
749 needs_reshard_begin
= needs_reshard_end
= 0;
751 void request_reshard(uint32_t begin
, uint32_t end
) {
752 if (begin
< needs_reshard_begin
) {
753 needs_reshard_begin
= begin
;
755 if (end
> needs_reshard_end
) {
756 needs_reshard_end
= end
;
760 struct DeleteDisposer
{
761 void operator()(Extent
*e
) { delete e
; }
766 extent_map
.clear_and_dispose(DeleteDisposer());
770 extent_map
.clear_and_dispose(DeleteDisposer());
773 clear_needs_reshard();
776 bool encode_some(uint32_t offset
, uint32_t length
, bufferlist
& bl
,
778 unsigned decode_some(bufferlist
& bl
);
780 void bound_encode_spanning_blobs(size_t& p
);
781 void encode_spanning_blobs(bufferlist::contiguous_appender
& p
);
782 void decode_spanning_blobs(bufferptr::iterator
& p
);
784 BlobRef
get_spanning_blob(int id
) {
785 auto p
= spanning_blob_map
.find(id
);
786 assert(p
!= spanning_blob_map
.end());
790 void update(KeyValueDB::Transaction t
, bool force
);
791 decltype(BlueStore::Blob::id
) allocate_spanning_blob_id();
794 KeyValueDB::Transaction t
);
796 /// initialize Shards from the onode
797 void init_shards(bool loaded
, bool dirty
);
799 /// return index of shard containing offset
800 /// or -1 if not found
801 int seek_shard(uint32_t offset
) {
802 size_t end
= shards
.size();
803 size_t mid
, left
= 0;
804 size_t right
= end
; // one passed the right end
806 while (left
< right
) {
807 mid
= left
+ (right
- left
) / 2;
808 if (offset
>= shards
[mid
].shard_info
->offset
) {
809 size_t next
= mid
+ 1;
810 if (next
>= end
|| offset
< shards
[next
].shard_info
->offset
)
812 //continue to search forwards
815 //continue to search backwards
820 return -1; // not found
823 /// check if a range spans a shard
824 bool spans_shard(uint32_t offset
, uint32_t length
) {
825 if (shards
.empty()) {
828 int s
= seek_shard(offset
);
830 if (s
== (int)shards
.size() - 1) {
831 return false; // last shard
833 if (offset
+ length
<= shards
[s
+1].shard_info
->offset
) {
839 /// ensure that a range of the map is loaded
840 void fault_range(KeyValueDB
*db
,
841 uint32_t offset
, uint32_t length
);
843 /// ensure a range of the map is marked dirty
844 void dirty_range(uint32_t offset
, uint32_t length
);
846 /// for seek_lextent test
847 extent_map_t::iterator
find(uint64_t offset
);
849 /// seek to the first lextent including or after offset
850 extent_map_t::iterator
seek_lextent(uint64_t offset
);
851 extent_map_t::const_iterator
seek_lextent(uint64_t offset
) const;
854 void add(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
) {
855 extent_map
.insert(*new Extent(lo
, o
, l
, b
));
858 /// remove (and delete) an Extent
859 void rm(extent_map_t::iterator p
) {
860 extent_map
.erase_and_dispose(p
, DeleteDisposer());
863 bool has_any_lextents(uint64_t offset
, uint64_t length
);
865 /// consolidate adjacent lextents in extent_map
866 int compress_extent_map(uint64_t offset
, uint64_t length
);
868 /// punch a logical hole. add lextents to deref to target list.
869 void punch_hole(CollectionRef
&c
,
870 uint64_t offset
, uint64_t length
,
871 old_extent_map_t
*old_extents
);
873 /// put new lextent into lextent_map overwriting existing ones if
874 /// any and update references accordingly
875 Extent
*set_lextent(CollectionRef
&c
,
876 uint64_t logical_offset
,
877 uint64_t offset
, uint64_t length
,
879 old_extent_map_t
*old_extents
);
881 /// split a blob (and referring extents)
882 BlobRef
split_blob(BlobRef lb
, uint32_t blob_offset
, uint32_t pos
);
885 /// Compressed Blob Garbage collector
887 The primary idea of the collector is to estimate a difference between
888 allocation units(AU) currently present for compressed blobs and new AUs
889 required to store that data uncompressed.
890 Estimation is performed for protrusive extents within a logical range
891 determined by a concatenation of old_extents collection and specific(current)
893 The root cause for old_extents use is the need to handle blob ref counts
894 properly. Old extents still hold blob refs and hence we need to traverse
895 the collection to determine if blob to be released.
896 Protrusive extents are extents that fit into the blob set in action
897 (ones that are below the logical range from above) but not removed totally
898 due to the current write.
900 extent1 <loffs = 100, boffs = 100, len = 100> ->
901 blob1<compressed, len_on_disk=4096, logical_len=8192>
902 extent2 <loffs = 200, boffs = 200, len = 100> ->
903 blob2<raw, len_on_disk=4096, llen=4096>
904 extent3 <loffs = 300, boffs = 300, len = 100> ->
905 blob1<compressed, len_on_disk=4096, llen=8192>
906 extent4 <loffs = 4096, boffs = 0, len = 100> ->
907 blob3<raw, len_on_disk=4096, llen=4096>
909 protrusive extents are within the following ranges <0~300, 400~8192-400>
910 In this case existing AUs that might be removed due to GC (i.e. blob1)
912 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
913 Hence we should do a collect.
915 class GarbageCollector
918 /// return amount of allocation units that might be saved due to GC
922 const ExtentMap
& extent_map
,
923 const old_extent_map_t
& old_extents
,
924 uint64_t min_alloc_size
);
926 /// return a collection of extents to perform GC on
927 const vector
<AllocExtent
>& get_extents_to_collect() const {
928 return extents_to_collect
;
930 GarbageCollector(CephContext
* _cct
) : cct(_cct
) {}
934 uint64_t referenced_bytes
= 0; ///< amount of bytes referenced in blob
935 int64_t expected_allocations
= 0; ///< new alloc units required
936 ///< in case of gc fulfilled
937 bool collect_candidate
= false; ///< indicate if blob has any extents
938 ///< eligible for GC.
939 extent_map_t::const_iterator first_lextent
; ///< points to the first
940 ///< lextent referring to
941 ///< the blob if any.
942 ///< collect_candidate flag
943 ///< determines the validity
944 extent_map_t::const_iterator last_lextent
; ///< points to the last
945 ///< lextent referring to
946 ///< the blob if any.
948 BlobInfo(uint64_t ref_bytes
) :
949 referenced_bytes(ref_bytes
) {
953 map
<Blob
*, BlobInfo
> affected_blobs
; ///< compressed blobs and their ref_map
954 ///< copies that are affected by the
957 vector
<AllocExtent
> extents_to_collect
; ///< protrusive extents that should
958 ///< be collected if GC takes place
960 boost::optional
<uint64_t > used_alloc_unit
; ///< last processed allocation
961 ///< unit when traversing
962 ///< protrusive extents.
963 ///< Other extents mapped to
964 ///< this AU to be ignored
965 ///< (except the case where
966 ///< uncompressed extent follows
967 ///< compressed one - see below).
968 BlobInfo
* blob_info_counted
= nullptr; ///< set if previous allocation unit
969 ///< caused expected_allocations
970 ///< counter increment at this blob.
971 ///< if uncompressed extent follows
972 ///< a decrement for the
973 ///< expected_allocations counter
975 int64_t expected_allocations
= 0; ///< new alloc units required in case
977 int64_t expected_for_release
= 0; ///< alloc units currently used by
978 ///< compressed blobs that might
980 uint64_t gc_start_offset
; ///starting offset for GC
981 uint64_t gc_end_offset
; ///ending offset for GC
984 void process_protrusive_extents(const BlueStore::ExtentMap
& extent_map
,
985 uint64_t start_offset
,
987 uint64_t start_touch_offset
,
988 uint64_t end_touch_offset
,
989 uint64_t min_alloc_size
);
994 /// an in-memory object
996 MEMPOOL_CLASS_HELPERS();
998 std::atomic_int nref
; ///< reference count
1003 /// key under PREFIX_OBJ where we are stored
1004 mempool::bluestore_cache_other::string key
;
1006 boost::intrusive::list_member_hook
<> lru_item
;
1008 bluestore_onode_t onode
; ///< metadata stored as value in kv store
1009 bool exists
; ///< true if object logically exists
1011 ExtentMap extent_map
;
1013 // track txc's that have not been committed to kv store (and whose
1014 // effects cannot be read via the kvdb read methods)
1015 std::atomic
<int> flushing_count
= {0};
1016 std::mutex flush_lock
; ///< protect flush_txns
1017 std::condition_variable flush_cond
; ///< wait here for uncommitted txns
1019 Onode(Collection
*c
, const ghobject_t
& o
,
1020 const mempool::bluestore_cache_other::string
& k
)
1038 typedef boost::intrusive_ptr
<Onode
> OnodeRef
;
1041 /// a cache (shard) of onodes and buffers
1044 PerfCounters
*logger
;
1045 std::recursive_mutex lock
; ///< protect lru and other structures
1047 std::atomic
<uint64_t> num_extents
= {0};
1048 std::atomic
<uint64_t> num_blobs
= {0};
1050 static Cache
*create(CephContext
* cct
, string type
, PerfCounters
*logger
);
1052 Cache(CephContext
* cct
) : cct(cct
), logger(nullptr) {}
1055 virtual void _add_onode(OnodeRef
& o
, int level
) = 0;
1056 virtual void _rm_onode(OnodeRef
& o
) = 0;
1057 virtual void _touch_onode(OnodeRef
& o
) = 0;
1059 virtual void _add_buffer(Buffer
*b
, int level
, Buffer
*near
) = 0;
1060 virtual void _rm_buffer(Buffer
*b
) = 0;
1061 virtual void _move_buffer(Cache
*src
, Buffer
*b
) = 0;
1062 virtual void _adjust_buffer_size(Buffer
*b
, int64_t delta
) = 0;
1063 virtual void _touch_buffer(Buffer
*b
) = 0;
1065 virtual uint64_t _get_num_onodes() = 0;
1066 virtual uint64_t _get_buffer_bytes() = 0;
1082 void trim(uint64_t target_bytes
,
1083 float target_meta_ratio
,
1084 float target_data_ratio
,
1085 float bytes_per_onode
);
1089 virtual void _trim(uint64_t onode_max
, uint64_t buffer_max
) = 0;
1091 virtual void add_stats(uint64_t *onodes
, uint64_t *extents
,
1094 uint64_t *bytes
) = 0;
1097 std::lock_guard
<std::recursive_mutex
> l(lock
);
1098 return _get_num_onodes() == 0 && _get_buffer_bytes() == 0;
1102 virtual void _audit(const char *s
) = 0;
1104 void _audit(const char *s
) { /* no-op */ }
1108 /// simple LRU cache for onodes and buffers
1109 struct LRUCache
: public Cache
{
1111 typedef boost::intrusive::list
<
1113 boost::intrusive::member_hook
<
1115 boost::intrusive::list_member_hook
<>,
1116 &Onode::lru_item
> > onode_lru_list_t
;
1117 typedef boost::intrusive::list
<
1119 boost::intrusive::member_hook
<
1121 boost::intrusive::list_member_hook
<>,
1122 &Buffer::lru_item
> > buffer_lru_list_t
;
1124 onode_lru_list_t onode_lru
;
1126 buffer_lru_list_t buffer_lru
;
1127 uint64_t buffer_size
= 0;
1130 LRUCache(CephContext
* cct
) : Cache(cct
) {}
1131 uint64_t _get_num_onodes() override
{
1132 return onode_lru
.size();
1134 void _add_onode(OnodeRef
& o
, int level
) override
{
1136 onode_lru
.push_front(*o
);
1138 onode_lru
.push_back(*o
);
1140 void _rm_onode(OnodeRef
& o
) override
{
1141 auto q
= onode_lru
.iterator_to(*o
);
1144 void _touch_onode(OnodeRef
& o
) override
;
1146 uint64_t _get_buffer_bytes() override
{
1149 void _add_buffer(Buffer
*b
, int level
, Buffer
*near
) override
{
1151 auto q
= buffer_lru
.iterator_to(*near
);
1152 buffer_lru
.insert(q
, *b
);
1153 } else if (level
> 0) {
1154 buffer_lru
.push_front(*b
);
1156 buffer_lru
.push_back(*b
);
1158 buffer_size
+= b
->length
;
1160 void _rm_buffer(Buffer
*b
) override
{
1161 assert(buffer_size
>= b
->length
);
1162 buffer_size
-= b
->length
;
1163 auto q
= buffer_lru
.iterator_to(*b
);
1164 buffer_lru
.erase(q
);
1166 void _move_buffer(Cache
*src
, Buffer
*b
) override
{
1168 _add_buffer(b
, 0, nullptr);
1170 void _adjust_buffer_size(Buffer
*b
, int64_t delta
) override
{
1171 assert((int64_t)buffer_size
+ delta
>= 0);
1172 buffer_size
+= delta
;
1174 void _touch_buffer(Buffer
*b
) override
{
1175 auto p
= buffer_lru
.iterator_to(*b
);
1176 buffer_lru
.erase(p
);
1177 buffer_lru
.push_front(*b
);
1178 _audit("_touch_buffer end");
1181 void _trim(uint64_t onode_max
, uint64_t buffer_max
) override
;
1183 void add_stats(uint64_t *onodes
, uint64_t *extents
,
1186 uint64_t *bytes
) override
{
1187 std::lock_guard
<std::recursive_mutex
> l(lock
);
1188 *onodes
+= onode_lru
.size();
1189 *extents
+= num_extents
;
1190 *blobs
+= num_blobs
;
1191 *buffers
+= buffer_lru
.size();
1192 *bytes
+= buffer_size
;
1196 void _audit(const char *s
) override
;
1200 // 2Q cache for buffers, LRU for onodes
1201 struct TwoQCache
: public Cache
{
1203 // stick with LRU for onodes for now (fixme?)
1204 typedef boost::intrusive::list
<
1206 boost::intrusive::member_hook
<
1208 boost::intrusive::list_member_hook
<>,
1209 &Onode::lru_item
> > onode_lru_list_t
;
1210 typedef boost::intrusive::list
<
1212 boost::intrusive::member_hook
<
1214 boost::intrusive::list_member_hook
<>,
1215 &Buffer::lru_item
> > buffer_list_t
;
1217 onode_lru_list_t onode_lru
;
1219 buffer_list_t buffer_hot
; ///< "Am" hot buffers
1220 buffer_list_t buffer_warm_in
; ///< "A1in" newly warm buffers
1221 buffer_list_t buffer_warm_out
; ///< "A1out" empty buffers we've evicted
1222 uint64_t buffer_bytes
= 0; ///< bytes
1226 BUFFER_WARM_IN
, ///< in buffer_warm_in
1227 BUFFER_WARM_OUT
, ///< in buffer_warm_out
1228 BUFFER_HOT
, ///< in buffer_hot
1232 uint64_t buffer_list_bytes
[BUFFER_TYPE_MAX
] = {0}; ///< bytes per type
1235 TwoQCache(CephContext
* cct
) : Cache(cct
) {}
1236 uint64_t _get_num_onodes() override
{
1237 return onode_lru
.size();
1239 void _add_onode(OnodeRef
& o
, int level
) override
{
1241 onode_lru
.push_front(*o
);
1243 onode_lru
.push_back(*o
);
1245 void _rm_onode(OnodeRef
& o
) override
{
1246 auto q
= onode_lru
.iterator_to(*o
);
1249 void _touch_onode(OnodeRef
& o
) override
;
1251 uint64_t _get_buffer_bytes() override
{
1252 return buffer_bytes
;
1254 void _add_buffer(Buffer
*b
, int level
, Buffer
*near
) override
;
1255 void _rm_buffer(Buffer
*b
) override
;
1256 void _move_buffer(Cache
*src
, Buffer
*b
) override
;
1257 void _adjust_buffer_size(Buffer
*b
, int64_t delta
) override
;
1258 void _touch_buffer(Buffer
*b
) override
{
1259 switch (b
->cache_private
) {
1260 case BUFFER_WARM_IN
:
1261 // do nothing (somewhat counter-intuitively!)
1263 case BUFFER_WARM_OUT
:
1264 // move from warm_out to hot LRU
1265 assert(0 == "this happens via discard hint");
1268 // move to front of hot LRU
1269 buffer_hot
.erase(buffer_hot
.iterator_to(*b
));
1270 buffer_hot
.push_front(*b
);
1273 _audit("_touch_buffer end");
1276 void _trim(uint64_t onode_max
, uint64_t buffer_max
) override
;
1278 void add_stats(uint64_t *onodes
, uint64_t *extents
,
1281 uint64_t *bytes
) override
{
1282 std::lock_guard
<std::recursive_mutex
> l(lock
);
1283 *onodes
+= onode_lru
.size();
1284 *extents
+= num_extents
;
1285 *blobs
+= num_blobs
;
1286 *buffers
+= buffer_hot
.size() + buffer_warm_in
.size();
1287 *bytes
+= buffer_bytes
;
1291 void _audit(const char *s
) override
;
1300 mempool::bluestore_cache_other::unordered_map
<ghobject_t
,OnodeRef
> onode_map
;
1302 friend class Collection
; // for split_cache()
1305 OnodeSpace(Cache
*c
) : cache(c
) {}
1310 OnodeRef
add(const ghobject_t
& oid
, OnodeRef o
);
1311 OnodeRef
lookup(const ghobject_t
& o
);
1312 void remove(const ghobject_t
& oid
) {
1313 onode_map
.erase(oid
);
1315 void rename(OnodeRef
& o
, const ghobject_t
& old_oid
,
1316 const ghobject_t
& new_oid
,
1317 const mempool::bluestore_cache_other::string
& new_okey
);
1321 /// return true if f true for any item
1322 bool map_any(std::function
<bool(OnodeRef
)> f
);
1325 struct Collection
: public CollectionImpl
{
1327 Cache
*cache
; ///< our cache shard
1329 bluestore_cnode_t cnode
;
1334 SharedBlobSet shared_blob_set
; ///< open SharedBlobs
1336 // cache onodes on a per-collection basis to avoid lock
1338 OnodeSpace onode_map
;
1341 pool_opts_t pool_opts
;
1343 OnodeRef
get_onode(const ghobject_t
& oid
, bool create
);
1345 // the terminology is confusing here, sorry!
1347 // blob_t shared_blob_t
1348 // !shared unused -> open
1349 // shared !loaded -> open + shared
1350 // shared loaded -> open + shared + loaded
1353 // open = SharedBlob is instantiated
1354 // shared = blob_t shared flag is set; SharedBlob is hashed.
1355 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1356 void open_shared_blob(uint64_t sbid
, BlobRef b
);
1357 void load_shared_blob(SharedBlobRef sb
);
1358 void make_blob_shared(uint64_t sbid
, BlobRef b
);
1359 uint64_t make_blob_unshared(SharedBlob
*sb
);
1361 BlobRef
new_blob() {
1362 BlobRef b
= new Blob();
1363 b
->shared_blob
= new SharedBlob(this);
1367 const coll_t
&get_cid() override
{
1371 bool contains(const ghobject_t
& oid
) {
1373 return oid
.hobj
.pool
== -1;
1375 if (cid
.is_pg(&spgid
))
1377 spgid
.pgid
.contains(cnode
.bits
, oid
) &&
1378 oid
.shard_id
== spgid
.shard
;
1382 void split_cache(Collection
*dest
);
1384 Collection(BlueStore
*ns
, Cache
*ca
, coll_t c
);
1387 class OmapIteratorImpl
: public ObjectMap::ObjectMapIteratorImpl
{
1390 KeyValueDB::Iterator it
;
1393 OmapIteratorImpl(CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
);
1394 int seek_to_first() override
;
1395 int upper_bound(const string
&after
) override
;
1396 int lower_bound(const string
&to
) override
;
1397 bool valid() override
;
1398 int next(bool validate
=true) override
;
1399 string
key() override
;
1400 bufferlist
value() override
;
1401 int status() override
{
1407 typedef boost::intrusive_ptr
<OpSequencer
> OpSequencerRef
;
1409 struct volatile_statfs
{
1411 STATFS_ALLOCATED
= 0,
1413 STATFS_COMPRESSED_ORIGINAL
,
1415 STATFS_COMPRESSED_ALLOCATED
,
1418 int64_t values
[STATFS_LAST
];
1420 memset(this, 0, sizeof(volatile_statfs
));
1423 *this = volatile_statfs();
1425 volatile_statfs
& operator+=(const volatile_statfs
& other
) {
1426 for (size_t i
= 0; i
< STATFS_LAST
; ++i
) {
1427 values
[i
] += other
.values
[i
];
1431 int64_t& allocated() {
1432 return values
[STATFS_ALLOCATED
];
1435 return values
[STATFS_STORED
];
1437 int64_t& compressed_original() {
1438 return values
[STATFS_COMPRESSED_ORIGINAL
];
1440 int64_t& compressed() {
1441 return values
[STATFS_COMPRESSED
];
1443 int64_t& compressed_allocated() {
1444 return values
[STATFS_COMPRESSED_ALLOCATED
];
1447 return values
[STATFS_ALLOCATED
] == 0 &&
1448 values
[STATFS_STORED
] == 0 &&
1449 values
[STATFS_COMPRESSED
] == 0 &&
1450 values
[STATFS_COMPRESSED_ORIGINAL
] == 0 &&
1451 values
[STATFS_COMPRESSED_ALLOCATED
] == 0;
1453 void decode(bufferlist::iterator
& it
) {
1454 for (size_t i
= 0; i
< STATFS_LAST
; i
++) {
1455 ::decode(values
[i
], it
);
1459 void encode(bufferlist
& bl
) {
1460 for (size_t i
= 0; i
< STATFS_LAST
; i
++) {
1461 ::encode(values
[i
], bl
);
1466 struct TransContext
: public AioContext
{
1467 MEMPOOL_CLASS_HELPERS();
1473 STATE_KV_QUEUED
, // queued for kv_sync_thread submission
1474 STATE_KV_SUBMITTED
, // submitted to kv; not yet synced
1476 STATE_DEFERRED_QUEUED
, // in deferred_queue (pending or running)
1477 STATE_DEFERRED_CLEANUP
, // remove deferred kv record
1478 STATE_DEFERRED_DONE
,
1483 state_t state
= STATE_PREPARE
;
1485 const char *get_state_name() {
1487 case STATE_PREPARE
: return "prepare";
1488 case STATE_AIO_WAIT
: return "aio_wait";
1489 case STATE_IO_DONE
: return "io_done";
1490 case STATE_KV_QUEUED
: return "kv_queued";
1491 case STATE_KV_SUBMITTED
: return "kv_submitted";
1492 case STATE_KV_DONE
: return "kv_done";
1493 case STATE_DEFERRED_QUEUED
: return "deferred_queued";
1494 case STATE_DEFERRED_CLEANUP
: return "deferred_cleanup";
1495 case STATE_DEFERRED_DONE
: return "deferred_done";
1496 case STATE_FINISHING
: return "finishing";
1497 case STATE_DONE
: return "done";
1502 #if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1503 const char *get_state_latency_name(int state
) {
1505 case l_bluestore_state_prepare_lat
: return "prepare";
1506 case l_bluestore_state_aio_wait_lat
: return "aio_wait";
1507 case l_bluestore_state_io_done_lat
: return "io_done";
1508 case l_bluestore_state_kv_queued_lat
: return "kv_queued";
1509 case l_bluestore_state_kv_committing_lat
: return "kv_committing";
1510 case l_bluestore_state_kv_done_lat
: return "kv_done";
1511 case l_bluestore_state_deferred_queued_lat
: return "deferred_queued";
1512 case l_bluestore_state_deferred_cleanup_lat
: return "deferred_cleanup";
1513 case l_bluestore_state_finishing_lat
: return "finishing";
1514 case l_bluestore_state_done_lat
: return "done";
1520 void log_state_latency(PerfCounters
*logger
, int state
) {
1521 utime_t lat
, now
= ceph_clock_now();
1522 lat
= now
- last_stamp
;
1523 logger
->tinc(state
, lat
);
1524 #if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1525 if (state
>= l_bluestore_state_prepare_lat
&& state
<= l_bluestore_state_done_lat
) {
1526 double usecs
= (now
.to_nsec()-last_stamp
.to_nsec())/1000;
1527 OID_ELAPSED("", usecs
, get_state_latency_name(state
));
1534 boost::intrusive::list_member_hook
<> sequencer_item
;
1536 uint64_t bytes
= 0, cost
= 0;
1538 set
<OnodeRef
> onodes
; ///< these need to be updated/written
1539 set
<OnodeRef
> modified_objects
; ///< objects we modified (and need a ref)
1540 set
<SharedBlobRef
> shared_blobs
; ///< these need to be updated/written
1541 set
<SharedBlobRef
> shared_blobs_written
; ///< update these on io completion
1543 KeyValueDB::Transaction t
; ///< then we will commit this
1544 Context
*oncommit
= nullptr; ///< signal on commit
1545 Context
*onreadable
= nullptr; ///< signal on readable
1546 Context
*onreadable_sync
= nullptr; ///< signal on readable
1547 list
<Context
*> oncommits
; ///< more commit completions
1548 list
<CollectionRef
> removed_collections
; ///< colls we removed
1550 boost::intrusive::list_member_hook
<> deferred_queue_item
;
1551 bluestore_deferred_transaction_t
*deferred_txn
= nullptr; ///< if any
1553 interval_set
<uint64_t> allocated
, released
;
1554 volatile_statfs statfs_delta
;
1557 bool had_ios
= false; ///< true if we submitted IOs before our kv txn
1559 CollectionRef first_collection
; ///< first referenced collection
1565 uint64_t last_nid
= 0; ///< if non-zero, highest new nid we allocated
1566 uint64_t last_blobid
= 0; ///< if non-zero, highest new blobid we allocated
1568 explicit TransContext(CephContext
* cct
, OpSequencer
*o
)
1571 start(ceph_clock_now()) {
1575 delete deferred_txn
;
1578 void write_onode(OnodeRef
&o
) {
1581 void write_shared_blob(SharedBlobRef
&sb
) {
1582 shared_blobs
.insert(sb
);
1584 void unshare_blob(SharedBlob
*sb
) {
1585 shared_blobs
.erase(sb
);
1588 /// note we logically modified object (when onode itself is unmodified)
1589 void note_modified_object(OnodeRef
&o
) {
1590 // onode itself isn't written, though
1591 modified_objects
.insert(o
);
1593 void removed(OnodeRef
& o
) {
1595 modified_objects
.erase(o
);
1598 void aio_finish(BlueStore
*store
) override
{
1599 store
->txc_aio_finish(this);
1603 typedef boost::intrusive::list
<
1605 boost::intrusive::member_hook
<
1607 boost::intrusive::list_member_hook
<>,
1608 &TransContext::deferred_queue_item
> > deferred_queue_t
;
1610 struct DeferredBatch
: public AioContext
{
1612 struct deferred_io
{
1613 bufferlist bl
; ///< data
1614 uint64_t seq
; ///< deferred transaction seq
1616 map
<uint64_t,deferred_io
> iomap
; ///< map of ios in this batch
1617 deferred_queue_t txcs
; ///< txcs in this batch
1618 IOContext ioc
; ///< our aios
1619 /// bytes of pending io for each deferred seq (may be 0)
1620 map
<uint64_t,int> seq_bytes
;
1622 void _discard(CephContext
*cct
, uint64_t offset
, uint64_t length
);
1623 void _audit(CephContext
*cct
);
1625 DeferredBatch(CephContext
*cct
, OpSequencer
*osr
)
1626 : osr(osr
), ioc(cct
, this) {}
1629 void prepare_write(CephContext
*cct
,
1630 uint64_t seq
, uint64_t offset
, uint64_t length
,
1631 bufferlist::const_iterator
& p
);
1633 void aio_finish(BlueStore
*store
) override
{
1634 store
->_deferred_aio_finish(osr
);
1638 class OpSequencer
: public Sequencer_impl
{
1641 std::condition_variable qcond
;
1642 typedef boost::intrusive::list
<
1644 boost::intrusive::member_hook
<
1646 boost::intrusive::list_member_hook
<>,
1647 &TransContext::sequencer_item
> > q_list_t
;
1648 q_list_t q
; ///< transactions
1650 boost::intrusive::list_member_hook
<> deferred_osr_queue_item
;
1652 DeferredBatch
*deferred_running
= nullptr;
1653 DeferredBatch
*deferred_pending
= nullptr;
1658 uint64_t last_seq
= 0;
1660 std::atomic_int txc_with_unstable_io
= {0}; ///< num txcs with unstable io
1662 std::atomic_int kv_committing_serially
= {0};
1664 std::atomic_int kv_submitted_waiters
= {0};
1666 std::atomic_bool registered
= {true}; ///< registered in BlueStore's osr_set
1667 std::atomic_bool zombie
= {false}; ///< owning Sequencer has gone away
1669 OpSequencer(CephContext
* cct
, BlueStore
*store
)
1670 : Sequencer_impl(cct
),
1671 parent(NULL
), store(store
) {
1672 store
->register_osr(this);
1674 ~OpSequencer() override
{
1679 void discard() override
{
1680 // Note that we may have txc's in flight when the parent Sequencer
1681 // goes away. Reflect this with zombie==registered==true and let
1682 // _osr_drain_all clean up later.
1688 std::lock_guard
<std::mutex
> l(qlock
);
1696 void _unregister() {
1698 store
->unregister_osr(this);
1703 void queue_new(TransContext
*txc
) {
1704 std::lock_guard
<std::mutex
> l(qlock
);
1705 txc
->seq
= ++last_seq
;
1710 std::unique_lock
<std::mutex
> l(qlock
);
1715 void drain_preceding(TransContext
*txc
) {
1716 std::unique_lock
<std::mutex
> l(qlock
);
1717 while (!q
.empty() && &q
.front() != txc
)
1721 bool _is_all_kv_submitted() {
1722 // caller must hold qlock
1726 TransContext
*txc
= &q
.back();
1727 if (txc
->state
>= TransContext::STATE_KV_SUBMITTED
) {
1733 void flush() override
{
1734 std::unique_lock
<std::mutex
> l(qlock
);
1736 // set flag before the check because the condition
1737 // may become true outside qlock, and we need to make
1738 // sure those threads see waiters and signal qcond.
1739 ++kv_submitted_waiters
;
1740 if (_is_all_kv_submitted()) {
1744 --kv_submitted_waiters
;
1748 bool flush_commit(Context
*c
) override
{
1749 std::lock_guard
<std::mutex
> l(qlock
);
1753 TransContext
*txc
= &q
.back();
1754 if (txc
->state
>= TransContext::STATE_KV_DONE
) {
1757 txc
->oncommits
.push_back(c
);
1762 typedef boost::intrusive::list
<
1764 boost::intrusive::member_hook
<
1766 boost::intrusive::list_member_hook
<>,
1767 &OpSequencer::deferred_osr_queue_item
> > deferred_osr_queue_t
;
1769 struct KVSyncThread
: public Thread
{
1771 explicit KVSyncThread(BlueStore
*s
) : store(s
) {}
1772 void *entry() override
{
1773 store
->_kv_sync_thread();
1777 struct KVFinalizeThread
: public Thread
{
1779 explicit KVFinalizeThread(BlueStore
*s
) : store(s
) {}
1781 store
->_kv_finalize_thread();
1786 struct DBHistogram
{
1795 map
<int, struct value_dist
> val_map
; ///< slab id to count, max length of value and key
1798 map
<string
, map
<int, struct key_dist
> > key_hist
;
1799 map
<int, uint64_t> value_hist
;
1800 int get_key_slab(size_t sz
);
1801 string
get_key_slab_to_range(int slab
);
1802 int get_value_slab(size_t sz
);
1803 string
get_value_slab_to_range(int slab
);
1804 void update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
1805 const string
&prefix
, size_t key_size
, size_t value_size
);
1806 void dump(Formatter
*f
);
1809 // --------------------------------------------------------
1812 BlueFS
*bluefs
= nullptr;
1813 unsigned bluefs_shared_bdev
= 0; ///< which bluefs bdev we are sharing
1814 bool bluefs_single_shared_device
= true;
1815 utime_t bluefs_last_balance
;
1817 KeyValueDB
*db
= nullptr;
1818 BlockDevice
*bdev
= nullptr;
1819 std::string freelist_type
;
1820 FreelistManager
*fm
= nullptr;
1821 Allocator
*alloc
= nullptr;
1823 int path_fd
= -1; ///< open handle to $path
1824 int fsid_fd
= -1; ///< open handle (locked) to $path/fsid
1825 bool mounted
= false;
1827 RWLock coll_lock
= {"BlueStore::coll_lock"}; ///< rwlock to protect coll_map
1828 mempool::bluestore_cache_other::unordered_map
<coll_t
, CollectionRef
> coll_map
;
1830 vector
<Cache
*> cache_shards
;
1832 std::mutex osr_lock
; ///< protect osd_set
1833 std::set
<OpSequencerRef
> osr_set
; ///< set of all OpSequencers
1835 std::atomic
<uint64_t> nid_last
= {0};
1836 std::atomic
<uint64_t> nid_max
= {0};
1837 std::atomic
<uint64_t> blobid_last
= {0};
1838 std::atomic
<uint64_t> blobid_max
= {0};
1840 Throttle throttle_bytes
; ///< submit to commit
1841 Throttle throttle_deferred_bytes
; ///< submit to deferred complete
1843 interval_set
<uint64_t> bluefs_extents
; ///< block extents owned by bluefs
1844 interval_set
<uint64_t> bluefs_extents_reclaiming
; ///< currently reclaiming
1846 std::mutex deferred_lock
, deferred_submit_lock
;
1847 std::atomic
<uint64_t> deferred_seq
= {0};
1848 deferred_osr_queue_t deferred_queue
; ///< osr's with deferred io pending
1849 int deferred_queue_size
= 0; ///< num txc's queued across all osrs
1850 atomic_int deferred_aggressive
= {0}; ///< aggressive wakeup of kv thread
1852 int m_finisher_num
= 1;
1853 vector
<Finisher
*> finishers
;
1855 KVSyncThread kv_sync_thread
;
1857 std::condition_variable kv_cond
;
1858 bool kv_sync_started
= false;
1859 bool kv_stop
= false;
1860 bool kv_finalize_started
= false;
1861 bool kv_finalize_stop
= false;
1862 deque
<TransContext
*> kv_queue
; ///< ready, already submitted
1863 deque
<TransContext
*> kv_queue_unsubmitted
; ///< ready, need submit by kv thread
1864 deque
<TransContext
*> kv_committing
; ///< currently syncing
1865 deque
<DeferredBatch
*> deferred_done_queue
; ///< deferred ios done
1866 deque
<DeferredBatch
*> deferred_stable_queue
; ///< deferred ios done + stable
1868 KVFinalizeThread kv_finalize_thread
;
1869 std::mutex kv_finalize_lock
;
1870 std::condition_variable kv_finalize_cond
;
1871 deque
<TransContext
*> kv_committing_to_finalize
; ///< pending finalization
1872 deque
<DeferredBatch
*> deferred_stable_to_finalize
; ///< pending finalization
1874 PerfCounters
*logger
= nullptr;
1876 std::mutex reap_lock
;
1877 list
<CollectionRef
> removed_collections
;
1879 RWLock debug_read_error_lock
= {"BlueStore::debug_read_error_lock"};
1880 set
<ghobject_t
> debug_data_error_objects
;
1881 set
<ghobject_t
> debug_mdata_error_objects
;
1883 std::atomic
<int> csum_type
= {Checksummer::CSUM_CRC32C
};
1885 uint64_t block_size
= 0; ///< block size of block device (power of 2)
1886 uint64_t block_mask
= 0; ///< mask to get just the block offset
1887 size_t block_size_order
= 0; ///< bits to shift to get block size
1889 uint64_t min_alloc_size
= 0; ///< minimum allocation unit (power of 2)
1890 ///< bits for min_alloc_size
1891 uint8_t min_alloc_size_order
= 0;
1892 static_assert(std::numeric_limits
<uint8_t>::max() >
1893 std::numeric_limits
<decltype(min_alloc_size
)>::digits
,
1894 "not enough bits for min_alloc_size");
1896 ///< maximum allocation unit (power of 2)
1897 std::atomic
<uint64_t> max_alloc_size
= {0};
1899 ///< number threshold for forced deferred writes
1900 std::atomic
<int> deferred_batch_ops
= {0};
1902 ///< size threshold for forced deferred writes
1903 std::atomic
<uint64_t> prefer_deferred_size
= {0};
1905 ///< approx cost per io, in bytes
1906 std::atomic
<uint64_t> throttle_cost_per_io
= {0};
1908 std::atomic
<Compressor::CompressionMode
> comp_mode
=
1909 {Compressor::COMP_NONE
}; ///< compression mode
1910 CompressorRef compressor
;
1911 std::atomic
<uint64_t> comp_min_blob_size
= {0};
1912 std::atomic
<uint64_t> comp_max_blob_size
= {0};
1914 std::atomic
<uint64_t> max_blob_size
= {0}; ///< maximum blob size
1916 uint64_t kv_ios
= 0;
1917 uint64_t kv_throttle_costs
= 0;
1919 // cache trim control
1920 uint64_t cache_size
= 0; ///< total cache size
1921 float cache_meta_ratio
= 0; ///< cache ratio dedicated to metadata
1922 float cache_kv_ratio
= 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
1923 float cache_data_ratio
= 0; ///< cache ratio dedicated to object data
1925 std::mutex vstatfs_lock
;
1926 volatile_statfs vstatfs
;
1928 struct MempoolThread
: public Thread
{
1934 explicit MempoolThread(BlueStore
*s
)
1936 lock("BlueStore::MempoolThread::lock") {}
1937 void *entry() override
;
1939 assert(stop
== false);
1940 create("bstore_mempool");
1951 // --------------------------------------------------------
1954 void _init_logger();
1955 void _shutdown_logger();
1956 int _reload_logger();
1960 int _open_fsid(bool create
);
1962 int _read_fsid(uuid_d
*f
);
1965 void _set_alloc_sizes();
1966 void _set_blob_size();
1968 int _open_bdev(bool create
);
1970 int _open_db(bool create
);
1972 int _open_fm(bool create
);
1975 void _close_alloc();
1976 int _open_collections(int *errors
=0);
1977 void _close_collections();
1979 int _setup_block_symlink_or_file(string name
, string path
, uint64_t size
,
1982 int _write_bdev_label(string path
, bluestore_bdev_label_t label
);
1984 static int _read_bdev_label(CephContext
* cct
, string path
,
1985 bluestore_bdev_label_t
*label
);
1987 int _check_or_set_bdev_label(string path
, uint64_t size
, string desc
,
1990 int _open_super_meta();
1992 void _open_statfs();
1994 int _reconcile_bluefs_freespace();
1995 int _balance_bluefs_freespace(PExtentVector
*extents
);
1996 void _commit_bluefs_freespace(const PExtentVector
& extents
);
1998 CollectionRef
_get_collection(const coll_t
& cid
);
1999 void _queue_reap_collection(CollectionRef
& c
);
2000 void _reap_collections();
2001 void _update_cache_logger();
2003 void _assign_nid(TransContext
*txc
, OnodeRef o
);
2004 uint64_t _assign_blobid(TransContext
*txc
);
2006 void _dump_onode(OnodeRef o
, int log_level
=30);
2007 void _dump_extent_map(ExtentMap
& em
, int log_level
=30);
2008 void _dump_transaction(Transaction
*t
, int log_level
= 30);
2010 TransContext
*_txc_create(OpSequencer
*osr
);
2011 void _txc_update_store_statfs(TransContext
*txc
);
2012 void _txc_add_transaction(TransContext
*txc
, Transaction
*t
);
2013 void _txc_calc_cost(TransContext
*txc
);
2014 void _txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
);
2015 void _txc_state_proc(TransContext
*txc
);
2016 void _txc_aio_submit(TransContext
*txc
);
2018 void txc_aio_finish(void *p
) {
2019 _txc_state_proc(static_cast<TransContext
*>(p
));
2022 void _txc_finish_io(TransContext
*txc
);
2023 void _txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
);
2024 void _txc_applied_kv(TransContext
*txc
);
2025 void _txc_committed_kv(TransContext
*txc
);
2026 void _txc_finish(TransContext
*txc
);
2027 void _txc_release_alloc(TransContext
*txc
);
2029 void _osr_drain_preceding(TransContext
*txc
);
2030 void _osr_drain_all();
2031 void _osr_unregister_all();
2035 void _kv_sync_thread();
2036 void _kv_finalize_thread();
2038 bluestore_deferred_op_t
*_get_deferred_op(TransContext
*txc
, OnodeRef o
);
2039 void _deferred_queue(TransContext
*txc
);
2040 void deferred_try_submit();
2041 void _deferred_submit_unlock(OpSequencer
*osr
);
2042 void _deferred_aio_finish(OpSequencer
*osr
);
2043 int _deferred_replay();
2046 using mempool_dynamic_bitset
=
2047 boost::dynamic_bitset
<uint64_t,
2048 mempool::bluestore_fsck::pool_allocator
<uint64_t>>;
2051 int _fsck_check_extents(
2052 const ghobject_t
& oid
,
2053 const PExtentVector
& extents
,
2055 mempool_dynamic_bitset
&used_blocks
,
2056 store_statfs_t
& expected_statfs
);
2058 void _buffer_cache_write(
2064 b
->shared_blob
->bc
.write(b
->shared_blob
->get_cache(), txc
->seq
, offset
, bl
,
2066 txc
->shared_blobs_written
.insert(b
->shared_blob
);
2069 int _collection_list(
2070 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
,
2071 int max
, vector
<ghobject_t
> *ls
, ghobject_t
*next
);
2073 template <typename T
, typename F
>
2074 T
select_option(const std::string
& opt_name
, T val1
, F f
) {
2075 //NB: opt_name reserved for future use
2076 boost::optional
<T
> val2
= f();
2083 void _apply_padding(uint64_t head_pad
,
2085 bufferlist
& padded
);
2087 // -- ondisk version ---
2089 const int32_t latest_ondisk_format
= 2; ///< our version
2090 const int32_t min_readable_ondisk_format
= 1; ///< what we can read
2091 const int32_t min_compat_ondisk_format
= 2; ///< who can read us
2094 int32_t ondisk_format
= 0; ///< value detected on mount
2096 int _upgrade_super(); ///< upgrade (called during open_super)
2097 void _prepare_ondisk_format_super(KeyValueDB::Transaction
& t
);
2099 // --- public interface ---
2101 BlueStore(CephContext
*cct
, const string
& path
);
2102 BlueStore(CephContext
*cct
, const string
& path
, uint64_t min_alloc_size
); // Ctor for UT only
2103 ~BlueStore() override
;
2105 string
get_type() override
{
2109 bool needs_journal() override
{ return false; };
2110 bool wants_journal() override
{ return false; };
2111 bool allows_journal() override
{ return false; };
2113 bool is_rotational() override
;
2115 string
get_default_device_class() override
{
2116 string device_class
;
2117 map
<string
, string
> metadata
;
2118 collect_metadata(&metadata
);
2119 auto it
= metadata
.find("bluestore_bdev_type");
2120 if (it
!= metadata
.end()) {
2121 device_class
= it
->second
;
2123 return device_class
;
2126 static int get_block_device_fsid(CephContext
* cct
, const string
& path
,
2129 bool test_mount_in_use() override
;
2132 int _mount(bool kv_only
);
2134 int mount() override
{
2135 return _mount(false);
2137 int umount() override
;
2139 int start_kv_only(KeyValueDB
**pdb
) {
2140 int r
= _mount(true);
2147 int fsck(bool deep
) override
;
2149 void set_cache_shards(unsigned num
) override
;
2151 int validate_hobject_key(const hobject_t
&obj
) const override
{
2154 unsigned get_max_attr_name_length() override
{
2155 return 256; // arbitrary; there is no real limit internally
2158 int mkfs() override
;
2159 int mkjournal() override
{
2163 void get_db_statistics(Formatter
*f
) override
;
2164 void generate_db_histogram(Formatter
*f
) override
;
2165 void _flush_cache();
2166 void flush_cache() override
;
2167 void dump_perf_counters(Formatter
*f
) override
{
2168 f
->open_object_section("perf_counters");
2169 logger
->dump_formatted(f
, false);
2173 void register_osr(OpSequencer
*osr
) {
2174 std::lock_guard
<std::mutex
> l(osr_lock
);
2175 osr_set
.insert(osr
);
2177 void unregister_osr(OpSequencer
*osr
) {
2178 std::lock_guard
<std::mutex
> l(osr_lock
);
2183 int statfs(struct store_statfs_t
*buf
) override
;
2185 void collect_metadata(map
<string
,string
> *pm
) override
;
2187 bool exists(const coll_t
& cid
, const ghobject_t
& oid
) override
;
2188 bool exists(CollectionHandle
&c
, const ghobject_t
& oid
) override
;
2189 int set_collection_opts(
2191 const pool_opts_t
& opts
) override
;
2194 const ghobject_t
& oid
,
2196 bool allow_eio
= false) override
;
2198 CollectionHandle
&c
,
2199 const ghobject_t
& oid
,
2201 bool allow_eio
= false) override
;
2204 const ghobject_t
& oid
,
2208 uint32_t op_flags
= 0) override
;
2210 CollectionHandle
&c
,
2211 const ghobject_t
& oid
,
2215 uint32_t op_flags
= 0) override
;
2222 uint32_t op_flags
= 0);
2225 int _fiemap(CollectionHandle
&c_
, const ghobject_t
& oid
,
2226 uint64_t offset
, size_t len
, interval_set
<uint64_t>& destset
);
2228 int fiemap(const coll_t
& cid
, const ghobject_t
& oid
,
2229 uint64_t offset
, size_t len
, bufferlist
& bl
) override
;
2230 int fiemap(CollectionHandle
&c
, const ghobject_t
& oid
,
2231 uint64_t offset
, size_t len
, bufferlist
& bl
) override
;
2232 int fiemap(const coll_t
& cid
, const ghobject_t
& oid
,
2233 uint64_t offset
, size_t len
, map
<uint64_t, uint64_t>& destmap
) override
;
2234 int fiemap(CollectionHandle
&c
, const ghobject_t
& oid
,
2235 uint64_t offset
, size_t len
, map
<uint64_t, uint64_t>& destmap
) override
;
2238 int getattr(const coll_t
& cid
, const ghobject_t
& oid
, const char *name
,
2239 bufferptr
& value
) override
;
2240 int getattr(CollectionHandle
&c
, const ghobject_t
& oid
, const char *name
,
2241 bufferptr
& value
) override
;
2243 int getattrs(const coll_t
& cid
, const ghobject_t
& oid
,
2244 map
<string
,bufferptr
>& aset
) override
;
2245 int getattrs(CollectionHandle
&c
, const ghobject_t
& oid
,
2246 map
<string
,bufferptr
>& aset
) override
;
2248 int list_collections(vector
<coll_t
>& ls
) override
;
2250 CollectionHandle
open_collection(const coll_t
&c
) override
;
2252 bool collection_exists(const coll_t
& c
) override
;
2253 int collection_empty(const coll_t
& c
, bool *empty
) override
;
2254 int collection_bits(const coll_t
& c
) override
;
2256 int collection_list(const coll_t
& cid
,
2257 const ghobject_t
& start
,
2258 const ghobject_t
& end
,
2260 vector
<ghobject_t
> *ls
, ghobject_t
*next
) override
;
2261 int collection_list(CollectionHandle
&c
,
2262 const ghobject_t
& start
,
2263 const ghobject_t
& end
,
2265 vector
<ghobject_t
> *ls
, ghobject_t
*next
) override
;
2268 const coll_t
& cid
, ///< [in] Collection containing oid
2269 const ghobject_t
&oid
, ///< [in] Object containing omap
2270 bufferlist
*header
, ///< [out] omap header
2271 map
<string
, bufferlist
> *out
/// < [out] Key to value map
2274 CollectionHandle
&c
, ///< [in] Collection containing oid
2275 const ghobject_t
&oid
, ///< [in] Object containing omap
2276 bufferlist
*header
, ///< [out] omap header
2277 map
<string
, bufferlist
> *out
/// < [out] Key to value map
2281 int omap_get_header(
2282 const coll_t
& cid
, ///< [in] Collection containing oid
2283 const ghobject_t
&oid
, ///< [in] Object containing omap
2284 bufferlist
*header
, ///< [out] omap header
2285 bool allow_eio
= false ///< [in] don't assert on eio
2287 int omap_get_header(
2288 CollectionHandle
&c
, ///< [in] Collection containing oid
2289 const ghobject_t
&oid
, ///< [in] Object containing omap
2290 bufferlist
*header
, ///< [out] omap header
2291 bool allow_eio
= false ///< [in] don't assert on eio
2294 /// Get keys defined on oid
2296 const coll_t
& cid
, ///< [in] Collection containing oid
2297 const ghobject_t
&oid
, ///< [in] Object containing omap
2298 set
<string
> *keys
///< [out] Keys defined on oid
2301 CollectionHandle
&c
, ///< [in] Collection containing oid
2302 const ghobject_t
&oid
, ///< [in] Object containing omap
2303 set
<string
> *keys
///< [out] Keys defined on oid
2307 int omap_get_values(
2308 const coll_t
& cid
, ///< [in] Collection containing oid
2309 const ghobject_t
&oid
, ///< [in] Object containing omap
2310 const set
<string
> &keys
, ///< [in] Keys to get
2311 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
2313 int omap_get_values(
2314 CollectionHandle
&c
, ///< [in] Collection containing oid
2315 const ghobject_t
&oid
, ///< [in] Object containing omap
2316 const set
<string
> &keys
, ///< [in] Keys to get
2317 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
2320 /// Filters keys into out which are defined on oid
2321 int omap_check_keys(
2322 const coll_t
& cid
, ///< [in] Collection containing oid
2323 const ghobject_t
&oid
, ///< [in] Object containing omap
2324 const set
<string
> &keys
, ///< [in] Keys to check
2325 set
<string
> *out
///< [out] Subset of keys defined on oid
2327 int omap_check_keys(
2328 CollectionHandle
&c
, ///< [in] Collection containing oid
2329 const ghobject_t
&oid
, ///< [in] Object containing omap
2330 const set
<string
> &keys
, ///< [in] Keys to check
2331 set
<string
> *out
///< [out] Subset of keys defined on oid
2334 ObjectMap::ObjectMapIterator
get_omap_iterator(
2335 const coll_t
& cid
, ///< [in] collection
2336 const ghobject_t
&oid
///< [in] object
2338 ObjectMap::ObjectMapIterator
get_omap_iterator(
2339 CollectionHandle
&c
, ///< [in] collection
2340 const ghobject_t
&oid
///< [in] object
2343 void set_fsid(uuid_d u
) override
{
2346 uuid_d
get_fsid() override
{
2350 uint64_t estimate_objects_overhead(uint64_t num_objects
) override
{
2351 return num_objects
* 300; //assuming per-object overhead is 300 bytes
2354 struct BSPerfTracker
{
2355 PerfCounters::avg_tracker
<uint64_t> os_commit_latency
;
2356 PerfCounters::avg_tracker
<uint64_t> os_apply_latency
;
2358 objectstore_perf_stat_t
get_cur_stats() const {
2359 objectstore_perf_stat_t ret
;
2360 ret
.os_commit_latency
= os_commit_latency
.avg();
2361 ret
.os_apply_latency
= os_apply_latency
.avg();
2365 void update_from_perfcounters(PerfCounters
&logger
);
2368 objectstore_perf_stat_t
get_cur_stats() override
{
2369 perf_tracker
.update_from_perfcounters(*logger
);
2370 return perf_tracker
.get_cur_stats();
2372 const PerfCounters
* get_perf_counters() const override
{
2376 int queue_transactions(
2378 vector
<Transaction
>& tls
,
2379 TrackedOpRef op
= TrackedOpRef(),
2380 ThreadPool::TPHandle
*handle
= NULL
) override
;
2383 void inject_data_error(const ghobject_t
& o
) override
{
2384 RWLock::WLocker
l(debug_read_error_lock
);
2385 debug_data_error_objects
.insert(o
);
2387 void inject_mdata_error(const ghobject_t
& o
) override
{
2388 RWLock::WLocker
l(debug_read_error_lock
);
2389 debug_mdata_error_objects
.insert(o
);
2391 void compact() override
{
2397 bool _debug_data_eio(const ghobject_t
& o
) {
2398 if (!cct
->_conf
->bluestore_debug_inject_read_err
) {
2401 RWLock::RLocker
l(debug_read_error_lock
);
2402 return debug_data_error_objects
.count(o
);
2404 bool _debug_mdata_eio(const ghobject_t
& o
) {
2405 if (!cct
->_conf
->bluestore_debug_inject_read_err
) {
2408 RWLock::RLocker
l(debug_read_error_lock
);
2409 return debug_mdata_error_objects
.count(o
);
2411 void _debug_obj_on_delete(const ghobject_t
& o
) {
2412 if (cct
->_conf
->bluestore_debug_inject_read_err
) {
2413 RWLock::WLocker
l(debug_read_error_lock
);
2414 debug_data_error_objects
.erase(o
);
2415 debug_mdata_error_objects
.erase(o
);
2421 // --------------------------------------------------------
2422 // read processing internal methods
2425 const bluestore_blob_t
* blob
,
2426 uint64_t blob_xoffset
,
2427 const bufferlist
& bl
,
2428 uint64_t logical_offset
) const;
2429 int _decompress(bufferlist
& source
, bufferlist
* result
);
2432 // --------------------------------------------------------
2435 struct WriteContext
{
2436 bool buffered
= false; ///< buffered write
2437 bool compress
= false; ///< compressed write
2438 uint64_t target_blob_size
= 0; ///< target (max) blob size
2439 unsigned csum_order
= 0; ///< target checksum chunk order
2441 old_extent_map_t old_extents
; ///< must deref these blobs
2444 uint64_t logical_offset
; ///< write logical offset
2446 uint64_t blob_length
;
2449 uint64_t b_off0
; ///< original offset in a blob prior to padding
2450 uint64_t length0
; ///< original data length prior to padding
2453 bool new_blob
; ///< whether new blob was created
2456 uint64_t logical_offs
,
2466 logical_offset(logical_offs
),
2468 blob_length(blob_len
),
2473 mark_unused(_mark_unused
),
2474 new_blob(_new_blob
) {}
2476 vector
<write_item
> writes
; ///< blobs we're writing
2478 /// partial clone of the context
2479 void fork(const WriteContext
& other
) {
2480 buffered
= other
.buffered
;
2481 compress
= other
.compress
;
2482 target_blob_size
= other
.target_blob_size
;
2483 csum_order
= other
.csum_order
;
2495 writes
.emplace_back(loffs
,
2505 /// Checks for writes to the same pextent within a blob
2510 uint64_t min_alloc_size
);
2513 void _do_write_small(
2517 uint64_t offset
, uint64_t length
,
2518 bufferlist::iterator
& blp
,
2519 WriteContext
*wctx
);
2524 uint64_t offset
, uint64_t length
,
2525 bufferlist::iterator
& blp
,
2526 WriteContext
*wctx
);
2527 int _do_alloc_write(
2531 WriteContext
*wctx
);
2537 set
<SharedBlob
*> *maybe_unshared_blobs
=0);
2539 int _do_transaction(Transaction
*t
,
2541 ThreadPool::TPHandle
*handle
);
2543 int _write(TransContext
*txc
,
2546 uint64_t offset
, size_t len
,
2548 uint32_t fadvise_flags
);
2549 void _pad_zeros(bufferlist
*bl
, uint64_t *offset
,
2550 uint64_t chunk_size
);
2552 void _choose_write_options(CollectionRef
& c
,
2554 uint32_t fadvise_flags
,
2555 WriteContext
*wctx
);
2557 int _do_gc(TransContext
*txc
,
2560 const GarbageCollector
& gc
,
2561 const WriteContext
& wctx
,
2562 uint64_t *dirty_start
,
2563 uint64_t *dirty_end
);
2565 int _do_write(TransContext
*txc
,
2568 uint64_t offset
, uint64_t length
,
2570 uint32_t fadvise_flags
);
2571 void _do_write_data(TransContext
*txc
,
2577 WriteContext
*wctx
);
2579 int _touch(TransContext
*txc
,
2582 int _do_zero(TransContext
*txc
,
2585 uint64_t offset
, size_t len
);
2586 int _zero(TransContext
*txc
,
2589 uint64_t offset
, size_t len
);
2590 void _do_truncate(TransContext
*txc
,
2594 set
<SharedBlob
*> *maybe_unshared_blobs
=0);
2595 void _truncate(TransContext
*txc
,
2599 int _remove(TransContext
*txc
,
2602 int _do_remove(TransContext
*txc
,
2605 int _setattr(TransContext
*txc
,
2610 int _setattrs(TransContext
*txc
,
2613 const map
<string
,bufferptr
>& aset
);
2614 int _rmattr(TransContext
*txc
,
2617 const string
& name
);
2618 int _rmattrs(TransContext
*txc
,
2621 void _do_omap_clear(TransContext
*txc
, uint64_t id
);
2622 int _omap_clear(TransContext
*txc
,
2625 int _omap_setkeys(TransContext
*txc
,
2629 int _omap_setheader(TransContext
*txc
,
2632 bufferlist
& header
);
2633 int _omap_rmkeys(TransContext
*txc
,
2637 int _omap_rmkey_range(TransContext
*txc
,
2640 const string
& first
, const string
& last
);
2641 int _set_alloc_hint(
2645 uint64_t expected_object_size
,
2646 uint64_t expected_write_size
,
2648 int _do_clone_range(TransContext
*txc
,
2652 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
);
2653 int _clone(TransContext
*txc
,
2657 int _clone_range(TransContext
*txc
,
2661 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
);
2662 int _rename(TransContext
*txc
,
2666 const ghobject_t
& new_oid
);
2667 int _create_collection(TransContext
*txc
, const coll_t
&cid
,
2668 unsigned bits
, CollectionRef
*c
);
2669 int _remove_collection(TransContext
*txc
, const coll_t
&cid
,
2671 int _split_collection(TransContext
*txc
,
2674 unsigned bits
, int rem
);
2677 inline ostream
& operator<<(ostream
& out
, const BlueStore::OpSequencer
& s
) {
2678 return out
<< *s
.parent
;
2681 static inline void intrusive_ptr_add_ref(BlueStore::Onode
*o
) {
2684 static inline void intrusive_ptr_release(BlueStore::Onode
*o
) {
2688 static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer
*o
) {
2691 static inline void intrusive_ptr_release(BlueStore::OpSequencer
*o
) {