1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_OSD_BLUESTORE_H
16 #define CEPH_OSD_BLUESTORE_H
24 #include <condition_variable>
26 #include <boost/intrusive/list.hpp>
27 #include <boost/intrusive/unordered_set.hpp>
28 #include <boost/intrusive/set.hpp>
29 #include <boost/functional/hash.hpp>
30 #include <boost/dynamic_bitset.hpp>
32 #include "include/assert.h"
33 #include "include/unordered_map.h"
34 #include "include/memory.h"
35 #include "include/mempool.h"
36 #include "common/Finisher.h"
37 #include "common/perf_counters.h"
38 #include "common/PriorityCache.h"
39 #include "compressor/Compressor.h"
40 #include "os/ObjectStore.h"
42 #include "bluestore_types.h"
43 #include "BlockDevice.h"
44 #include "common/EventTrace.h"
47 class FreelistManager
;
51 //#define DEBUG_DEFERRED
55 // constants for Buffer::optimize()
56 #define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
60 l_bluestore_first
= 732430,
61 l_bluestore_kv_flush_lat
,
62 l_bluestore_kv_commit_lat
,
64 l_bluestore_state_prepare_lat
,
65 l_bluestore_state_aio_wait_lat
,
66 l_bluestore_state_io_done_lat
,
67 l_bluestore_state_kv_queued_lat
,
68 l_bluestore_state_kv_committing_lat
,
69 l_bluestore_state_kv_done_lat
,
70 l_bluestore_state_deferred_queued_lat
,
71 l_bluestore_state_deferred_aio_wait_lat
,
72 l_bluestore_state_deferred_cleanup_lat
,
73 l_bluestore_state_finishing_lat
,
74 l_bluestore_state_done_lat
,
75 l_bluestore_throttle_lat
,
76 l_bluestore_submit_lat
,
77 l_bluestore_commit_lat
,
79 l_bluestore_read_onode_meta_lat
,
80 l_bluestore_read_wait_aio_lat
,
81 l_bluestore_compress_lat
,
82 l_bluestore_decompress_lat
,
84 l_bluestore_compress_success_count
,
85 l_bluestore_compress_rejected_count
,
86 l_bluestore_write_pad_bytes
,
87 l_bluestore_deferred_write_ops
,
88 l_bluestore_deferred_write_bytes
,
89 l_bluestore_write_penalty_read_ops
,
90 l_bluestore_allocated
,
92 l_bluestore_compressed
,
93 l_bluestore_compressed_allocated
,
94 l_bluestore_compressed_original
,
96 l_bluestore_onode_hits
,
97 l_bluestore_onode_misses
,
98 l_bluestore_onode_shard_hits
,
99 l_bluestore_onode_shard_misses
,
103 l_bluestore_buffer_bytes
,
104 l_bluestore_buffer_hit_bytes
,
105 l_bluestore_buffer_miss_bytes
,
106 l_bluestore_write_big
,
107 l_bluestore_write_big_bytes
,
108 l_bluestore_write_big_blobs
,
109 l_bluestore_write_small
,
110 l_bluestore_write_small_bytes
,
111 l_bluestore_write_small_unused
,
112 l_bluestore_write_small_deferred
,
113 l_bluestore_write_small_pre_read
,
114 l_bluestore_write_small_new
,
116 l_bluestore_onode_reshard
,
117 l_bluestore_blob_split
,
118 l_bluestore_extent_compress
,
119 l_bluestore_gc_merged
,
120 l_bluestore_read_eio
,
124 class BlueStore
: public ObjectStore
,
125 public md_config_obs_t
{
126 // -----------------------------------------------------
130 const char** get_tracked_conf_keys() const override
;
131 void handle_conf_change(const struct md_config_t
*conf
,
132 const std::set
<std::string
> &changed
) override
;
135 void _set_compression();
136 void _set_throttle_params();
137 int _set_cache_sizes();
141 typedef map
<uint64_t, bufferlist
> ready_regions_t
;
145 typedef boost::intrusive_ptr
<Collection
> CollectionRef
;
148 virtual void aio_finish(BlueStore
*store
) = 0;
149 virtual ~AioContext() {}
154 MEMPOOL_CLASS_HELPERS();
157 STATE_EMPTY
, ///< empty buffer -- used for cache history
158 STATE_CLEAN
, ///< clean data that is up to date
159 STATE_WRITING
, ///< data that is being written (io not yet complete)
161 static const char *get_state_name(int s
) {
163 case STATE_EMPTY
: return "empty";
164 case STATE_CLEAN
: return "clean";
165 case STATE_WRITING
: return "writing";
166 default: return "???";
170 FLAG_NOCACHE
= 1, ///< trim when done WRITING (do not become CLEAN)
171 // NOTE: fix operator<< when you define a second flag
173 static const char *get_flag_name(int s
) {
175 case FLAG_NOCACHE
: return "nocache";
176 default: return "???";
181 uint16_t state
; ///< STATE_*
182 uint16_t cache_private
= 0; ///< opaque (to us) value used by Cache impl
183 uint32_t flags
; ///< FLAG_*
185 uint32_t offset
, length
;
188 boost::intrusive::list_member_hook
<> lru_item
;
189 boost::intrusive::list_member_hook
<> state_item
;
191 Buffer(BufferSpace
*space
, unsigned s
, uint64_t q
, uint32_t o
, uint32_t l
,
193 : space(space
), state(s
), flags(f
), seq(q
), offset(o
), length(l
) {}
194 Buffer(BufferSpace
*space
, unsigned s
, uint64_t q
, uint32_t o
, bufferlist
& b
,
196 : space(space
), state(s
), flags(f
), seq(q
), offset(o
),
197 length(b
.length()), data(b
) {}
199 bool is_empty() const {
200 return state
== STATE_EMPTY
;
202 bool is_clean() const {
203 return state
== STATE_CLEAN
;
205 bool is_writing() const {
206 return state
== STATE_WRITING
;
209 uint32_t end() const {
210 return offset
+ length
;
213 void truncate(uint32_t newlen
) {
214 assert(newlen
< length
);
217 t
.substr_of(data
, 0, newlen
);
222 void maybe_rebuild() {
224 (data
.get_num_buffers() > 1 ||
225 data
.front().wasted() > data
.length() / MAX_BUFFER_SLOP_RATIO_DEN
)) {
230 void dump(Formatter
*f
) const {
231 f
->dump_string("state", get_state_name(state
));
232 f
->dump_unsigned("seq", seq
);
233 f
->dump_unsigned("offset", offset
);
234 f
->dump_unsigned("length", length
);
235 f
->dump_unsigned("data_length", data
.length());
241 /// map logical extent range (object) onto buffers
244 BYPASS_CLEAN_CACHE
= 0x1, // bypass clean cache
247 typedef boost::intrusive::list
<
249 boost::intrusive::member_hook
<
251 boost::intrusive::list_member_hook
<>,
252 &Buffer::state_item
> > state_list_t
;
254 mempool::bluestore_cache_other::map
<uint32_t, std::unique_ptr
<Buffer
>>
257 // we use a bare intrusive list here instead of std::map because
258 // it uses less memory and we expect this to be very small (very
259 // few IOs in flight to the same Blob at the same time).
260 state_list_t writing
; ///< writing buffers, sorted by seq, ascending
263 assert(buffer_map
.empty());
264 assert(writing
.empty());
267 void _add_buffer(Cache
* cache
, Buffer
*b
, int level
, Buffer
*near
) {
268 cache
->_audit("_add_buffer start");
269 buffer_map
[b
->offset
].reset(b
);
270 if (b
->is_writing()) {
271 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_writing
);
272 if (writing
.empty() || writing
.rbegin()->seq
<= b
->seq
) {
273 writing
.push_back(*b
);
275 auto it
= writing
.begin();
276 while (it
->seq
< b
->seq
) {
280 assert(it
->seq
>= b
->seq
);
281 // note that this will insert b before it
282 // hence the order is maintained
283 writing
.insert(it
, *b
);
286 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
287 cache
->_add_buffer(b
, level
, near
);
289 cache
->_audit("_add_buffer end");
291 void _rm_buffer(Cache
* cache
, Buffer
*b
) {
292 _rm_buffer(cache
, buffer_map
.find(b
->offset
));
294 void _rm_buffer(Cache
* cache
,
295 map
<uint32_t, std::unique_ptr
<Buffer
>>::iterator p
) {
296 assert(p
!= buffer_map
.end());
297 cache
->_audit("_rm_buffer start");
298 if (p
->second
->is_writing()) {
299 writing
.erase(writing
.iterator_to(*p
->second
));
301 cache
->_rm_buffer(p
->second
.get());
304 cache
->_audit("_rm_buffer end");
307 map
<uint32_t,std::unique_ptr
<Buffer
>>::iterator
_data_lower_bound(
309 auto i
= buffer_map
.lower_bound(offset
);
310 if (i
!= buffer_map
.begin()) {
312 if (i
->first
+ i
->second
->length
<= offset
)
318 // must be called under protection of the Cache lock
319 void _clear(Cache
* cache
);
321 // return value is the highest cache_private of a trimmed buffer, or 0.
322 int discard(Cache
* cache
, uint32_t offset
, uint32_t length
) {
323 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
324 return _discard(cache
, offset
, length
);
326 int _discard(Cache
* cache
, uint32_t offset
, uint32_t length
);
328 void write(Cache
* cache
, uint64_t seq
, uint32_t offset
, bufferlist
& bl
,
330 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
331 Buffer
*b
= new Buffer(this, Buffer::STATE_WRITING
, seq
, offset
, bl
,
333 b
->cache_private
= _discard(cache
, offset
, bl
.length());
334 _add_buffer(cache
, b
, (flags
& Buffer::FLAG_NOCACHE
) ? 0 : 1, nullptr);
336 void finish_write(Cache
* cache
, uint64_t seq
);
337 void did_read(Cache
* cache
, uint32_t offset
, bufferlist
& bl
) {
338 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
339 Buffer
*b
= new Buffer(this, Buffer::STATE_CLEAN
, 0, offset
, bl
);
340 b
->cache_private
= _discard(cache
, offset
, bl
.length());
341 _add_buffer(cache
, b
, 1, nullptr);
344 void read(Cache
* cache
, uint32_t offset
, uint32_t length
,
345 BlueStore::ready_regions_t
& res
,
346 interval_set
<uint32_t>& res_intervals
,
349 void truncate(Cache
* cache
, uint32_t offset
) {
350 discard(cache
, offset
, (uint32_t)-1 - offset
);
353 void split(Cache
* cache
, size_t pos
, BufferSpace
&r
);
355 void dump(Cache
* cache
, Formatter
*f
) const {
356 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
357 f
->open_array_section("buffers");
358 for (auto& i
: buffer_map
) {
359 f
->open_object_section("buffer");
360 assert(i
.first
== i
.second
->offset
);
368 struct SharedBlobSet
;
370 /// in-memory shared blob state (incl cached buffers)
372 MEMPOOL_CLASS_HELPERS();
374 std::atomic_int nref
= {0}; ///< reference count
379 uint64_t sbid_unloaded
; ///< sbid if persistent isn't loaded
380 bluestore_shared_blob_t
*persistent
; ///< persistent part of the shared blob if any
382 BufferSpace bc
; ///< buffer cache
384 SharedBlob(Collection
*_coll
) : coll(_coll
), sbid_unloaded(0) {
386 get_cache()->add_blob();
389 SharedBlob(uint64_t i
, Collection
*_coll
);
392 uint64_t get_sbid() const {
393 return loaded
? persistent
->sbid
: sbid_unloaded
;
396 friend void intrusive_ptr_add_ref(SharedBlob
*b
) { b
->get(); }
397 friend void intrusive_ptr_release(SharedBlob
*b
) { b
->put(); }
399 friend ostream
& operator<<(ostream
& out
, const SharedBlob
& sb
);
406 /// get logical references
407 void get_ref(uint64_t offset
, uint32_t length
);
409 /// put logical references, and get back any released extents
410 void put_ref(uint64_t offset
, uint32_t length
,
411 PExtentVector
*r
, set
<SharedBlob
*> *maybe_unshared_blobs
);
413 friend bool operator==(const SharedBlob
&l
, const SharedBlob
&r
) {
414 return l
.get_sbid() == r
.get_sbid();
416 inline Cache
* get_cache() {
417 return coll
? coll
->cache
: nullptr;
419 inline SharedBlobSet
* get_parent() {
420 return coll
? &(coll
->shared_blob_set
) : nullptr;
422 inline bool is_loaded() const {
427 typedef boost::intrusive_ptr
<SharedBlob
> SharedBlobRef
;
429 /// a lookup table of SharedBlobs
430 struct SharedBlobSet
{
431 std::mutex lock
; ///< protect lookup, insertion, removal
433 // we use a bare pointer because we don't want to affect the ref
435 mempool::bluestore_cache_other::unordered_map
<uint64_t,SharedBlob
*> sb_map
;
437 SharedBlobRef
lookup(uint64_t sbid
) {
438 std::lock_guard
<std::mutex
> l(lock
);
439 auto p
= sb_map
.find(sbid
);
440 if (p
== sb_map
.end() ||
441 p
->second
->nref
== 0) {
447 void add(Collection
* coll
, SharedBlob
*sb
) {
448 std::lock_guard
<std::mutex
> l(lock
);
449 sb_map
[sb
->get_sbid()] = sb
;
453 bool remove(SharedBlob
*sb
, bool verify_nref_is_zero
=false) {
454 std::lock_guard
<std::mutex
> l(lock
);
455 assert(sb
->get_parent() == this);
456 if (verify_nref_is_zero
&& sb
->nref
!= 0) {
459 // only remove if it still points to us
460 auto p
= sb_map
.find(sb
->get_sbid());
461 if (p
!= sb_map
.end() &&
469 std::lock_guard
<std::mutex
> l(lock
);
470 return sb_map
.empty();
473 void dump(CephContext
*cct
, int lvl
);
476 //#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
478 /// in-memory blob metadata and associated cached buffers (if any)
480 MEMPOOL_CLASS_HELPERS();
482 std::atomic_int nref
= {0}; ///< reference count
483 int16_t id
= -1; ///< id, for spanning blobs only, >= 0
484 int16_t last_encoded_id
= -1; ///< (ephemeral) used during encoding only
485 SharedBlobRef shared_blob
; ///< shared blob state (if any)
488 mutable bluestore_blob_t blob
; ///< decoded blob metadata
490 mutable bufferlist blob_bl
; ///< cached encoded blob, blob is dirty if empty
492 /// refs from this shard. ephemeral if id<0, persisted if spanning.
493 bluestore_blob_use_tracker_t used_in_blob
;
497 friend void intrusive_ptr_add_ref(Blob
*b
) { b
->get(); }
498 friend void intrusive_ptr_release(Blob
*b
) { b
->put(); }
500 friend ostream
& operator<<(ostream
& out
, const Blob
&b
);
502 const bluestore_blob_use_tracker_t
& get_blob_use_tracker() const {
505 bool is_referenced() const {
506 return used_in_blob
.is_not_empty();
508 uint32_t get_referenced_bytes() const {
509 return used_in_blob
.get_referenced_bytes();
512 bool is_spanning() const {
516 bool can_split() const {
517 std::lock_guard
<std::recursive_mutex
> l(shared_blob
->get_cache()->lock
);
518 // splitting a BufferSpace writing list is too hard; don't try.
519 return shared_blob
->bc
.writing
.empty() &&
520 used_in_blob
.can_split() &&
521 get_blob().can_split();
524 bool can_split_at(uint32_t blob_offset
) const {
525 return used_in_blob
.can_split_at(blob_offset
) &&
526 get_blob().can_split_at(blob_offset
);
529 bool can_reuse_blob(uint32_t min_alloc_size
,
530 uint32_t target_blob_size
,
535 o
.shared_blob
= shared_blob
;
542 inline const bluestore_blob_t
& get_blob() const {
545 inline bluestore_blob_t
& dirty_blob() {
552 /// discard buffers for unallocated regions
553 void discard_unallocated(Collection
*coll
);
555 /// get logical references
556 void get_ref(Collection
*coll
, uint32_t offset
, uint32_t length
);
557 /// put logical references, and get back any released extents
558 bool put_ref(Collection
*coll
, uint32_t offset
, uint32_t length
,
562 void split(Collection
*coll
, uint32_t blob_offset
, Blob
*o
);
574 void _encode() const {
575 if (blob_bl
.length() == 0 ) {
576 ::encode(blob
, blob_bl
);
578 assert(blob_bl
.length());
583 bool include_ref_map
) const {
585 p
+= blob_bl
.length();
586 if (include_ref_map
) {
587 used_in_blob
.bound_encode(p
);
591 bufferlist::contiguous_appender
& p
,
592 bool include_ref_map
) const {
595 if (include_ref_map
) {
596 used_in_blob
.encode(p
);
600 Collection */
*coll*/
,
601 bufferptr::iterator
& p
,
602 bool include_ref_map
) {
603 const char *start
= p
.get_pos();
605 const char *end
= p
.get_pos();
607 blob_bl
.append(start
, end
- start
);
608 if (include_ref_map
) {
609 used_in_blob
.decode(p
);
617 bool include_ref_map
) const {
618 denc(blob
, p
, struct_v
);
619 if (blob
.is_shared()) {
622 if (include_ref_map
) {
623 used_in_blob
.bound_encode(p
);
627 bufferlist::contiguous_appender
& p
,
630 bool include_ref_map
) const {
631 denc(blob
, p
, struct_v
);
632 if (blob
.is_shared()) {
635 if (include_ref_map
) {
636 used_in_blob
.encode(p
);
641 bufferptr::iterator
& p
,
644 bool include_ref_map
);
647 typedef boost::intrusive_ptr
<Blob
> BlobRef
;
648 typedef mempool::bluestore_cache_other::map
<int,BlobRef
> blob_map_t
;
650 /// a logical extent, pointing to (some portion of) a blob
651 typedef boost::intrusive::set_base_hook
<boost::intrusive::optimize_size
<true> > ExtentBase
; //making an alias to avoid build warnings
652 struct Extent
: public ExtentBase
{
653 MEMPOOL_CLASS_HELPERS();
655 uint32_t logical_offset
= 0; ///< logical offset
656 uint32_t blob_offset
= 0; ///< blob offset
657 uint32_t length
= 0; ///< length
658 BlobRef blob
; ///< the blob with our data
660 /// ctor for lookup only
661 explicit Extent(uint32_t lo
) : ExtentBase(), logical_offset(lo
) { }
662 /// ctor for delayed initialization (see decode_some())
663 explicit Extent() : ExtentBase() {
665 /// ctor for general usage
666 Extent(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
)
668 logical_offset(lo
), blob_offset(o
), length(l
) {
673 blob
->shared_blob
->get_cache()->rm_extent();
677 void assign_blob(const BlobRef
& b
) {
680 blob
->shared_blob
->get_cache()->add_extent();
683 // comparators for intrusive_set
684 friend bool operator<(const Extent
&a
, const Extent
&b
) {
685 return a
.logical_offset
< b
.logical_offset
;
687 friend bool operator>(const Extent
&a
, const Extent
&b
) {
688 return a
.logical_offset
> b
.logical_offset
;
690 friend bool operator==(const Extent
&a
, const Extent
&b
) {
691 return a
.logical_offset
== b
.logical_offset
;
694 uint32_t blob_start() const {
695 return logical_offset
- blob_offset
;
698 uint32_t blob_end() const {
699 return blob_start() + blob
->get_blob().get_logical_length();
702 uint32_t logical_end() const {
703 return logical_offset
+ length
;
706 // return true if any piece of the blob is out of
707 // the given range [o, o + l].
708 bool blob_escapes_range(uint32_t o
, uint32_t l
) const {
709 return blob_start() < o
|| blob_end() > o
+ l
;
712 typedef boost::intrusive::set
<Extent
> extent_map_t
;
715 friend ostream
& operator<<(ostream
& out
, const Extent
& e
);
718 boost::intrusive::list_member_hook
<> old_extent_item
;
721 bool blob_empty
; // flag to track the last removed extent that makes blob
722 // empty - required to update compression stat properly
723 OldExtent(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
)
724 : e(lo
, o
, l
, b
), blob_empty(false) {
726 static OldExtent
* create(CollectionRef c
,
732 typedef boost::intrusive::list
<
734 boost::intrusive::member_hook
<
736 boost::intrusive::list_member_hook
<>,
737 &OldExtent::old_extent_item
> > old_extent_map_t
;
741 /// a sharded extent map, mapping offsets to lextents to blobs
744 extent_map_t extent_map
; ///< map of Extents to Blobs
745 blob_map_t spanning_blob_map
; ///< blobs that span shards
748 bluestore_onode_t::shard_info
*shard_info
= nullptr;
749 unsigned extents
= 0; ///< count extents in this shard
750 bool loaded
= false; ///< true if shard is loaded
751 bool dirty
= false; ///< true if shard is dirty and needs reencoding
753 mempool::bluestore_cache_other::vector
<Shard
> shards
; ///< shards
755 bufferlist inline_bl
; ///< cached encoded map, if unsharded; empty=>dirty
757 uint32_t needs_reshard_begin
= 0;
758 uint32_t needs_reshard_end
= 0;
760 bool needs_reshard() const {
761 return needs_reshard_end
> needs_reshard_begin
;
763 void clear_needs_reshard() {
764 needs_reshard_begin
= needs_reshard_end
= 0;
766 void request_reshard(uint32_t begin
, uint32_t end
) {
767 if (begin
< needs_reshard_begin
) {
768 needs_reshard_begin
= begin
;
770 if (end
> needs_reshard_end
) {
771 needs_reshard_end
= end
;
775 struct DeleteDisposer
{
776 void operator()(Extent
*e
) { delete e
; }
781 extent_map
.clear_and_dispose(DeleteDisposer());
785 extent_map
.clear_and_dispose(DeleteDisposer());
788 clear_needs_reshard();
791 bool encode_some(uint32_t offset
, uint32_t length
, bufferlist
& bl
,
793 unsigned decode_some(bufferlist
& bl
);
795 void bound_encode_spanning_blobs(size_t& p
);
796 void encode_spanning_blobs(bufferlist::contiguous_appender
& p
);
797 void decode_spanning_blobs(bufferptr::iterator
& p
);
799 BlobRef
get_spanning_blob(int id
) {
800 auto p
= spanning_blob_map
.find(id
);
801 assert(p
!= spanning_blob_map
.end());
805 void update(KeyValueDB::Transaction t
, bool force
);
806 decltype(BlueStore::Blob::id
) allocate_spanning_blob_id();
809 KeyValueDB::Transaction t
);
811 /// initialize Shards from the onode
812 void init_shards(bool loaded
, bool dirty
);
814 /// return index of shard containing offset
815 /// or -1 if not found
816 int seek_shard(uint32_t offset
) {
817 size_t end
= shards
.size();
818 size_t mid
, left
= 0;
819 size_t right
= end
; // one passed the right end
821 while (left
< right
) {
822 mid
= left
+ (right
- left
) / 2;
823 if (offset
>= shards
[mid
].shard_info
->offset
) {
824 size_t next
= mid
+ 1;
825 if (next
>= end
|| offset
< shards
[next
].shard_info
->offset
)
827 //continue to search forwards
830 //continue to search backwards
835 return -1; // not found
838 /// check if a range spans a shard
839 bool spans_shard(uint32_t offset
, uint32_t length
) {
840 if (shards
.empty()) {
843 int s
= seek_shard(offset
);
845 if (s
== (int)shards
.size() - 1) {
846 return false; // last shard
848 if (offset
+ length
<= shards
[s
+1].shard_info
->offset
) {
854 /// ensure that a range of the map is loaded
855 void fault_range(KeyValueDB
*db
,
856 uint32_t offset
, uint32_t length
);
858 /// ensure a range of the map is marked dirty
859 void dirty_range(uint32_t offset
, uint32_t length
);
861 /// for seek_lextent test
862 extent_map_t::iterator
find(uint64_t offset
);
864 /// seek to the first lextent including or after offset
865 extent_map_t::iterator
seek_lextent(uint64_t offset
);
866 extent_map_t::const_iterator
seek_lextent(uint64_t offset
) const;
869 void add(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
) {
870 extent_map
.insert(*new Extent(lo
, o
, l
, b
));
873 /// remove (and delete) an Extent
874 void rm(extent_map_t::iterator p
) {
875 extent_map
.erase_and_dispose(p
, DeleteDisposer());
878 bool has_any_lextents(uint64_t offset
, uint64_t length
);
880 /// consolidate adjacent lextents in extent_map
881 int compress_extent_map(uint64_t offset
, uint64_t length
);
883 /// punch a logical hole. add lextents to deref to target list.
884 void punch_hole(CollectionRef
&c
,
885 uint64_t offset
, uint64_t length
,
886 old_extent_map_t
*old_extents
);
888 /// put new lextent into lextent_map overwriting existing ones if
889 /// any and update references accordingly
890 Extent
*set_lextent(CollectionRef
&c
,
891 uint64_t logical_offset
,
892 uint64_t offset
, uint64_t length
,
894 old_extent_map_t
*old_extents
);
896 /// split a blob (and referring extents)
897 BlobRef
split_blob(BlobRef lb
, uint32_t blob_offset
, uint32_t pos
);
900 /// Compressed Blob Garbage collector
902 The primary idea of the collector is to estimate a difference between
903 allocation units(AU) currently present for compressed blobs and new AUs
904 required to store that data uncompressed.
905 Estimation is performed for protrusive extents within a logical range
906 determined by a concatenation of old_extents collection and specific(current)
908 The root cause for old_extents use is the need to handle blob ref counts
909 properly. Old extents still hold blob refs and hence we need to traverse
910 the collection to determine if blob to be released.
911 Protrusive extents are extents that fit into the blob set in action
912 (ones that are below the logical range from above) but not removed totally
913 due to the current write.
915 extent1 <loffs = 100, boffs = 100, len = 100> ->
916 blob1<compressed, len_on_disk=4096, logical_len=8192>
917 extent2 <loffs = 200, boffs = 200, len = 100> ->
918 blob2<raw, len_on_disk=4096, llen=4096>
919 extent3 <loffs = 300, boffs = 300, len = 100> ->
920 blob1<compressed, len_on_disk=4096, llen=8192>
921 extent4 <loffs = 4096, boffs = 0, len = 100> ->
922 blob3<raw, len_on_disk=4096, llen=4096>
924 protrusive extents are within the following ranges <0~300, 400~8192-400>
925 In this case existing AUs that might be removed due to GC (i.e. blob1)
927 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
928 Hence we should do a collect.
930 class GarbageCollector
933 /// return amount of allocation units that might be saved due to GC
937 const ExtentMap
& extent_map
,
938 const old_extent_map_t
& old_extents
,
939 uint64_t min_alloc_size
);
941 /// return a collection of extents to perform GC on
942 const vector
<AllocExtent
>& get_extents_to_collect() const {
943 return extents_to_collect
;
945 GarbageCollector(CephContext
* _cct
) : cct(_cct
) {}
949 uint64_t referenced_bytes
= 0; ///< amount of bytes referenced in blob
950 int64_t expected_allocations
= 0; ///< new alloc units required
951 ///< in case of gc fulfilled
952 bool collect_candidate
= false; ///< indicate if blob has any extents
953 ///< eligible for GC.
954 extent_map_t::const_iterator first_lextent
; ///< points to the first
955 ///< lextent referring to
956 ///< the blob if any.
957 ///< collect_candidate flag
958 ///< determines the validity
959 extent_map_t::const_iterator last_lextent
; ///< points to the last
960 ///< lextent referring to
961 ///< the blob if any.
963 BlobInfo(uint64_t ref_bytes
) :
964 referenced_bytes(ref_bytes
) {
968 map
<Blob
*, BlobInfo
> affected_blobs
; ///< compressed blobs and their ref_map
969 ///< copies that are affected by the
972 vector
<AllocExtent
> extents_to_collect
; ///< protrusive extents that should
973 ///< be collected if GC takes place
975 boost::optional
<uint64_t > used_alloc_unit
; ///< last processed allocation
976 ///< unit when traversing
977 ///< protrusive extents.
978 ///< Other extents mapped to
979 ///< this AU to be ignored
980 ///< (except the case where
981 ///< uncompressed extent follows
982 ///< compressed one - see below).
983 BlobInfo
* blob_info_counted
= nullptr; ///< set if previous allocation unit
984 ///< caused expected_allocations
985 ///< counter increment at this blob.
986 ///< if uncompressed extent follows
987 ///< a decrement for the
988 ///< expected_allocations counter
990 int64_t expected_allocations
= 0; ///< new alloc units required in case
992 int64_t expected_for_release
= 0; ///< alloc units currently used by
993 ///< compressed blobs that might
995 uint64_t gc_start_offset
; ///starting offset for GC
996 uint64_t gc_end_offset
; ///ending offset for GC
999 void process_protrusive_extents(const BlueStore::ExtentMap
& extent_map
,
1000 uint64_t start_offset
,
1001 uint64_t end_offset
,
1002 uint64_t start_touch_offset
,
1003 uint64_t end_touch_offset
,
1004 uint64_t min_alloc_size
);
1009 /// an in-memory object
1011 MEMPOOL_CLASS_HELPERS();
1013 std::atomic_int nref
; ///< reference count
1018 /// key under PREFIX_OBJ where we are stored
1019 mempool::bluestore_cache_other::string key
;
1021 boost::intrusive::list_member_hook
<> lru_item
;
1023 bluestore_onode_t onode
; ///< metadata stored as value in kv store
1024 bool exists
; ///< true if object logically exists
1026 ExtentMap extent_map
;
1028 // track txc's that have not been committed to kv store (and whose
1029 // effects cannot be read via the kvdb read methods)
1030 std::atomic
<int> flushing_count
= {0};
1031 std::mutex flush_lock
; ///< protect flush_txns
1032 std::condition_variable flush_cond
; ///< wait here for uncommitted txns
1034 Onode(Collection
*c
, const ghobject_t
& o
,
1035 const mempool::bluestore_cache_other::string
& k
)
1053 typedef boost::intrusive_ptr
<Onode
> OnodeRef
;
1056 /// a cache (shard) of onodes and buffers
1059 PerfCounters
*logger
;
1060 std::recursive_mutex lock
; ///< protect lru and other structures
1062 std::atomic
<uint64_t> num_extents
= {0};
1063 std::atomic
<uint64_t> num_blobs
= {0};
1065 static Cache
*create(CephContext
* cct
, string type
, PerfCounters
*logger
);
1067 Cache(CephContext
* cct
) : cct(cct
), logger(nullptr) {}
1070 virtual void _add_onode(OnodeRef
& o
, int level
) = 0;
1071 virtual void _rm_onode(OnodeRef
& o
) = 0;
1072 virtual void _touch_onode(OnodeRef
& o
) = 0;
1074 virtual void _add_buffer(Buffer
*b
, int level
, Buffer
*near
) = 0;
1075 virtual void _rm_buffer(Buffer
*b
) = 0;
1076 virtual void _move_buffer(Cache
*src
, Buffer
*b
) = 0;
1077 virtual void _adjust_buffer_size(Buffer
*b
, int64_t delta
) = 0;
1078 virtual void _touch_buffer(Buffer
*b
) = 0;
1080 virtual uint64_t _get_num_onodes() = 0;
1081 virtual uint64_t _get_buffer_bytes() = 0;
1097 void trim(uint64_t onode_max
, uint64_t buffer_max
);
1101 virtual void _trim(uint64_t onode_max
, uint64_t buffer_max
) = 0;
1103 virtual void add_stats(uint64_t *onodes
, uint64_t *extents
,
1106 uint64_t *bytes
) = 0;
1109 std::lock_guard
<std::recursive_mutex
> l(lock
);
1110 return _get_num_onodes() == 0 && _get_buffer_bytes() == 0;
1114 virtual void _audit(const char *s
) = 0;
1116 void _audit(const char *s
) { /* no-op */ }
1120 /// simple LRU cache for onodes and buffers
1121 struct LRUCache
: public Cache
{
1123 typedef boost::intrusive::list
<
1125 boost::intrusive::member_hook
<
1127 boost::intrusive::list_member_hook
<>,
1128 &Onode::lru_item
> > onode_lru_list_t
;
1129 typedef boost::intrusive::list
<
1131 boost::intrusive::member_hook
<
1133 boost::intrusive::list_member_hook
<>,
1134 &Buffer::lru_item
> > buffer_lru_list_t
;
1136 onode_lru_list_t onode_lru
;
1138 buffer_lru_list_t buffer_lru
;
1139 uint64_t buffer_size
= 0;
1142 LRUCache(CephContext
* cct
) : Cache(cct
) {}
1143 uint64_t _get_num_onodes() override
{
1144 return onode_lru
.size();
1146 void _add_onode(OnodeRef
& o
, int level
) override
{
1148 onode_lru
.push_front(*o
);
1150 onode_lru
.push_back(*o
);
1152 void _rm_onode(OnodeRef
& o
) override
{
1153 auto q
= onode_lru
.iterator_to(*o
);
1156 void _touch_onode(OnodeRef
& o
) override
;
1158 uint64_t _get_buffer_bytes() override
{
1161 void _add_buffer(Buffer
*b
, int level
, Buffer
*near
) override
{
1163 auto q
= buffer_lru
.iterator_to(*near
);
1164 buffer_lru
.insert(q
, *b
);
1165 } else if (level
> 0) {
1166 buffer_lru
.push_front(*b
);
1168 buffer_lru
.push_back(*b
);
1170 buffer_size
+= b
->length
;
1172 void _rm_buffer(Buffer
*b
) override
{
1173 assert(buffer_size
>= b
->length
);
1174 buffer_size
-= b
->length
;
1175 auto q
= buffer_lru
.iterator_to(*b
);
1176 buffer_lru
.erase(q
);
1178 void _move_buffer(Cache
*src
, Buffer
*b
) override
{
1180 _add_buffer(b
, 0, nullptr);
1182 void _adjust_buffer_size(Buffer
*b
, int64_t delta
) override
{
1183 assert((int64_t)buffer_size
+ delta
>= 0);
1184 buffer_size
+= delta
;
1186 void _touch_buffer(Buffer
*b
) override
{
1187 auto p
= buffer_lru
.iterator_to(*b
);
1188 buffer_lru
.erase(p
);
1189 buffer_lru
.push_front(*b
);
1190 _audit("_touch_buffer end");
1193 void _trim(uint64_t onode_max
, uint64_t buffer_max
) override
;
1195 void add_stats(uint64_t *onodes
, uint64_t *extents
,
1198 uint64_t *bytes
) override
{
1199 std::lock_guard
<std::recursive_mutex
> l(lock
);
1200 *onodes
+= onode_lru
.size();
1201 *extents
+= num_extents
;
1202 *blobs
+= num_blobs
;
1203 *buffers
+= buffer_lru
.size();
1204 *bytes
+= buffer_size
;
1208 void _audit(const char *s
) override
;
1212 // 2Q cache for buffers, LRU for onodes
1213 struct TwoQCache
: public Cache
{
1215 // stick with LRU for onodes for now (fixme?)
1216 typedef boost::intrusive::list
<
1218 boost::intrusive::member_hook
<
1220 boost::intrusive::list_member_hook
<>,
1221 &Onode::lru_item
> > onode_lru_list_t
;
1222 typedef boost::intrusive::list
<
1224 boost::intrusive::member_hook
<
1226 boost::intrusive::list_member_hook
<>,
1227 &Buffer::lru_item
> > buffer_list_t
;
1229 onode_lru_list_t onode_lru
;
1231 buffer_list_t buffer_hot
; ///< "Am" hot buffers
1232 buffer_list_t buffer_warm_in
; ///< "A1in" newly warm buffers
1233 buffer_list_t buffer_warm_out
; ///< "A1out" empty buffers we've evicted
1234 uint64_t buffer_bytes
= 0; ///< bytes
1238 BUFFER_WARM_IN
, ///< in buffer_warm_in
1239 BUFFER_WARM_OUT
, ///< in buffer_warm_out
1240 BUFFER_HOT
, ///< in buffer_hot
1244 uint64_t buffer_list_bytes
[BUFFER_TYPE_MAX
] = {0}; ///< bytes per type
1247 TwoQCache(CephContext
* cct
) : Cache(cct
) {}
1248 uint64_t _get_num_onodes() override
{
1249 return onode_lru
.size();
1251 void _add_onode(OnodeRef
& o
, int level
) override
{
1253 onode_lru
.push_front(*o
);
1255 onode_lru
.push_back(*o
);
1257 void _rm_onode(OnodeRef
& o
) override
{
1258 auto q
= onode_lru
.iterator_to(*o
);
1261 void _touch_onode(OnodeRef
& o
) override
;
1263 uint64_t _get_buffer_bytes() override
{
1264 return buffer_bytes
;
1266 void _add_buffer(Buffer
*b
, int level
, Buffer
*near
) override
;
1267 void _rm_buffer(Buffer
*b
) override
;
1268 void _move_buffer(Cache
*src
, Buffer
*b
) override
;
1269 void _adjust_buffer_size(Buffer
*b
, int64_t delta
) override
;
1270 void _touch_buffer(Buffer
*b
) override
{
1271 switch (b
->cache_private
) {
1272 case BUFFER_WARM_IN
:
1273 // do nothing (somewhat counter-intuitively!)
1275 case BUFFER_WARM_OUT
:
1276 // move from warm_out to hot LRU
1277 assert(0 == "this happens via discard hint");
1280 // move to front of hot LRU
1281 buffer_hot
.erase(buffer_hot
.iterator_to(*b
));
1282 buffer_hot
.push_front(*b
);
1285 _audit("_touch_buffer end");
1288 void _trim(uint64_t onode_max
, uint64_t buffer_max
) override
;
1290 void add_stats(uint64_t *onodes
, uint64_t *extents
,
1293 uint64_t *bytes
) override
{
1294 std::lock_guard
<std::recursive_mutex
> l(lock
);
1295 *onodes
+= onode_lru
.size();
1296 *extents
+= num_extents
;
1297 *blobs
+= num_blobs
;
1298 *buffers
+= buffer_hot
.size() + buffer_warm_in
.size();
1299 *bytes
+= buffer_bytes
;
1303 void _audit(const char *s
) override
;
1312 mempool::bluestore_cache_other::unordered_map
<ghobject_t
,OnodeRef
> onode_map
;
1314 friend class Collection
; // for split_cache()
1317 OnodeSpace(Cache
*c
) : cache(c
) {}
1322 OnodeRef
add(const ghobject_t
& oid
, OnodeRef o
);
1323 OnodeRef
lookup(const ghobject_t
& o
);
1324 void remove(const ghobject_t
& oid
) {
1325 onode_map
.erase(oid
);
1327 void rename(OnodeRef
& o
, const ghobject_t
& old_oid
,
1328 const ghobject_t
& new_oid
,
1329 const mempool::bluestore_cache_other::string
& new_okey
);
1333 void dump(CephContext
*cct
, int lvl
);
1335 /// return true if f true for any item
1336 bool map_any(std::function
<bool(OnodeRef
)> f
);
1339 struct Collection
: public CollectionImpl
{
1341 Cache
*cache
; ///< our cache shard
1343 bluestore_cnode_t cnode
;
1348 SharedBlobSet shared_blob_set
; ///< open SharedBlobs
1350 // cache onodes on a per-collection basis to avoid lock
1352 OnodeSpace onode_map
;
1355 pool_opts_t pool_opts
;
1357 OnodeRef
get_onode(const ghobject_t
& oid
, bool create
);
1359 // the terminology is confusing here, sorry!
1361 // blob_t shared_blob_t
1362 // !shared unused -> open
1363 // shared !loaded -> open + shared
1364 // shared loaded -> open + shared + loaded
1367 // open = SharedBlob is instantiated
1368 // shared = blob_t shared flag is set; SharedBlob is hashed.
1369 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1370 void open_shared_blob(uint64_t sbid
, BlobRef b
);
1371 void load_shared_blob(SharedBlobRef sb
);
1372 void make_blob_shared(uint64_t sbid
, BlobRef b
);
1373 uint64_t make_blob_unshared(SharedBlob
*sb
);
1375 BlobRef
new_blob() {
1376 BlobRef b
= new Blob();
1377 b
->shared_blob
= new SharedBlob(this);
1381 const coll_t
&get_cid() override
{
1385 bool contains(const ghobject_t
& oid
) {
1387 return oid
.hobj
.pool
== -1;
1389 if (cid
.is_pg(&spgid
))
1391 spgid
.pgid
.contains(cnode
.bits
, oid
) &&
1392 oid
.shard_id
== spgid
.shard
;
1396 void split_cache(Collection
*dest
);
1398 Collection(BlueStore
*ns
, Cache
*ca
, coll_t c
);
1401 class OmapIteratorImpl
: public ObjectMap::ObjectMapIteratorImpl
{
1404 KeyValueDB::Iterator it
;
1407 OmapIteratorImpl(CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
);
1408 int seek_to_first() override
;
1409 int upper_bound(const string
&after
) override
;
1410 int lower_bound(const string
&to
) override
;
1411 bool valid() override
;
1412 int next(bool validate
=true) override
;
1413 string
key() override
;
1414 bufferlist
value() override
;
1415 int status() override
{
1421 typedef boost::intrusive_ptr
<OpSequencer
> OpSequencerRef
;
1423 struct volatile_statfs
{
1425 STATFS_ALLOCATED
= 0,
1427 STATFS_COMPRESSED_ORIGINAL
,
1429 STATFS_COMPRESSED_ALLOCATED
,
1432 int64_t values
[STATFS_LAST
];
1434 memset(this, 0, sizeof(volatile_statfs
));
1437 *this = volatile_statfs();
1439 volatile_statfs
& operator+=(const volatile_statfs
& other
) {
1440 for (size_t i
= 0; i
< STATFS_LAST
; ++i
) {
1441 values
[i
] += other
.values
[i
];
1445 int64_t& allocated() {
1446 return values
[STATFS_ALLOCATED
];
1449 return values
[STATFS_STORED
];
1451 int64_t& compressed_original() {
1452 return values
[STATFS_COMPRESSED_ORIGINAL
];
1454 int64_t& compressed() {
1455 return values
[STATFS_COMPRESSED
];
1457 int64_t& compressed_allocated() {
1458 return values
[STATFS_COMPRESSED_ALLOCATED
];
1461 return values
[STATFS_ALLOCATED
] == 0 &&
1462 values
[STATFS_STORED
] == 0 &&
1463 values
[STATFS_COMPRESSED
] == 0 &&
1464 values
[STATFS_COMPRESSED_ORIGINAL
] == 0 &&
1465 values
[STATFS_COMPRESSED_ALLOCATED
] == 0;
1467 void decode(bufferlist::iterator
& it
) {
1468 for (size_t i
= 0; i
< STATFS_LAST
; i
++) {
1469 ::decode(values
[i
], it
);
1473 void encode(bufferlist
& bl
) {
1474 for (size_t i
= 0; i
< STATFS_LAST
; i
++) {
1475 ::encode(values
[i
], bl
);
1480 struct TransContext
: public AioContext
{
1481 MEMPOOL_CLASS_HELPERS();
1487 STATE_KV_QUEUED
, // queued for kv_sync_thread submission
1488 STATE_KV_SUBMITTED
, // submitted to kv; not yet synced
1490 STATE_DEFERRED_QUEUED
, // in deferred_queue (pending or running)
1491 STATE_DEFERRED_CLEANUP
, // remove deferred kv record
1492 STATE_DEFERRED_DONE
,
1497 state_t state
= STATE_PREPARE
;
1499 const char *get_state_name() {
1501 case STATE_PREPARE
: return "prepare";
1502 case STATE_AIO_WAIT
: return "aio_wait";
1503 case STATE_IO_DONE
: return "io_done";
1504 case STATE_KV_QUEUED
: return "kv_queued";
1505 case STATE_KV_SUBMITTED
: return "kv_submitted";
1506 case STATE_KV_DONE
: return "kv_done";
1507 case STATE_DEFERRED_QUEUED
: return "deferred_queued";
1508 case STATE_DEFERRED_CLEANUP
: return "deferred_cleanup";
1509 case STATE_DEFERRED_DONE
: return "deferred_done";
1510 case STATE_FINISHING
: return "finishing";
1511 case STATE_DONE
: return "done";
1516 #if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1517 const char *get_state_latency_name(int state
) {
1519 case l_bluestore_state_prepare_lat
: return "prepare";
1520 case l_bluestore_state_aio_wait_lat
: return "aio_wait";
1521 case l_bluestore_state_io_done_lat
: return "io_done";
1522 case l_bluestore_state_kv_queued_lat
: return "kv_queued";
1523 case l_bluestore_state_kv_committing_lat
: return "kv_committing";
1524 case l_bluestore_state_kv_done_lat
: return "kv_done";
1525 case l_bluestore_state_deferred_queued_lat
: return "deferred_queued";
1526 case l_bluestore_state_deferred_cleanup_lat
: return "deferred_cleanup";
1527 case l_bluestore_state_finishing_lat
: return "finishing";
1528 case l_bluestore_state_done_lat
: return "done";
1534 void log_state_latency(PerfCounters
*logger
, int state
) {
1535 utime_t lat
, now
= ceph_clock_now();
1536 lat
= now
- last_stamp
;
1537 logger
->tinc(state
, lat
);
1538 #if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1539 if (state
>= l_bluestore_state_prepare_lat
&& state
<= l_bluestore_state_done_lat
) {
1540 double usecs
= (now
.to_nsec()-last_stamp
.to_nsec())/1000;
1541 OID_ELAPSED("", usecs
, get_state_latency_name(state
));
1548 boost::intrusive::list_member_hook
<> sequencer_item
;
1550 uint64_t bytes
= 0, cost
= 0;
1552 set
<OnodeRef
> onodes
; ///< these need to be updated/written
1553 set
<OnodeRef
> modified_objects
; ///< objects we modified (and need a ref)
1554 set
<SharedBlobRef
> shared_blobs
; ///< these need to be updated/written
1555 set
<SharedBlobRef
> shared_blobs_written
; ///< update these on io completion
1557 KeyValueDB::Transaction t
; ///< then we will commit this
1558 Context
*oncommit
= nullptr; ///< signal on commit
1559 Context
*onreadable
= nullptr; ///< signal on readable
1560 Context
*onreadable_sync
= nullptr; ///< signal on readable
1561 list
<Context
*> oncommits
; ///< more commit completions
1562 list
<CollectionRef
> removed_collections
; ///< colls we removed
1564 boost::intrusive::list_member_hook
<> deferred_queue_item
;
1565 bluestore_deferred_transaction_t
*deferred_txn
= nullptr; ///< if any
1567 interval_set
<uint64_t> allocated
, released
;
1568 volatile_statfs statfs_delta
;
1571 bool had_ios
= false; ///< true if we submitted IOs before our kv txn
1577 uint64_t last_nid
= 0; ///< if non-zero, highest new nid we allocated
1578 uint64_t last_blobid
= 0; ///< if non-zero, highest new blobid we allocated
1580 explicit TransContext(CephContext
* cct
, OpSequencer
*o
)
1583 start(ceph_clock_now()) {
1587 delete deferred_txn
;
1590 void write_onode(OnodeRef
&o
) {
1593 void write_shared_blob(SharedBlobRef
&sb
) {
1594 shared_blobs
.insert(sb
);
1596 void unshare_blob(SharedBlob
*sb
) {
1597 shared_blobs
.erase(sb
);
1600 /// note we logically modified object (when onode itself is unmodified)
1601 void note_modified_object(OnodeRef
&o
) {
1602 // onode itself isn't written, though
1603 modified_objects
.insert(o
);
1605 void removed(OnodeRef
& o
) {
1607 modified_objects
.erase(o
);
1610 void aio_finish(BlueStore
*store
) override
{
1611 store
->txc_aio_finish(this);
1615 typedef boost::intrusive::list
<
1617 boost::intrusive::member_hook
<
1619 boost::intrusive::list_member_hook
<>,
1620 &TransContext::deferred_queue_item
> > deferred_queue_t
;
1622 struct DeferredBatch
: public AioContext
{
1624 struct deferred_io
{
1625 bufferlist bl
; ///< data
1626 uint64_t seq
; ///< deferred transaction seq
1628 map
<uint64_t,deferred_io
> iomap
; ///< map of ios in this batch
1629 deferred_queue_t txcs
; ///< txcs in this batch
1630 IOContext ioc
; ///< our aios
1631 /// bytes of pending io for each deferred seq (may be 0)
1632 map
<uint64_t,int> seq_bytes
;
1634 void _discard(CephContext
*cct
, uint64_t offset
, uint64_t length
);
1635 void _audit(CephContext
*cct
);
1637 DeferredBatch(CephContext
*cct
, OpSequencer
*osr
)
1638 : osr(osr
), ioc(cct
, this) {}
1641 void prepare_write(CephContext
*cct
,
1642 uint64_t seq
, uint64_t offset
, uint64_t length
,
1643 bufferlist::const_iterator
& p
);
1645 void aio_finish(BlueStore
*store
) override
{
1646 store
->_deferred_aio_finish(osr
);
1650 class OpSequencer
: public Sequencer_impl
{
1653 std::condition_variable qcond
;
1654 typedef boost::intrusive::list
<
1656 boost::intrusive::member_hook
<
1658 boost::intrusive::list_member_hook
<>,
1659 &TransContext::sequencer_item
> > q_list_t
;
1660 q_list_t q
; ///< transactions
1662 boost::intrusive::list_member_hook
<> deferred_osr_queue_item
;
1664 DeferredBatch
*deferred_running
= nullptr;
1665 DeferredBatch
*deferred_pending
= nullptr;
1670 uint64_t last_seq
= 0;
1672 std::atomic_int txc_with_unstable_io
= {0}; ///< num txcs with unstable io
1674 std::atomic_int kv_committing_serially
= {0};
1676 std::atomic_int kv_submitted_waiters
= {0};
1678 std::atomic_bool registered
= {true}; ///< registered in BlueStore's osr_set
1679 std::atomic_bool zombie
= {false}; ///< owning Sequencer has gone away
1681 OpSequencer(CephContext
* cct
, BlueStore
*store
)
1682 : Sequencer_impl(cct
),
1683 parent(NULL
), store(store
) {
1684 store
->register_osr(this);
1686 ~OpSequencer() override
{
1691 void discard() override
{
1692 // Note that we may have txc's in flight when the parent Sequencer
1693 // goes away. Reflect this with zombie==registered==true and let
1694 // _osr_drain_all clean up later.
1700 std::lock_guard
<std::mutex
> l(qlock
);
1708 void _unregister() {
1710 store
->unregister_osr(this);
1715 void queue_new(TransContext
*txc
) {
1716 std::lock_guard
<std::mutex
> l(qlock
);
1717 txc
->seq
= ++last_seq
;
1722 std::unique_lock
<std::mutex
> l(qlock
);
1727 void drain_preceding(TransContext
*txc
) {
1728 std::unique_lock
<std::mutex
> l(qlock
);
1729 while (!q
.empty() && &q
.front() != txc
)
1733 bool _is_all_kv_submitted() {
1734 // caller must hold qlock
1738 TransContext
*txc
= &q
.back();
1739 if (txc
->state
>= TransContext::STATE_KV_SUBMITTED
) {
1745 void flush() override
{
1746 std::unique_lock
<std::mutex
> l(qlock
);
1748 // set flag before the check because the condition
1749 // may become true outside qlock, and we need to make
1750 // sure those threads see waiters and signal qcond.
1751 ++kv_submitted_waiters
;
1752 if (_is_all_kv_submitted()) {
1756 --kv_submitted_waiters
;
1760 bool flush_commit(Context
*c
) override
{
1761 std::lock_guard
<std::mutex
> l(qlock
);
1765 TransContext
*txc
= &q
.back();
1766 if (txc
->state
>= TransContext::STATE_KV_DONE
) {
1769 txc
->oncommits
.push_back(c
);
1774 typedef boost::intrusive::list
<
1776 boost::intrusive::member_hook
<
1778 boost::intrusive::list_member_hook
<>,
1779 &OpSequencer::deferred_osr_queue_item
> > deferred_osr_queue_t
;
1781 struct KVSyncThread
: public Thread
{
1783 explicit KVSyncThread(BlueStore
*s
) : store(s
) {}
1784 void *entry() override
{
1785 store
->_kv_sync_thread();
1789 struct KVFinalizeThread
: public Thread
{
1791 explicit KVFinalizeThread(BlueStore
*s
) : store(s
) {}
1793 store
->_kv_finalize_thread();
1798 struct DBHistogram
{
1807 map
<int, struct value_dist
> val_map
; ///< slab id to count, max length of value and key
1810 map
<string
, map
<int, struct key_dist
> > key_hist
;
1811 map
<int, uint64_t> value_hist
;
1812 int get_key_slab(size_t sz
);
1813 string
get_key_slab_to_range(int slab
);
1814 int get_value_slab(size_t sz
);
1815 string
get_value_slab_to_range(int slab
);
1816 void update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
1817 const string
&prefix
, size_t key_size
, size_t value_size
);
1818 void dump(Formatter
*f
);
1821 // --------------------------------------------------------
1824 BlueFS
*bluefs
= nullptr;
1825 unsigned bluefs_shared_bdev
= 0; ///< which bluefs bdev we are sharing
1826 bool bluefs_single_shared_device
= true;
1827 utime_t bluefs_last_balance
;
1829 KeyValueDB
*db
= nullptr;
1830 BlockDevice
*bdev
= nullptr;
1831 std::string freelist_type
;
1832 FreelistManager
*fm
= nullptr;
1833 Allocator
*alloc
= nullptr;
1835 int path_fd
= -1; ///< open handle to $path
1836 int fsid_fd
= -1; ///< open handle (locked) to $path/fsid
1837 bool mounted
= false;
1839 RWLock coll_lock
= {"BlueStore::coll_lock"}; ///< rwlock to protect coll_map
1840 mempool::bluestore_cache_other::unordered_map
<coll_t
, CollectionRef
> coll_map
;
1842 vector
<Cache
*> cache_shards
;
1844 std::mutex osr_lock
; ///< protect osd_set
1845 std::set
<OpSequencerRef
> osr_set
; ///< set of all OpSequencers
1847 std::atomic
<uint64_t> nid_last
= {0};
1848 std::atomic
<uint64_t> nid_max
= {0};
1849 std::atomic
<uint64_t> blobid_last
= {0};
1850 std::atomic
<uint64_t> blobid_max
= {0};
1852 Throttle throttle_bytes
; ///< submit to commit
1853 Throttle throttle_deferred_bytes
; ///< submit to deferred complete
1855 interval_set
<uint64_t> bluefs_extents
; ///< block extents owned by bluefs
1856 interval_set
<uint64_t> bluefs_extents_reclaiming
; ///< currently reclaiming
1858 std::mutex deferred_lock
;
1859 std::atomic
<uint64_t> deferred_seq
= {0};
1860 deferred_osr_queue_t deferred_queue
; ///< osr's with deferred io pending
1861 int deferred_queue_size
= 0; ///< num txc's queued across all osrs
1862 atomic_int deferred_aggressive
= {0}; ///< aggressive wakeup of kv thread
1863 Finisher deferred_finisher
;
1865 int m_finisher_num
= 1;
1866 vector
<Finisher
*> finishers
;
1868 KVSyncThread kv_sync_thread
;
1870 std::condition_variable kv_cond
;
1871 bool _kv_only
= false;
1872 bool kv_sync_started
= false;
1873 bool kv_stop
= false;
1874 bool kv_finalize_started
= false;
1875 bool kv_finalize_stop
= false;
1876 deque
<TransContext
*> kv_queue
; ///< ready, already submitted
1877 deque
<TransContext
*> kv_queue_unsubmitted
; ///< ready, need submit by kv thread
1878 deque
<TransContext
*> kv_committing
; ///< currently syncing
1879 deque
<DeferredBatch
*> deferred_done_queue
; ///< deferred ios done
1880 deque
<DeferredBatch
*> deferred_stable_queue
; ///< deferred ios done + stable
1882 KVFinalizeThread kv_finalize_thread
;
1883 std::mutex kv_finalize_lock
;
1884 std::condition_variable kv_finalize_cond
;
1885 deque
<TransContext
*> kv_committing_to_finalize
; ///< pending finalization
1886 deque
<DeferredBatch
*> deferred_stable_to_finalize
; ///< pending finalization
1888 PerfCounters
*logger
= nullptr;
1890 list
<CollectionRef
> removed_collections
;
1892 RWLock debug_read_error_lock
= {"BlueStore::debug_read_error_lock"};
1893 set
<ghobject_t
> debug_data_error_objects
;
1894 set
<ghobject_t
> debug_mdata_error_objects
;
1896 std::atomic
<int> csum_type
= {Checksummer::CSUM_CRC32C
};
1898 uint64_t block_size
= 0; ///< block size of block device (power of 2)
1899 uint64_t block_mask
= 0; ///< mask to get just the block offset
1900 size_t block_size_order
= 0; ///< bits to shift to get block size
1902 uint64_t min_alloc_size
= 0; ///< minimum allocation unit (power of 2)
1903 ///< bits for min_alloc_size
1904 uint8_t min_alloc_size_order
= 0;
1905 static_assert(std::numeric_limits
<uint8_t>::max() >
1906 std::numeric_limits
<decltype(min_alloc_size
)>::digits
,
1907 "not enough bits for min_alloc_size");
1909 ///< maximum allocation unit (power of 2)
1910 std::atomic
<uint64_t> max_alloc_size
= {0};
1912 ///< number threshold for forced deferred writes
1913 std::atomic
<int> deferred_batch_ops
= {0};
1915 ///< size threshold for forced deferred writes
1916 std::atomic
<uint64_t> prefer_deferred_size
= {0};
1918 ///< approx cost per io, in bytes
1919 std::atomic
<uint64_t> throttle_cost_per_io
= {0};
1921 std::atomic
<Compressor::CompressionMode
> comp_mode
=
1922 {Compressor::COMP_NONE
}; ///< compression mode
1923 CompressorRef compressor
;
1924 std::atomic
<uint64_t> comp_min_blob_size
= {0};
1925 std::atomic
<uint64_t> comp_max_blob_size
= {0};
1927 std::atomic
<uint64_t> max_blob_size
= {0}; ///< maximum blob size
1929 uint64_t kv_ios
= 0;
1930 uint64_t kv_throttle_costs
= 0;
1932 // cache trim control
1933 uint64_t cache_size
= 0; ///< total cache size
1934 double cache_meta_ratio
= 0; ///< cache ratio dedicated to metadata
1935 double cache_kv_ratio
= 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
1936 double cache_data_ratio
= 0; ///< cache ratio dedicated to object data
1937 bool cache_autotune
= false; ///< cache autotune setting
1938 uint64_t cache_autotune_chunk_size
= 0; ///< cache autotune chunk size
1939 double cache_autotune_interval
= 0; ///< time to wait between cache rebalancing
1940 uint64_t osd_memory_target
= 0; ///< OSD memory target when autotuning cache
1941 uint64_t osd_memory_base
= 0; ///< OSD base memory when autotuning cache
1942 double osd_memory_expected_fragmentation
= 0; ///< expected memory fragmentation
1943 uint64_t osd_memory_cache_min
= 0; ///< Min memory to assign when autotuning cahce
1944 double osd_memory_cache_resize_interval
= 0; ///< Time to wait between cache resizing
1945 std::mutex vstatfs_lock
;
1946 volatile_statfs vstatfs
;
1948 struct MempoolThread
: public Thread
{
1955 uint64_t autotune_cache_size
= 0;
1957 struct MempoolCache
: public PriorityCache::PriCache
{
1959 int64_t cache_bytes
[PriorityCache::Priority::LAST
+1];
1960 double cache_ratio
= 0;
1962 MempoolCache(BlueStore
*s
) : store(s
) {};
1964 virtual uint64_t _get_used_bytes() const = 0;
1966 virtual int64_t request_cache_bytes(
1967 PriorityCache::Priority pri
, uint64_t chunk_bytes
) const {
1968 int64_t assigned
= get_cache_bytes(pri
);
1971 // All cache items are currently shoved into the LAST priority
1972 case PriorityCache::Priority::LAST
:
1974 uint64_t usage
= _get_used_bytes();
1975 int64_t request
= PriorityCache::get_chunk(usage
, chunk_bytes
);
1976 return(request
> assigned
) ? request
- assigned
: 0;
1984 virtual int64_t get_cache_bytes(PriorityCache::Priority pri
) const {
1985 return cache_bytes
[pri
];
1987 virtual int64_t get_cache_bytes() const {
1990 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
1991 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
1992 total
+= get_cache_bytes(pri
);
1996 virtual void set_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
1997 cache_bytes
[pri
] = bytes
;
1999 virtual void add_cache_bytes(PriorityCache::Priority pri
, int64_t bytes
) {
2000 cache_bytes
[pri
] += bytes
;
2002 virtual int64_t commit_cache_size() {
2003 return get_cache_bytes();
2005 virtual double get_cache_ratio() const {
2008 virtual void set_cache_ratio(double ratio
) {
2009 cache_ratio
= ratio
;
2011 virtual string
get_cache_name() const = 0;
2014 struct MetaCache
: public MempoolCache
{
2015 MetaCache(BlueStore
*s
) : MempoolCache(s
) {};
2017 virtual uint64_t _get_used_bytes() const {
2018 return mempool::bluestore_cache_other::allocated_bytes() +
2019 mempool::bluestore_cache_onode::allocated_bytes();
2022 virtual string
get_cache_name() const {
2023 return "BlueStore Meta Cache";
2026 uint64_t _get_num_onodes() const {
2027 uint64_t onode_num
=
2028 mempool::bluestore_cache_onode::allocated_items();
2029 return (2 > onode_num
) ? 2 : onode_num
;
2032 double get_bytes_per_onode() const {
2033 return (double)_get_used_bytes() / (double)_get_num_onodes();
2037 struct DataCache
: public MempoolCache
{
2038 DataCache(BlueStore
*s
) : MempoolCache(s
) {};
2040 virtual uint64_t _get_used_bytes() const {
2042 for (auto i
: store
->cache_shards
) {
2043 bytes
+= i
->_get_buffer_bytes();
2047 virtual string
get_cache_name() const {
2048 return "BlueStore Data Cache";
2053 explicit MempoolThread(BlueStore
*s
)
2055 lock("BlueStore::MempoolThread::lock"),
2056 meta_cache(MetaCache(s
)),
2057 data_cache(DataCache(s
)) {}
2059 void *entry() override
;
2061 assert(stop
== false);
2062 create("bstore_mempool");
2073 void _adjust_cache_settings();
2074 void _trim_shards(bool interval_stats
);
2075 void _tune_cache_size(bool interval_stats
);
2076 void _balance_cache(const std::list
<PriorityCache::PriCache
*>& caches
);
2077 void _balance_cache_pri(int64_t *mem_avail
,
2078 const std::list
<PriorityCache::PriCache
*>& caches
,
2079 PriorityCache::Priority pri
);
2082 // --------------------------------------------------------
2085 void _init_logger();
2086 void _shutdown_logger();
2087 int _reload_logger();
2091 int _open_fsid(bool create
);
2093 int _read_fsid(uuid_d
*f
);
2096 void _set_alloc_sizes();
2097 void _set_blob_size();
2098 void _set_finisher_num();
2100 int _open_bdev(bool create
);
2102 int _open_db(bool create
);
2104 int _open_fm(bool create
);
2107 void _close_alloc();
2108 int _open_collections(int *errors
=0);
2109 void _close_collections();
2111 int _setup_block_symlink_or_file(string name
, string path
, uint64_t size
,
2115 static int _write_bdev_label(CephContext
* cct
,
2116 string path
, bluestore_bdev_label_t label
);
2117 static int _read_bdev_label(CephContext
* cct
, string path
,
2118 bluestore_bdev_label_t
*label
);
2120 int _check_or_set_bdev_label(string path
, uint64_t size
, string desc
,
2123 int _open_super_meta();
2125 void _open_statfs();
2127 int _reconcile_bluefs_freespace();
2128 int _balance_bluefs_freespace(PExtentVector
*extents
);
2129 void _commit_bluefs_freespace(const PExtentVector
& extents
);
2131 CollectionRef
_get_collection(const coll_t
& cid
);
2132 void _queue_reap_collection(CollectionRef
& c
);
2133 void _reap_collections();
2134 void _update_cache_logger();
2136 void _assign_nid(TransContext
*txc
, OnodeRef o
);
2137 uint64_t _assign_blobid(TransContext
*txc
);
2139 void _dump_onode(const OnodeRef
& o
, int log_level
=30);
2140 void _dump_extent_map(ExtentMap
& em
, int log_level
=30);
2141 void _dump_transaction(Transaction
*t
, int log_level
= 30);
2143 TransContext
*_txc_create(OpSequencer
*osr
);
2144 void _txc_update_store_statfs(TransContext
*txc
);
2145 void _txc_add_transaction(TransContext
*txc
, Transaction
*t
);
2146 void _txc_calc_cost(TransContext
*txc
);
2147 void _txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
);
2148 void _txc_state_proc(TransContext
*txc
);
2149 void _txc_aio_submit(TransContext
*txc
);
2151 void txc_aio_finish(void *p
) {
2152 _txc_state_proc(static_cast<TransContext
*>(p
));
2155 void _txc_finish_io(TransContext
*txc
);
2156 void _txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
);
2157 void _txc_applied_kv(TransContext
*txc
);
2158 void _txc_committed_kv(TransContext
*txc
);
2159 void _txc_finish(TransContext
*txc
);
2160 void _txc_release_alloc(TransContext
*txc
);
2162 void _osr_drain_preceding(TransContext
*txc
);
2163 void _osr_drain_all();
2164 void _osr_unregister_all();
2168 void _kv_sync_thread();
2169 void _kv_finalize_thread();
2171 bluestore_deferred_op_t
*_get_deferred_op(TransContext
*txc
, OnodeRef o
);
2172 void _deferred_queue(TransContext
*txc
);
2174 void deferred_try_submit();
2176 void _deferred_submit_unlock(OpSequencer
*osr
);
2177 void _deferred_aio_finish(OpSequencer
*osr
);
2178 int _deferred_replay();
2181 using mempool_dynamic_bitset
=
2182 boost::dynamic_bitset
<uint64_t,
2183 mempool::bluestore_fsck::pool_allocator
<uint64_t>>;
2186 int _fsck_check_extents(
2187 const ghobject_t
& oid
,
2188 const PExtentVector
& extents
,
2190 mempool_dynamic_bitset
&used_blocks
,
2191 uint64_t granularity
,
2192 store_statfs_t
& expected_statfs
);
2194 void _buffer_cache_write(
2200 b
->shared_blob
->bc
.write(b
->shared_blob
->get_cache(), txc
->seq
, offset
, bl
,
2202 txc
->shared_blobs_written
.insert(b
->shared_blob
);
2205 int _collection_list(
2206 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
,
2207 int max
, vector
<ghobject_t
> *ls
, ghobject_t
*next
);
2209 template <typename T
, typename F
>
2210 T
select_option(const std::string
& opt_name
, T val1
, F f
) {
2211 //NB: opt_name reserved for future use
2212 boost::optional
<T
> val2
= f();
2219 void _apply_padding(uint64_t head_pad
,
2221 bufferlist
& padded
);
2223 // -- ondisk version ---
2225 const int32_t latest_ondisk_format
= 2; ///< our version
2226 const int32_t min_readable_ondisk_format
= 1; ///< what we can read
2227 const int32_t min_compat_ondisk_format
= 2; ///< who can read us
2230 int32_t ondisk_format
= 0; ///< value detected on mount
2232 int _upgrade_super(); ///< upgrade (called during open_super)
2233 void _prepare_ondisk_format_super(KeyValueDB::Transaction
& t
);
2235 // --- public interface ---
2237 BlueStore(CephContext
*cct
, const string
& path
);
2238 BlueStore(CephContext
*cct
, const string
& path
, uint64_t min_alloc_size
); // Ctor for UT only
2239 ~BlueStore() override
;
2241 string
get_type() override
{
2245 bool needs_journal() override
{ return false; };
2246 bool wants_journal() override
{ return false; };
2247 bool allows_journal() override
{ return false; };
2249 bool is_rotational() override
;
2250 bool is_journal_rotational() override
;
2252 string
get_default_device_class() override
{
2253 string device_class
;
2254 map
<string
, string
> metadata
;
2255 collect_metadata(&metadata
);
2256 auto it
= metadata
.find("bluestore_bdev_type");
2257 if (it
!= metadata
.end()) {
2258 device_class
= it
->second
;
2260 return device_class
;
2263 static int get_block_device_fsid(CephContext
* cct
, const string
& path
,
2266 bool test_mount_in_use() override
;
2269 int _mount(bool kv_only
);
2271 int mount() override
{
2272 return _mount(false);
2274 int umount() override
;
2276 int start_kv_only(KeyValueDB
**pdb
) {
2277 int r
= _mount(true);
2284 int write_meta(const std::string
& key
, const std::string
& value
) override
;
2285 int read_meta(const std::string
& key
, std::string
*value
) override
;
2288 int fsck(bool deep
) override
{
2289 return _fsck(deep
, false);
2291 int repair(bool deep
) override
{
2292 return _fsck(deep
, true);
2294 int _fsck(bool deep
, bool repair
);
2296 void set_cache_shards(unsigned num
) override
;
2298 int validate_hobject_key(const hobject_t
&obj
) const override
{
2301 unsigned get_max_attr_name_length() override
{
2302 return 256; // arbitrary; there is no real limit internally
2305 int mkfs() override
;
2306 int mkjournal() override
{
2310 void get_db_statistics(Formatter
*f
) override
;
2311 void generate_db_histogram(Formatter
*f
) override
;
2312 void _flush_cache();
2313 void flush_cache() override
;
2314 void dump_perf_counters(Formatter
*f
) override
{
2315 f
->open_object_section("perf_counters");
2316 logger
->dump_formatted(f
, false);
2320 void register_osr(OpSequencer
*osr
) {
2321 std::lock_guard
<std::mutex
> l(osr_lock
);
2322 osr_set
.insert(osr
);
2324 void unregister_osr(OpSequencer
*osr
) {
2325 std::lock_guard
<std::mutex
> l(osr_lock
);
2330 int statfs(struct store_statfs_t
*buf
) override
;
2332 void collect_metadata(map
<string
,string
> *pm
) override
;
2334 bool exists(const coll_t
& cid
, const ghobject_t
& oid
) override
;
2335 bool exists(CollectionHandle
&c
, const ghobject_t
& oid
) override
;
2336 int set_collection_opts(
2338 const pool_opts_t
& opts
) override
;
2341 const ghobject_t
& oid
,
2343 bool allow_eio
= false) override
;
2345 CollectionHandle
&c
,
2346 const ghobject_t
& oid
,
2348 bool allow_eio
= false) override
;
2351 const ghobject_t
& oid
,
2355 uint32_t op_flags
= 0) override
;
2357 CollectionHandle
&c
,
2358 const ghobject_t
& oid
,
2362 uint32_t op_flags
= 0) override
;
2369 uint32_t op_flags
= 0);
2372 int _fiemap(CollectionHandle
&c_
, const ghobject_t
& oid
,
2373 uint64_t offset
, size_t len
, interval_set
<uint64_t>& destset
);
2375 int fiemap(const coll_t
& cid
, const ghobject_t
& oid
,
2376 uint64_t offset
, size_t len
, bufferlist
& bl
) override
;
2377 int fiemap(CollectionHandle
&c
, const ghobject_t
& oid
,
2378 uint64_t offset
, size_t len
, bufferlist
& bl
) override
;
2379 int fiemap(const coll_t
& cid
, const ghobject_t
& oid
,
2380 uint64_t offset
, size_t len
, map
<uint64_t, uint64_t>& destmap
) override
;
2381 int fiemap(CollectionHandle
&c
, const ghobject_t
& oid
,
2382 uint64_t offset
, size_t len
, map
<uint64_t, uint64_t>& destmap
) override
;
2385 int getattr(const coll_t
& cid
, const ghobject_t
& oid
, const char *name
,
2386 bufferptr
& value
) override
;
2387 int getattr(CollectionHandle
&c
, const ghobject_t
& oid
, const char *name
,
2388 bufferptr
& value
) override
;
2390 int getattrs(const coll_t
& cid
, const ghobject_t
& oid
,
2391 map
<string
,bufferptr
>& aset
) override
;
2392 int getattrs(CollectionHandle
&c
, const ghobject_t
& oid
,
2393 map
<string
,bufferptr
>& aset
) override
;
2395 int list_collections(vector
<coll_t
>& ls
) override
;
2397 CollectionHandle
open_collection(const coll_t
&c
) override
;
2399 bool collection_exists(const coll_t
& c
) override
;
2400 int collection_empty(const coll_t
& c
, bool *empty
) override
;
2401 int collection_bits(const coll_t
& c
) override
;
2403 int collection_list(const coll_t
& cid
,
2404 const ghobject_t
& start
,
2405 const ghobject_t
& end
,
2407 vector
<ghobject_t
> *ls
, ghobject_t
*next
) override
;
2408 int collection_list(CollectionHandle
&c
,
2409 const ghobject_t
& start
,
2410 const ghobject_t
& end
,
2412 vector
<ghobject_t
> *ls
, ghobject_t
*next
) override
;
2415 const coll_t
& cid
, ///< [in] Collection containing oid
2416 const ghobject_t
&oid
, ///< [in] Object containing omap
2417 bufferlist
*header
, ///< [out] omap header
2418 map
<string
, bufferlist
> *out
/// < [out] Key to value map
2421 CollectionHandle
&c
, ///< [in] Collection containing oid
2422 const ghobject_t
&oid
, ///< [in] Object containing omap
2423 bufferlist
*header
, ///< [out] omap header
2424 map
<string
, bufferlist
> *out
/// < [out] Key to value map
2428 int omap_get_header(
2429 const coll_t
& cid
, ///< [in] Collection containing oid
2430 const ghobject_t
&oid
, ///< [in] Object containing omap
2431 bufferlist
*header
, ///< [out] omap header
2432 bool allow_eio
= false ///< [in] don't assert on eio
2434 int omap_get_header(
2435 CollectionHandle
&c
, ///< [in] Collection containing oid
2436 const ghobject_t
&oid
, ///< [in] Object containing omap
2437 bufferlist
*header
, ///< [out] omap header
2438 bool allow_eio
= false ///< [in] don't assert on eio
2441 /// Get keys defined on oid
2443 const coll_t
& cid
, ///< [in] Collection containing oid
2444 const ghobject_t
&oid
, ///< [in] Object containing omap
2445 set
<string
> *keys
///< [out] Keys defined on oid
2448 CollectionHandle
&c
, ///< [in] Collection containing oid
2449 const ghobject_t
&oid
, ///< [in] Object containing omap
2450 set
<string
> *keys
///< [out] Keys defined on oid
2454 int omap_get_values(
2455 const coll_t
& cid
, ///< [in] Collection containing oid
2456 const ghobject_t
&oid
, ///< [in] Object containing omap
2457 const set
<string
> &keys
, ///< [in] Keys to get
2458 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
2460 int omap_get_values(
2461 CollectionHandle
&c
, ///< [in] Collection containing oid
2462 const ghobject_t
&oid
, ///< [in] Object containing omap
2463 const set
<string
> &keys
, ///< [in] Keys to get
2464 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
2467 /// Filters keys into out which are defined on oid
2468 int omap_check_keys(
2469 const coll_t
& cid
, ///< [in] Collection containing oid
2470 const ghobject_t
&oid
, ///< [in] Object containing omap
2471 const set
<string
> &keys
, ///< [in] Keys to check
2472 set
<string
> *out
///< [out] Subset of keys defined on oid
2474 int omap_check_keys(
2475 CollectionHandle
&c
, ///< [in] Collection containing oid
2476 const ghobject_t
&oid
, ///< [in] Object containing omap
2477 const set
<string
> &keys
, ///< [in] Keys to check
2478 set
<string
> *out
///< [out] Subset of keys defined on oid
2481 ObjectMap::ObjectMapIterator
get_omap_iterator(
2482 const coll_t
& cid
, ///< [in] collection
2483 const ghobject_t
&oid
///< [in] object
2485 ObjectMap::ObjectMapIterator
get_omap_iterator(
2486 CollectionHandle
&c
, ///< [in] collection
2487 const ghobject_t
&oid
///< [in] object
2490 void set_fsid(uuid_d u
) override
{
2493 uuid_d
get_fsid() override
{
2497 uint64_t estimate_objects_overhead(uint64_t num_objects
) override
{
2498 return num_objects
* 300; //assuming per-object overhead is 300 bytes
2501 struct BSPerfTracker
{
2502 PerfCounters::avg_tracker
<uint64_t> os_commit_latency
;
2503 PerfCounters::avg_tracker
<uint64_t> os_apply_latency
;
2505 objectstore_perf_stat_t
get_cur_stats() const {
2506 objectstore_perf_stat_t ret
;
2507 ret
.os_commit_latency
= os_commit_latency
.current_avg();
2508 ret
.os_apply_latency
= os_apply_latency
.current_avg();
2512 void update_from_perfcounters(PerfCounters
&logger
);
2515 objectstore_perf_stat_t
get_cur_stats() override
{
2516 perf_tracker
.update_from_perfcounters(*logger
);
2517 return perf_tracker
.get_cur_stats();
2519 const PerfCounters
* get_perf_counters() const override
{
2523 int queue_transactions(
2525 vector
<Transaction
>& tls
,
2526 TrackedOpRef op
= TrackedOpRef(),
2527 ThreadPool::TPHandle
*handle
= NULL
) override
;
2530 void inject_data_error(const ghobject_t
& o
) override
{
2531 RWLock::WLocker
l(debug_read_error_lock
);
2532 debug_data_error_objects
.insert(o
);
2534 void inject_mdata_error(const ghobject_t
& o
) override
{
2535 RWLock::WLocker
l(debug_read_error_lock
);
2536 debug_mdata_error_objects
.insert(o
);
2538 void compact() override
{
2542 bool has_builtin_csum() const override
{
2547 bool _debug_data_eio(const ghobject_t
& o
) {
2548 if (!cct
->_conf
->bluestore_debug_inject_read_err
) {
2551 RWLock::RLocker
l(debug_read_error_lock
);
2552 return debug_data_error_objects
.count(o
);
2554 bool _debug_mdata_eio(const ghobject_t
& o
) {
2555 if (!cct
->_conf
->bluestore_debug_inject_read_err
) {
2558 RWLock::RLocker
l(debug_read_error_lock
);
2559 return debug_mdata_error_objects
.count(o
);
2561 void _debug_obj_on_delete(const ghobject_t
& o
) {
2562 if (cct
->_conf
->bluestore_debug_inject_read_err
) {
2563 RWLock::WLocker
l(debug_read_error_lock
);
2564 debug_data_error_objects
.erase(o
);
2565 debug_mdata_error_objects
.erase(o
);
2571 // --------------------------------------------------------
2572 // read processing internal methods
2575 const bluestore_blob_t
* blob
,
2576 uint64_t blob_xoffset
,
2577 const bufferlist
& bl
,
2578 uint64_t logical_offset
) const;
2579 int _decompress(bufferlist
& source
, bufferlist
* result
);
2582 // --------------------------------------------------------
2585 struct WriteContext
{
2586 bool buffered
= false; ///< buffered write
2587 bool compress
= false; ///< compressed write
2588 uint64_t target_blob_size
= 0; ///< target (max) blob size
2589 unsigned csum_order
= 0; ///< target checksum chunk order
2591 old_extent_map_t old_extents
; ///< must deref these blobs
2594 uint64_t logical_offset
; ///< write logical offset
2596 uint64_t blob_length
;
2599 uint64_t b_off0
; ///< original offset in a blob prior to padding
2600 uint64_t length0
; ///< original data length prior to padding
2603 bool new_blob
; ///< whether new blob was created
2605 bool compressed
= false;
2606 bufferlist compressed_bl
;
2607 size_t compressed_len
= 0;
2610 uint64_t logical_offs
,
2620 logical_offset(logical_offs
),
2622 blob_length(blob_len
),
2627 mark_unused(_mark_unused
),
2628 new_blob(_new_blob
) {}
2630 vector
<write_item
> writes
; ///< blobs we're writing
2632 /// partial clone of the context
2633 void fork(const WriteContext
& other
) {
2634 buffered
= other
.buffered
;
2635 compress
= other
.compress
;
2636 target_blob_size
= other
.target_blob_size
;
2637 csum_order
= other
.csum_order
;
2649 writes
.emplace_back(loffs
,
2659 /// Checks for writes to the same pextent within a blob
2664 uint64_t min_alloc_size
);
2667 void _do_write_small(
2671 uint64_t offset
, uint64_t length
,
2672 bufferlist::iterator
& blp
,
2673 WriteContext
*wctx
);
2678 uint64_t offset
, uint64_t length
,
2679 bufferlist::iterator
& blp
,
2680 WriteContext
*wctx
);
2681 int _do_alloc_write(
2685 WriteContext
*wctx
);
2691 set
<SharedBlob
*> *maybe_unshared_blobs
=0);
2693 int _do_transaction(Transaction
*t
,
2695 ThreadPool::TPHandle
*handle
);
2697 int _write(TransContext
*txc
,
2700 uint64_t offset
, size_t len
,
2702 uint32_t fadvise_flags
);
2703 void _pad_zeros(bufferlist
*bl
, uint64_t *offset
,
2704 uint64_t chunk_size
);
2706 void _choose_write_options(CollectionRef
& c
,
2708 uint32_t fadvise_flags
,
2709 WriteContext
*wctx
);
2711 int _do_gc(TransContext
*txc
,
2714 const GarbageCollector
& gc
,
2715 const WriteContext
& wctx
,
2716 uint64_t *dirty_start
,
2717 uint64_t *dirty_end
);
2719 int _do_write(TransContext
*txc
,
2722 uint64_t offset
, uint64_t length
,
2724 uint32_t fadvise_flags
);
2725 void _do_write_data(TransContext
*txc
,
2731 WriteContext
*wctx
);
2733 int _touch(TransContext
*txc
,
2736 int _do_zero(TransContext
*txc
,
2739 uint64_t offset
, size_t len
);
2740 int _zero(TransContext
*txc
,
2743 uint64_t offset
, size_t len
);
2744 void _do_truncate(TransContext
*txc
,
2748 set
<SharedBlob
*> *maybe_unshared_blobs
=0);
2749 int _truncate(TransContext
*txc
,
2753 int _remove(TransContext
*txc
,
2756 int _do_remove(TransContext
*txc
,
2759 int _setattr(TransContext
*txc
,
2764 int _setattrs(TransContext
*txc
,
2767 const map
<string
,bufferptr
>& aset
);
2768 int _rmattr(TransContext
*txc
,
2771 const string
& name
);
2772 int _rmattrs(TransContext
*txc
,
2775 void _do_omap_clear(TransContext
*txc
, uint64_t id
);
2776 int _omap_clear(TransContext
*txc
,
2779 int _omap_setkeys(TransContext
*txc
,
2783 int _omap_setheader(TransContext
*txc
,
2786 bufferlist
& header
);
2787 int _omap_rmkeys(TransContext
*txc
,
2791 int _omap_rmkey_range(TransContext
*txc
,
2794 const string
& first
, const string
& last
);
2795 int _set_alloc_hint(
2799 uint64_t expected_object_size
,
2800 uint64_t expected_write_size
,
2802 int _do_clone_range(TransContext
*txc
,
2806 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
);
2807 int _clone(TransContext
*txc
,
2811 int _clone_range(TransContext
*txc
,
2815 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
);
2816 int _rename(TransContext
*txc
,
2820 const ghobject_t
& new_oid
);
2821 int _create_collection(TransContext
*txc
, const coll_t
&cid
,
2822 unsigned bits
, CollectionRef
*c
);
2823 int _remove_collection(TransContext
*txc
, const coll_t
&cid
,
2825 int _split_collection(TransContext
*txc
,
2828 unsigned bits
, int rem
);
2831 inline ostream
& operator<<(ostream
& out
, const BlueStore::OpSequencer
& s
) {
2832 return out
<< *s
.parent
;
2835 static inline void intrusive_ptr_add_ref(BlueStore::Onode
*o
) {
2838 static inline void intrusive_ptr_release(BlueStore::Onode
*o
) {
2842 static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer
*o
) {
2845 static inline void intrusive_ptr_release(BlueStore::OpSequencer
*o
) {