1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_OSD_BLUESTORE_H
16 #define CEPH_OSD_BLUESTORE_H
24 #include <condition_variable>
26 #include <boost/intrusive/list.hpp>
27 #include <boost/intrusive/unordered_set.hpp>
28 #include <boost/intrusive/set.hpp>
29 #include <boost/functional/hash.hpp>
30 #include <boost/dynamic_bitset.hpp>
32 #include "include/assert.h"
33 #include "include/unordered_map.h"
34 #include "include/memory.h"
35 #include "include/mempool.h"
36 #include "common/Finisher.h"
37 #include "common/perf_counters.h"
38 #include "compressor/Compressor.h"
39 #include "os/ObjectStore.h"
41 #include "bluestore_types.h"
42 #include "BlockDevice.h"
43 #include "common/EventTrace.h"
46 class FreelistManager
;
50 //#define DEBUG_DEFERRED
53 l_bluestore_first
= 732430,
54 l_bluestore_kv_flush_lat
,
55 l_bluestore_kv_commit_lat
,
57 l_bluestore_state_prepare_lat
,
58 l_bluestore_state_aio_wait_lat
,
59 l_bluestore_state_io_done_lat
,
60 l_bluestore_state_kv_queued_lat
,
61 l_bluestore_state_kv_committing_lat
,
62 l_bluestore_state_kv_done_lat
,
63 l_bluestore_state_deferred_queued_lat
,
64 l_bluestore_state_deferred_aio_wait_lat
,
65 l_bluestore_state_deferred_cleanup_lat
,
66 l_bluestore_state_finishing_lat
,
67 l_bluestore_state_done_lat
,
68 l_bluestore_throttle_lat
,
69 l_bluestore_submit_lat
,
70 l_bluestore_commit_lat
,
72 l_bluestore_read_onode_meta_lat
,
73 l_bluestore_read_wait_aio_lat
,
74 l_bluestore_compress_lat
,
75 l_bluestore_decompress_lat
,
77 l_bluestore_compress_success_count
,
78 l_bluestore_compress_rejected_count
,
79 l_bluestore_write_pad_bytes
,
80 l_bluestore_deferred_write_ops
,
81 l_bluestore_deferred_write_bytes
,
82 l_bluestore_write_penalty_read_ops
,
83 l_bluestore_allocated
,
85 l_bluestore_compressed
,
86 l_bluestore_compressed_allocated
,
87 l_bluestore_compressed_original
,
89 l_bluestore_onode_hits
,
90 l_bluestore_onode_misses
,
91 l_bluestore_onode_shard_hits
,
92 l_bluestore_onode_shard_misses
,
96 l_bluestore_buffer_bytes
,
97 l_bluestore_buffer_hit_bytes
,
98 l_bluestore_buffer_miss_bytes
,
99 l_bluestore_write_big
,
100 l_bluestore_write_big_bytes
,
101 l_bluestore_write_big_blobs
,
102 l_bluestore_write_small
,
103 l_bluestore_write_small_bytes
,
104 l_bluestore_write_small_unused
,
105 l_bluestore_write_small_deferred
,
106 l_bluestore_write_small_pre_read
,
107 l_bluestore_write_small_new
,
109 l_bluestore_onode_reshard
,
110 l_bluestore_blob_split
,
111 l_bluestore_extent_compress
,
112 l_bluestore_gc_merged
,
116 class BlueStore
: public ObjectStore
,
117 public md_config_obs_t
{
118 // -----------------------------------------------------
122 const char** get_tracked_conf_keys() const override
;
123 void handle_conf_change(const struct md_config_t
*conf
,
124 const std::set
<std::string
> &changed
) override
;
127 void _set_compression();
128 void _set_throttle_params();
132 typedef map
<uint64_t, bufferlist
> ready_regions_t
;
136 typedef boost::intrusive_ptr
<Collection
> CollectionRef
;
139 virtual void aio_finish(BlueStore
*store
) = 0;
140 virtual ~AioContext() {}
145 MEMPOOL_CLASS_HELPERS();
148 STATE_EMPTY
, ///< empty buffer -- used for cache history
149 STATE_CLEAN
, ///< clean data that is up to date
150 STATE_WRITING
, ///< data that is being written (io not yet complete)
152 static const char *get_state_name(int s
) {
154 case STATE_EMPTY
: return "empty";
155 case STATE_CLEAN
: return "clean";
156 case STATE_WRITING
: return "writing";
157 default: return "???";
161 FLAG_NOCACHE
= 1, ///< trim when done WRITING (do not become CLEAN)
162 // NOTE: fix operator<< when you define a second flag
164 static const char *get_flag_name(int s
) {
166 case FLAG_NOCACHE
: return "nocache";
167 default: return "???";
172 uint16_t state
; ///< STATE_*
173 uint16_t cache_private
= 0; ///< opaque (to us) value used by Cache impl
174 uint32_t flags
; ///< FLAG_*
176 uint32_t offset
, length
;
179 boost::intrusive::list_member_hook
<> lru_item
;
180 boost::intrusive::list_member_hook
<> state_item
;
182 Buffer(BufferSpace
*space
, unsigned s
, uint64_t q
, uint32_t o
, uint32_t l
,
184 : space(space
), state(s
), flags(f
), seq(q
), offset(o
), length(l
) {}
185 Buffer(BufferSpace
*space
, unsigned s
, uint64_t q
, uint32_t o
, bufferlist
& b
,
187 : space(space
), state(s
), flags(f
), seq(q
), offset(o
),
188 length(b
.length()), data(b
) {}
190 bool is_empty() const {
191 return state
== STATE_EMPTY
;
193 bool is_clean() const {
194 return state
== STATE_CLEAN
;
196 bool is_writing() const {
197 return state
== STATE_WRITING
;
200 uint32_t end() const {
201 return offset
+ length
;
204 void truncate(uint32_t newlen
) {
205 assert(newlen
< length
);
208 t
.substr_of(data
, 0, newlen
);
214 void dump(Formatter
*f
) const {
215 f
->dump_string("state", get_state_name(state
));
216 f
->dump_unsigned("seq", seq
);
217 f
->dump_unsigned("offset", offset
);
218 f
->dump_unsigned("length", length
);
219 f
->dump_unsigned("data_length", data
.length());
225 /// map logical extent range (object) onto buffers
227 typedef boost::intrusive::list
<
229 boost::intrusive::member_hook
<
231 boost::intrusive::list_member_hook
<>,
232 &Buffer::state_item
> > state_list_t
;
234 mempool::bluestore_meta_other::map
<uint32_t, std::unique_ptr
<Buffer
>>
237 // we use a bare intrusive list here instead of std::map because
238 // it uses less memory and we expect this to be very small (very
239 // few IOs in flight to the same Blob at the same time).
240 state_list_t writing
; ///< writing buffers, sorted by seq, ascending
243 assert(buffer_map
.empty());
244 assert(writing
.empty());
247 void _add_buffer(Cache
* cache
, Buffer
*b
, int level
, Buffer
*near
) {
248 cache
->_audit("_add_buffer start");
249 buffer_map
[b
->offset
].reset(b
);
250 if (b
->is_writing()) {
251 writing
.push_back(*b
);
253 cache
->_add_buffer(b
, level
, near
);
255 cache
->_audit("_add_buffer end");
257 void _rm_buffer(Cache
* cache
, Buffer
*b
) {
258 _rm_buffer(cache
, buffer_map
.find(b
->offset
));
260 void _rm_buffer(Cache
* cache
, map
<uint32_t, std::unique_ptr
<Buffer
>>::iterator p
) {
261 assert(p
!= buffer_map
.end());
262 cache
->_audit("_rm_buffer start");
263 if (p
->second
->is_writing()) {
264 writing
.erase(writing
.iterator_to(*p
->second
));
266 cache
->_rm_buffer(p
->second
.get());
269 cache
->_audit("_rm_buffer end");
272 map
<uint32_t,std::unique_ptr
<Buffer
>>::iterator
_data_lower_bound(
274 auto i
= buffer_map
.lower_bound(offset
);
275 if (i
!= buffer_map
.begin()) {
277 if (i
->first
+ i
->second
->length
<= offset
)
283 // must be called under protection of the Cache lock
284 void _clear(Cache
* cache
);
286 // return value is the highest cache_private of a trimmed buffer, or 0.
287 int discard(Cache
* cache
, uint32_t offset
, uint32_t length
) {
288 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
289 return _discard(cache
, offset
, length
);
291 int _discard(Cache
* cache
, uint32_t offset
, uint32_t length
);
293 void write(Cache
* cache
, uint64_t seq
, uint32_t offset
, bufferlist
& bl
,
295 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
296 Buffer
*b
= new Buffer(this, Buffer::STATE_WRITING
, seq
, offset
, bl
,
298 b
->cache_private
= _discard(cache
, offset
, bl
.length());
299 _add_buffer(cache
, b
, (flags
& Buffer::FLAG_NOCACHE
) ? 0 : 1, nullptr);
301 void finish_write(Cache
* cache
, uint64_t seq
);
302 void did_read(Cache
* cache
, uint32_t offset
, bufferlist
& bl
) {
303 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
304 Buffer
*b
= new Buffer(this, Buffer::STATE_CLEAN
, 0, offset
, bl
);
305 b
->cache_private
= _discard(cache
, offset
, bl
.length());
306 _add_buffer(cache
, b
, 1, nullptr);
309 void read(Cache
* cache
, uint32_t offset
, uint32_t length
,
310 BlueStore::ready_regions_t
& res
,
311 interval_set
<uint32_t>& res_intervals
);
313 void truncate(Cache
* cache
, uint32_t offset
) {
314 discard(cache
, offset
, (uint32_t)-1 - offset
);
317 void split(Cache
* cache
, size_t pos
, BufferSpace
&r
);
319 void dump(Cache
* cache
, Formatter
*f
) const {
320 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
321 f
->open_array_section("buffers");
322 for (auto& i
: buffer_map
) {
323 f
->open_object_section("buffer");
324 assert(i
.first
== i
.second
->offset
);
332 struct SharedBlobSet
;
334 /// in-memory shared blob state (incl cached buffers)
336 MEMPOOL_CLASS_HELPERS();
338 std::atomic_int nref
= {0}; ///< reference count
343 uint64_t sbid_unloaded
; ///< sbid if persistent isn't loaded
344 bluestore_shared_blob_t
*persistent
; ///< persistent part of the shared blob if any
346 BufferSpace bc
; ///< buffer cache
348 SharedBlob(Collection
*_coll
) : coll(_coll
), sbid_unloaded(0) {
350 get_cache()->add_blob();
353 SharedBlob(uint64_t i
, Collection
*_coll
);
356 uint64_t get_sbid() const {
357 return loaded
? persistent
->sbid
: sbid_unloaded
;
360 friend void intrusive_ptr_add_ref(SharedBlob
*b
) { b
->get(); }
361 friend void intrusive_ptr_release(SharedBlob
*b
) { b
->put(); }
363 friend ostream
& operator<<(ostream
& out
, const SharedBlob
& sb
);
370 /// get logical references
371 void get_ref(uint64_t offset
, uint32_t length
);
373 /// put logical references, and get back any released extents
374 void put_ref(uint64_t offset
, uint32_t length
,
377 friend bool operator==(const SharedBlob
&l
, const SharedBlob
&r
) {
378 return l
.get_sbid() == r
.get_sbid();
380 inline Cache
* get_cache() {
381 return coll
? coll
->cache
: nullptr;
383 inline SharedBlobSet
* get_parent() {
384 return coll
? &(coll
->shared_blob_set
) : nullptr;
386 inline bool is_loaded() const {
391 typedef boost::intrusive_ptr
<SharedBlob
> SharedBlobRef
;
393 /// a lookup table of SharedBlobs
394 struct SharedBlobSet
{
395 std::mutex lock
; ///< protect lookup, insertion, removal
397 // we use a bare pointer because we don't want to affect the ref
399 mempool::bluestore_meta_other::unordered_map
<uint64_t,SharedBlob
*> sb_map
;
401 SharedBlobRef
lookup(uint64_t sbid
) {
402 std::lock_guard
<std::mutex
> l(lock
);
403 auto p
= sb_map
.find(sbid
);
404 if (p
== sb_map
.end()) {
410 void add(Collection
* coll
, SharedBlob
*sb
) {
411 std::lock_guard
<std::mutex
> l(lock
);
412 sb_map
[sb
->get_sbid()] = sb
;
416 bool remove(SharedBlob
*sb
) {
417 std::lock_guard
<std::mutex
> l(lock
);
419 assert(sb
->get_parent() == this);
420 sb_map
.erase(sb
->get_sbid());
427 std::lock_guard
<std::mutex
> l(lock
);
428 return sb_map
.empty();
432 //#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
434 /// in-memory blob metadata and associated cached buffers (if any)
436 MEMPOOL_CLASS_HELPERS();
438 std::atomic_int nref
= {0}; ///< reference count
439 int16_t id
= -1; ///< id, for spanning blobs only, >= 0
440 int16_t last_encoded_id
= -1; ///< (ephemeral) used during encoding only
441 SharedBlobRef shared_blob
; ///< shared blob state (if any)
444 mutable bluestore_blob_t blob
; ///< decoded blob metadata
446 mutable bufferlist blob_bl
; ///< cached encoded blob, blob is dirty if empty
448 /// refs from this shard. ephemeral if id<0, persisted if spanning.
449 bluestore_blob_use_tracker_t used_in_blob
;
453 friend void intrusive_ptr_add_ref(Blob
*b
) { b
->get(); }
454 friend void intrusive_ptr_release(Blob
*b
) { b
->put(); }
456 friend ostream
& operator<<(ostream
& out
, const Blob
&b
);
458 const bluestore_blob_use_tracker_t
& get_blob_use_tracker() const {
461 bool is_referenced() const {
462 return used_in_blob
.is_not_empty();
464 uint32_t get_referenced_bytes() const {
465 return used_in_blob
.get_referenced_bytes();
468 bool is_spanning() const {
472 bool can_split() const {
473 std::lock_guard
<std::recursive_mutex
> l(shared_blob
->get_cache()->lock
);
474 // splitting a BufferSpace writing list is too hard; don't try.
475 return shared_blob
->bc
.writing
.empty() &&
476 used_in_blob
.can_split() &&
477 get_blob().can_split();
480 bool can_split_at(uint32_t blob_offset
) const {
481 return used_in_blob
.can_split_at(blob_offset
) &&
482 get_blob().can_split_at(blob_offset
);
485 bool try_reuse_blob(uint32_t min_alloc_size
,
486 uint32_t target_blob_size
,
491 o
.shared_blob
= shared_blob
;
498 const bluestore_blob_t
& get_blob() const {
501 bluestore_blob_t
& dirty_blob() {
508 /// discard buffers for unallocated regions
509 void discard_unallocated(Collection
*coll
);
511 /// get logical references
512 void get_ref(Collection
*coll
, uint32_t offset
, uint32_t length
);
513 /// put logical references, and get back any released extents
514 bool put_ref(Collection
*coll
, uint32_t offset
, uint32_t length
,
518 void split(Collection
*coll
, uint32_t blob_offset
, Blob
*o
);
530 void _encode() const {
531 if (blob_bl
.length() == 0 ) {
532 ::encode(blob
, blob_bl
);
534 assert(blob_bl
.length());
539 bool include_ref_map
) const {
541 p
+= blob_bl
.length();
542 if (include_ref_map
) {
543 used_in_blob
.bound_encode(p
);
547 bufferlist::contiguous_appender
& p
,
548 bool include_ref_map
) const {
551 if (include_ref_map
) {
552 used_in_blob
.encode(p
);
556 Collection */
*coll*/
,
557 bufferptr::iterator
& p
,
558 bool include_ref_map
) {
559 const char *start
= p
.get_pos();
561 const char *end
= p
.get_pos();
563 blob_bl
.append(start
, end
- start
);
564 if (include_ref_map
) {
565 used_in_blob
.decode(p
);
573 bool include_ref_map
) const {
574 denc(blob
, p
, struct_v
);
575 if (blob
.is_shared()) {
578 if (include_ref_map
) {
579 used_in_blob
.bound_encode(p
);
583 bufferlist::contiguous_appender
& p
,
586 bool include_ref_map
) const {
587 denc(blob
, p
, struct_v
);
588 if (blob
.is_shared()) {
591 if (include_ref_map
) {
592 used_in_blob
.encode(p
);
597 bufferptr::iterator
& p
,
600 bool include_ref_map
);
603 typedef boost::intrusive_ptr
<Blob
> BlobRef
;
604 typedef mempool::bluestore_meta_other::map
<int,BlobRef
> blob_map_t
;
606 /// a logical extent, pointing to (some portion of) a blob
607 typedef boost::intrusive::set_base_hook
<boost::intrusive::optimize_size
<true> > ExtentBase
; //making an alias to avoid build warnings
608 struct Extent
: public ExtentBase
{
609 MEMPOOL_CLASS_HELPERS();
611 uint32_t logical_offset
= 0; ///< logical offset
612 uint32_t blob_offset
= 0; ///< blob offset
613 uint32_t length
= 0; ///< length
614 BlobRef blob
; ///< the blob with our data
616 /// ctor for lookup only
617 explicit Extent(uint32_t lo
) : ExtentBase(), logical_offset(lo
) { }
618 /// ctor for delayed initialization (see decode_some())
619 explicit Extent() : ExtentBase() {
621 /// ctor for general usage
622 Extent(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
)
624 logical_offset(lo
), blob_offset(o
), length(l
) {
629 blob
->shared_blob
->get_cache()->rm_extent();
633 void assign_blob(const BlobRef
& b
) {
636 blob
->shared_blob
->get_cache()->add_extent();
639 // comparators for intrusive_set
640 friend bool operator<(const Extent
&a
, const Extent
&b
) {
641 return a
.logical_offset
< b
.logical_offset
;
643 friend bool operator>(const Extent
&a
, const Extent
&b
) {
644 return a
.logical_offset
> b
.logical_offset
;
646 friend bool operator==(const Extent
&a
, const Extent
&b
) {
647 return a
.logical_offset
== b
.logical_offset
;
650 uint32_t blob_start() const {
651 return logical_offset
- blob_offset
;
654 uint32_t blob_end() const {
655 return blob_start() + blob
->get_blob().get_logical_length();
658 uint32_t logical_end() const {
659 return logical_offset
+ length
;
662 // return true if any piece of the blob is out of
663 // the given range [o, o + l].
664 bool blob_escapes_range(uint32_t o
, uint32_t l
) const {
665 return blob_start() < o
|| blob_end() > o
+ l
;
668 typedef boost::intrusive::set
<Extent
> extent_map_t
;
671 friend ostream
& operator<<(ostream
& out
, const Extent
& e
);
674 boost::intrusive::list_member_hook
<> old_extent_item
;
677 bool blob_empty
; // flag to track the last removed extent that makes blob
678 // empty - required to update compression stat properly
679 OldExtent(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
)
680 : e(lo
, o
, l
, b
), blob_empty(false) {
682 static OldExtent
* create(CollectionRef c
,
688 typedef boost::intrusive::list
<
690 boost::intrusive::member_hook
<
692 boost::intrusive::list_member_hook
<>,
693 &OldExtent::old_extent_item
> > old_extent_map_t
;
697 /// a sharded extent map, mapping offsets to lextents to blobs
700 extent_map_t extent_map
; ///< map of Extents to Blobs
701 blob_map_t spanning_blob_map
; ///< blobs that span shards
704 bluestore_onode_t::shard_info
*shard_info
= nullptr;
705 unsigned extents
= 0; ///< count extents in this shard
706 bool loaded
= false; ///< true if shard is loaded
707 bool dirty
= false; ///< true if shard is dirty and needs reencoding
709 mempool::bluestore_meta_other::vector
<Shard
> shards
; ///< shards
711 bufferlist inline_bl
; ///< cached encoded map, if unsharded; empty=>dirty
713 uint32_t needs_reshard_begin
= 0;
714 uint32_t needs_reshard_end
= 0;
716 bool needs_reshard() const {
717 return needs_reshard_end
> needs_reshard_begin
;
719 void clear_needs_reshard() {
720 needs_reshard_begin
= needs_reshard_end
= 0;
722 void request_reshard(uint32_t begin
, uint32_t end
) {
723 if (begin
< needs_reshard_begin
) {
724 needs_reshard_begin
= begin
;
726 if (end
> needs_reshard_end
) {
727 needs_reshard_end
= end
;
731 struct DeleteDisposer
{
732 void operator()(Extent
*e
) { delete e
; }
737 extent_map
.clear_and_dispose(DeleteDisposer());
741 extent_map
.clear_and_dispose(DeleteDisposer());
744 clear_needs_reshard();
747 bool encode_some(uint32_t offset
, uint32_t length
, bufferlist
& bl
,
749 unsigned decode_some(bufferlist
& bl
);
751 void bound_encode_spanning_blobs(size_t& p
);
752 void encode_spanning_blobs(bufferlist::contiguous_appender
& p
);
753 void decode_spanning_blobs(bufferptr::iterator
& p
);
755 BlobRef
get_spanning_blob(int id
) {
756 auto p
= spanning_blob_map
.find(id
);
757 assert(p
!= spanning_blob_map
.end());
761 void update(KeyValueDB::Transaction t
, bool force
);
764 KeyValueDB::Transaction t
);
766 /// initialize Shards from the onode
767 void init_shards(bool loaded
, bool dirty
);
769 /// return index of shard containing offset
770 /// or -1 if not found
771 int seek_shard(uint32_t offset
) {
772 size_t end
= shards
.size();
773 size_t mid
, left
= 0;
774 size_t right
= end
; // one passed the right end
776 while (left
< right
) {
777 mid
= left
+ (right
- left
) / 2;
778 if (offset
>= shards
[mid
].shard_info
->offset
) {
779 size_t next
= mid
+ 1;
780 if (next
>= end
|| offset
< shards
[next
].shard_info
->offset
)
782 //continue to search forwards
785 //continue to search backwards
790 return -1; // not found
793 /// check if a range spans a shard
794 bool spans_shard(uint32_t offset
, uint32_t length
) {
795 if (shards
.empty()) {
798 int s
= seek_shard(offset
);
800 if (s
== (int)shards
.size() - 1) {
801 return false; // last shard
803 if (offset
+ length
<= shards
[s
+1].shard_info
->offset
) {
809 /// ensure that a range of the map is loaded
810 void fault_range(KeyValueDB
*db
,
811 uint32_t offset
, uint32_t length
);
813 /// ensure a range of the map is marked dirty
814 void dirty_range(KeyValueDB::Transaction t
,
815 uint32_t offset
, uint32_t length
);
817 extent_map_t::iterator
find(uint64_t offset
);
819 /// find a lextent that includes offset
820 extent_map_t::iterator
find_lextent(uint64_t offset
);
822 /// seek to the first lextent including or after offset
823 extent_map_t::iterator
seek_lextent(uint64_t offset
);
824 extent_map_t::const_iterator
seek_lextent(uint64_t offset
) const;
827 void add(uint32_t lo
, uint32_t o
, uint32_t l
, BlobRef
& b
) {
828 extent_map
.insert(*new Extent(lo
, o
, l
, b
));
831 /// remove (and delete) an Extent
832 void rm(extent_map_t::iterator p
) {
833 extent_map
.erase_and_dispose(p
, DeleteDisposer());
836 bool has_any_lextents(uint64_t offset
, uint64_t length
);
838 /// consolidate adjacent lextents in extent_map
839 int compress_extent_map(uint64_t offset
, uint64_t length
);
841 /// punch a logical hole. add lextents to deref to target list.
842 void punch_hole(CollectionRef
&c
,
843 uint64_t offset
, uint64_t length
,
844 old_extent_map_t
*old_extents
);
846 /// put new lextent into lextent_map overwriting existing ones if
847 /// any and update references accordingly
848 Extent
*set_lextent(CollectionRef
&c
,
849 uint64_t logical_offset
,
850 uint64_t offset
, uint64_t length
,
852 old_extent_map_t
*old_extents
);
854 /// split a blob (and referring extents)
855 BlobRef
split_blob(BlobRef lb
, uint32_t blob_offset
, uint32_t pos
);
858 /// Compressed Blob Garbage collector
860 The primary idea of the collector is to estimate a difference between
861 allocation units(AU) currently present for compressed blobs and new AUs
862 required to store that data uncompressed.
863 Estimation is performed for protrusive extents within a logical range
864 determined by a concatenation of old_extents collection and specific(current)
866 The root cause for old_extents use is the need to handle blob ref counts
867 properly. Old extents still hold blob refs and hence we need to traverse
868 the collection to determine if blob to be released.
869 Protrusive extents are extents that fit into the blob set in action
870 (ones that are below the logical range from above) but not removed totally
871 due to the current write.
873 extent1 <loffs = 100, boffs = 100, len = 100> ->
874 blob1<compressed, len_on_disk=4096, logical_len=8192>
875 extent2 <loffs = 200, boffs = 200, len = 100> ->
876 blob2<raw, len_on_disk=4096, llen=4096>
877 extent3 <loffs = 300, boffs = 300, len = 100> ->
878 blob1<compressed, len_on_disk=4096, llen=8192>
879 extent4 <loffs = 4096, boffs = 0, len = 100> ->
880 blob3<raw, len_on_disk=4096, llen=4096>
882 protrusive extents are within the following ranges <0~300, 400~8192-400>
883 In this case existing AUs that might be removed due to GC (i.e. blob1)
885 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
886 Hence we should do a collect.
888 class GarbageCollector
891 /// return amount of allocation units that might be saved due to GC
895 const ExtentMap
& extent_map
,
896 const old_extent_map_t
& old_extents
,
897 uint64_t min_alloc_size
);
899 /// return a collection of extents to perform GC on
900 const vector
<AllocExtent
>& get_extents_to_collect() const {
901 return extents_to_collect
;
903 GarbageCollector(CephContext
* _cct
) : cct(_cct
) {}
907 uint64_t referenced_bytes
= 0; ///< amount of bytes referenced in blob
908 int64_t expected_allocations
= 0; ///< new alloc units required
909 ///< in case of gc fulfilled
910 bool collect_candidate
= false; ///< indicate if blob has any extents
911 ///< eligible for GC.
912 extent_map_t::const_iterator first_lextent
; ///< points to the first
913 ///< lextent referring to
914 ///< the blob if any.
915 ///< collect_candidate flag
916 ///< determines the validity
917 extent_map_t::const_iterator last_lextent
; ///< points to the last
918 ///< lextent referring to
919 ///< the blob if any.
921 BlobInfo(uint64_t ref_bytes
) :
922 referenced_bytes(ref_bytes
) {
926 map
<Blob
*, BlobInfo
> affected_blobs
; ///< compressed blobs and their ref_map
927 ///< copies that are affected by the
930 vector
<AllocExtent
> extents_to_collect
; ///< protrusive extents that should
931 ///< be collected if GC takes place
933 boost::optional
<uint64_t > used_alloc_unit
; ///< last processed allocation
934 ///< unit when traversing
935 ///< protrusive extents.
936 ///< Other extents mapped to
937 ///< this AU to be ignored
938 ///< (except the case where
939 ///< uncompressed extent follows
940 ///< compressed one - see below).
941 BlobInfo
* blob_info_counted
= nullptr; ///< set if previous allocation unit
942 ///< caused expected_allocations
943 ///< counter increment at this blob.
944 ///< if uncompressed extent follows
945 ///< a decrement for the
946 ///< expected_allocations counter
948 int64_t expected_allocations
= 0; ///< new alloc units required in case
950 int64_t expected_for_release
= 0; ///< alloc units currently used by
951 ///< compressed blobs that might
953 uint64_t gc_start_offset
; ///starting offset for GC
954 uint64_t gc_end_offset
; ///ending offset for GC
957 void process_protrusive_extents(const BlueStore::ExtentMap
& extent_map
,
958 uint64_t start_offset
,
960 uint64_t start_touch_offset
,
961 uint64_t end_touch_offset
,
962 uint64_t min_alloc_size
);
967 /// an in-memory object
969 MEMPOOL_CLASS_HELPERS();
971 std::atomic_int nref
; ///< reference count
976 /// key under PREFIX_OBJ where we are stored
977 mempool::bluestore_meta_other::string key
;
979 boost::intrusive::list_member_hook
<> lru_item
;
981 bluestore_onode_t onode
; ///< metadata stored as value in kv store
982 bool exists
; ///< true if object logically exists
984 ExtentMap extent_map
;
986 // track txc's that have not been committed to kv store (and whose
987 // effects cannot be read via the kvdb read methods)
988 std::atomic
<int> flushing_count
= {0};
989 std::mutex flush_lock
; ///< protect flush_txns
990 std::condition_variable flush_cond
; ///< wait here for uncommitted txns
992 Onode(Collection
*c
, const ghobject_t
& o
,
993 const mempool::bluestore_meta_other::string
& k
)
1011 typedef boost::intrusive_ptr
<Onode
> OnodeRef
;
1014 /// a cache (shard) of onodes and buffers
1017 PerfCounters
*logger
;
1018 std::recursive_mutex lock
; ///< protect lru and other structures
1020 std::atomic
<uint64_t> num_extents
= {0};
1021 std::atomic
<uint64_t> num_blobs
= {0};
1023 size_t last_trim_seq
= 0;
1025 static Cache
*create(CephContext
* cct
, string type
, PerfCounters
*logger
);
1027 Cache(CephContext
* cct
) : cct(cct
), logger(nullptr) {}
1030 virtual void _add_onode(OnodeRef
& o
, int level
) = 0;
1031 virtual void _rm_onode(OnodeRef
& o
) = 0;
1032 virtual void _touch_onode(OnodeRef
& o
) = 0;
1034 virtual void _add_buffer(Buffer
*b
, int level
, Buffer
*near
) = 0;
1035 virtual void _rm_buffer(Buffer
*b
) = 0;
1036 virtual void _move_buffer(Cache
*src
, Buffer
*b
) = 0;
1037 virtual void _adjust_buffer_size(Buffer
*b
, int64_t delta
) = 0;
1038 virtual void _touch_buffer(Buffer
*b
) = 0;
1040 virtual uint64_t _get_num_onodes() = 0;
1041 virtual uint64_t _get_buffer_bytes() = 0;
1057 void trim(uint64_t target_bytes
, float target_meta_ratio
,
1058 float bytes_per_onode
);
1062 virtual void _trim(uint64_t onode_max
, uint64_t buffer_max
) = 0;
1064 virtual void add_stats(uint64_t *onodes
, uint64_t *extents
,
1067 uint64_t *bytes
) = 0;
1070 virtual void _audit(const char *s
) = 0;
1072 void _audit(const char *s
) { /* no-op */ }
1076 /// simple LRU cache for onodes and buffers
1077 struct LRUCache
: public Cache
{
1079 typedef boost::intrusive::list
<
1081 boost::intrusive::member_hook
<
1083 boost::intrusive::list_member_hook
<>,
1084 &Onode::lru_item
> > onode_lru_list_t
;
1085 typedef boost::intrusive::list
<
1087 boost::intrusive::member_hook
<
1089 boost::intrusive::list_member_hook
<>,
1090 &Buffer::lru_item
> > buffer_lru_list_t
;
1092 onode_lru_list_t onode_lru
;
1094 buffer_lru_list_t buffer_lru
;
1095 uint64_t buffer_size
= 0;
1098 LRUCache(CephContext
* cct
) : Cache(cct
) {}
1099 uint64_t _get_num_onodes() override
{
1100 return onode_lru
.size();
1102 void _add_onode(OnodeRef
& o
, int level
) override
{
1104 onode_lru
.push_front(*o
);
1106 onode_lru
.push_back(*o
);
1108 void _rm_onode(OnodeRef
& o
) override
{
1109 auto q
= onode_lru
.iterator_to(*o
);
1112 void _touch_onode(OnodeRef
& o
) override
;
1114 uint64_t _get_buffer_bytes() override
{
1117 void _add_buffer(Buffer
*b
, int level
, Buffer
*near
) override
{
1119 auto q
= buffer_lru
.iterator_to(*near
);
1120 buffer_lru
.insert(q
, *b
);
1121 } else if (level
> 0) {
1122 buffer_lru
.push_front(*b
);
1124 buffer_lru
.push_back(*b
);
1126 buffer_size
+= b
->length
;
1128 void _rm_buffer(Buffer
*b
) override
{
1129 assert(buffer_size
>= b
->length
);
1130 buffer_size
-= b
->length
;
1131 auto q
= buffer_lru
.iterator_to(*b
);
1132 buffer_lru
.erase(q
);
1134 void _move_buffer(Cache
*src
, Buffer
*b
) override
{
1136 _add_buffer(b
, 0, nullptr);
1138 void _adjust_buffer_size(Buffer
*b
, int64_t delta
) override
{
1139 assert((int64_t)buffer_size
+ delta
>= 0);
1140 buffer_size
+= delta
;
1142 void _touch_buffer(Buffer
*b
) override
{
1143 auto p
= buffer_lru
.iterator_to(*b
);
1144 buffer_lru
.erase(p
);
1145 buffer_lru
.push_front(*b
);
1146 _audit("_touch_buffer end");
1149 void _trim(uint64_t onode_max
, uint64_t buffer_max
) override
;
1151 void add_stats(uint64_t *onodes
, uint64_t *extents
,
1154 uint64_t *bytes
) override
{
1155 std::lock_guard
<std::recursive_mutex
> l(lock
);
1156 *onodes
+= onode_lru
.size();
1157 *extents
+= num_extents
;
1158 *blobs
+= num_blobs
;
1159 *buffers
+= buffer_lru
.size();
1160 *bytes
+= buffer_size
;
1164 void _audit(const char *s
) override
;
1168 // 2Q cache for buffers, LRU for onodes
1169 struct TwoQCache
: public Cache
{
1171 // stick with LRU for onodes for now (fixme?)
1172 typedef boost::intrusive::list
<
1174 boost::intrusive::member_hook
<
1176 boost::intrusive::list_member_hook
<>,
1177 &Onode::lru_item
> > onode_lru_list_t
;
1178 typedef boost::intrusive::list
<
1180 boost::intrusive::member_hook
<
1182 boost::intrusive::list_member_hook
<>,
1183 &Buffer::lru_item
> > buffer_list_t
;
1185 onode_lru_list_t onode_lru
;
1187 buffer_list_t buffer_hot
; ///< "Am" hot buffers
1188 buffer_list_t buffer_warm_in
; ///< "A1in" newly warm buffers
1189 buffer_list_t buffer_warm_out
; ///< "A1out" empty buffers we've evicted
1190 uint64_t buffer_bytes
= 0; ///< bytes
1194 BUFFER_WARM_IN
, ///< in buffer_warm_in
1195 BUFFER_WARM_OUT
, ///< in buffer_warm_out
1196 BUFFER_HOT
, ///< in buffer_hot
1200 uint64_t buffer_list_bytes
[BUFFER_TYPE_MAX
] = {0}; ///< bytes per type
1203 TwoQCache(CephContext
* cct
) : Cache(cct
) {}
1204 uint64_t _get_num_onodes() override
{
1205 return onode_lru
.size();
1207 void _add_onode(OnodeRef
& o
, int level
) override
{
1209 onode_lru
.push_front(*o
);
1211 onode_lru
.push_back(*o
);
1213 void _rm_onode(OnodeRef
& o
) override
{
1214 auto q
= onode_lru
.iterator_to(*o
);
1217 void _touch_onode(OnodeRef
& o
) override
;
1219 uint64_t _get_buffer_bytes() override
{
1220 return buffer_bytes
;
1222 void _add_buffer(Buffer
*b
, int level
, Buffer
*near
) override
;
1223 void _rm_buffer(Buffer
*b
) override
;
1224 void _move_buffer(Cache
*src
, Buffer
*b
) override
;
1225 void _adjust_buffer_size(Buffer
*b
, int64_t delta
) override
;
1226 void _touch_buffer(Buffer
*b
) override
{
1227 switch (b
->cache_private
) {
1228 case BUFFER_WARM_IN
:
1229 // do nothing (somewhat counter-intuitively!)
1231 case BUFFER_WARM_OUT
:
1232 // move from warm_out to hot LRU
1233 assert(0 == "this happens via discard hint");
1236 // move to front of hot LRU
1237 buffer_hot
.erase(buffer_hot
.iterator_to(*b
));
1238 buffer_hot
.push_front(*b
);
1241 _audit("_touch_buffer end");
1244 void _trim(uint64_t onode_max
, uint64_t buffer_max
) override
;
1246 void add_stats(uint64_t *onodes
, uint64_t *extents
,
1249 uint64_t *bytes
) override
{
1250 std::lock_guard
<std::recursive_mutex
> l(lock
);
1251 *onodes
+= onode_lru
.size();
1252 *extents
+= num_extents
;
1253 *blobs
+= num_blobs
;
1254 *buffers
+= buffer_hot
.size() + buffer_warm_in
.size();
1255 *bytes
+= buffer_bytes
;
1259 void _audit(const char *s
) override
;
1268 mempool::bluestore_meta_other::unordered_map
<ghobject_t
,OnodeRef
> onode_map
;
1270 friend class Collection
; // for split_cache()
1273 OnodeSpace(Cache
*c
) : cache(c
) {}
1278 OnodeRef
add(const ghobject_t
& oid
, OnodeRef o
);
1279 OnodeRef
lookup(const ghobject_t
& o
);
1280 void remove(const ghobject_t
& oid
) {
1281 onode_map
.erase(oid
);
1283 void rename(OnodeRef
& o
, const ghobject_t
& old_oid
,
1284 const ghobject_t
& new_oid
,
1285 const mempool::bluestore_meta_other::string
& new_okey
);
1289 /// return true if f true for any item
1290 bool map_any(std::function
<bool(OnodeRef
)> f
);
1293 struct Collection
: public CollectionImpl
{
1295 Cache
*cache
; ///< our cache shard
1297 bluestore_cnode_t cnode
;
1302 SharedBlobSet shared_blob_set
; ///< open SharedBlobs
1304 // cache onodes on a per-collection basis to avoid lock
1306 OnodeSpace onode_map
;
1309 pool_opts_t pool_opts
;
1311 OnodeRef
get_onode(const ghobject_t
& oid
, bool create
);
1313 // the terminology is confusing here, sorry!
1315 // blob_t shared_blob_t
1316 // !shared unused -> open
1317 // shared !loaded -> open + shared
1318 // shared loaded -> open + shared + loaded
1321 // open = SharedBlob is instantiated
1322 // shared = blob_t shared flag is set; SharedBlob is hashed.
1323 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1324 void open_shared_blob(uint64_t sbid
, BlobRef b
);
1325 void load_shared_blob(SharedBlobRef sb
);
1326 void make_blob_shared(uint64_t sbid
, BlobRef b
);
1328 BlobRef
new_blob() {
1329 BlobRef b
= new Blob();
1330 b
->shared_blob
= new SharedBlob(this);
1334 const coll_t
&get_cid() override
{
1338 bool contains(const ghobject_t
& oid
) {
1340 return oid
.hobj
.pool
== -1;
1342 if (cid
.is_pg(&spgid
))
1344 spgid
.pgid
.contains(cnode
.bits
, oid
) &&
1345 oid
.shard_id
== spgid
.shard
;
1349 void split_cache(Collection
*dest
);
1352 Collection(BlueStore
*ns
, Cache
*ca
, coll_t c
);
1355 class OmapIteratorImpl
: public ObjectMap::ObjectMapIteratorImpl
{
1358 KeyValueDB::Iterator it
;
1361 OmapIteratorImpl(CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
);
1362 int seek_to_first() override
;
1363 int upper_bound(const string
&after
) override
;
1364 int lower_bound(const string
&to
) override
;
1365 bool valid() override
;
1366 int next(bool validate
=true) override
;
1367 string
key() override
;
1368 bufferlist
value() override
;
1369 int status() override
{
1375 typedef boost::intrusive_ptr
<OpSequencer
> OpSequencerRef
;
1377 struct TransContext
: public AioContext
{
1382 STATE_KV_QUEUED
, // queued for kv_sync_thread submission
1383 STATE_KV_SUBMITTED
, // submitted to kv; not yet synced
1385 STATE_DEFERRED_QUEUED
, // in deferred_queue (pending or running)
1386 STATE_DEFERRED_CLEANUP
, // remove deferred kv record
1387 STATE_DEFERRED_DONE
,
1392 state_t state
= STATE_PREPARE
;
1394 const char *get_state_name() {
1396 case STATE_PREPARE
: return "prepare";
1397 case STATE_AIO_WAIT
: return "aio_wait";
1398 case STATE_IO_DONE
: return "io_done";
1399 case STATE_KV_QUEUED
: return "kv_queued";
1400 case STATE_KV_SUBMITTED
: return "kv_submitted";
1401 case STATE_KV_DONE
: return "kv_done";
1402 case STATE_DEFERRED_QUEUED
: return "deferred_queued";
1403 case STATE_DEFERRED_CLEANUP
: return "deferred_cleanup";
1404 case STATE_DEFERRED_DONE
: return "deferred_done";
1405 case STATE_FINISHING
: return "finishing";
1406 case STATE_DONE
: return "done";
1411 #if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1412 const char *get_state_latency_name(int state
) {
1414 case l_bluestore_state_prepare_lat
: return "prepare";
1415 case l_bluestore_state_aio_wait_lat
: return "aio_wait";
1416 case l_bluestore_state_io_done_lat
: return "io_done";
1417 case l_bluestore_state_kv_queued_lat
: return "kv_queued";
1418 case l_bluestore_state_kv_committing_lat
: return "kv_committing";
1419 case l_bluestore_state_kv_done_lat
: return "kv_done";
1420 case l_bluestore_state_deferred_queued_lat
: return "deferred_queued";
1421 case l_bluestore_state_deferred_cleanup_lat
: return "deferred_cleanup";
1422 case l_bluestore_state_finishing_lat
: return "finishing";
1423 case l_bluestore_state_done_lat
: return "done";
1429 void log_state_latency(PerfCounters
*logger
, int state
) {
1430 utime_t lat
, now
= ceph_clock_now();
1431 lat
= now
- last_stamp
;
1432 logger
->tinc(state
, lat
);
1433 #if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1434 if (state
>= l_bluestore_state_prepare_lat
&& state
<= l_bluestore_state_done_lat
) {
1435 double usecs
= (now
.to_nsec()-last_stamp
.to_nsec())/1000;
1436 OID_ELAPSED("", usecs
, get_state_latency_name(state
));
1443 boost::intrusive::list_member_hook
<> sequencer_item
;
1445 uint64_t bytes
= 0, cost
= 0;
1447 set
<OnodeRef
> onodes
; ///< these need to be updated/written
1448 set
<OnodeRef
> modified_objects
; ///< objects we modified (and need a ref)
1449 set
<SharedBlobRef
> shared_blobs
; ///< these need to be updated/written
1450 set
<SharedBlobRef
> shared_blobs_written
; ///< update these on io completion
1452 KeyValueDB::Transaction t
; ///< then we will commit this
1453 Context
*oncommit
= nullptr; ///< signal on commit
1454 Context
*onreadable
= nullptr; ///< signal on readable
1455 Context
*onreadable_sync
= nullptr; ///< signal on readable
1456 list
<Context
*> oncommits
; ///< more commit completions
1457 list
<CollectionRef
> removed_collections
; ///< colls we removed
1459 boost::intrusive::list_member_hook
<> deferred_queue_item
;
1460 bluestore_deferred_transaction_t
*deferred_txn
= nullptr; ///< if any
1462 interval_set
<uint64_t> allocated
, released
;
1463 struct volatile_statfs
{
1465 STATFS_ALLOCATED
= 0,
1467 STATFS_COMPRESSED_ORIGINAL
,
1469 STATFS_COMPRESSED_ALLOCATED
,
1472 int64_t values
[STATFS_LAST
];
1474 memset(this, 0, sizeof(volatile_statfs
));
1477 *this = volatile_statfs();
1479 int64_t& allocated() {
1480 return values
[STATFS_ALLOCATED
];
1483 return values
[STATFS_STORED
];
1485 int64_t& compressed_original() {
1486 return values
[STATFS_COMPRESSED_ORIGINAL
];
1488 int64_t& compressed() {
1489 return values
[STATFS_COMPRESSED
];
1491 int64_t& compressed_allocated() {
1492 return values
[STATFS_COMPRESSED_ALLOCATED
];
1495 return values
[STATFS_ALLOCATED
] == 0 &&
1496 values
[STATFS_STORED
] == 0 &&
1497 values
[STATFS_COMPRESSED
] == 0 &&
1498 values
[STATFS_COMPRESSED_ORIGINAL
] == 0 &&
1499 values
[STATFS_COMPRESSED_ALLOCATED
] == 0;
1501 void decode(bufferlist::iterator
& it
) {
1502 for (size_t i
= 0; i
< STATFS_LAST
; i
++) {
1503 ::decode(values
[i
], it
);
1507 void encode(bufferlist
& bl
) {
1508 for (size_t i
= 0; i
< STATFS_LAST
; i
++) {
1509 ::encode(values
[i
], bl
);
1516 bool had_ios
= false; ///< true if we submitted IOs before our kv txn
1518 CollectionRef first_collection
; ///< first referenced collection
1524 uint64_t last_nid
= 0; ///< if non-zero, highest new nid we allocated
1525 uint64_t last_blobid
= 0; ///< if non-zero, highest new blobid we allocated
1527 explicit TransContext(CephContext
* cct
, OpSequencer
*o
)
1530 start(ceph_clock_now()) {
1534 delete deferred_txn
;
1537 void write_onode(OnodeRef
&o
) {
1540 void write_shared_blob(SharedBlobRef
&sb
) {
1541 shared_blobs
.insert(sb
);
1543 /// note we logically modified object (when onode itself is unmodified)
1544 void note_modified_object(OnodeRef
&o
) {
1545 // onode itself isn't written, though
1546 modified_objects
.insert(o
);
1548 void removed(OnodeRef
& o
) {
1550 modified_objects
.erase(o
);
1553 void aio_finish(BlueStore
*store
) override
{
1554 store
->txc_aio_finish(this);
1558 typedef boost::intrusive::list
<
1560 boost::intrusive::member_hook
<
1562 boost::intrusive::list_member_hook
<>,
1563 &TransContext::deferred_queue_item
> > deferred_queue_t
;
1565 struct DeferredBatch
: public AioContext
{
1567 struct deferred_io
{
1568 bufferlist bl
; ///< data
1569 uint64_t seq
; ///< deferred transaction seq
1571 map
<uint64_t,deferred_io
> iomap
; ///< map of ios in this batch
1572 deferred_queue_t txcs
; ///< txcs in this batch
1573 IOContext ioc
; ///< our aios
1574 /// bytes of pending io for each deferred seq (may be 0)
1575 map
<uint64_t,int> seq_bytes
;
1577 void _discard(CephContext
*cct
, uint64_t offset
, uint64_t length
);
1578 void _audit(CephContext
*cct
);
1580 DeferredBatch(CephContext
*cct
, OpSequencer
*osr
)
1581 : osr(osr
), ioc(cct
, this) {}
1584 void prepare_write(CephContext
*cct
,
1585 uint64_t seq
, uint64_t offset
, uint64_t length
,
1586 bufferlist::const_iterator
& p
);
1588 void aio_finish(BlueStore
*store
) override
{
1589 store
->_deferred_aio_finish(osr
);
1593 class OpSequencer
: public Sequencer_impl
{
1596 std::condition_variable qcond
;
1597 typedef boost::intrusive::list
<
1599 boost::intrusive::member_hook
<
1601 boost::intrusive::list_member_hook
<>,
1602 &TransContext::sequencer_item
> > q_list_t
;
1603 q_list_t q
; ///< transactions
1605 boost::intrusive::list_member_hook
<> deferred_osr_queue_item
;
1607 DeferredBatch
*deferred_running
= nullptr;
1608 DeferredBatch
*deferred_pending
= nullptr;
1613 uint64_t last_seq
= 0;
1615 std::atomic_int txc_with_unstable_io
= {0}; ///< num txcs with unstable io
1617 std::atomic_int kv_committing_serially
= {0};
1619 std::atomic_int kv_submitted_waiters
= {0};
1621 std::atomic_bool registered
= {true}; ///< registered in BlueStore's osr_set
1622 std::atomic_bool zombie
= {false}; ///< owning Sequencer has gone away
1624 OpSequencer(CephContext
* cct
, BlueStore
*store
)
1625 : Sequencer_impl(cct
),
1626 parent(NULL
), store(store
) {
1627 store
->register_osr(this);
1629 ~OpSequencer() override
{
1634 void discard() override
{
1635 // Note that we may have txc's in flight when the parent Sequencer
1636 // goes away. Reflect this with zombie==registered==true and let
1637 // _osr_drain_all clean up later.
1643 std::lock_guard
<std::mutex
> l(qlock
);
1651 void _unregister() {
1653 store
->unregister_osr(this);
1658 void queue_new(TransContext
*txc
) {
1659 std::lock_guard
<std::mutex
> l(qlock
);
1660 txc
->seq
= ++last_seq
;
1665 std::unique_lock
<std::mutex
> l(qlock
);
1670 void drain_preceding(TransContext
*txc
) {
1671 std::unique_lock
<std::mutex
> l(qlock
);
1672 while (!q
.empty() && &q
.front() != txc
)
1676 bool _is_all_kv_submitted() {
1677 // caller must hold qlock
1681 TransContext
*txc
= &q
.back();
1682 if (txc
->state
>= TransContext::STATE_KV_SUBMITTED
) {
1688 void flush() override
{
1689 std::unique_lock
<std::mutex
> l(qlock
);
1691 // set flag before the check because the condition
1692 // may become true outside qlock, and we need to make
1693 // sure those threads see waiters and signal qcond.
1694 ++kv_submitted_waiters
;
1695 if (_is_all_kv_submitted()) {
1699 --kv_submitted_waiters
;
1703 bool flush_commit(Context
*c
) override
{
1704 std::lock_guard
<std::mutex
> l(qlock
);
1708 TransContext
*txc
= &q
.back();
1709 if (txc
->state
>= TransContext::STATE_KV_DONE
) {
1712 txc
->oncommits
.push_back(c
);
1717 typedef boost::intrusive::list
<
1719 boost::intrusive::member_hook
<
1721 boost::intrusive::list_member_hook
<>,
1722 &OpSequencer::deferred_osr_queue_item
> > deferred_osr_queue_t
;
1724 struct KVSyncThread
: public Thread
{
1726 explicit KVSyncThread(BlueStore
*s
) : store(s
) {}
1727 void *entry() override
{
1728 store
->_kv_sync_thread();
1733 struct DBHistogram
{
1742 map
<int, struct value_dist
> val_map
; ///< slab id to count, max length of value and key
1745 map
<string
, map
<int, struct key_dist
> > key_hist
;
1746 map
<int, uint64_t> value_hist
;
1747 int get_key_slab(size_t sz
);
1748 string
get_key_slab_to_range(int slab
);
1749 int get_value_slab(size_t sz
);
1750 string
get_value_slab_to_range(int slab
);
1751 void update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
1752 const string
&prefix
, size_t key_size
, size_t value_size
);
1753 void dump(Formatter
*f
);
1756 // --------------------------------------------------------
1759 BlueFS
*bluefs
= nullptr;
1760 unsigned bluefs_shared_bdev
= 0; ///< which bluefs bdev we are sharing
1761 bool bluefs_single_shared_device
= true;
1762 utime_t bluefs_last_balance
;
1764 KeyValueDB
*db
= nullptr;
1765 BlockDevice
*bdev
= nullptr;
1766 std::string freelist_type
;
1767 FreelistManager
*fm
= nullptr;
1768 Allocator
*alloc
= nullptr;
1770 int path_fd
= -1; ///< open handle to $path
1771 int fsid_fd
= -1; ///< open handle (locked) to $path/fsid
1772 bool mounted
= false;
1774 RWLock coll_lock
= {"BlueStore::coll_lock"}; ///< rwlock to protect coll_map
1775 mempool::bluestore_meta_other::unordered_map
<coll_t
, CollectionRef
> coll_map
;
1777 vector
<Cache
*> cache_shards
;
1779 std::mutex osr_lock
; ///< protect osd_set
1780 std::set
<OpSequencerRef
> osr_set
; ///< set of all OpSequencers
1782 std::atomic
<uint64_t> nid_last
= {0};
1783 std::atomic
<uint64_t> nid_max
= {0};
1784 std::atomic
<uint64_t> blobid_last
= {0};
1785 std::atomic
<uint64_t> blobid_max
= {0};
1787 Throttle throttle_bytes
; ///< submit to commit
1788 Throttle throttle_deferred_bytes
; ///< submit to deferred complete
1790 interval_set
<uint64_t> bluefs_extents
; ///< block extents owned by bluefs
1791 interval_set
<uint64_t> bluefs_extents_reclaiming
; ///< currently reclaiming
1793 std::mutex deferred_lock
;
1794 std::atomic
<uint64_t> deferred_seq
= {0};
1795 deferred_osr_queue_t deferred_queue
; ///< osr's with deferred io pending
1796 int deferred_queue_size
= 0; ///< num txc's queued across all osrs
1797 atomic_int deferred_aggressive
= {0}; ///< aggressive wakeup of kv thread
1799 int m_finisher_num
= 1;
1800 vector
<Finisher
*> finishers
;
1802 KVSyncThread kv_sync_thread
;
1804 std::condition_variable kv_cond
;
1805 bool kv_stop
= false;
1806 deque
<TransContext
*> kv_queue
; ///< ready, already submitted
1807 deque
<TransContext
*> kv_queue_unsubmitted
; ///< ready, need submit by kv thread
1808 deque
<TransContext
*> kv_committing
; ///< currently syncing
1809 deque
<DeferredBatch
*> deferred_done_queue
; ///< deferred ios done
1810 deque
<DeferredBatch
*> deferred_stable_queue
; ///< deferred ios done + stable
1812 PerfCounters
*logger
= nullptr;
1814 std::mutex reap_lock
;
1815 list
<CollectionRef
> removed_collections
;
1817 RWLock debug_read_error_lock
= {"BlueStore::debug_read_error_lock"};
1818 set
<ghobject_t
> debug_data_error_objects
;
1819 set
<ghobject_t
> debug_mdata_error_objects
;
1821 std::atomic
<int> csum_type
= {Checksummer::CSUM_CRC32C
};
1823 uint64_t block_size
= 0; ///< block size of block device (power of 2)
1824 uint64_t block_mask
= 0; ///< mask to get just the block offset
1825 size_t block_size_order
= 0; ///< bits to shift to get block size
1827 uint64_t min_alloc_size
= 0; ///< minimum allocation unit (power of 2)
1828 int deferred_batch_ops
= 0; ///< deferred batch size
1830 ///< bits for min_alloc_size
1831 std::atomic
<uint8_t> min_alloc_size_order
= {0};
1832 static_assert(std::numeric_limits
<uint8_t>::max() >
1833 std::numeric_limits
<decltype(min_alloc_size
)>::digits
,
1834 "not enough bits for min_alloc_size");
1836 ///< size threshold for forced deferred writes
1837 std::atomic
<uint64_t> prefer_deferred_size
= {0};
1839 ///< maximum allocation unit (power of 2)
1840 std::atomic
<uint64_t> max_alloc_size
= {0};
1842 ///< approx cost per io, in bytes
1843 std::atomic
<uint64_t> throttle_cost_per_io
= {0};
1845 std::atomic
<Compressor::CompressionMode
> comp_mode
= {Compressor::COMP_NONE
}; ///< compression mode
1846 CompressorRef compressor
;
1847 std::atomic
<uint64_t> comp_min_blob_size
= {0};
1848 std::atomic
<uint64_t> comp_max_blob_size
= {0};
1850 std::atomic
<uint64_t> max_blob_size
= {0}; ///< maximum blob size
1852 // cache trim control
1854 // note that these update in a racy way, but we don't *really* care if
1855 // they're perfectly accurate. they are all word sized so they will
1856 // individually update atomically, but may not be coherent with each other.
1857 size_t mempool_seq
= 0;
1858 size_t mempool_bytes
= 0;
1859 size_t mempool_onodes
= 0;
1861 void get_mempool_stats(size_t *seq
, uint64_t *bytes
, uint64_t *onodes
) {
1863 *bytes
= mempool_bytes
;
1864 *onodes
= mempool_onodes
;
1867 struct MempoolThread
: public Thread
{
1873 explicit MempoolThread(BlueStore
*s
)
1875 lock("BlueStore::MempoolThread::lock") {}
1876 void *entry() override
;
1878 assert(stop
== false);
1879 create("bstore_mempool");
1890 // --------------------------------------------------------
1893 void _init_logger();
1894 void _shutdown_logger();
1895 int _reload_logger();
1899 int _open_fsid(bool create
);
1901 int _read_fsid(uuid_d
*f
);
1904 void _set_alloc_sizes();
1905 void _set_blob_size();
1907 int _open_bdev(bool create
);
1909 int _open_db(bool create
);
1911 int _open_fm(bool create
);
1914 void _close_alloc();
1915 int _open_collections(int *errors
=0);
1916 void _close_collections();
1918 int _setup_block_symlink_or_file(string name
, string path
, uint64_t size
,
1921 int _write_bdev_label(string path
, bluestore_bdev_label_t label
);
1923 static int _read_bdev_label(CephContext
* cct
, string path
,
1924 bluestore_bdev_label_t
*label
);
1926 int _check_or_set_bdev_label(string path
, uint64_t size
, string desc
,
1929 int _open_super_meta();
1931 int _reconcile_bluefs_freespace();
1932 int _balance_bluefs_freespace(PExtentVector
*extents
);
1933 void _commit_bluefs_freespace(const PExtentVector
& extents
);
1935 CollectionRef
_get_collection(const coll_t
& cid
);
1936 void _queue_reap_collection(CollectionRef
& c
);
1937 void _reap_collections();
1938 void _update_cache_logger();
1940 void _assign_nid(TransContext
*txc
, OnodeRef o
);
1941 uint64_t _assign_blobid(TransContext
*txc
);
1943 void _dump_onode(OnodeRef o
, int log_level
=30);
1944 void _dump_extent_map(ExtentMap
& em
, int log_level
=30);
1945 void _dump_transaction(Transaction
*t
, int log_level
= 30);
1947 TransContext
*_txc_create(OpSequencer
*osr
);
1948 void _txc_update_store_statfs(TransContext
*txc
);
1949 void _txc_add_transaction(TransContext
*txc
, Transaction
*t
);
1950 void _txc_calc_cost(TransContext
*txc
);
1951 void _txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
);
1952 void _txc_state_proc(TransContext
*txc
);
1953 void _txc_aio_submit(TransContext
*txc
);
1955 void txc_aio_finish(void *p
) {
1956 _txc_state_proc(static_cast<TransContext
*>(p
));
1959 void _txc_finish_io(TransContext
*txc
);
1960 void _txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
);
1961 void _txc_applied_kv(TransContext
*txc
);
1962 void _txc_committed_kv(TransContext
*txc
);
1963 void _txc_finish(TransContext
*txc
);
1964 void _txc_release_alloc(TransContext
*txc
);
1966 void _osr_drain_preceding(TransContext
*txc
);
1967 void _osr_drain_all();
1968 void _osr_unregister_all();
1970 void _kv_sync_thread();
1973 std::lock_guard
<std::mutex
> l(kv_lock
);
1975 kv_cond
.notify_all();
1977 kv_sync_thread
.join();
1979 std::lock_guard
<std::mutex
> l(kv_lock
);
1984 bluestore_deferred_op_t
*_get_deferred_op(TransContext
*txc
, OnodeRef o
);
1985 void _deferred_queue(TransContext
*txc
);
1986 void deferred_try_submit() {
1987 std::lock_guard
<std::mutex
> l(deferred_lock
);
1988 _deferred_try_submit();
1990 void _deferred_try_submit();
1991 void _deferred_submit(OpSequencer
*osr
);
1992 void _deferred_aio_finish(OpSequencer
*osr
);
1993 int _deferred_replay();
1996 using mempool_dynamic_bitset
=
1997 boost::dynamic_bitset
<uint64_t,
1998 mempool::bluestore_fsck::pool_allocator
<uint64_t>>;
2001 int _fsck_check_extents(
2002 const ghobject_t
& oid
,
2003 const PExtentVector
& extents
,
2005 mempool_dynamic_bitset
&used_blocks
,
2006 store_statfs_t
& expected_statfs
);
2008 void _buffer_cache_write(
2014 b
->shared_blob
->bc
.write(b
->shared_blob
->get_cache(), txc
->seq
, offset
, bl
,
2016 txc
->shared_blobs_written
.insert(b
->shared_blob
);
2019 int _collection_list(
2020 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
,
2021 int max
, vector
<ghobject_t
> *ls
, ghobject_t
*next
);
2023 template <typename T
, typename F
>
2024 T
select_option(const std::string
& opt_name
, T val1
, F f
) {
2025 //NB: opt_name reserved for future use
2026 boost::optional
<T
> val2
= f();
2033 void _apply_padding(uint64_t head_pad
,
2036 bufferlist
& padded
);
2038 // -- ondisk version ---
2040 const int32_t latest_ondisk_format
= 2; ///< our version
2041 const int32_t min_readable_ondisk_format
= 1; ///< what we can read
2042 const int32_t min_compat_ondisk_format
= 2; ///< who can read us
2045 int32_t ondisk_format
= 0; ///< value detected on mount
2047 int _upgrade_super(); ///< upgrade (called during open_super)
2048 void _prepare_ondisk_format_super(KeyValueDB::Transaction
& t
);
2050 // --- public interface ---
2052 BlueStore(CephContext
*cct
, const string
& path
);
2053 BlueStore(CephContext
*cct
, const string
& path
, uint64_t min_alloc_size
); // Ctor for UT only
2054 ~BlueStore() override
;
2056 string
get_type() override
{
2060 bool needs_journal() override
{ return false; };
2061 bool wants_journal() override
{ return false; };
2062 bool allows_journal() override
{ return false; };
2064 static int get_block_device_fsid(CephContext
* cct
, const string
& path
,
2067 bool test_mount_in_use() override
;
2070 int _mount(bool kv_only
);
2072 int mount() override
{
2073 return _mount(false);
2075 int umount() override
;
2077 int start_kv_only(KeyValueDB
**pdb
) {
2078 int r
= _mount(true);
2085 int fsck(bool deep
) override
;
2087 void set_cache_shards(unsigned num
) override
;
2089 int validate_hobject_key(const hobject_t
&obj
) const override
{
2092 unsigned get_max_attr_name_length() override
{
2093 return 256; // arbitrary; there is no real limit internally
2096 int mkfs() override
;
2097 int mkjournal() override
{
2101 void get_db_statistics(Formatter
*f
) override
;
2102 void generate_db_histogram(Formatter
*f
) override
;
2103 void flush_cache() override
;
2104 void dump_perf_counters(Formatter
*f
) override
{
2105 f
->open_object_section("perf_counters");
2106 logger
->dump_formatted(f
, false);
2110 void register_osr(OpSequencer
*osr
) {
2111 std::lock_guard
<std::mutex
> l(osr_lock
);
2112 osr_set
.insert(osr
);
2114 void unregister_osr(OpSequencer
*osr
) {
2115 std::lock_guard
<std::mutex
> l(osr_lock
);
2120 int statfs(struct store_statfs_t
*buf
) override
;
2122 void collect_metadata(map
<string
,string
> *pm
) override
;
2124 bool exists(const coll_t
& cid
, const ghobject_t
& oid
) override
;
2125 bool exists(CollectionHandle
&c
, const ghobject_t
& oid
) override
;
2126 int set_collection_opts(
2128 const pool_opts_t
& opts
) override
;
2131 const ghobject_t
& oid
,
2133 bool allow_eio
= false) override
;
2135 CollectionHandle
&c
,
2136 const ghobject_t
& oid
,
2138 bool allow_eio
= false) override
;
2141 const ghobject_t
& oid
,
2145 uint32_t op_flags
= 0,
2146 bool allow_eio
= false) override
;
2148 CollectionHandle
&c
,
2149 const ghobject_t
& oid
,
2153 uint32_t op_flags
= 0,
2154 bool allow_eio
= false) override
;
2161 uint32_t op_flags
= 0);
2164 int _fiemap(CollectionHandle
&c_
, const ghobject_t
& oid
,
2165 uint64_t offset
, size_t len
, interval_set
<uint64_t>& destset
);
2167 int fiemap(const coll_t
& cid
, const ghobject_t
& oid
,
2168 uint64_t offset
, size_t len
, bufferlist
& bl
) override
;
2169 int fiemap(CollectionHandle
&c
, const ghobject_t
& oid
,
2170 uint64_t offset
, size_t len
, bufferlist
& bl
) override
;
2171 int fiemap(const coll_t
& cid
, const ghobject_t
& oid
,
2172 uint64_t offset
, size_t len
, map
<uint64_t, uint64_t>& destmap
) override
;
2173 int fiemap(CollectionHandle
&c
, const ghobject_t
& oid
,
2174 uint64_t offset
, size_t len
, map
<uint64_t, uint64_t>& destmap
) override
;
2177 int getattr(const coll_t
& cid
, const ghobject_t
& oid
, const char *name
,
2178 bufferptr
& value
) override
;
2179 int getattr(CollectionHandle
&c
, const ghobject_t
& oid
, const char *name
,
2180 bufferptr
& value
) override
;
2182 int getattrs(const coll_t
& cid
, const ghobject_t
& oid
,
2183 map
<string
,bufferptr
>& aset
) override
;
2184 int getattrs(CollectionHandle
&c
, const ghobject_t
& oid
,
2185 map
<string
,bufferptr
>& aset
) override
;
2187 int list_collections(vector
<coll_t
>& ls
) override
;
2189 CollectionHandle
open_collection(const coll_t
&c
) override
;
2191 bool collection_exists(const coll_t
& c
) override
;
2192 int collection_empty(const coll_t
& c
, bool *empty
) override
;
2193 int collection_bits(const coll_t
& c
) override
;
2195 int collection_list(const coll_t
& cid
,
2196 const ghobject_t
& start
,
2197 const ghobject_t
& end
,
2199 vector
<ghobject_t
> *ls
, ghobject_t
*next
) override
;
2200 int collection_list(CollectionHandle
&c
,
2201 const ghobject_t
& start
,
2202 const ghobject_t
& end
,
2204 vector
<ghobject_t
> *ls
, ghobject_t
*next
) override
;
2207 const coll_t
& cid
, ///< [in] Collection containing oid
2208 const ghobject_t
&oid
, ///< [in] Object containing omap
2209 bufferlist
*header
, ///< [out] omap header
2210 map
<string
, bufferlist
> *out
/// < [out] Key to value map
2213 CollectionHandle
&c
, ///< [in] Collection containing oid
2214 const ghobject_t
&oid
, ///< [in] Object containing omap
2215 bufferlist
*header
, ///< [out] omap header
2216 map
<string
, bufferlist
> *out
/// < [out] Key to value map
2220 int omap_get_header(
2221 const coll_t
& cid
, ///< [in] Collection containing oid
2222 const ghobject_t
&oid
, ///< [in] Object containing omap
2223 bufferlist
*header
, ///< [out] omap header
2224 bool allow_eio
= false ///< [in] don't assert on eio
2226 int omap_get_header(
2227 CollectionHandle
&c
, ///< [in] Collection containing oid
2228 const ghobject_t
&oid
, ///< [in] Object containing omap
2229 bufferlist
*header
, ///< [out] omap header
2230 bool allow_eio
= false ///< [in] don't assert on eio
2233 /// Get keys defined on oid
2235 const coll_t
& cid
, ///< [in] Collection containing oid
2236 const ghobject_t
&oid
, ///< [in] Object containing omap
2237 set
<string
> *keys
///< [out] Keys defined on oid
2240 CollectionHandle
&c
, ///< [in] Collection containing oid
2241 const ghobject_t
&oid
, ///< [in] Object containing omap
2242 set
<string
> *keys
///< [out] Keys defined on oid
2246 int omap_get_values(
2247 const coll_t
& cid
, ///< [in] Collection containing oid
2248 const ghobject_t
&oid
, ///< [in] Object containing omap
2249 const set
<string
> &keys
, ///< [in] Keys to get
2250 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
2252 int omap_get_values(
2253 CollectionHandle
&c
, ///< [in] Collection containing oid
2254 const ghobject_t
&oid
, ///< [in] Object containing omap
2255 const set
<string
> &keys
, ///< [in] Keys to get
2256 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
2259 /// Filters keys into out which are defined on oid
2260 int omap_check_keys(
2261 const coll_t
& cid
, ///< [in] Collection containing oid
2262 const ghobject_t
&oid
, ///< [in] Object containing omap
2263 const set
<string
> &keys
, ///< [in] Keys to check
2264 set
<string
> *out
///< [out] Subset of keys defined on oid
2266 int omap_check_keys(
2267 CollectionHandle
&c
, ///< [in] Collection containing oid
2268 const ghobject_t
&oid
, ///< [in] Object containing omap
2269 const set
<string
> &keys
, ///< [in] Keys to check
2270 set
<string
> *out
///< [out] Subset of keys defined on oid
2273 ObjectMap::ObjectMapIterator
get_omap_iterator(
2274 const coll_t
& cid
, ///< [in] collection
2275 const ghobject_t
&oid
///< [in] object
2277 ObjectMap::ObjectMapIterator
get_omap_iterator(
2278 CollectionHandle
&c
, ///< [in] collection
2279 const ghobject_t
&oid
///< [in] object
2282 void set_fsid(uuid_d u
) override
{
2285 uuid_d
get_fsid() override
{
2289 uint64_t estimate_objects_overhead(uint64_t num_objects
) override
{
2290 return num_objects
* 300; //assuming per-object overhead is 300 bytes
2293 struct BSPerfTracker
{
2294 PerfCounters::avg_tracker
<uint64_t> os_commit_latency
;
2295 PerfCounters::avg_tracker
<uint64_t> os_apply_latency
;
2297 objectstore_perf_stat_t
get_cur_stats() const {
2298 objectstore_perf_stat_t ret
;
2299 ret
.os_commit_latency
= os_commit_latency
.avg();
2300 ret
.os_apply_latency
= os_apply_latency
.avg();
2304 void update_from_perfcounters(PerfCounters
&logger
);
2307 objectstore_perf_stat_t
get_cur_stats() override
{
2308 perf_tracker
.update_from_perfcounters(*logger
);
2309 return perf_tracker
.get_cur_stats();
2311 const PerfCounters
* get_perf_counters() const override
{
2315 int queue_transactions(
2317 vector
<Transaction
>& tls
,
2318 TrackedOpRef op
= TrackedOpRef(),
2319 ThreadPool::TPHandle
*handle
= NULL
) override
;
2322 void inject_data_error(const ghobject_t
& o
) override
{
2323 RWLock::WLocker
l(debug_read_error_lock
);
2324 debug_data_error_objects
.insert(o
);
2326 void inject_mdata_error(const ghobject_t
& o
) override
{
2327 RWLock::WLocker
l(debug_read_error_lock
);
2328 debug_mdata_error_objects
.insert(o
);
2331 bool _debug_data_eio(const ghobject_t
& o
) {
2332 if (!cct
->_conf
->bluestore_debug_inject_read_err
) {
2335 RWLock::RLocker
l(debug_read_error_lock
);
2336 return debug_data_error_objects
.count(o
);
2338 bool _debug_mdata_eio(const ghobject_t
& o
) {
2339 if (!cct
->_conf
->bluestore_debug_inject_read_err
) {
2342 RWLock::RLocker
l(debug_read_error_lock
);
2343 return debug_mdata_error_objects
.count(o
);
2345 void _debug_obj_on_delete(const ghobject_t
& o
) {
2346 if (cct
->_conf
->bluestore_debug_inject_read_err
) {
2347 RWLock::WLocker
l(debug_read_error_lock
);
2348 debug_data_error_objects
.erase(o
);
2349 debug_mdata_error_objects
.erase(o
);
2355 // --------------------------------------------------------
2356 // read processing internal methods
2359 const bluestore_blob_t
* blob
,
2360 uint64_t blob_xoffset
,
2361 const bufferlist
& bl
,
2362 uint64_t logical_offset
) const;
2363 int _decompress(bufferlist
& source
, bufferlist
* result
);
2366 // --------------------------------------------------------
2369 struct WriteContext
{
2370 bool buffered
= false; ///< buffered write
2371 bool compress
= false; ///< compressed write
2372 uint64_t target_blob_size
= 0; ///< target (max) blob size
2373 unsigned csum_order
= 0; ///< target checksum chunk order
2375 old_extent_map_t old_extents
; ///< must deref these blobs
2378 uint64_t logical_offset
; ///< write logical offset
2380 uint64_t blob_length
;
2383 uint64_t b_off0
; ///< original offset in a blob prior to padding
2384 uint64_t length0
; ///< original data length prior to padding
2387 bool new_blob
; ///< whether new blob was created
2390 uint64_t logical_offs
,
2400 logical_offset(logical_offs
),
2402 blob_length(blob_len
),
2407 mark_unused(_mark_unused
),
2408 new_blob(_new_blob
) {}
2410 vector
<write_item
> writes
; ///< blobs we're writing
2412 /// partial clone of the context
2413 void fork(const WriteContext
& other
) {
2414 buffered
= other
.buffered
;
2415 compress
= other
.compress
;
2416 target_blob_size
= other
.target_blob_size
;
2417 csum_order
= other
.csum_order
;
2429 writes
.emplace_back(loffs
,
2439 /// Checks for writes to the same pextent within a blob
2444 uint64_t min_alloc_size
);
2447 void _do_write_small(
2451 uint64_t offset
, uint64_t length
,
2452 bufferlist::iterator
& blp
,
2453 WriteContext
*wctx
);
2458 uint64_t offset
, uint64_t length
,
2459 bufferlist::iterator
& blp
,
2460 WriteContext
*wctx
);
2461 int _do_alloc_write(
2465 WriteContext
*wctx
);
2470 WriteContext
*wctx
);
2472 int _do_transaction(Transaction
*t
,
2474 ThreadPool::TPHandle
*handle
);
2476 int _write(TransContext
*txc
,
2479 uint64_t offset
, size_t len
,
2481 uint32_t fadvise_flags
);
2482 void _pad_zeros(bufferlist
*bl
, uint64_t *offset
,
2483 uint64_t chunk_size
);
2485 int _do_write(TransContext
*txc
,
2488 uint64_t offset
, uint64_t length
,
2490 uint32_t fadvise_flags
);
2491 void _do_write_data(TransContext
*txc
,
2497 WriteContext
*wctx
);
2499 int _touch(TransContext
*txc
,
2502 int _do_zero(TransContext
*txc
,
2505 uint64_t offset
, size_t len
);
2506 int _zero(TransContext
*txc
,
2509 uint64_t offset
, size_t len
);
2510 void _do_truncate(TransContext
*txc
,
2514 void _truncate(TransContext
*txc
,
2518 int _remove(TransContext
*txc
,
2521 int _do_remove(TransContext
*txc
,
2524 int _setattr(TransContext
*txc
,
2529 int _setattrs(TransContext
*txc
,
2532 const map
<string
,bufferptr
>& aset
);
2533 int _rmattr(TransContext
*txc
,
2536 const string
& name
);
2537 int _rmattrs(TransContext
*txc
,
2540 void _do_omap_clear(TransContext
*txc
, uint64_t id
);
2541 int _omap_clear(TransContext
*txc
,
2544 int _omap_setkeys(TransContext
*txc
,
2548 int _omap_setheader(TransContext
*txc
,
2551 bufferlist
& header
);
2552 int _omap_rmkeys(TransContext
*txc
,
2556 int _omap_rmkey_range(TransContext
*txc
,
2559 const string
& first
, const string
& last
);
2560 int _set_alloc_hint(
2564 uint64_t expected_object_size
,
2565 uint64_t expected_write_size
,
2567 int _do_clone_range(TransContext
*txc
,
2571 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
);
2572 int _clone(TransContext
*txc
,
2576 int _clone_range(TransContext
*txc
,
2580 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
);
2581 int _rename(TransContext
*txc
,
2585 const ghobject_t
& new_oid
);
2586 int _create_collection(TransContext
*txc
, const coll_t
&cid
,
2587 unsigned bits
, CollectionRef
*c
);
2588 int _remove_collection(TransContext
*txc
, const coll_t
&cid
,
2590 int _split_collection(TransContext
*txc
,
2593 unsigned bits
, int rem
);
2596 inline ostream
& operator<<(ostream
& out
, const BlueStore::OpSequencer
& s
) {
2597 return out
<< *s
.parent
;
2600 static inline void intrusive_ptr_add_ref(BlueStore::Onode
*o
) {
2603 static inline void intrusive_ptr_release(BlueStore::Onode
*o
) {
2607 static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer
*o
) {
2610 static inline void intrusive_ptr_release(BlueStore::OpSequencer
*o
) {