1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include <sys/types.h>
21 #include <boost/container/flat_set.hpp>
22 #include "boost/algorithm/string.hpp"
24 #include "include/cpp-btree/btree_set.h"
26 #include "BlueStore.h"
27 #include "bluestore_common.h"
29 #include "include/compat.h"
30 #include "include/intarith.h"
31 #include "include/stringify.h"
32 #include "include/str_map.h"
33 #include "include/util.h"
34 #include "common/errno.h"
35 #include "common/safe_io.h"
36 #include "common/PriorityCache.h"
37 #include "common/RWLock.h"
38 #include "Allocator.h"
39 #include "FreelistManager.h"
41 #include "BlueRocksEnv.h"
42 #include "auth/Crypto.h"
43 #include "common/EventTrace.h"
44 #include "perfglue/heap_profiler.h"
45 #include "common/blkdev.h"
46 #include "common/numa.h"
47 #include "common/pretty_binary.h"
49 #if defined(WITH_LTTNG)
50 #define TRACEPOINT_DEFINE
51 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
52 #include "tracing/bluestore.h"
53 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
54 #undef TRACEPOINT_DEFINE
56 #define tracepoint(...)
59 #define dout_context cct
60 #define dout_subsys ceph_subsys_bluestore
62 using bid_t
= decltype(BlueStore::Blob::id
);
64 // bluestore_cache_onode
65 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode
, bluestore_onode
,
66 bluestore_cache_onode
);
68 // bluestore_cache_other
69 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer
, bluestore_buffer
,
71 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent
, bluestore_extent
,
73 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob
, bluestore_blob
,
75 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob
, bluestore_shared_blob
,
76 bluestore_SharedBlob
);
79 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext
, bluestore_transcontext
,
84 using std::numeric_limits
;
90 using std::ostringstream
;
93 using std::stringstream
;
96 using ceph::bufferlist
;
97 using ceph::bufferptr
;
98 using ceph::coarse_mono_clock
;
101 using ceph::Formatter
;
102 using ceph::JSONFormatter
;
103 using ceph::make_timespan
;
104 using ceph::mono_clock
;
105 using ceph::mono_time
;
106 using ceph::timespan_str
;
109 const string PREFIX_SUPER
= "S"; // field -> value
110 const string PREFIX_STAT
= "T"; // field -> value(int64 array)
111 const string PREFIX_COLL
= "C"; // collection name -> cnode_t
112 const string PREFIX_OBJ
= "O"; // object name -> onode_t
113 const string PREFIX_OMAP
= "M"; // u64 + keyname -> value
114 const string PREFIX_PGMETA_OMAP
= "P"; // u64 + keyname -> value(for meta coll)
115 const string PREFIX_PERPOOL_OMAP
= "m"; // s64 + u64 + keyname -> value
116 const string PREFIX_PERPG_OMAP
= "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
117 const string PREFIX_DEFERRED
= "L"; // id -> deferred_transaction_t
118 const string PREFIX_ALLOC
= "B"; // u64 offset -> u64 length (freelist)
119 const string PREFIX_ALLOC_BITMAP
= "b";// (see BitmapFreelistManager)
120 const string PREFIX_SHARED_BLOB
= "X"; // u64 offset -> shared_blob_t
121 const string PREFIX_ZONED_FM_META
= "Z"; // (see ZonedFreelistManager)
122 const string PREFIX_ZONED_FM_INFO
= "z"; // (see ZonedFreelistManager)
123 const string PREFIX_ZONED_CL_INFO
= "G"; // (per-zone cleaner metadata)
125 const string BLUESTORE_GLOBAL_STATFS_KEY
= "bluestore_statfs";
127 // write a label in the first block. always use this size. note that
128 // bluefs makes a matching assumption about the location of its
129 // superblock (always the second block of the device).
130 #define BDEV_LABEL_BLOCK_SIZE 4096
132 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
133 #define SUPER_RESERVED 8192
135 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
139 * extent map blob encoding
141 * we use the low bits of the blobid field to indicate some common scenarios
142 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
144 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
145 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
146 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
147 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
148 #define BLOBID_SHIFT_BITS 4
151 * object name key structure
153 * encoded u8: shard + 2^7 (so that it sorts properly)
154 * encoded u64: poolid + 2^63 (so that it sorts properly)
155 * encoded u32: hash (bit reversed)
157 * escaped string: namespace
159 * escaped string: key or object name
160 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
161 * we are done. otherwise, we are followed by the object name.
162 * escaped string: object name (unless '=' above)
165 * encoded u64: generation
168 #define ONODE_KEY_SUFFIX 'o'
177 #define EXTENT_SHARD_KEY_SUFFIX 'x'
180 * string encoding in the key
182 * The key string needs to lexicographically sort the same way that
183 * ghobject_t does. We do this by escaping anything <= to '#' with #
184 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
187 * We use ! as a terminator for strings; this works because it is < #
188 * and will get escaped if it is present in the string.
190 * NOTE: There is a bug in this implementation: due to implicit
191 * character type conversion in comparison it may produce unexpected
192 * ordering. Unfortunately fixing the bug would mean invalidating the
193 * keys in existing deployments. Instead we do additional sorting
194 * where it is needed.
197 static void append_escaped(const string
&in
, S
*out
)
199 char hexbyte
[in
.length() * 3 + 1];
200 char* ptr
= &hexbyte
[0];
201 for (string::const_iterator i
= in
.begin(); i
!= in
.end(); ++i
) {
202 if (*i
<= '#') { // bug: unexpected result for *i > 0x7f
204 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
205 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
206 } else if (*i
>= '~') { // bug: unexpected result for *i > 0x7f
208 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
209 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
215 out
->append(hexbyte
, ptr
- &hexbyte
[0]);
218 inline unsigned h2i(char c
)
220 if ((c
>= '0') && (c
<= '9')) {
222 } else if ((c
>= 'a') && (c
<= 'f')) {
224 } else if ((c
>= 'A') && (c
<= 'F')) {
227 return 256; // make it always larger than 255
231 static int decode_escaped(const char *p
, string
*out
)
234 char* ptr
= &buff
[0];
235 char* max
= &buff
[252];
236 const char *orig_p
= p
;
237 while (*p
&& *p
!= '!') {
238 if (*p
== '#' || *p
== '~') {
241 hex
= h2i(*p
++) << 4;
254 out
->append(buff
, ptr
-buff
);
259 out
->append(buff
, ptr
-buff
);
265 static void _key_encode_shard(shard_id_t shard
, T
*key
)
267 key
->push_back((char)((uint8_t)shard
.id
+ (uint8_t)0x80));
270 static const char *_key_decode_shard(const char *key
, shard_id_t
*pshard
)
272 pshard
->id
= (uint8_t)*key
- (uint8_t)0x80;
276 static void get_coll_range(const coll_t
& cid
, int bits
,
277 ghobject_t
*temp_start
, ghobject_t
*temp_end
,
278 ghobject_t
*start
, ghobject_t
*end
)
281 if (cid
.is_pg(&pgid
)) {
282 start
->shard_id
= pgid
.shard
;
283 *temp_start
= *start
;
285 start
->hobj
.pool
= pgid
.pool();
286 temp_start
->hobj
.pool
= -2ll - pgid
.pool();
289 *temp_end
= *temp_start
;
291 uint32_t reverse_hash
= hobject_t::_reverse_bits(pgid
.ps());
292 start
->hobj
.set_bitwise_key_u32(reverse_hash
);
293 temp_start
->hobj
.set_bitwise_key_u32(reverse_hash
);
295 uint64_t end_hash
= reverse_hash
+ (1ull << (32 - bits
));
296 if (end_hash
> 0xffffffffull
)
297 end_hash
= 0xffffffffull
;
299 end
->hobj
.set_bitwise_key_u32(end_hash
);
300 temp_end
->hobj
.set_bitwise_key_u32(end_hash
);
302 start
->shard_id
= shard_id_t::NO_SHARD
;
303 start
->hobj
.pool
= -1ull;
306 start
->hobj
.set_bitwise_key_u32(0);
307 end
->hobj
.set_bitwise_key_u32(0xffffffff);
309 // no separate temp section
314 start
->generation
= 0;
316 temp_start
->generation
= 0;
317 temp_end
->generation
= 0;
320 static void get_shared_blob_key(uint64_t sbid
, string
*key
)
323 _key_encode_u64(sbid
, key
);
326 static int get_key_shared_blob(const string
& key
, uint64_t *sbid
)
328 const char *p
= key
.c_str();
329 if (key
.length() < sizeof(uint64_t))
331 _key_decode_u64(p
, sbid
);
336 static void _key_encode_prefix(const ghobject_t
& oid
, S
*key
)
338 _key_encode_shard(oid
.shard_id
, key
);
339 _key_encode_u64(oid
.hobj
.pool
+ 0x8000000000000000ull
, key
);
340 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), key
);
343 static const char *_key_decode_prefix(const char *p
, ghobject_t
*oid
)
345 p
= _key_decode_shard(p
, &oid
->shard_id
);
348 p
= _key_decode_u64(p
, &pool
);
349 oid
->hobj
.pool
= pool
- 0x8000000000000000ull
;
352 p
= _key_decode_u32(p
, &hash
);
354 oid
->hobj
.set_bitwise_key_u32(hash
);
359 #define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
362 static int get_key_object(const S
& key
, ghobject_t
*oid
)
365 const char *p
= key
.c_str();
367 if (key
.length() < ENCODED_KEY_PREFIX_LEN
)
370 p
= _key_decode_prefix(p
, oid
);
372 if (key
.length() == ENCODED_KEY_PREFIX_LEN
)
375 r
= decode_escaped(p
, &oid
->hobj
.nspace
);
381 r
= decode_escaped(p
, &k
);
388 oid
->hobj
.oid
.name
= k
;
389 } else if (*p
== '<' || *p
== '>') {
392 r
= decode_escaped(p
, &oid
->hobj
.oid
.name
);
396 oid
->hobj
.set_key(k
);
402 p
= _key_decode_u64(p
, &oid
->hobj
.snap
.val
);
403 p
= _key_decode_u64(p
, &oid
->generation
);
405 if (*p
!= ONODE_KEY_SUFFIX
) {
410 // if we get something other than a null terminator here,
411 // something goes wrong.
419 static void get_object_key(CephContext
*cct
, const ghobject_t
& oid
, S
*key
)
423 size_t max_len
= ENCODED_KEY_PREFIX_LEN
+
424 (oid
.hobj
.nspace
.length() * 3 + 1) +
425 (oid
.hobj
.get_key().length() * 3 + 1) +
426 1 + // for '<', '=', or '>'
427 (oid
.hobj
.oid
.name
.length() * 3 + 1) +
429 key
->reserve(max_len
);
431 _key_encode_prefix(oid
, key
);
433 append_escaped(oid
.hobj
.nspace
, key
);
435 if (oid
.hobj
.get_key().length()) {
436 // is a key... could be < = or >.
437 append_escaped(oid
.hobj
.get_key(), key
);
438 // (ASCII chars < = and > sort in that order, yay)
439 int r
= oid
.hobj
.get_key().compare(oid
.hobj
.oid
.name
);
441 key
->append(r
> 0 ? ">" : "<");
442 append_escaped(oid
.hobj
.oid
.name
, key
);
449 append_escaped(oid
.hobj
.oid
.name
, key
);
453 _key_encode_u64(oid
.hobj
.snap
, key
);
454 _key_encode_u64(oid
.generation
, key
);
456 key
->push_back(ONODE_KEY_SUFFIX
);
461 int r
= get_key_object(*key
, &t
);
463 derr
<< " r " << r
<< dendl
;
464 derr
<< "key " << pretty_binary_string(*key
) << dendl
;
465 derr
<< "oid " << oid
<< dendl
;
466 derr
<< " t " << t
<< dendl
;
467 ceph_assert(r
== 0 && t
== oid
);
472 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
473 // char lets us quickly test whether it is a shard key without decoding any
474 // of the prefix bytes.
476 static void get_extent_shard_key(const S
& onode_key
, uint32_t offset
,
480 key
->reserve(onode_key
.length() + 4 + 1);
481 key
->append(onode_key
.c_str(), onode_key
.size());
482 _key_encode_u32(offset
, key
);
483 key
->push_back(EXTENT_SHARD_KEY_SUFFIX
);
486 static void rewrite_extent_shard_key(uint32_t offset
, string
*key
)
488 ceph_assert(key
->size() > sizeof(uint32_t) + 1);
489 ceph_assert(*key
->rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
490 _key_encode_u32(offset
, key
->size() - sizeof(uint32_t) - 1, key
);
494 static void generate_extent_shard_key_and_apply(
498 std::function
<void(const string
& final_key
)> apply
)
500 if (key
->empty()) { // make full key
501 ceph_assert(!onode_key
.empty());
502 get_extent_shard_key(onode_key
, offset
, key
);
504 rewrite_extent_shard_key(offset
, key
);
509 int get_key_extent_shard(const string
& key
, string
*onode_key
, uint32_t *offset
)
511 ceph_assert(key
.size() > sizeof(uint32_t) + 1);
512 ceph_assert(*key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
513 int okey_len
= key
.size() - sizeof(uint32_t) - 1;
514 *onode_key
= key
.substr(0, okey_len
);
515 const char *p
= key
.data() + okey_len
;
516 _key_decode_u32(p
, offset
);
520 static bool is_extent_shard_key(const string
& key
)
522 return *key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
;
525 static void get_deferred_key(uint64_t seq
, string
*out
)
527 _key_encode_u64(seq
, out
);
530 static void get_pool_stat_key(int64_t pool_id
, string
*key
)
533 _key_encode_u64(pool_id
, key
);
536 static int get_key_pool_stat(const string
& key
, uint64_t* pool_id
)
538 const char *p
= key
.c_str();
539 if (key
.length() < sizeof(uint64_t))
541 _key_decode_u64(p
, pool_id
);
546 template <int LogLevelV
>
547 void _dump_extent_map(CephContext
*cct
, const BlueStore::ExtentMap
&em
)
550 for (auto& s
: em
.shards
) {
551 dout(LogLevelV
) << __func__
<< " shard " << *s
.shard_info
552 << (s
.loaded
? " (loaded)" : "")
553 << (s
.dirty
? " (dirty)" : "")
556 for (auto& e
: em
.extent_map
) {
557 dout(LogLevelV
) << __func__
<< " " << e
<< dendl
;
558 ceph_assert(e
.logical_offset
>= pos
);
559 pos
= e
.logical_offset
+ e
.length
;
560 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
561 if (blob
.has_csum()) {
563 unsigned n
= blob
.get_csum_count();
564 for (unsigned i
= 0; i
< n
; ++i
)
565 v
.push_back(blob
.get_csum_item(i
));
566 dout(LogLevelV
) << __func__
<< " csum: " << std::hex
<< v
<< std::dec
569 std::lock_guard
l(e
.blob
->shared_blob
->get_cache()->lock
);
570 for (auto& i
: e
.blob
->shared_blob
->bc
.buffer_map
) {
571 dout(LogLevelV
) << __func__
<< " 0x" << std::hex
<< i
.first
572 << "~" << i
.second
->length
<< std::dec
573 << " " << *i
.second
<< dendl
;
578 template <int LogLevelV
>
579 void _dump_onode(CephContext
*cct
, const BlueStore::Onode
& o
)
581 if (!cct
->_conf
->subsys
.should_gather
<ceph_subsys_bluestore
, LogLevelV
>())
583 dout(LogLevelV
) << __func__
<< " " << &o
<< " " << o
.oid
584 << " nid " << o
.onode
.nid
585 << " size 0x" << std::hex
<< o
.onode
.size
586 << " (" << std::dec
<< o
.onode
.size
<< ")"
587 << " expected_object_size " << o
.onode
.expected_object_size
588 << " expected_write_size " << o
.onode
.expected_write_size
589 << " in " << o
.onode
.extent_map_shards
.size() << " shards"
590 << ", " << o
.extent_map
.spanning_blob_map
.size()
593 for (auto p
= o
.onode
.attrs
.begin();
594 p
!= o
.onode
.attrs
.end();
596 dout(LogLevelV
) << __func__
<< " attr " << p
->first
597 << " len " << p
->second
.length() << dendl
;
599 _dump_extent_map
<LogLevelV
>(cct
, o
.extent_map
);
602 template <int LogLevelV
>
603 void _dump_transaction(CephContext
*cct
, ObjectStore::Transaction
*t
)
605 dout(LogLevelV
) << __func__
<< " transaction dump:\n";
606 JSONFormatter
f(true);
607 f
.open_object_section("transaction");
616 ostream
& operator<<(ostream
& out
, const BlueStore::Buffer
& b
)
618 out
<< "buffer(" << &b
<< " space " << b
.space
<< " 0x" << std::hex
619 << b
.offset
<< "~" << b
.length
<< std::dec
620 << " " << BlueStore::Buffer::get_state_name(b
.state
);
622 out
<< " " << BlueStore::Buffer::get_flag_name(b
.flags
);
629 * Due to a bug in key string encoding (see a comment for append_escaped)
630 * the KeyValueDB iterator does not lexicographically sort the same
631 * way that ghobject_t does: objects with the same hash may have wrong order.
633 * This is the iterator wrapper that fixes the keys order.
636 class CollectionListIterator
{
638 CollectionListIterator(const KeyValueDB::Iterator
&it
)
641 virtual ~CollectionListIterator() {
644 virtual bool valid() const = 0;
645 virtual const ghobject_t
&oid() const = 0;
646 virtual void lower_bound(const ghobject_t
&oid
) = 0;
647 virtual void upper_bound(const ghobject_t
&oid
) = 0;
648 virtual void next() = 0;
650 virtual int cmp(const ghobject_t
&oid
) const = 0;
652 bool is_ge(const ghobject_t
&oid
) const {
653 return cmp(oid
) >= 0;
656 bool is_lt(const ghobject_t
&oid
) const {
661 KeyValueDB::Iterator m_it
;
664 class SimpleCollectionListIterator
: public CollectionListIterator
{
666 SimpleCollectionListIterator(CephContext
*cct
, const KeyValueDB::Iterator
&it
)
667 : CollectionListIterator(it
), m_cct(cct
) {
670 bool valid() const override
{
671 return m_it
->valid();
674 const ghobject_t
&oid() const override
{
675 ceph_assert(valid());
680 void lower_bound(const ghobject_t
&oid
) override
{
682 get_object_key(m_cct
, oid
, &key
);
684 m_it
->lower_bound(key
);
688 void upper_bound(const ghobject_t
&oid
) override
{
690 get_object_key(m_cct
, oid
, &key
);
692 m_it
->upper_bound(key
);
696 void next() override
{
697 ceph_assert(valid());
703 int cmp(const ghobject_t
&oid
) const override
{
704 ceph_assert(valid());
707 get_object_key(m_cct
, oid
, &key
);
709 return m_it
->key().compare(key
);
717 m_oid
= ghobject_t();
718 while (m_it
->valid() && is_extent_shard_key(m_it
->key())) {
725 int r
= get_key_object(m_it
->key(), &m_oid
);
730 class SortedCollectionListIterator
: public CollectionListIterator
{
732 SortedCollectionListIterator(const KeyValueDB::Iterator
&it
)
733 : CollectionListIterator(it
), m_chunk_iter(m_chunk
.end()) {
736 bool valid() const override
{
737 return m_chunk_iter
!= m_chunk
.end();
740 const ghobject_t
&oid() const override
{
741 ceph_assert(valid());
743 return m_chunk_iter
->first
;
746 void lower_bound(const ghobject_t
&oid
) override
{
748 _key_encode_prefix(oid
, &key
);
750 m_it
->lower_bound(key
);
751 m_chunk_iter
= m_chunk
.end();
752 if (!get_next_chunk()) {
756 if (this->oid().shard_id
!= oid
.shard_id
||
757 this->oid().hobj
.pool
!= oid
.hobj
.pool
||
758 this->oid().hobj
.get_bitwise_key_u32() != oid
.hobj
.get_bitwise_key_u32()) {
762 m_chunk_iter
= m_chunk
.lower_bound(oid
);
763 if (m_chunk_iter
== m_chunk
.end()) {
768 void upper_bound(const ghobject_t
&oid
) override
{
771 if (valid() && this->oid() == oid
) {
776 void next() override
{
777 ceph_assert(valid());
780 if (m_chunk_iter
== m_chunk
.end()) {
785 int cmp(const ghobject_t
&oid
) const override
{
786 ceph_assert(valid());
788 if (this->oid() < oid
) {
791 if (this->oid() > oid
) {
798 std::map
<ghobject_t
, std::string
> m_chunk
;
799 std::map
<ghobject_t
, std::string
>::iterator m_chunk_iter
;
801 bool get_next_chunk() {
802 while (m_it
->valid() && is_extent_shard_key(m_it
->key())) {
806 if (!m_it
->valid()) {
811 int r
= get_key_object(m_it
->key(), &oid
);
816 m_chunk
.insert({oid
, m_it
->key()});
820 } while (m_it
->valid() && is_extent_shard_key(m_it
->key()));
822 if (!m_it
->valid()) {
827 r
= get_key_object(m_it
->key(), &next
);
829 if (next
.shard_id
!= oid
.shard_id
||
830 next
.hobj
.pool
!= oid
.hobj
.pool
||
831 next
.hobj
.get_bitwise_key_u32() != oid
.hobj
.get_bitwise_key_u32()) {
837 m_chunk_iter
= m_chunk
.begin();
842 } // anonymous namespace
846 void BlueStore::GarbageCollector::process_protrusive_extents(
847 const BlueStore::ExtentMap
& extent_map
,
848 uint64_t start_offset
,
850 uint64_t start_touch_offset
,
851 uint64_t end_touch_offset
,
852 uint64_t min_alloc_size
)
854 ceph_assert(start_offset
<= start_touch_offset
&& end_offset
>= end_touch_offset
);
856 uint64_t lookup_start_offset
= p2align(start_offset
, min_alloc_size
);
857 uint64_t lookup_end_offset
= round_up_to(end_offset
, min_alloc_size
);
859 dout(30) << __func__
<< " (hex): [" << std::hex
860 << lookup_start_offset
<< ", " << lookup_end_offset
861 << ")" << std::dec
<< dendl
;
863 for (auto it
= extent_map
.seek_lextent(lookup_start_offset
);
864 it
!= extent_map
.extent_map
.end() &&
865 it
->logical_offset
< lookup_end_offset
;
867 uint64_t alloc_unit_start
= it
->logical_offset
/ min_alloc_size
;
868 uint64_t alloc_unit_end
= (it
->logical_end() - 1) / min_alloc_size
;
870 dout(30) << __func__
<< " " << *it
871 << "alloc_units: " << alloc_unit_start
<< ".." << alloc_unit_end
874 Blob
* b
= it
->blob
.get();
876 if (it
->logical_offset
>=start_touch_offset
&&
877 it
->logical_end() <= end_touch_offset
) {
878 // Process extents within the range affected by
879 // the current write request.
880 // Need to take into account if existing extents
881 // can be merged with them (uncompressed case)
882 if (!b
->get_blob().is_compressed()) {
883 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
884 --blob_info_counted
->expected_allocations
; // don't need to allocate
885 // new AU for compressed
886 // data since another
887 // collocated uncompressed
888 // blob already exists
889 dout(30) << __func__
<< " --expected:"
890 << alloc_unit_start
<< dendl
;
892 used_alloc_unit
= alloc_unit_end
;
893 blob_info_counted
= nullptr;
895 } else if (b
->get_blob().is_compressed()) {
897 // additionally we take compressed blobs that were not impacted
898 // by the write into account too
900 affected_blobs
.emplace(
901 b
, BlobInfo(b
->get_referenced_bytes())).first
->second
;
904 (used_alloc_unit
&& used_alloc_unit
== alloc_unit_start
) ? 0 : 1;
905 bi
.expected_allocations
+= alloc_unit_end
- alloc_unit_start
+ adjust
;
906 dout(30) << __func__
<< " expected_allocations="
907 << bi
.expected_allocations
<< " end_au:"
908 << alloc_unit_end
<< dendl
;
910 blob_info_counted
= &bi
;
911 used_alloc_unit
= alloc_unit_end
;
913 ceph_assert(it
->length
<= bi
.referenced_bytes
);
914 bi
.referenced_bytes
-= it
->length
;
915 dout(30) << __func__
<< " affected_blob:" << *b
916 << " unref 0x" << std::hex
<< it
->length
917 << " referenced = 0x" << bi
.referenced_bytes
918 << std::dec
<< dendl
;
919 // NOTE: we can't move specific blob to resulting GC list here
920 // when reference counter == 0 since subsequent extents might
921 // decrement its expected_allocation.
922 // Hence need to enumerate all the extents first.
923 if (!bi
.collect_candidate
) {
924 bi
.first_lextent
= it
;
925 bi
.collect_candidate
= true;
927 bi
.last_lextent
= it
;
929 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
930 // don't need to allocate new AU for compressed data since another
931 // collocated uncompressed blob already exists
932 --blob_info_counted
->expected_allocations
;
933 dout(30) << __func__
<< " --expected_allocations:"
934 << alloc_unit_start
<< dendl
;
936 used_alloc_unit
= alloc_unit_end
;
937 blob_info_counted
= nullptr;
941 for (auto b_it
= affected_blobs
.begin();
942 b_it
!= affected_blobs
.end();
944 Blob
* b
= b_it
->first
;
945 BlobInfo
& bi
= b_it
->second
;
946 if (bi
.referenced_bytes
== 0) {
947 uint64_t len_on_disk
= b_it
->first
->get_blob().get_ondisk_length();
948 int64_t blob_expected_for_release
=
949 round_up_to(len_on_disk
, min_alloc_size
) / min_alloc_size
;
951 dout(30) << __func__
<< " " << *(b_it
->first
)
952 << " expected4release=" << blob_expected_for_release
953 << " expected_allocations=" << bi
.expected_allocations
955 int64_t benefit
= blob_expected_for_release
- bi
.expected_allocations
;
956 if (benefit
>= g_conf()->bluestore_gc_enable_blob_threshold
) {
957 if (bi
.collect_candidate
) {
958 auto it
= bi
.first_lextent
;
961 if (it
->blob
.get() == b
) {
962 extents_to_collect
.insert(it
->logical_offset
, it
->length
);
964 bExit
= it
== bi
.last_lextent
;
968 expected_for_release
+= blob_expected_for_release
;
969 expected_allocations
+= bi
.expected_allocations
;
975 int64_t BlueStore::GarbageCollector::estimate(
976 uint64_t start_offset
,
978 const BlueStore::ExtentMap
& extent_map
,
979 const BlueStore::old_extent_map_t
& old_extents
,
980 uint64_t min_alloc_size
)
983 affected_blobs
.clear();
984 extents_to_collect
.clear();
985 used_alloc_unit
= boost::optional
<uint64_t >();
986 blob_info_counted
= nullptr;
988 uint64_t gc_start_offset
= start_offset
;
989 uint64_t gc_end_offset
= start_offset
+ length
;
991 uint64_t end_offset
= start_offset
+ length
;
993 for (auto it
= old_extents
.begin(); it
!= old_extents
.end(); ++it
) {
994 Blob
* b
= it
->e
.blob
.get();
995 if (b
->get_blob().is_compressed()) {
997 // update gc_start_offset/gc_end_offset if needed
998 gc_start_offset
= min(gc_start_offset
, (uint64_t)it
->e
.blob_start());
999 gc_end_offset
= std::max(gc_end_offset
, (uint64_t)it
->e
.blob_end());
1001 auto o
= it
->e
.logical_offset
;
1002 auto l
= it
->e
.length
;
1004 uint64_t ref_bytes
= b
->get_referenced_bytes();
1005 // micro optimization to bypass blobs that have no more references
1006 if (ref_bytes
!= 0) {
1007 dout(30) << __func__
<< " affected_blob:" << *b
1008 << " unref 0x" << std::hex
<< o
<< "~" << l
1009 << std::dec
<< dendl
;
1010 affected_blobs
.emplace(b
, BlobInfo(ref_bytes
));
1014 dout(30) << __func__
<< " gc range(hex): [" << std::hex
1015 << gc_start_offset
<< ", " << gc_end_offset
1016 << ")" << std::dec
<< dendl
;
1018 // enumerate preceeding extents to check if they reference affected blobs
1019 if (gc_start_offset
< start_offset
|| gc_end_offset
> end_offset
) {
1020 process_protrusive_extents(extent_map
,
1027 return expected_for_release
- expected_allocations
;
1030 // LruOnodeCacheShard
1031 struct LruOnodeCacheShard
: public BlueStore::OnodeCacheShard
{
1032 typedef boost::intrusive::list
<
1034 boost::intrusive::member_hook
<
1036 boost::intrusive::list_member_hook
<>,
1037 &BlueStore::Onode::lru_item
> > list_t
;
1041 explicit LruOnodeCacheShard(CephContext
*cct
) : BlueStore::OnodeCacheShard(cct
) {}
1043 void _add(BlueStore::Onode
* o
, int level
) override
1045 if (o
->put_cache()) {
1046 (level
> 0) ? lru
.push_front(*o
) : lru
.push_back(*o
);
1050 ++num
; // we count both pinned and unpinned entries
1051 dout(20) << __func__
<< " " << this << " " << o
->oid
<< " added, num=" << num
<< dendl
;
1053 void _rm(BlueStore::Onode
* o
) override
1055 if (o
->pop_cache()) {
1056 lru
.erase(lru
.iterator_to(*o
));
1058 ceph_assert(num_pinned
);
1063 dout(20) << __func__
<< " " << this << " " << " " << o
->oid
<< " removed, num=" << num
<< dendl
;
1065 void _pin(BlueStore::Onode
* o
) override
1067 lru
.erase(lru
.iterator_to(*o
));
1069 dout(20) << __func__
<< this << " " << " " << " " << o
->oid
<< " pinned" << dendl
;
1071 void _unpin(BlueStore::Onode
* o
) override
1074 ceph_assert(num_pinned
);
1076 dout(20) << __func__
<< this << " " << " " << " " << o
->oid
<< " unpinned" << dendl
;
1078 void _unpin_and_rm(BlueStore::Onode
* o
) override
1081 ceph_assert(num_pinned
);
1086 void _trim_to(uint64_t new_size
) override
1088 if (new_size
>= lru
.size()) {
1089 return; // don't even try
1091 uint64_t n
= lru
.size() - new_size
;
1093 ceph_assert(p
!= lru
.begin());
1095 ceph_assert(num
>= n
);
1098 BlueStore::Onode
*o
= &*p
;
1099 dout(20) << __func__
<< " rm " << o
->oid
<< " "
1100 << o
->nref
<< " " << o
->cached
<< " " << o
->pinned
<< dendl
;
1101 if (p
!= lru
.begin()) {
1104 ceph_assert(n
== 0);
1107 auto pinned
= !o
->pop_cache();
1108 ceph_assert(!pinned
);
1109 o
->c
->onode_map
._remove(o
->oid
);
1112 void move_pinned(OnodeCacheShard
*to
, BlueStore::Onode
*o
) override
1117 ceph_assert(o
->cached
);
1118 ceph_assert(o
->pinned
);
1120 ceph_assert(num_pinned
);
1126 void add_stats(uint64_t *onodes
, uint64_t *pinned_onodes
) override
1129 *pinned_onodes
+= num_pinned
;
1134 BlueStore::OnodeCacheShard
*BlueStore::OnodeCacheShard::create(
1137 PerfCounters
*logger
)
1139 BlueStore::OnodeCacheShard
*c
= nullptr;
1140 // Currently we only implement an LRU cache for onodes
1141 c
= new LruOnodeCacheShard(cct
);
1146 // LruBufferCacheShard
1147 struct LruBufferCacheShard
: public BlueStore::BufferCacheShard
{
1148 typedef boost::intrusive::list
<
1150 boost::intrusive::member_hook
<
1152 boost::intrusive::list_member_hook
<>,
1153 &BlueStore::Buffer::lru_item
> > list_t
;
1156 explicit LruBufferCacheShard(CephContext
*cct
) : BlueStore::BufferCacheShard(cct
) {}
1158 void _add(BlueStore::Buffer
*b
, int level
, BlueStore::Buffer
*near
) override
{
1160 auto q
= lru
.iterator_to(*near
);
1162 } else if (level
> 0) {
1167 buffer_bytes
+= b
->length
;
1170 void _rm(BlueStore::Buffer
*b
) override
{
1171 ceph_assert(buffer_bytes
>= b
->length
);
1172 buffer_bytes
-= b
->length
;
1173 auto q
= lru
.iterator_to(*b
);
1177 void _move(BlueStore::BufferCacheShard
*src
, BlueStore::Buffer
*b
) override
{
1179 _add(b
, 0, nullptr);
1181 void _adjust_size(BlueStore::Buffer
*b
, int64_t delta
) override
{
1182 ceph_assert((int64_t)buffer_bytes
+ delta
>= 0);
1183 buffer_bytes
+= delta
;
1185 void _touch(BlueStore::Buffer
*b
) override
{
1186 auto p
= lru
.iterator_to(*b
);
1190 _audit("_touch_buffer end");
1193 void _trim_to(uint64_t max
) override
1195 while (buffer_bytes
> max
) {
1196 auto i
= lru
.rbegin();
1197 if (i
== lru
.rend()) {
1198 // stop if lru is now empty
1202 BlueStore::Buffer
*b
= &*i
;
1203 ceph_assert(b
->is_clean());
1204 dout(20) << __func__
<< " rm " << *b
<< dendl
;
1205 b
->space
->_rm_buffer(this, b
);
1210 void add_stats(uint64_t *extents
,
1213 uint64_t *bytes
) override
{
1214 *extents
+= num_extents
;
1215 *blobs
+= num_blobs
;
1217 *bytes
+= buffer_bytes
;
1220 void _audit(const char *s
) override
1222 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1224 for (auto i
= lru
.begin(); i
!= lru
.end(); ++i
) {
1227 if (s
!= buffer_bytes
) {
1228 derr
<< __func__
<< " buffer_size " << buffer_bytes
<< " actual " << s
1230 for (auto i
= lru
.begin(); i
!= lru
.end(); ++i
) {
1231 derr
<< __func__
<< " " << *i
<< dendl
;
1233 ceph_assert(s
== buffer_bytes
);
1235 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1241 // TwoQBufferCacheShard
1243 struct TwoQBufferCacheShard
: public BlueStore::BufferCacheShard
{
1244 typedef boost::intrusive::list
<
1246 boost::intrusive::member_hook
<
1248 boost::intrusive::list_member_hook
<>,
1249 &BlueStore::Buffer::lru_item
> > list_t
;
1250 list_t hot
; ///< "Am" hot buffers
1251 list_t warm_in
; ///< "A1in" newly warm buffers
1252 list_t warm_out
; ///< "A1out" empty buffers we've evicted
1256 BUFFER_WARM_IN
, ///< in warm_in
1257 BUFFER_WARM_OUT
, ///< in warm_out
1258 BUFFER_HOT
, ///< in hot
1262 uint64_t list_bytes
[BUFFER_TYPE_MAX
] = {0}; ///< bytes per type
1265 explicit TwoQBufferCacheShard(CephContext
*cct
) : BufferCacheShard(cct
) {}
1267 void _add(BlueStore::Buffer
*b
, int level
, BlueStore::Buffer
*near
) override
1269 dout(20) << __func__
<< " level " << level
<< " near " << near
1271 << " which has cache_private " << b
->cache_private
<< dendl
;
1273 b
->cache_private
= near
->cache_private
;
1274 switch (b
->cache_private
) {
1275 case BUFFER_WARM_IN
:
1276 warm_in
.insert(warm_in
.iterator_to(*near
), *b
);
1278 case BUFFER_WARM_OUT
:
1279 ceph_assert(b
->is_empty());
1280 warm_out
.insert(warm_out
.iterator_to(*near
), *b
);
1283 hot
.insert(hot
.iterator_to(*near
), *b
);
1286 ceph_abort_msg("bad cache_private");
1288 } else if (b
->cache_private
== BUFFER_NEW
) {
1289 b
->cache_private
= BUFFER_WARM_IN
;
1291 warm_in
.push_front(*b
);
1293 // take caller hint to start at the back of the warm queue
1294 warm_in
.push_back(*b
);
1297 // we got a hint from discard
1298 switch (b
->cache_private
) {
1299 case BUFFER_WARM_IN
:
1300 // stay in warm_in. move to front, even though 2Q doesn't actually
1302 dout(20) << __func__
<< " move to front of warm " << *b
<< dendl
;
1303 warm_in
.push_front(*b
);
1305 case BUFFER_WARM_OUT
:
1306 b
->cache_private
= BUFFER_HOT
;
1307 // move to hot. fall-thru
1309 dout(20) << __func__
<< " move to front of hot " << *b
<< dendl
;
1313 ceph_abort_msg("bad cache_private");
1316 if (!b
->is_empty()) {
1317 buffer_bytes
+= b
->length
;
1318 list_bytes
[b
->cache_private
] += b
->length
;
1320 num
= hot
.size() + warm_in
.size();
1323 void _rm(BlueStore::Buffer
*b
) override
1325 dout(20) << __func__
<< " " << *b
<< dendl
;
1326 if (!b
->is_empty()) {
1327 ceph_assert(buffer_bytes
>= b
->length
);
1328 buffer_bytes
-= b
->length
;
1329 ceph_assert(list_bytes
[b
->cache_private
] >= b
->length
);
1330 list_bytes
[b
->cache_private
] -= b
->length
;
1332 switch (b
->cache_private
) {
1333 case BUFFER_WARM_IN
:
1334 warm_in
.erase(warm_in
.iterator_to(*b
));
1336 case BUFFER_WARM_OUT
:
1337 warm_out
.erase(warm_out
.iterator_to(*b
));
1340 hot
.erase(hot
.iterator_to(*b
));
1343 ceph_abort_msg("bad cache_private");
1345 num
= hot
.size() + warm_in
.size();
1348 void _move(BlueStore::BufferCacheShard
*srcc
, BlueStore::Buffer
*b
) override
1350 TwoQBufferCacheShard
*src
= static_cast<TwoQBufferCacheShard
*>(srcc
);
1353 // preserve which list we're on (even if we can't preserve the order!)
1354 switch (b
->cache_private
) {
1355 case BUFFER_WARM_IN
:
1356 ceph_assert(!b
->is_empty());
1357 warm_in
.push_back(*b
);
1359 case BUFFER_WARM_OUT
:
1360 ceph_assert(b
->is_empty());
1361 warm_out
.push_back(*b
);
1364 ceph_assert(!b
->is_empty());
1368 ceph_abort_msg("bad cache_private");
1370 if (!b
->is_empty()) {
1371 buffer_bytes
+= b
->length
;
1372 list_bytes
[b
->cache_private
] += b
->length
;
1374 num
= hot
.size() + warm_in
.size();
1377 void _adjust_size(BlueStore::Buffer
*b
, int64_t delta
) override
1379 dout(20) << __func__
<< " delta " << delta
<< " on " << *b
<< dendl
;
1380 if (!b
->is_empty()) {
1381 ceph_assert((int64_t)buffer_bytes
+ delta
>= 0);
1382 buffer_bytes
+= delta
;
1383 ceph_assert((int64_t)list_bytes
[b
->cache_private
] + delta
>= 0);
1384 list_bytes
[b
->cache_private
] += delta
;
1388 void _touch(BlueStore::Buffer
*b
) override
{
1389 switch (b
->cache_private
) {
1390 case BUFFER_WARM_IN
:
1391 // do nothing (somewhat counter-intuitively!)
1393 case BUFFER_WARM_OUT
:
1394 // move from warm_out to hot LRU
1395 ceph_abort_msg("this happens via discard hint");
1398 // move to front of hot LRU
1399 hot
.erase(hot
.iterator_to(*b
));
1403 num
= hot
.size() + warm_in
.size();
1404 _audit("_touch_buffer end");
1407 void _trim_to(uint64_t max
) override
1409 if (buffer_bytes
> max
) {
1410 uint64_t kin
= max
* cct
->_conf
->bluestore_2q_cache_kin_ratio
;
1411 uint64_t khot
= max
- kin
;
1413 // pre-calculate kout based on average buffer size too,
1414 // which is typical(the warm_in and hot lists may change later)
1416 uint64_t buffer_num
= hot
.size() + warm_in
.size();
1418 uint64_t avg_size
= buffer_bytes
/ buffer_num
;
1419 ceph_assert(avg_size
);
1420 uint64_t calculated_num
= max
/ avg_size
;
1421 kout
= calculated_num
* cct
->_conf
->bluestore_2q_cache_kout_ratio
;
1424 if (list_bytes
[BUFFER_HOT
] < khot
) {
1425 // hot is small, give slack to warm_in
1426 kin
+= khot
- list_bytes
[BUFFER_HOT
];
1427 } else if (list_bytes
[BUFFER_WARM_IN
] < kin
) {
1428 // warm_in is small, give slack to hot
1429 khot
+= kin
- list_bytes
[BUFFER_WARM_IN
];
1432 // adjust warm_in list
1433 int64_t to_evict_bytes
= list_bytes
[BUFFER_WARM_IN
] - kin
;
1434 uint64_t evicted
= 0;
1436 while (to_evict_bytes
> 0) {
1437 auto p
= warm_in
.rbegin();
1438 if (p
== warm_in
.rend()) {
1439 // stop if warm_in list is now empty
1443 BlueStore::Buffer
*b
= &*p
;
1444 ceph_assert(b
->is_clean());
1445 dout(20) << __func__
<< " buffer_warm_in -> out " << *b
<< dendl
;
1446 ceph_assert(buffer_bytes
>= b
->length
);
1447 buffer_bytes
-= b
->length
;
1448 ceph_assert(list_bytes
[BUFFER_WARM_IN
] >= b
->length
);
1449 list_bytes
[BUFFER_WARM_IN
] -= b
->length
;
1450 to_evict_bytes
-= b
->length
;
1451 evicted
+= b
->length
;
1452 b
->state
= BlueStore::Buffer::STATE_EMPTY
;
1454 warm_in
.erase(warm_in
.iterator_to(*b
));
1455 warm_out
.push_front(*b
);
1456 b
->cache_private
= BUFFER_WARM_OUT
;
1460 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1461 << " from warm_in list, done evicting warm_in buffers"
1466 to_evict_bytes
= list_bytes
[BUFFER_HOT
] - khot
;
1469 while (to_evict_bytes
> 0) {
1470 auto p
= hot
.rbegin();
1471 if (p
== hot
.rend()) {
1472 // stop if hot list is now empty
1476 BlueStore::Buffer
*b
= &*p
;
1477 dout(20) << __func__
<< " buffer_hot rm " << *b
<< dendl
;
1478 ceph_assert(b
->is_clean());
1479 // adjust evict size before buffer goes invalid
1480 to_evict_bytes
-= b
->length
;
1481 evicted
+= b
->length
;
1482 b
->space
->_rm_buffer(this, b
);
1486 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1487 << " from hot list, done evicting hot buffers"
1491 // adjust warm out list too, if necessary
1492 int64_t n
= warm_out
.size() - kout
;
1494 BlueStore::Buffer
*b
= &*warm_out
.rbegin();
1495 ceph_assert(b
->is_empty());
1496 dout(20) << __func__
<< " buffer_warm_out rm " << *b
<< dendl
;
1497 b
->space
->_rm_buffer(this, b
);
1500 num
= hot
.size() + warm_in
.size();
1503 void add_stats(uint64_t *extents
,
1506 uint64_t *bytes
) override
{
1507 *extents
+= num_extents
;
1508 *blobs
+= num_blobs
;
1510 *bytes
+= buffer_bytes
;
1514 void _audit(const char *s
) override
1516 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1518 for (auto i
= hot
.begin(); i
!= hot
.end(); ++i
) {
1522 uint64_t hot_bytes
= s
;
1523 if (hot_bytes
!= list_bytes
[BUFFER_HOT
]) {
1524 derr
<< __func__
<< " hot_list_bytes "
1525 << list_bytes
[BUFFER_HOT
]
1526 << " != actual " << hot_bytes
1528 ceph_assert(hot_bytes
== list_bytes
[BUFFER_HOT
]);
1531 for (auto i
= warm_in
.begin(); i
!= warm_in
.end(); ++i
) {
1535 uint64_t warm_in_bytes
= s
- hot_bytes
;
1536 if (warm_in_bytes
!= list_bytes
[BUFFER_WARM_IN
]) {
1537 derr
<< __func__
<< " warm_in_list_bytes "
1538 << list_bytes
[BUFFER_WARM_IN
]
1539 << " != actual " << warm_in_bytes
1541 ceph_assert(warm_in_bytes
== list_bytes
[BUFFER_WARM_IN
]);
1544 if (s
!= buffer_bytes
) {
1545 derr
<< __func__
<< " buffer_bytes " << buffer_bytes
<< " actual " << s
1547 ceph_assert(s
== buffer_bytes
);
1550 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1558 BlueStore::BufferCacheShard
*BlueStore::BufferCacheShard::create(
1561 PerfCounters
*logger
)
1563 BufferCacheShard
*c
= nullptr;
1565 c
= new LruBufferCacheShard(cct
);
1566 else if (type
== "2q")
1567 c
= new TwoQBufferCacheShard(cct
);
1569 ceph_abort_msg("unrecognized cache type");
1577 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1579 void BlueStore::BufferSpace::_clear(BufferCacheShard
* cache
)
1581 // note: we already hold cache->lock
1582 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1583 while (!buffer_map
.empty()) {
1584 _rm_buffer(cache
, buffer_map
.begin());
1588 int BlueStore::BufferSpace::_discard(BufferCacheShard
* cache
, uint32_t offset
, uint32_t length
)
1590 // note: we already hold cache->lock
1591 ldout(cache
->cct
, 20) << __func__
<< std::hex
<< " 0x" << offset
<< "~" << length
1592 << std::dec
<< dendl
;
1593 int cache_private
= 0;
1594 cache
->_audit("discard start");
1595 auto i
= _data_lower_bound(offset
);
1596 uint32_t end
= offset
+ length
;
1597 while (i
!= buffer_map
.end()) {
1598 Buffer
*b
= i
->second
.get();
1599 if (b
->offset
>= end
) {
1602 if (b
->cache_private
> cache_private
) {
1603 cache_private
= b
->cache_private
;
1605 if (b
->offset
< offset
) {
1606 int64_t front
= offset
- b
->offset
;
1607 if (b
->end() > end
) {
1608 // drop middle (split)
1609 uint32_t tail
= b
->end() - end
;
1610 if (b
->data
.length()) {
1612 bl
.substr_of(b
->data
, b
->length
- tail
, tail
);
1613 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
, b
->flags
);
1614 nb
->maybe_rebuild();
1615 _add_buffer(cache
, nb
, 0, b
);
1617 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, tail
,
1621 if (!b
->is_writing()) {
1622 cache
->_adjust_size(b
, front
- (int64_t)b
->length
);
1626 cache
->_audit("discard end 1");
1630 if (!b
->is_writing()) {
1631 cache
->_adjust_size(b
, front
- (int64_t)b
->length
);
1639 if (b
->end() <= end
) {
1640 // drop entire buffer
1641 _rm_buffer(cache
, i
++);
1645 uint32_t keep
= b
->end() - end
;
1646 if (b
->data
.length()) {
1648 bl
.substr_of(b
->data
, b
->length
- keep
, keep
);
1649 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
, b
->flags
);
1650 nb
->maybe_rebuild();
1651 _add_buffer(cache
, nb
, 0, b
);
1653 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, keep
,
1657 _rm_buffer(cache
, i
);
1658 cache
->_audit("discard end 2");
1661 return cache_private
;
1664 void BlueStore::BufferSpace::read(
1665 BufferCacheShard
* cache
,
1668 BlueStore::ready_regions_t
& res
,
1669 interval_set
<uint32_t>& res_intervals
,
1673 res_intervals
.clear();
1674 uint32_t want_bytes
= length
;
1675 uint32_t end
= offset
+ length
;
1678 std::lock_guard
l(cache
->lock
);
1679 for (auto i
= _data_lower_bound(offset
);
1680 i
!= buffer_map
.end() && offset
< end
&& i
->first
< end
;
1682 Buffer
*b
= i
->second
.get();
1683 ceph_assert(b
->end() > offset
);
1686 if (flags
& BYPASS_CLEAN_CACHE
)
1687 val
= b
->is_writing();
1689 val
= b
->is_writing() || b
->is_clean();
1691 if (b
->offset
< offset
) {
1692 uint32_t skip
= offset
- b
->offset
;
1693 uint32_t l
= min(length
, b
->length
- skip
);
1694 res
[offset
].substr_of(b
->data
, skip
, l
);
1695 res_intervals
.insert(offset
, l
);
1698 if (!b
->is_writing()) {
1703 if (b
->offset
> offset
) {
1704 uint32_t gap
= b
->offset
- offset
;
1705 if (length
<= gap
) {
1711 if (!b
->is_writing()) {
1714 if (b
->length
> length
) {
1715 res
[offset
].substr_of(b
->data
, 0, length
);
1716 res_intervals
.insert(offset
, length
);
1719 res
[offset
].append(b
->data
);
1720 res_intervals
.insert(offset
, b
->length
);
1721 if (b
->length
== length
)
1723 offset
+= b
->length
;
1724 length
-= b
->length
;
1730 uint64_t hit_bytes
= res_intervals
.size();
1731 ceph_assert(hit_bytes
<= want_bytes
);
1732 uint64_t miss_bytes
= want_bytes
- hit_bytes
;
1733 cache
->logger
->inc(l_bluestore_buffer_hit_bytes
, hit_bytes
);
1734 cache
->logger
->inc(l_bluestore_buffer_miss_bytes
, miss_bytes
);
1737 void BlueStore::BufferSpace::_finish_write(BufferCacheShard
* cache
, uint64_t seq
)
1739 auto i
= writing
.begin();
1740 while (i
!= writing
.end()) {
1750 ceph_assert(b
->is_writing());
1752 if (b
->flags
& Buffer::FLAG_NOCACHE
) {
1754 ldout(cache
->cct
, 20) << __func__
<< " discard " << *b
<< dendl
;
1755 buffer_map
.erase(b
->offset
);
1757 b
->state
= Buffer::STATE_CLEAN
;
1760 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
1761 cache
->_add(b
, 1, nullptr);
1762 ldout(cache
->cct
, 20) << __func__
<< " added " << *b
<< dendl
;
1766 cache
->_audit("finish_write end");
1769 void BlueStore::BufferSpace::split(BufferCacheShard
* cache
, size_t pos
, BlueStore::BufferSpace
&r
)
1771 std::lock_guard
lk(cache
->lock
);
1772 if (buffer_map
.empty())
1775 auto p
= --buffer_map
.end();
1777 if (p
->second
->end() <= pos
)
1780 if (p
->second
->offset
< pos
) {
1781 ldout(cache
->cct
, 30) << __func__
<< " cut " << *p
->second
<< dendl
;
1782 size_t left
= pos
- p
->second
->offset
;
1783 size_t right
= p
->second
->length
- left
;
1784 if (p
->second
->data
.length()) {
1786 bl
.substr_of(p
->second
->data
, left
, right
);
1787 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1788 0, bl
, p
->second
->flags
),
1789 0, p
->second
.get());
1791 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1792 0, right
, p
->second
->flags
),
1793 0, p
->second
.get());
1795 cache
->_adjust_size(p
->second
.get(), -right
);
1796 p
->second
->truncate(left
);
1800 ceph_assert(p
->second
->end() > pos
);
1801 ldout(cache
->cct
, 30) << __func__
<< " move " << *p
->second
<< dendl
;
1802 if (p
->second
->data
.length()) {
1803 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1804 p
->second
->offset
- pos
, p
->second
->data
, p
->second
->flags
),
1805 0, p
->second
.get());
1807 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1808 p
->second
->offset
- pos
, p
->second
->length
, p
->second
->flags
),
1809 0, p
->second
.get());
1811 if (p
== buffer_map
.begin()) {
1812 _rm_buffer(cache
, p
);
1815 _rm_buffer(cache
, p
--);
1818 ceph_assert(writing
.empty());
1825 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1827 BlueStore::OnodeRef
BlueStore::OnodeSpace::add(const ghobject_t
& oid
,
1830 std::lock_guard
l(cache
->lock
);
1831 auto p
= onode_map
.find(oid
);
1832 if (p
!= onode_map
.end()) {
1833 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
1834 << " raced, returning existing " << p
->second
1838 ldout(cache
->cct
, 20) << __func__
<< " " << oid
<< " " << o
<< dendl
;
1840 cache
->_add(o
.get(), 1);
1845 void BlueStore::OnodeSpace::_remove(const ghobject_t
& oid
)
1847 ldout(cache
->cct
, 20) << __func__
<< " " << oid
<< " " << dendl
;
1848 onode_map
.erase(oid
);
1851 BlueStore::OnodeRef
BlueStore::OnodeSpace::lookup(const ghobject_t
& oid
)
1853 ldout(cache
->cct
, 30) << __func__
<< dendl
;
1858 std::lock_guard
l(cache
->lock
);
1859 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator p
= onode_map
.find(oid
);
1860 if (p
== onode_map
.end()) {
1861 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " miss" << dendl
;
1863 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " hit " << p
->second
1864 << " " << p
->second
->nref
1865 << " " << p
->second
->cached
1866 << " " << p
->second
->pinned
1868 // This will pin onode and implicitly touch the cache when Onode
1869 // eventually will become unpinned
1871 ceph_assert(!o
->cached
|| o
->pinned
);
1878 cache
->logger
->inc(l_bluestore_onode_hits
);
1880 cache
->logger
->inc(l_bluestore_onode_misses
);
1885 void BlueStore::OnodeSpace::clear()
1887 std::lock_guard
l(cache
->lock
);
1888 ldout(cache
->cct
, 10) << __func__
<< " " << onode_map
.size()<< dendl
;
1889 for (auto &p
: onode_map
) {
1890 cache
->_rm(p
.second
.get());
1895 bool BlueStore::OnodeSpace::empty()
1897 std::lock_guard
l(cache
->lock
);
1898 return onode_map
.empty();
1901 void BlueStore::OnodeSpace::rename(
1903 const ghobject_t
& old_oid
,
1904 const ghobject_t
& new_oid
,
1905 const mempool::bluestore_cache_meta::string
& new_okey
)
1907 std::lock_guard
l(cache
->lock
);
1908 ldout(cache
->cct
, 30) << __func__
<< " " << old_oid
<< " -> " << new_oid
1910 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator po
, pn
;
1911 po
= onode_map
.find(old_oid
);
1912 pn
= onode_map
.find(new_oid
);
1913 ceph_assert(po
!= pn
);
1915 ceph_assert(po
!= onode_map
.end());
1916 if (pn
!= onode_map
.end()) {
1917 ldout(cache
->cct
, 30) << __func__
<< " removing target " << pn
->second
1919 cache
->_rm(pn
->second
.get());
1920 onode_map
.erase(pn
);
1922 OnodeRef o
= po
->second
;
1924 // install a non-existent onode at old location
1925 oldo
.reset(new Onode(o
->c
, old_oid
, o
->key
));
1927 cache
->_add(oldo
.get(), 1);
1928 // add at new position and fix oid, key.
1929 // This will pin 'o' and implicitly touch cache
1930 // when it will eventually become unpinned
1931 onode_map
.insert(make_pair(new_oid
, o
));
1932 ceph_assert(o
->pinned
);
1939 bool BlueStore::OnodeSpace::map_any(std::function
<bool(Onode
*)> f
)
1941 std::lock_guard
l(cache
->lock
);
1942 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1943 for (auto& i
: onode_map
) {
1944 if (f(i
.second
.get())) {
1951 template <int LogLevelV
= 30>
1952 void BlueStore::OnodeSpace::dump(CephContext
*cct
)
1954 for (auto& i
: onode_map
) {
1955 ldout(cct
, LogLevelV
) << i
.first
<< " : " << i
.second
1956 << " " << i
.second
->nref
1957 << " " << i
.second
->cached
1958 << " " << i
.second
->pinned
1966 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1968 #define dout_context coll->store->cct
1970 void BlueStore::SharedBlob::dump(Formatter
* f
) const
1972 f
->dump_bool("loaded", loaded
);
1974 persistent
->dump(f
);
1976 f
->dump_unsigned("sbid_unloaded", sbid_unloaded
);
1980 ostream
& operator<<(ostream
& out
, const BlueStore::SharedBlob
& sb
)
1982 out
<< "SharedBlob(" << &sb
;
1985 out
<< " loaded " << *sb
.persistent
;
1987 out
<< " sbid 0x" << std::hex
<< sb
.sbid_unloaded
<< std::dec
;
1992 BlueStore::SharedBlob::SharedBlob(uint64_t i
, Collection
*_coll
)
1993 : coll(_coll
), sbid_unloaded(i
)
1995 ceph_assert(sbid_unloaded
> 0);
1997 get_cache()->add_blob();
2001 BlueStore::SharedBlob::~SharedBlob()
2003 if (loaded
&& persistent
) {
2008 void BlueStore::SharedBlob::put()
2011 dout(20) << __func__
<< " " << this
2012 << " removing self from set " << get_parent()
2015 auto coll_snap
= coll
;
2017 std::lock_guard
l(coll_snap
->cache
->lock
);
2018 if (coll_snap
!= coll
) {
2021 if (!coll_snap
->shared_blob_set
.remove(this, true)) {
2025 bc
._clear(coll_snap
->cache
);
2026 coll_snap
->cache
->rm_blob();
2032 void BlueStore::SharedBlob::get_ref(uint64_t offset
, uint32_t length
)
2034 ceph_assert(persistent
);
2035 persistent
->ref_map
.get(offset
, length
);
2038 void BlueStore::SharedBlob::put_ref(uint64_t offset
, uint32_t length
,
2042 ceph_assert(persistent
);
2043 persistent
->ref_map
.put(offset
, length
, r
,
2044 unshare
&& !*unshare
? unshare
: nullptr);
2047 void BlueStore::SharedBlob::finish_write(uint64_t seq
)
2050 BufferCacheShard
*cache
= coll
->cache
;
2051 std::lock_guard
l(cache
->lock
);
2052 if (coll
->cache
!= cache
) {
2053 dout(20) << __func__
2054 << " raced with sb cache update, was " << cache
2055 << ", now " << coll
->cache
<< ", retrying"
2059 bc
._finish_write(cache
, seq
);
2067 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2069 template <int LogLevelV
= 30>
2070 void BlueStore::SharedBlobSet::dump(CephContext
*cct
)
2072 std::lock_guard
l(lock
);
2073 for (auto& i
: sb_map
) {
2074 ldout(cct
, LogLevelV
) << i
.first
<< " : " << *i
.second
<< dendl
;
2081 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2083 void BlueStore::Blob::dump(Formatter
* f
) const
2085 if (is_spanning()) {
2086 f
->dump_unsigned("spanning_id ", id
);
2090 f
->dump_object("shared", *shared_blob
);
2094 ostream
& operator<<(ostream
& out
, const BlueStore::Blob
& b
)
2096 out
<< "Blob(" << &b
;
2097 if (b
.is_spanning()) {
2098 out
<< " spanning " << b
.id
;
2100 out
<< " " << b
.get_blob() << " " << b
.get_blob_use_tracker();
2101 if (b
.shared_blob
) {
2102 out
<< " " << *b
.shared_blob
;
2104 out
<< " (shared_blob=NULL)";
2110 void BlueStore::Blob::discard_unallocated(Collection
*coll
)
2112 if (get_blob().is_shared()) {
2115 if (get_blob().is_compressed()) {
2116 bool discard
= false;
2117 bool all_invalid
= true;
2118 for (auto e
: get_blob().get_extents()) {
2119 if (!e
.is_valid()) {
2122 all_invalid
= false;
2125 ceph_assert(discard
== all_invalid
); // in case of compressed blob all
2126 // or none pextents are invalid.
2128 shared_blob
->bc
.discard(shared_blob
->get_cache(), 0,
2129 get_blob().get_logical_length());
2133 for (auto e
: get_blob().get_extents()) {
2134 if (!e
.is_valid()) {
2135 dout(20) << __func__
<< " 0x" << std::hex
<< pos
2137 << std::dec
<< dendl
;
2138 shared_blob
->bc
.discard(shared_blob
->get_cache(), pos
, e
.length
);
2142 if (get_blob().can_prune_tail()) {
2143 dirty_blob().prune_tail();
2144 used_in_blob
.prune_tail(get_blob().get_ondisk_length());
2145 dout(20) << __func__
<< " pruned tail, now " << get_blob() << dendl
;
2150 void BlueStore::Blob::get_ref(
2155 // Caller has to initialize Blob's logical length prior to increment
2156 // references. Otherwise one is neither unable to determine required
2157 // amount of counters in case of per-au tracking nor obtain min_release_size
2158 // for single counter mode.
2159 ceph_assert(get_blob().get_logical_length() != 0);
2160 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2161 << std::dec
<< " " << *this << dendl
;
2163 if (used_in_blob
.is_empty()) {
2164 uint32_t min_release_size
=
2165 get_blob().get_release_size(coll
->store
->min_alloc_size
);
2166 uint64_t l
= get_blob().get_logical_length();
2167 dout(20) << __func__
<< " init 0x" << std::hex
<< l
<< ", "
2168 << min_release_size
<< std::dec
<< dendl
;
2169 used_in_blob
.init(l
, min_release_size
);
2176 bool BlueStore::Blob::put_ref(
2182 PExtentVector logical
;
2184 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2185 << std::dec
<< " " << *this << dendl
;
2187 bool empty
= used_in_blob
.put(
2192 // nothing to release
2193 if (!empty
&& logical
.empty()) {
2197 bluestore_blob_t
& b
= dirty_blob();
2198 return b
.release_extents(empty
, logical
, r
);
2201 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size
,
2202 uint32_t target_blob_size
,
2204 uint32_t *length0
) {
2205 ceph_assert(min_alloc_size
);
2206 ceph_assert(target_blob_size
);
2207 if (!get_blob().is_mutable()) {
2211 uint32_t length
= *length0
;
2212 uint32_t end
= b_offset
+ length
;
2214 // Currently for the sake of simplicity we omit blob reuse if data is
2215 // unaligned with csum chunk. Later we can perform padding if needed.
2216 if (get_blob().has_csum() &&
2217 ((b_offset
% get_blob().get_csum_chunk_size()) != 0 ||
2218 (end
% get_blob().get_csum_chunk_size()) != 0)) {
2222 auto blen
= get_blob().get_logical_length();
2223 uint32_t new_blen
= blen
;
2225 // make sure target_blob_size isn't less than current blob len
2226 target_blob_size
= std::max(blen
, target_blob_size
);
2228 if (b_offset
>= blen
) {
2229 // new data totally stands out of the existing blob
2232 // new data overlaps with the existing blob
2233 new_blen
= std::max(blen
, end
);
2235 uint32_t overlap
= 0;
2236 if (new_blen
> blen
) {
2237 overlap
= blen
- b_offset
;
2242 if (!get_blob().is_unallocated(b_offset
, overlap
)) {
2243 // abort if any piece of the overlap has already been allocated
2248 if (new_blen
> blen
) {
2249 int64_t overflow
= int64_t(new_blen
) - target_blob_size
;
2250 // Unable to decrease the provided length to fit into max_blob_size
2251 if (overflow
>= length
) {
2255 // FIXME: in some cases we could reduce unused resolution
2256 if (get_blob().has_unused()) {
2261 new_blen
-= overflow
;
2266 if (new_blen
> blen
) {
2267 dirty_blob().add_tail(new_blen
);
2268 used_in_blob
.add_tail(new_blen
,
2269 get_blob().get_release_size(min_alloc_size
));
2275 void BlueStore::Blob::split(Collection
*coll
, uint32_t blob_offset
, Blob
*r
)
2277 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2278 << " start " << *this << dendl
;
2279 ceph_assert(blob
.can_split());
2280 ceph_assert(used_in_blob
.can_split());
2281 bluestore_blob_t
&lb
= dirty_blob();
2282 bluestore_blob_t
&rb
= r
->dirty_blob();
2286 &(r
->used_in_blob
));
2288 lb
.split(blob_offset
, rb
);
2289 shared_blob
->bc
.split(shared_blob
->get_cache(), blob_offset
, r
->shared_blob
->bc
);
2291 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2292 << " finish " << *this << dendl
;
2293 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2294 << " and " << *r
<< dendl
;
2297 #ifndef CACHE_BLOB_BL
2298 void BlueStore::Blob::decode(
2300 bufferptr::const_iterator
& p
,
2303 bool include_ref_map
)
2305 denc(blob
, p
, struct_v
);
2306 if (blob
.is_shared()) {
2309 if (include_ref_map
) {
2311 used_in_blob
.decode(p
);
2313 used_in_blob
.clear();
2314 bluestore_extent_ref_map_t legacy_ref_map
;
2315 legacy_ref_map
.decode(p
);
2316 for (auto r
: legacy_ref_map
.ref_map
) {
2320 r
.second
.refs
* r
.second
.length
);
2329 void BlueStore::Extent::dump(Formatter
* f
) const
2331 f
->dump_unsigned("logical_offset", logical_offset
);
2332 f
->dump_unsigned("length", length
);
2333 f
->dump_unsigned("blob_offset", blob_offset
);
2334 f
->dump_object("blob", *blob
);
2337 ostream
& operator<<(ostream
& out
, const BlueStore::Extent
& e
)
2339 return out
<< std::hex
<< "0x" << e
.logical_offset
<< "~" << e
.length
2340 << ": 0x" << e
.blob_offset
<< "~" << e
.length
<< std::dec
2345 BlueStore::OldExtent
* BlueStore::OldExtent::create(CollectionRef c
,
2350 OldExtent
* oe
= new OldExtent(lo
, o
, l
, b
);
2351 b
->put_ref(c
.get(), o
, l
, &(oe
->r
));
2352 oe
->blob_empty
= !b
->is_referenced();
2359 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2361 #define dout_context onode->c->store->cct
2363 BlueStore::ExtentMap::ExtentMap(Onode
*o
)
2366 o
->c
->store
->cct
->_conf
->bluestore_extent_map_inline_shard_prealloc_size
) {
2369 void BlueStore::ExtentMap::dump(Formatter
* f
) const
2371 f
->open_array_section("extents");
2373 for (auto& e
: extent_map
) {
2374 f
->dump_object("extent", e
);
2379 void BlueStore::ExtentMap::dup(BlueStore
* b
, TransContext
* txc
,
2380 CollectionRef
& c
, OnodeRef
& oldo
, OnodeRef
& newo
, uint64_t& srcoff
,
2381 uint64_t& length
, uint64_t& dstoff
) {
2383 auto cct
= onode
->c
->store
->cct
;
2385 cct
->_conf
->bluestore_debug_inject_bug21040
;
2386 vector
<BlobRef
> id_to_blob(oldo
->extent_map
.extent_map
.size());
2387 for (auto& e
: oldo
->extent_map
.extent_map
) {
2388 e
.blob
->last_encoded_id
= -1;
2392 uint64_t end
= srcoff
+ length
;
2393 uint32_t dirty_range_begin
= 0;
2394 uint32_t dirty_range_end
= 0;
2395 bool src_dirty
= false;
2396 for (auto ep
= oldo
->extent_map
.seek_lextent(srcoff
);
2397 ep
!= oldo
->extent_map
.extent_map
.end();
2400 if (e
.logical_offset
>= end
) {
2403 dout(20) << __func__
<< " src " << e
<< dendl
;
2405 bool blob_duped
= true;
2406 if (e
.blob
->last_encoded_id
>= 0) {
2407 cb
= id_to_blob
[e
.blob
->last_encoded_id
];
2411 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
2412 // make sure it is shared
2413 if (!blob
.is_shared()) {
2414 c
->make_blob_shared(b
->_assign_blobid(txc
), e
.blob
);
2415 if (!inject_21040
&& !src_dirty
) {
2417 dirty_range_begin
= e
.logical_offset
;
2418 } else if (inject_21040
&&
2419 dirty_range_begin
== 0 && dirty_range_end
== 0) {
2420 dirty_range_begin
= e
.logical_offset
;
2422 ceph_assert(e
.logical_end() > 0);
2423 // -1 to exclude next potential shard
2424 dirty_range_end
= e
.logical_end() - 1;
2426 c
->load_shared_blob(e
.blob
->shared_blob
);
2429 e
.blob
->last_encoded_id
= n
;
2432 // bump the extent refs on the copied blob's extents
2433 for (auto p
: blob
.get_extents()) {
2435 e
.blob
->shared_blob
->get_ref(p
.offset
, p
.length
);
2438 txc
->write_shared_blob(e
.blob
->shared_blob
);
2439 dout(20) << __func__
<< " new " << *cb
<< dendl
;
2442 int skip_front
, skip_back
;
2443 if (e
.logical_offset
< srcoff
) {
2444 skip_front
= srcoff
- e
.logical_offset
;
2448 if (e
.logical_end() > end
) {
2449 skip_back
= e
.logical_end() - end
;
2454 Extent
* ne
= new Extent(e
.logical_offset
+ skip_front
+ dstoff
- srcoff
,
2455 e
.blob_offset
+ skip_front
, e
.length
- skip_front
- skip_back
, cb
);
2456 newo
->extent_map
.extent_map
.insert(*ne
);
2457 ne
->blob
->get_ref(c
.get(), ne
->blob_offset
, ne
->length
);
2458 // fixme: we may leave parts of new blob unreferenced that could
2459 // be freed (relative to the shared_blob).
2460 txc
->statfs_delta
.stored() += ne
->length
;
2461 if (e
.blob
->get_blob().is_compressed()) {
2462 txc
->statfs_delta
.compressed_original() += ne
->length
;
2464 txc
->statfs_delta
.compressed() +=
2465 cb
->get_blob().get_compressed_payload_length();
2468 dout(20) << __func__
<< " dst " << *ne
<< dendl
;
2471 if ((!inject_21040
&& src_dirty
) ||
2472 (inject_21040
&& dirty_range_end
> dirty_range_begin
)) {
2473 oldo
->extent_map
.dirty_range(dirty_range_begin
,
2474 dirty_range_end
- dirty_range_begin
);
2475 txc
->write_onode(oldo
);
2477 txc
->write_onode(newo
);
2479 if (dstoff
+ length
> newo
->onode
.size
) {
2480 newo
->onode
.size
= dstoff
+ length
;
2482 newo
->extent_map
.dirty_range(dstoff
, length
);
2484 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t
,
2487 auto cct
= onode
->c
->store
->cct
; //used by dout
2488 dout(20) << __func__
<< " " << onode
->oid
<< (force
? " force" : "") << dendl
;
2489 if (onode
->onode
.extent_map_shards
.empty()) {
2490 if (inline_bl
.length() == 0) {
2492 // we need to encode inline_bl to measure encoded length
2493 bool never_happen
= encode_some(0, OBJECT_MAX_SIZE
, inline_bl
, &n
);
2494 inline_bl
.reassign_to_mempool(mempool::mempool_bluestore_inline_bl
);
2495 ceph_assert(!never_happen
);
2496 size_t len
= inline_bl
.length();
2497 dout(20) << __func__
<< " inline shard " << len
<< " bytes from " << n
2498 << " extents" << dendl
;
2499 if (!force
&& len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2500 request_reshard(0, OBJECT_MAX_SIZE
);
2504 // will persist in the onode key.
2506 // pending shard update
2507 struct dirty_shard_t
{
2510 dirty_shard_t(Shard
*s
) : shard(s
) {}
2512 vector
<dirty_shard_t
> encoded_shards
;
2513 // allocate slots for all shards in a single call instead of
2514 // doing multiple allocations - one per each dirty shard
2515 encoded_shards
.reserve(shards
.size());
2517 auto p
= shards
.begin();
2519 while (p
!= shards
.end()) {
2520 ceph_assert(p
->shard_info
->offset
>= prev_p
->shard_info
->offset
);
2525 if (n
== shards
.end()) {
2526 endoff
= OBJECT_MAX_SIZE
;
2528 endoff
= n
->shard_info
->offset
;
2530 encoded_shards
.emplace_back(dirty_shard_t(&(*p
)));
2531 bufferlist
& bl
= encoded_shards
.back().bl
;
2532 if (encode_some(p
->shard_info
->offset
, endoff
- p
->shard_info
->offset
,
2535 derr
<< __func__
<< " encode_some needs reshard" << dendl
;
2536 ceph_assert(!force
);
2539 size_t len
= bl
.length();
2541 dout(20) << __func__
<< " shard 0x" << std::hex
2542 << p
->shard_info
->offset
<< std::dec
<< " is " << len
2543 << " bytes (was " << p
->shard_info
->bytes
<< ") from "
2544 << p
->extents
<< " extents" << dendl
;
2547 if (len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2548 // we are big; reshard ourselves
2549 request_reshard(p
->shard_info
->offset
, endoff
);
2551 // avoid resharding the trailing shard, even if it is small
2552 else if (n
!= shards
.end() &&
2553 len
< g_conf()->bluestore_extent_map_shard_min_size
) {
2554 ceph_assert(endoff
!= OBJECT_MAX_SIZE
);
2555 if (p
== shards
.begin()) {
2556 // we are the first shard, combine with next shard
2557 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2559 // combine either with the previous shard or the next,
2560 // whichever is smaller
2561 if (prev_p
->shard_info
->bytes
> n
->shard_info
->bytes
) {
2562 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2564 request_reshard(prev_p
->shard_info
->offset
, endoff
);
2573 if (needs_reshard()) {
2577 // schedule DB update for dirty shards
2579 for (auto& it
: encoded_shards
) {
2580 it
.shard
->dirty
= false;
2581 it
.shard
->shard_info
->bytes
= it
.bl
.length();
2582 generate_extent_shard_key_and_apply(
2584 it
.shard
->shard_info
->offset
,
2586 [&](const string
& final_key
) {
2587 t
->set(PREFIX_OBJ
, final_key
, it
.bl
);
2594 bid_t
BlueStore::ExtentMap::allocate_spanning_blob_id()
2596 if (spanning_blob_map
.empty())
2598 bid_t bid
= spanning_blob_map
.rbegin()->first
+ 1;
2599 // bid is valid and available.
2602 // Find next unused bid;
2603 bid
= rand() % (numeric_limits
<bid_t
>::max() + 1);
2604 const auto begin_bid
= bid
;
2606 if (!spanning_blob_map
.count(bid
))
2610 if (bid
< 0) bid
= 0;
2612 } while (bid
!= begin_bid
);
2613 auto cct
= onode
->c
->store
->cct
; // used by dout
2614 _dump_onode
<0>(cct
, *onode
);
2615 ceph_abort_msg("no available blob id");
2618 void BlueStore::ExtentMap::reshard(
2620 KeyValueDB::Transaction t
)
2622 auto cct
= onode
->c
->store
->cct
; // used by dout
2624 dout(10) << __func__
<< " 0x[" << std::hex
<< needs_reshard_begin
<< ","
2625 << needs_reshard_end
<< ")" << std::dec
2626 << " of " << onode
->onode
.extent_map_shards
.size()
2627 << " shards on " << onode
->oid
<< dendl
;
2628 for (auto& p
: spanning_blob_map
) {
2629 dout(20) << __func__
<< " spanning blob " << p
.first
<< " " << *p
.second
2632 // determine shard index range
2633 unsigned si_begin
= 0, si_end
= 0;
2634 if (!shards
.empty()) {
2635 while (si_begin
+ 1 < shards
.size() &&
2636 shards
[si_begin
+ 1].shard_info
->offset
<= needs_reshard_begin
) {
2639 needs_reshard_begin
= shards
[si_begin
].shard_info
->offset
;
2640 for (si_end
= si_begin
; si_end
< shards
.size(); ++si_end
) {
2641 if (shards
[si_end
].shard_info
->offset
>= needs_reshard_end
) {
2642 needs_reshard_end
= shards
[si_end
].shard_info
->offset
;
2646 if (si_end
== shards
.size()) {
2647 needs_reshard_end
= OBJECT_MAX_SIZE
;
2649 dout(20) << __func__
<< " shards [" << si_begin
<< "," << si_end
<< ")"
2650 << " over 0x[" << std::hex
<< needs_reshard_begin
<< ","
2651 << needs_reshard_end
<< ")" << std::dec
<< dendl
;
2654 fault_range(db
, needs_reshard_begin
, (needs_reshard_end
- needs_reshard_begin
));
2656 // we may need to fault in a larger interval later must have all
2657 // referring extents for spanning blobs loaded in order to have
2658 // accurate use_tracker values.
2659 uint32_t spanning_scan_begin
= needs_reshard_begin
;
2660 uint32_t spanning_scan_end
= needs_reshard_end
;
2664 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2665 generate_extent_shard_key_and_apply(
2666 onode
->key
, shards
[i
].shard_info
->offset
, &key
,
2667 [&](const string
& final_key
) {
2668 t
->rmkey(PREFIX_OBJ
, final_key
);
2673 // calculate average extent size
2675 unsigned extents
= 0;
2676 if (onode
->onode
.extent_map_shards
.empty()) {
2677 bytes
= inline_bl
.length();
2678 extents
= extent_map
.size();
2680 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2681 bytes
+= shards
[i
].shard_info
->bytes
;
2682 extents
+= shards
[i
].extents
;
2685 unsigned target
= cct
->_conf
->bluestore_extent_map_shard_target_size
;
2686 unsigned slop
= target
*
2687 cct
->_conf
->bluestore_extent_map_shard_target_size_slop
;
2688 unsigned extent_avg
= bytes
/ std::max(1u, extents
);
2689 dout(20) << __func__
<< " extent_avg " << extent_avg
<< ", target " << target
2690 << ", slop " << slop
<< dendl
;
2693 unsigned estimate
= 0;
2694 unsigned offset
= needs_reshard_begin
;
2695 vector
<bluestore_onode_t::shard_info
> new_shard_info
;
2696 unsigned max_blob_end
= 0;
2697 Extent
dummy(needs_reshard_begin
);
2698 for (auto e
= extent_map
.lower_bound(dummy
);
2699 e
!= extent_map
.end();
2701 if (e
->logical_offset
>= needs_reshard_end
) {
2704 dout(30) << " extent " << *e
<< dendl
;
2706 // disfavor shard boundaries that span a blob
2707 bool would_span
= (e
->logical_offset
< max_blob_end
) || e
->blob_offset
;
2709 estimate
+ extent_avg
> target
+ (would_span
? slop
: 0)) {
2711 if (offset
== needs_reshard_begin
) {
2712 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2713 new_shard_info
.back().offset
= offset
;
2714 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2715 << std::dec
<< dendl
;
2717 offset
= e
->logical_offset
;
2718 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2719 new_shard_info
.back().offset
= offset
;
2720 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2721 << std::dec
<< dendl
;
2724 estimate
+= extent_avg
;
2725 unsigned bs
= e
->blob_start();
2726 if (bs
< spanning_scan_begin
) {
2727 spanning_scan_begin
= bs
;
2729 uint32_t be
= e
->blob_end();
2730 if (be
> max_blob_end
) {
2733 if (be
> spanning_scan_end
) {
2734 spanning_scan_end
= be
;
2737 if (new_shard_info
.empty() && (si_begin
> 0 ||
2738 si_end
< shards
.size())) {
2739 // we resharded a partial range; we must produce at least one output
2741 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2742 new_shard_info
.back().offset
= needs_reshard_begin
;
2743 dout(20) << __func__
<< " new shard 0x" << std::hex
<< needs_reshard_begin
2744 << std::dec
<< " (singleton degenerate case)" << dendl
;
2747 auto& sv
= onode
->onode
.extent_map_shards
;
2748 dout(20) << __func__
<< " new " << new_shard_info
<< dendl
;
2749 dout(20) << __func__
<< " old " << sv
<< dendl
;
2751 // no old shards to keep
2752 sv
.swap(new_shard_info
);
2753 init_shards(true, true);
2755 // splice in new shards
2756 sv
.erase(sv
.begin() + si_begin
, sv
.begin() + si_end
);
2757 shards
.erase(shards
.begin() + si_begin
, shards
.begin() + si_end
);
2759 sv
.begin() + si_begin
,
2760 new_shard_info
.begin(),
2761 new_shard_info
.end());
2762 shards
.insert(shards
.begin() + si_begin
, new_shard_info
.size(), Shard());
2763 si_end
= si_begin
+ new_shard_info
.size();
2765 ceph_assert(sv
.size() == shards
.size());
2767 // note that we need to update every shard_info of shards here,
2768 // as sv might have been totally re-allocated above
2769 for (unsigned i
= 0; i
< shards
.size(); i
++) {
2770 shards
[i
].shard_info
= &sv
[i
];
2773 // mark newly added shards as dirty
2774 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2775 shards
[i
].loaded
= true;
2776 shards
[i
].dirty
= true;
2779 dout(20) << __func__
<< " fin " << sv
<< dendl
;
2783 // no more shards; unspan all previously spanning blobs
2784 auto p
= spanning_blob_map
.begin();
2785 while (p
!= spanning_blob_map
.end()) {
2787 dout(30) << __func__
<< " un-spanning " << *p
->second
<< dendl
;
2788 p
= spanning_blob_map
.erase(p
);
2791 // identify new spanning blobs
2792 dout(20) << __func__
<< " checking spanning blobs 0x[" << std::hex
2793 << spanning_scan_begin
<< "," << spanning_scan_end
<< ")" << dendl
;
2794 if (spanning_scan_begin
< needs_reshard_begin
) {
2795 fault_range(db
, spanning_scan_begin
,
2796 needs_reshard_begin
- spanning_scan_begin
);
2798 if (spanning_scan_end
> needs_reshard_end
) {
2799 fault_range(db
, needs_reshard_end
,
2800 spanning_scan_end
- needs_reshard_end
);
2802 auto sp
= sv
.begin() + si_begin
;
2803 auto esp
= sv
.end();
2804 unsigned shard_start
= sp
->offset
;
2808 shard_end
= OBJECT_MAX_SIZE
;
2810 shard_end
= sp
->offset
;
2812 Extent
dummy(needs_reshard_begin
);
2814 bool was_too_many_blobs_check
= false;
2815 auto too_many_blobs_threshold
=
2816 g_conf()->bluestore_debug_too_many_blobs_threshold
;
2817 auto& dumped_onodes
= onode
->c
->onode_map
.cache
->dumped_onodes
;
2818 decltype(onode
->c
->onode_map
.cache
->dumped_onodes
)::value_type
* oid_slot
= nullptr;
2819 decltype(onode
->c
->onode_map
.cache
->dumped_onodes
)::value_type
* oldest_slot
= nullptr;
2821 for (auto e
= extent_map
.lower_bound(dummy
); e
!= extent_map
.end(); ++e
) {
2822 if (e
->logical_offset
>= needs_reshard_end
) {
2825 dout(30) << " extent " << *e
<< dendl
;
2826 while (e
->logical_offset
>= shard_end
) {
2827 shard_start
= shard_end
;
2828 ceph_assert(sp
!= esp
);
2831 shard_end
= OBJECT_MAX_SIZE
;
2833 shard_end
= sp
->offset
;
2835 dout(30) << __func__
<< " shard 0x" << std::hex
<< shard_start
2836 << " to 0x" << shard_end
<< std::dec
<< dendl
;
2839 if (e
->blob_escapes_range(shard_start
, shard_end
- shard_start
)) {
2840 if (!e
->blob
->is_spanning()) {
2841 // We have two options: (1) split the blob into pieces at the
2842 // shard boundaries (and adjust extents accordingly), or (2)
2843 // mark it spanning. We prefer to cut the blob if we can. Note that
2844 // we may have to split it multiple times--potentially at every
2846 bool must_span
= false;
2847 BlobRef b
= e
->blob
;
2848 if (b
->can_split()) {
2849 uint32_t bstart
= e
->blob_start();
2850 uint32_t bend
= e
->blob_end();
2851 for (const auto& sh
: shards
) {
2852 if (bstart
< sh
.shard_info
->offset
&&
2853 bend
> sh
.shard_info
->offset
) {
2854 uint32_t blob_offset
= sh
.shard_info
->offset
- bstart
;
2855 if (b
->can_split_at(blob_offset
)) {
2856 dout(20) << __func__
<< " splitting blob, bstart 0x"
2857 << std::hex
<< bstart
<< " blob_offset 0x"
2858 << blob_offset
<< std::dec
<< " " << *b
<< dendl
;
2859 b
= split_blob(b
, blob_offset
, sh
.shard_info
->offset
);
2860 // switch b to the new right-hand side, in case it
2861 // *also* has to get split.
2862 bstart
+= blob_offset
;
2863 onode
->c
->store
->logger
->inc(l_bluestore_blob_split
);
2874 auto bid
= allocate_spanning_blob_id();
2876 spanning_blob_map
[b
->id
] = b
;
2877 dout(20) << __func__
<< " adding spanning " << *b
<< dendl
;
2878 if (!was_too_many_blobs_check
&&
2879 too_many_blobs_threshold
&&
2880 spanning_blob_map
.size() >= size_t(too_many_blobs_threshold
)) {
2882 was_too_many_blobs_check
= true;
2883 for (size_t i
= 0; i
< dumped_onodes
.size(); ++i
) {
2884 if (dumped_onodes
[i
].first
== onode
->oid
) {
2885 oid_slot
= &dumped_onodes
[i
];
2888 if (!oldest_slot
|| (oldest_slot
&&
2889 dumped_onodes
[i
].second
< oldest_slot
->second
)) {
2890 oldest_slot
= &dumped_onodes
[i
];
2897 if (e
->blob
->is_spanning()) {
2898 spanning_blob_map
.erase(e
->blob
->id
);
2900 dout(30) << __func__
<< " un-spanning " << *e
->blob
<< dendl
;
2904 bool do_dump
= (!oid_slot
&& was_too_many_blobs_check
) ||
2906 (mono_clock::now() - oid_slot
->second
>= make_timespan(5 * 60)));
2909 << " spanning blob count exceeds threshold, "
2910 << spanning_blob_map
.size() << " spanning blobs"
2912 _dump_onode
<0>(cct
, *onode
);
2914 oid_slot
->second
= mono_clock::now();
2916 ceph_assert(oldest_slot
);
2917 oldest_slot
->first
= onode
->oid
;
2918 oldest_slot
->second
= mono_clock::now();
2923 clear_needs_reshard();
2926 bool BlueStore::ExtentMap::encode_some(
2932 Extent
dummy(offset
);
2933 auto start
= extent_map
.lower_bound(dummy
);
2934 uint32_t end
= offset
+ length
;
2936 __u8 struct_v
= 2; // Version 2 differs from v1 in blob's ref_map
2937 // serialization only. Hence there is no specific
2938 // handling at ExtentMap level.
2942 bool must_reshard
= false;
2943 for (auto p
= start
;
2944 p
!= extent_map
.end() && p
->logical_offset
< end
;
2946 ceph_assert(p
->logical_offset
>= offset
);
2947 p
->blob
->last_encoded_id
= -1;
2948 if (!p
->blob
->is_spanning() && p
->blob_escapes_range(offset
, length
)) {
2949 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2950 << std::dec
<< " hit new spanning blob " << *p
<< dendl
;
2951 request_reshard(p
->blob_start(), p
->blob_end());
2952 must_reshard
= true;
2954 if (!must_reshard
) {
2955 denc_varint(0, bound
); // blobid
2956 denc_varint(0, bound
); // logical_offset
2957 denc_varint(0, bound
); // len
2958 denc_varint(0, bound
); // blob_offset
2960 p
->blob
->bound_encode(
2963 p
->blob
->shared_blob
->get_sbid(),
2971 denc(struct_v
, bound
);
2972 denc_varint(0, bound
); // number of extents
2975 auto app
= bl
.get_contiguous_appender(bound
);
2976 denc(struct_v
, app
);
2977 denc_varint(n
, app
);
2984 uint64_t prev_len
= 0;
2985 for (auto p
= start
;
2986 p
!= extent_map
.end() && p
->logical_offset
< end
;
2989 bool include_blob
= false;
2990 if (p
->blob
->is_spanning()) {
2991 blobid
= p
->blob
->id
<< BLOBID_SHIFT_BITS
;
2992 blobid
|= BLOBID_FLAG_SPANNING
;
2993 } else if (p
->blob
->last_encoded_id
< 0) {
2994 p
->blob
->last_encoded_id
= n
+ 1; // so it is always non-zero
2995 include_blob
= true;
2996 blobid
= 0; // the decoder will infer the id from n
2998 blobid
= p
->blob
->last_encoded_id
<< BLOBID_SHIFT_BITS
;
3000 if (p
->logical_offset
== pos
) {
3001 blobid
|= BLOBID_FLAG_CONTIGUOUS
;
3003 if (p
->blob_offset
== 0) {
3004 blobid
|= BLOBID_FLAG_ZEROOFFSET
;
3006 if (p
->length
== prev_len
) {
3007 blobid
|= BLOBID_FLAG_SAMELENGTH
;
3009 prev_len
= p
->length
;
3011 denc_varint(blobid
, app
);
3012 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
3013 denc_varint_lowz(p
->logical_offset
- pos
, app
);
3015 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
3016 denc_varint_lowz(p
->blob_offset
, app
);
3018 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
3019 denc_varint_lowz(p
->length
, app
);
3021 pos
= p
->logical_end();
3023 p
->blob
->encode(app
, struct_v
, p
->blob
->shared_blob
->get_sbid(), false);
3027 /*derr << __func__ << bl << dendl;
3028 derr << __func__ << ":";
3035 unsigned BlueStore::ExtentMap::decode_some(bufferlist
& bl
)
3038 derr << __func__ << ":";
3043 ceph_assert(bl
.get_num_buffers() <= 1);
3044 auto p
= bl
.front().begin_deep();
3047 // Version 2 differs from v1 in blob's ref_map
3048 // serialization only. Hence there is no specific
3049 // handling at ExtentMap level below.
3050 ceph_assert(struct_v
== 1 || struct_v
== 2);
3053 denc_varint(num
, p
);
3054 vector
<BlobRef
> blobs(num
);
3056 uint64_t prev_len
= 0;
3060 Extent
*le
= new Extent();
3062 denc_varint(blobid
, p
);
3063 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
3065 denc_varint_lowz(gap
, p
);
3068 le
->logical_offset
= pos
;
3069 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
3070 denc_varint_lowz(le
->blob_offset
, p
);
3072 le
->blob_offset
= 0;
3074 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
3075 denc_varint_lowz(prev_len
, p
);
3077 le
->length
= prev_len
;
3079 if (blobid
& BLOBID_FLAG_SPANNING
) {
3080 dout(30) << __func__
<< " getting spanning blob "
3081 << (blobid
>> BLOBID_SHIFT_BITS
) << dendl
;
3082 le
->assign_blob(get_spanning_blob(blobid
>> BLOBID_SHIFT_BITS
));
3084 blobid
>>= BLOBID_SHIFT_BITS
;
3086 le
->assign_blob(blobs
[blobid
- 1]);
3087 ceph_assert(le
->blob
);
3089 Blob
*b
= new Blob();
3091 b
->decode(onode
->c
, p
, struct_v
, &sbid
, false);
3093 onode
->c
->open_shared_blob(sbid
, b
);
3096 // we build ref_map dynamically for non-spanning blobs
3104 extent_map
.insert(*le
);
3107 ceph_assert(n
== num
);
3111 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p
)
3113 // Version 2 differs from v1 in blob's ref_map
3114 // serialization only. Hence there is no specific
3115 // handling at ExtentMap level.
3119 denc_varint((uint32_t)0, p
);
3120 size_t key_size
= 0;
3121 denc_varint((uint32_t)0, key_size
);
3122 p
+= spanning_blob_map
.size() * key_size
;
3123 for (const auto& i
: spanning_blob_map
) {
3124 i
.second
->bound_encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
3128 void BlueStore::ExtentMap::encode_spanning_blobs(
3129 bufferlist::contiguous_appender
& p
)
3131 // Version 2 differs from v1 in blob's ref_map
3132 // serialization only. Hence there is no specific
3133 // handling at ExtentMap level.
3137 denc_varint(spanning_blob_map
.size(), p
);
3138 for (auto& i
: spanning_blob_map
) {
3139 denc_varint(i
.second
->id
, p
);
3140 i
.second
->encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
3144 void BlueStore::ExtentMap::decode_spanning_blobs(
3145 bufferptr::const_iterator
& p
)
3149 // Version 2 differs from v1 in blob's ref_map
3150 // serialization only. Hence there is no specific
3151 // handling at ExtentMap level.
3152 ceph_assert(struct_v
== 1 || struct_v
== 2);
3157 BlobRef
b(new Blob());
3158 denc_varint(b
->id
, p
);
3159 spanning_blob_map
[b
->id
] = b
;
3161 b
->decode(onode
->c
, p
, struct_v
, &sbid
, true);
3162 onode
->c
->open_shared_blob(sbid
, b
);
3166 void BlueStore::ExtentMap::init_shards(bool loaded
, bool dirty
)
3168 shards
.resize(onode
->onode
.extent_map_shards
.size());
3170 for (auto &s
: onode
->onode
.extent_map_shards
) {
3171 shards
[i
].shard_info
= &s
;
3172 shards
[i
].loaded
= loaded
;
3173 shards
[i
].dirty
= dirty
;
3178 void BlueStore::ExtentMap::fault_range(
3183 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3184 << std::dec
<< dendl
;
3185 auto start
= seek_shard(offset
);
3186 auto last
= seek_shard(offset
+ length
);
3191 ceph_assert(last
>= start
);
3193 while (start
<= last
) {
3194 ceph_assert((size_t)start
< shards
.size());
3195 auto p
= &shards
[start
];
3197 dout(30) << __func__
<< " opening shard 0x" << std::hex
3198 << p
->shard_info
->offset
<< std::dec
<< dendl
;
3200 generate_extent_shard_key_and_apply(
3201 onode
->key
, p
->shard_info
->offset
, &key
,
3202 [&](const string
& final_key
) {
3203 int r
= db
->get(PREFIX_OBJ
, final_key
, &v
);
3205 derr
<< __func__
<< " missing shard 0x" << std::hex
3206 << p
->shard_info
->offset
<< std::dec
<< " for " << onode
->oid
3208 ceph_assert(r
>= 0);
3212 p
->extents
= decode_some(v
);
3214 dout(20) << __func__
<< " open shard 0x" << std::hex
3215 << p
->shard_info
->offset
3216 << " for range 0x" << offset
<< "~" << length
<< std::dec
3217 << " (" << v
.length() << " bytes)" << dendl
;
3218 ceph_assert(p
->dirty
== false);
3219 ceph_assert(v
.length() == p
->shard_info
->bytes
);
3220 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_misses
);
3222 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_hits
);
3228 void BlueStore::ExtentMap::dirty_range(
3232 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3233 << std::dec
<< dendl
;
3234 if (shards
.empty()) {
3235 dout(20) << __func__
<< " mark inline shard dirty" << dendl
;
3239 auto start
= seek_shard(offset
);
3243 auto last
= seek_shard(offset
+ length
- 1);
3247 ceph_assert(last
>= start
);
3248 while (start
<= last
) {
3249 ceph_assert((size_t)start
< shards
.size());
3250 auto p
= &shards
[start
];
3252 derr
<< __func__
<< "on write 0x" << std::hex
<< offset
3253 << "~" << length
<< " shard 0x" << p
->shard_info
->offset
3254 << std::dec
<< " is not loaded, can't mark dirty" << dendl
;
3255 ceph_abort_msg("can't mark unloaded shard dirty");
3258 dout(20) << __func__
<< " mark shard 0x" << std::hex
3259 << p
->shard_info
->offset
<< std::dec
<< " dirty" << dendl
;
3266 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::find(
3269 Extent
dummy(offset
);
3270 return extent_map
.find(dummy
);
3273 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::seek_lextent(
3276 Extent
dummy(offset
);
3277 auto fp
= extent_map
.lower_bound(dummy
);
3278 if (fp
!= extent_map
.begin()) {
3280 if (fp
->logical_end() <= offset
) {
3287 BlueStore::extent_map_t::const_iterator
BlueStore::ExtentMap::seek_lextent(
3288 uint64_t offset
) const
3290 Extent
dummy(offset
);
3291 auto fp
= extent_map
.lower_bound(dummy
);
3292 if (fp
!= extent_map
.begin()) {
3294 if (fp
->logical_end() <= offset
) {
3301 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset
, uint64_t length
)
3303 auto fp
= seek_lextent(offset
);
3304 if (fp
== extent_map
.end() || fp
->logical_offset
>= offset
+ length
) {
3310 int BlueStore::ExtentMap::compress_extent_map(
3314 if (extent_map
.empty())
3317 auto p
= seek_lextent(offset
);
3318 if (p
!= extent_map
.begin()) {
3319 --p
; // start to the left of offset
3321 // the caller should have just written to this region
3322 ceph_assert(p
!= extent_map
.end());
3324 // identify the *next* shard
3325 auto pshard
= shards
.begin();
3326 while (pshard
!= shards
.end() &&
3327 p
->logical_offset
>= pshard
->shard_info
->offset
) {
3331 if (pshard
!= shards
.end()) {
3332 shard_end
= pshard
->shard_info
->offset
;
3334 shard_end
= OBJECT_MAX_SIZE
;
3338 for (++n
; n
!= extent_map
.end(); p
= n
++) {
3339 if (n
->logical_offset
> offset
+ length
) {
3340 break; // stop after end
3342 while (n
!= extent_map
.end() &&
3343 p
->logical_end() == n
->logical_offset
&&
3344 p
->blob
== n
->blob
&&
3345 p
->blob_offset
+ p
->length
== n
->blob_offset
&&
3346 n
->logical_offset
< shard_end
) {
3347 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3348 << " next shard 0x" << shard_end
<< std::dec
3349 << " merging " << *p
<< " and " << *n
<< dendl
;
3350 p
->length
+= n
->length
;
3354 if (n
== extent_map
.end()) {
3357 if (n
->logical_offset
>= shard_end
) {
3358 ceph_assert(pshard
!= shards
.end());
3360 if (pshard
!= shards
.end()) {
3361 shard_end
= pshard
->shard_info
->offset
;
3363 shard_end
= OBJECT_MAX_SIZE
;
3368 onode
->c
->store
->logger
->inc(l_bluestore_extent_compress
, removed
);
3373 void BlueStore::ExtentMap::punch_hole(
3377 old_extent_map_t
*old_extents
)
3379 auto p
= seek_lextent(offset
);
3380 uint64_t end
= offset
+ length
;
3381 while (p
!= extent_map
.end()) {
3382 if (p
->logical_offset
>= end
) {
3385 if (p
->logical_offset
< offset
) {
3386 if (p
->logical_end() > end
) {
3387 // split and deref middle
3388 uint64_t front
= offset
- p
->logical_offset
;
3389 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ front
,
3391 old_extents
->push_back(*oe
);
3393 p
->blob_offset
+ front
+ length
,
3394 p
->length
- front
- length
,
3400 ceph_assert(p
->logical_end() > offset
); // else seek_lextent bug
3401 uint64_t keep
= offset
- p
->logical_offset
;
3402 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ keep
,
3403 p
->length
- keep
, p
->blob
);
3404 old_extents
->push_back(*oe
);
3410 if (p
->logical_offset
+ p
->length
<= end
) {
3411 // deref whole lextent
3412 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
3413 p
->length
, p
->blob
);
3414 old_extents
->push_back(*oe
);
3419 uint64_t keep
= p
->logical_end() - end
;
3420 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
3421 p
->length
- keep
, p
->blob
);
3422 old_extents
->push_back(*oe
);
3424 add(end
, p
->blob_offset
+ p
->length
- keep
, keep
, p
->blob
);
3430 BlueStore::Extent
*BlueStore::ExtentMap::set_lextent(
3432 uint64_t logical_offset
,
3433 uint64_t blob_offset
, uint64_t length
, BlobRef b
,
3434 old_extent_map_t
*old_extents
)
3436 // We need to have completely initialized Blob to increment its ref counters.
3437 ceph_assert(b
->get_blob().get_logical_length() != 0);
3439 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3440 // old_extents list if we overwre the blob totally
3441 // This might happen during WAL overwrite.
3442 b
->get_ref(onode
->c
, blob_offset
, length
);
3445 punch_hole(c
, logical_offset
, length
, old_extents
);
3448 Extent
*le
= new Extent(logical_offset
, blob_offset
, length
, b
);
3449 extent_map
.insert(*le
);
3450 if (spans_shard(logical_offset
, length
)) {
3451 request_reshard(logical_offset
, logical_offset
+ length
);
3456 BlueStore::BlobRef
BlueStore::ExtentMap::split_blob(
3458 uint32_t blob_offset
,
3461 uint32_t end_pos
= pos
+ lb
->get_blob().get_logical_length() - blob_offset
;
3462 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< " end 0x" << end_pos
3463 << " blob_offset 0x" << blob_offset
<< std::dec
<< " " << *lb
3465 BlobRef rb
= onode
->c
->new_blob();
3466 lb
->split(onode
->c
, blob_offset
, rb
.get());
3468 for (auto ep
= seek_lextent(pos
);
3469 ep
!= extent_map
.end() && ep
->logical_offset
< end_pos
;
3471 if (ep
->blob
!= lb
) {
3474 if (ep
->logical_offset
< pos
) {
3476 size_t left
= pos
- ep
->logical_offset
;
3477 Extent
*ne
= new Extent(pos
, 0, ep
->length
- left
, rb
);
3478 extent_map
.insert(*ne
);
3480 dout(30) << __func__
<< " split " << *ep
<< dendl
;
3481 dout(30) << __func__
<< " to " << *ne
<< dendl
;
3484 ceph_assert(ep
->blob_offset
>= blob_offset
);
3487 ep
->blob_offset
-= blob_offset
;
3488 dout(30) << __func__
<< " adjusted " << *ep
<< dendl
;
3497 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3500 // A tricky thing about Onode's ref counter is that we do an additional
3501 // increment when newly pinned instance is detected. And -1 on unpin.
3502 // This prevents from a conflict with a delete call (when nref == 0).
3503 // The latter might happen while the thread is in unpin() function
3504 // (and e.g. waiting for lock acquisition) since nref is already
3505 // decremented. And another 'putting' thread on the instance will release it.
3507 void BlueStore::Onode::get() {
3508 if (++nref
>= 2 && !pinned
) {
3509 OnodeCacheShard
* ocs
= c
->get_onode_cache();
3511 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3512 while (ocs
!= c
->get_onode_cache()) {
3514 ocs
= c
->get_onode_cache();
3517 bool was_pinned
= pinned
;
3519 // additional increment for newly pinned instance
3520 bool r
= !was_pinned
&& pinned
;
3530 void BlueStore::Onode::put() {
3533 OnodeCacheShard
* ocs
= c
->get_onode_cache();
3535 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3536 while (ocs
!= c
->get_onode_cache()) {
3538 ocs
= c
->get_onode_cache();
3541 bool need_unpin
= pinned
;
3542 pinned
= pinned
&& nref
> 2; // intentionally use > not >= as we have
3543 // +1 due to pinned state
3544 need_unpin
= need_unpin
&& !pinned
;
3545 if (cached
&& need_unpin
) {
3549 ocs
->_unpin_and_rm(this);
3550 // remove will also decrement nref and delete Onode
3551 c
->onode_map
._remove(oid
);
3554 // additional decrement for newly unpinned instance
3555 // should be the last action since Onode can be released
3556 // at any point after this decrement
3567 BlueStore::Onode
* BlueStore::Onode::decode(
3569 const ghobject_t
& oid
,
3571 const bufferlist
& v
)
3573 Onode
* on
= new Onode(c
.get(), oid
, key
);
3575 auto p
= v
.front().begin_deep();
3576 on
->onode
.decode(p
);
3577 for (auto& i
: on
->onode
.attrs
) {
3578 i
.second
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
3581 // initialize extent_map
3582 on
->extent_map
.decode_spanning_blobs(p
);
3583 if (on
->onode
.extent_map_shards
.empty()) {
3584 denc(on
->extent_map
.inline_bl
, p
);
3585 on
->extent_map
.decode_some(on
->extent_map
.inline_bl
);
3586 on
->extent_map
.inline_bl
.reassign_to_mempool(
3587 mempool::mempool_bluestore_cache_data
);
3590 on
->extent_map
.init_shards(false, false);
3595 void BlueStore::Onode::flush()
3597 if (flushing_count
.load()) {
3598 ldout(c
->store
->cct
, 20) << __func__
<< " cnt:" << flushing_count
<< dendl
;
3600 std::unique_lock
l(flush_lock
);
3601 while (flushing_count
.load()) {
3606 ldout(c
->store
->cct
, 20) << __func__
<< " done" << dendl
;
3609 void BlueStore::Onode::dump(Formatter
* f
) const
3615 const std::string
& BlueStore::Onode::calc_omap_prefix(uint8_t flags
)
3617 if (bluestore_onode_t::is_pgmeta_omap(flags
)) {
3618 return PREFIX_PGMETA_OMAP
;
3620 if (bluestore_onode_t::is_perpg_omap(flags
)) {
3621 return PREFIX_PERPG_OMAP
;
3623 if (bluestore_onode_t::is_perpool_omap(flags
)) {
3624 return PREFIX_PERPOOL_OMAP
;
3630 void BlueStore::Onode::calc_omap_header(
3635 if (!bluestore_onode_t::is_pgmeta_omap(flags
)) {
3636 if (bluestore_onode_t::is_perpg_omap(flags
)) {
3637 _key_encode_u64(o
->c
->pool(), out
);
3638 _key_encode_u32(o
->oid
.hobj
.get_bitwise_key_u32(), out
);
3639 } else if (bluestore_onode_t::is_perpool_omap(flags
)) {
3640 _key_encode_u64(o
->c
->pool(), out
);
3643 _key_encode_u64(o
->onode
.nid
, out
);
3644 out
->push_back('-');
3647 void BlueStore::Onode::calc_omap_key(uint8_t flags
,
3649 const std::string
& key
,
3652 if (!bluestore_onode_t::is_pgmeta_omap(flags
)) {
3653 if (bluestore_onode_t::is_perpg_omap(flags
)) {
3654 _key_encode_u64(o
->c
->pool(), out
);
3655 _key_encode_u32(o
->oid
.hobj
.get_bitwise_key_u32(), out
);
3656 } else if (bluestore_onode_t::is_perpool_omap(flags
)) {
3657 _key_encode_u64(o
->c
->pool(), out
);
3660 _key_encode_u64(o
->onode
.nid
, out
);
3661 out
->push_back('.');
3665 void BlueStore::Onode::rewrite_omap_key(const string
& old
, string
*out
)
3667 if (!onode
.is_pgmeta_omap()) {
3668 if (onode
.is_perpg_omap()) {
3669 _key_encode_u64(c
->pool(), out
);
3670 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), out
);
3671 } else if (onode
.is_perpool_omap()) {
3672 _key_encode_u64(c
->pool(), out
);
3675 _key_encode_u64(onode
.nid
, out
);
3676 out
->append(old
.c_str() + out
->length(), old
.size() - out
->length());
3679 void BlueStore::Onode::calc_omap_tail(
3684 if (!bluestore_onode_t::is_pgmeta_omap(flags
)) {
3685 if (bluestore_onode_t::is_perpg_omap(flags
)) {
3686 _key_encode_u64(o
->c
->pool(), out
);
3687 _key_encode_u32(o
->oid
.hobj
.get_bitwise_key_u32(), out
);
3688 } else if (bluestore_onode_t::is_perpool_omap(flags
)) {
3689 _key_encode_u64(o
->c
->pool(), out
);
3692 _key_encode_u64(o
->onode
.nid
, out
);
3693 out
->push_back('~');
3696 void BlueStore::Onode::decode_omap_key(const string
& key
, string
*user_key
)
3698 size_t pos
= sizeof(uint64_t) + 1;
3699 if (!onode
.is_pgmeta_omap()) {
3700 if (onode
.is_perpg_omap()) {
3701 pos
+= sizeof(uint64_t) + sizeof(uint32_t);
3702 } else if (onode
.is_perpool_omap()) {
3703 pos
+= sizeof(uint64_t);
3706 *user_key
= key
.substr(pos
);
3709 // =======================================================
3712 /// Checks for writes to the same pextent within a blob
3713 bool BlueStore::WriteContext::has_conflict(
3717 uint64_t min_alloc_size
)
3719 ceph_assert((loffs
% min_alloc_size
) == 0);
3720 ceph_assert((loffs_end
% min_alloc_size
) == 0);
3721 for (auto w
: writes
) {
3723 auto loffs2
= p2align(w
.logical_offset
, min_alloc_size
);
3724 auto loffs2_end
= p2roundup(w
.logical_offset
+ w
.length0
, min_alloc_size
);
3725 if ((loffs
<= loffs2
&& loffs_end
> loffs2
) ||
3726 (loffs
>= loffs2
&& loffs
< loffs2_end
)) {
3734 // =======================================================
3738 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3740 #define dout_context cct
3742 void BlueStore::DeferredBatch::prepare_write(
3744 uint64_t seq
, uint64_t offset
, uint64_t length
,
3745 bufferlist::const_iterator
& blp
)
3747 _discard(cct
, offset
, length
);
3748 auto i
= iomap
.insert(make_pair(offset
, deferred_io()));
3749 ceph_assert(i
.second
); // this should be a new insertion
3750 i
.first
->second
.seq
= seq
;
3751 blp
.copy(length
, i
.first
->second
.bl
);
3752 i
.first
->second
.bl
.reassign_to_mempool(
3753 mempool::mempool_bluestore_writing_deferred
);
3754 dout(20) << __func__
<< " seq " << seq
3755 << " 0x" << std::hex
<< offset
<< "~" << length
3756 << " crc " << i
.first
->second
.bl
.crc32c(-1)
3757 << std::dec
<< dendl
;
3758 seq_bytes
[seq
] += length
;
3759 #ifdef DEBUG_DEFERRED
3764 void BlueStore::DeferredBatch::_discard(
3765 CephContext
*cct
, uint64_t offset
, uint64_t length
)
3767 generic_dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3768 << std::dec
<< dendl
;
3769 auto p
= iomap
.lower_bound(offset
);
3770 if (p
!= iomap
.begin()) {
3772 auto end
= p
->first
+ p
->second
.bl
.length();
3775 head
.substr_of(p
->second
.bl
, 0, offset
- p
->first
);
3776 dout(20) << __func__
<< " keep head " << p
->second
.seq
3777 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3778 << " -> 0x" << head
.length() << std::dec
<< dendl
;
3779 auto i
= seq_bytes
.find(p
->second
.seq
);
3780 ceph_assert(i
!= seq_bytes
.end());
3781 if (end
> offset
+ length
) {
3783 tail
.substr_of(p
->second
.bl
, offset
+ length
- p
->first
,
3784 end
- (offset
+ length
));
3785 dout(20) << __func__
<< " keep tail " << p
->second
.seq
3786 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3787 << " -> 0x" << tail
.length() << std::dec
<< dendl
;
3788 auto &n
= iomap
[offset
+ length
];
3790 n
.seq
= p
->second
.seq
;
3791 i
->second
-= length
;
3793 i
->second
-= end
- offset
;
3795 ceph_assert(i
->second
>= 0);
3796 p
->second
.bl
.swap(head
);
3800 while (p
!= iomap
.end()) {
3801 if (p
->first
>= offset
+ length
) {
3804 auto i
= seq_bytes
.find(p
->second
.seq
);
3805 ceph_assert(i
!= seq_bytes
.end());
3806 auto end
= p
->first
+ p
->second
.bl
.length();
3807 if (end
> offset
+ length
) {
3808 unsigned drop_front
= offset
+ length
- p
->first
;
3809 unsigned keep_tail
= end
- (offset
+ length
);
3810 dout(20) << __func__
<< " truncate front " << p
->second
.seq
3811 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3812 << " drop_front 0x" << drop_front
<< " keep_tail 0x" << keep_tail
3813 << " to 0x" << (offset
+ length
) << "~" << keep_tail
3814 << std::dec
<< dendl
;
3815 auto &s
= iomap
[offset
+ length
];
3816 s
.seq
= p
->second
.seq
;
3817 s
.bl
.substr_of(p
->second
.bl
, drop_front
, keep_tail
);
3818 i
->second
-= drop_front
;
3820 dout(20) << __func__
<< " drop " << p
->second
.seq
3821 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3822 << std::dec
<< dendl
;
3823 i
->second
-= p
->second
.bl
.length();
3825 ceph_assert(i
->second
>= 0);
3830 void BlueStore::DeferredBatch::_audit(CephContext
*cct
)
3832 map
<uint64_t,int> sb
;
3833 for (auto p
: seq_bytes
) {
3834 sb
[p
.first
] = 0; // make sure we have the same set of keys
3837 for (auto& p
: iomap
) {
3838 ceph_assert(p
.first
>= pos
);
3839 sb
[p
.second
.seq
] += p
.second
.bl
.length();
3840 pos
= p
.first
+ p
.second
.bl
.length();
3842 ceph_assert(sb
== seq_bytes
);
3849 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3851 BlueStore::Collection::Collection(BlueStore
*store_
, OnodeCacheShard
*oc
, BufferCacheShard
*bc
, coll_t cid
)
3852 : CollectionImpl(store_
->cct
, cid
),
3857 commit_queue(nullptr)
3861 bool BlueStore::Collection::flush_commit(Context
*c
)
3863 return osr
->flush_commit(c
);
3866 void BlueStore::Collection::flush()
3871 void BlueStore::Collection::flush_all_but_last()
3873 osr
->flush_all_but_last();
3876 void BlueStore::Collection::open_shared_blob(uint64_t sbid
, BlobRef b
)
3878 ceph_assert(!b
->shared_blob
);
3879 const bluestore_blob_t
& blob
= b
->get_blob();
3880 if (!blob
.is_shared()) {
3881 b
->shared_blob
= new SharedBlob(this);
3885 b
->shared_blob
= shared_blob_set
.lookup(sbid
);
3886 if (b
->shared_blob
) {
3887 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3888 << std::dec
<< " had " << *b
->shared_blob
<< dendl
;
3890 b
->shared_blob
= new SharedBlob(sbid
, this);
3891 shared_blob_set
.add(this, b
->shared_blob
.get());
3892 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3893 << std::dec
<< " opened " << *b
->shared_blob
3898 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb
)
3900 if (!sb
->is_loaded()) {
3904 auto sbid
= sb
->get_sbid();
3905 get_shared_blob_key(sbid
, &key
);
3906 int r
= store
->db
->get(PREFIX_SHARED_BLOB
, key
, &v
);
3908 lderr(store
->cct
) << __func__
<< " sbid 0x" << std::hex
<< sbid
3909 << std::dec
<< " not found at key "
3910 << pretty_binary_string(key
) << dendl
;
3911 ceph_abort_msg("uh oh, missing shared_blob");
3915 sb
->persistent
= new bluestore_shared_blob_t(sbid
);
3916 auto p
= v
.cbegin();
3917 decode(*(sb
->persistent
), p
);
3918 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3919 << std::dec
<< " loaded shared_blob " << *sb
<< dendl
;
3923 void BlueStore::Collection::make_blob_shared(uint64_t sbid
, BlobRef b
)
3925 ldout(store
->cct
, 10) << __func__
<< " " << *b
<< dendl
;
3926 ceph_assert(!b
->shared_blob
->is_loaded());
3929 bluestore_blob_t
& blob
= b
->dirty_blob();
3930 blob
.set_flag(bluestore_blob_t::FLAG_SHARED
);
3932 // update shared blob
3933 b
->shared_blob
->loaded
= true;
3934 b
->shared_blob
->persistent
= new bluestore_shared_blob_t(sbid
);
3935 shared_blob_set
.add(this, b
->shared_blob
.get());
3936 for (auto p
: blob
.get_extents()) {
3938 b
->shared_blob
->get_ref(
3943 ldout(store
->cct
, 20) << __func__
<< " now " << *b
<< dendl
;
3946 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob
*sb
)
3948 ldout(store
->cct
, 10) << __func__
<< " " << *sb
<< dendl
;
3949 ceph_assert(sb
->is_loaded());
3951 uint64_t sbid
= sb
->get_sbid();
3952 shared_blob_set
.remove(sb
);
3954 delete sb
->persistent
;
3955 sb
->sbid_unloaded
= 0;
3956 ldout(store
->cct
, 20) << __func__
<< " now " << *sb
<< dendl
;
3960 BlueStore::OnodeRef
BlueStore::Collection::get_onode(
3961 const ghobject_t
& oid
,
3965 ceph_assert(create
? ceph_mutex_is_wlocked(lock
) : ceph_mutex_is_locked(lock
));
3968 if (cid
.is_pg(&pgid
)) {
3969 if (!oid
.match(cnode
.bits
, pgid
.ps())) {
3970 lderr(store
->cct
) << __func__
<< " oid " << oid
<< " not part of "
3971 << pgid
<< " bits " << cnode
.bits
<< dendl
;
3976 OnodeRef o
= onode_map
.lookup(oid
);
3981 get_object_key(store
->cct
, oid
, &key
);
3983 ldout(store
->cct
, 20) << __func__
<< " oid " << oid
<< " key "
3984 << pretty_binary_string(key
) << dendl
;
3990 r
= store
->db
->get(PREFIX_OBJ
, key
.c_str(), key
.size(), &v
);
3991 ldout(store
->cct
, 20) << " r " << r
<< " v.len " << v
.length() << dendl
;
3993 if (v
.length() == 0) {
3994 ceph_assert(r
== -ENOENT
);
3998 // new object, new onode
3999 on
= new Onode(this, oid
, key
);
4002 ceph_assert(r
>= 0);
4003 on
= Onode::decode(this, oid
, key
, v
);
4006 return onode_map
.add(oid
, o
);
4009 void BlueStore::Collection::split_cache(
4012 ldout(store
->cct
, 10) << __func__
<< " to " << dest
<< dendl
;
4014 auto *ocache
= get_onode_cache();
4015 auto *ocache_dest
= dest
->get_onode_cache();
4017 // lock cache shards
4018 std::lock(ocache
->lock
, ocache_dest
->lock
, cache
->lock
, dest
->cache
->lock
);
4019 std::lock_guard
l(ocache
->lock
, std::adopt_lock
);
4020 std::lock_guard
l2(ocache_dest
->lock
, std::adopt_lock
);
4021 std::lock_guard
l3(cache
->lock
, std::adopt_lock
);
4022 std::lock_guard
l4(dest
->cache
->lock
, std::adopt_lock
);
4024 int destbits
= dest
->cnode
.bits
;
4026 bool is_pg
= dest
->cid
.is_pg(&destpg
);
4029 auto p
= onode_map
.onode_map
.begin();
4030 while (p
!= onode_map
.onode_map
.end()) {
4031 OnodeRef o
= p
->second
;
4032 if (!p
->second
->oid
.match(destbits
, destpg
.pgid
.ps())) {
4033 // onode does not belong to this child
4034 ldout(store
->cct
, 20) << __func__
<< " not moving " << o
<< " " << o
->oid
4038 ldout(store
->cct
, 20) << __func__
<< " moving " << o
<< " " << o
->oid
4041 // ensuring that nref is always >= 2 and hence onode is pinned and
4042 // physically out of cache during the transition
4044 ceph_assert(o
->pinned
);
4046 p
= onode_map
.onode_map
.erase(p
);
4047 dest
->onode_map
.onode_map
[o
->oid
] = o
;
4049 get_onode_cache()->move_pinned(dest
->get_onode_cache(), o
.get());
4053 // move over shared blobs and buffers. cover shared blobs from
4054 // both extent map and spanning blob map (the full extent map
4055 // may not be faulted in)
4056 vector
<SharedBlob
*> sbvec
;
4057 for (auto& e
: o
->extent_map
.extent_map
) {
4058 sbvec
.push_back(e
.blob
->shared_blob
.get());
4060 for (auto& b
: o
->extent_map
.spanning_blob_map
) {
4061 sbvec
.push_back(b
.second
->shared_blob
.get());
4063 for (auto sb
: sbvec
) {
4064 if (sb
->coll
== dest
) {
4065 ldout(store
->cct
, 20) << __func__
<< " already moved " << *sb
4069 ldout(store
->cct
, 20) << __func__
<< " moving " << *sb
<< dendl
;
4070 if (sb
->get_sbid()) {
4071 ldout(store
->cct
, 20) << __func__
4072 << " moving registration " << *sb
<< dendl
;
4073 shared_blob_set
.remove(sb
);
4074 dest
->shared_blob_set
.add(dest
, sb
);
4077 if (dest
->cache
!= cache
) {
4078 for (auto& i
: sb
->bc
.buffer_map
) {
4079 if (!i
.second
->is_writing()) {
4080 ldout(store
->cct
, 20) << __func__
<< " moving " << *i
.second
4082 dest
->cache
->_move(cache
, i
.second
.get());
4089 dest
->cache
->_trim();
4092 // =======================================================
4097 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
4099 #define dout_context store->cct
4101 void *BlueStore::MempoolThread::entry()
4103 std::unique_lock l
{lock
};
4105 uint32_t prev_config_change
= store
->config_changed
.load();
4106 uint64_t base
= store
->osd_memory_base
;
4107 double fragmentation
= store
->osd_memory_expected_fragmentation
;
4108 uint64_t target
= store
->osd_memory_target
;
4109 uint64_t min
= store
->osd_memory_cache_min
;
4112 // When setting the maximum amount of memory to use for cache, first
4113 // assume some base amount of memory for the OSD and then fudge in
4114 // some overhead for fragmentation that scales with cache usage.
4115 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
4116 if (ltarget
> base
+ min
) {
4117 max
= ltarget
- base
;
4120 binned_kv_cache
= store
->db
->get_priority_cache();
4121 binned_kv_onode_cache
= store
->db
->get_priority_cache(PREFIX_OBJ
);
4122 if (store
->cache_autotune
&& binned_kv_cache
!= nullptr) {
4123 pcm
= std::make_shared
<PriorityCache::Manager
>(
4124 store
->cct
, min
, max
, target
, true, "bluestore-pricache");
4125 pcm
->insert("kv", binned_kv_cache
, true);
4126 pcm
->insert("meta", meta_cache
, true);
4127 pcm
->insert("data", data_cache
, true);
4128 if (binned_kv_onode_cache
!= nullptr) {
4129 pcm
->insert("kv_onode", binned_kv_onode_cache
, true);
4133 utime_t next_balance
= ceph_clock_now();
4134 utime_t next_resize
= ceph_clock_now();
4135 utime_t next_deferred_force_submit
= ceph_clock_now();
4136 utime_t alloc_stats_dump_clock
= ceph_clock_now();
4138 bool interval_stats_trim
= false;
4140 // Update pcm cache settings if related configuration was changed
4141 uint32_t cur_config_change
= store
->config_changed
.load();
4142 if (cur_config_change
!= prev_config_change
) {
4143 _update_cache_settings();
4144 prev_config_change
= cur_config_change
;
4147 // Before we trim, check and see if it's time to rebalance/resize.
4148 double autotune_interval
= store
->cache_autotune_interval
;
4149 double resize_interval
= store
->osd_memory_cache_resize_interval
;
4150 double max_defer_interval
= store
->max_defer_interval
;
4152 double alloc_stats_dump_interval
=
4153 store
->cct
->_conf
->bluestore_alloc_stats_dump_interval
;
4155 if (alloc_stats_dump_interval
> 0 &&
4156 alloc_stats_dump_clock
+ alloc_stats_dump_interval
< ceph_clock_now()) {
4157 store
->_record_allocation_stats();
4158 alloc_stats_dump_clock
= ceph_clock_now();
4160 if (autotune_interval
> 0 && next_balance
< ceph_clock_now()) {
4161 _adjust_cache_settings();
4163 // Log events at 5 instead of 20 when balance happens.
4164 interval_stats_trim
= true;
4166 if (pcm
!= nullptr) {
4170 next_balance
= ceph_clock_now();
4171 next_balance
+= autotune_interval
;
4173 if (resize_interval
> 0 && next_resize
< ceph_clock_now()) {
4174 if (ceph_using_tcmalloc() && pcm
!= nullptr) {
4177 next_resize
= ceph_clock_now();
4178 next_resize
+= resize_interval
;
4181 if (max_defer_interval
> 0 &&
4182 next_deferred_force_submit
< ceph_clock_now()) {
4183 if (store
->get_deferred_last_submitted() + max_defer_interval
<
4185 store
->deferred_try_submit();
4187 next_deferred_force_submit
= ceph_clock_now();
4188 next_deferred_force_submit
+= max_defer_interval
/3;
4191 // Now Resize the shards
4192 _resize_shards(interval_stats_trim
);
4193 interval_stats_trim
= false;
4195 store
->_update_cache_logger();
4196 auto wait
= ceph::make_timespan(
4197 store
->cct
->_conf
->bluestore_cache_trim_interval
);
4198 cond
.wait_for(l
, wait
);
4201 store
->_record_allocation_stats();
4207 void BlueStore::MempoolThread::_adjust_cache_settings()
4209 if (binned_kv_cache
!= nullptr) {
4210 binned_kv_cache
->set_cache_ratio(store
->cache_kv_ratio
);
4212 if (binned_kv_onode_cache
!= nullptr) {
4213 binned_kv_onode_cache
->set_cache_ratio(store
->cache_kv_onode_ratio
);
4215 meta_cache
->set_cache_ratio(store
->cache_meta_ratio
);
4216 data_cache
->set_cache_ratio(store
->cache_data_ratio
);
4219 void BlueStore::MempoolThread::_resize_shards(bool interval_stats
)
4221 size_t onode_shards
= store
->onode_cache_shards
.size();
4222 size_t buffer_shards
= store
->buffer_cache_shards
.size();
4223 int64_t kv_used
= store
->db
->get_cache_usage();
4224 int64_t kv_onode_used
= store
->db
->get_cache_usage(PREFIX_OBJ
);
4225 int64_t meta_used
= meta_cache
->_get_used_bytes();
4226 int64_t data_used
= data_cache
->_get_used_bytes();
4228 uint64_t cache_size
= store
->cache_size
;
4230 static_cast<int64_t>(store
->cache_kv_ratio
* cache_size
);
4231 int64_t kv_onode_alloc
=
4232 static_cast<int64_t>(store
->cache_kv_onode_ratio
* cache_size
);
4233 int64_t meta_alloc
=
4234 static_cast<int64_t>(store
->cache_meta_ratio
* cache_size
);
4235 int64_t data_alloc
=
4236 static_cast<int64_t>(store
->cache_data_ratio
* cache_size
);
4238 if (pcm
!= nullptr && binned_kv_cache
!= nullptr) {
4239 cache_size
= pcm
->get_tuned_mem();
4240 kv_alloc
= binned_kv_cache
->get_committed_size();
4241 meta_alloc
= meta_cache
->get_committed_size();
4242 data_alloc
= data_cache
->get_committed_size();
4243 if (binned_kv_onode_cache
!= nullptr) {
4244 kv_onode_alloc
= binned_kv_onode_cache
->get_committed_size();
4248 if (interval_stats
) {
4249 dout(5) << __func__
<< " cache_size: " << cache_size
4250 << " kv_alloc: " << kv_alloc
4251 << " kv_used: " << kv_used
4252 << " kv_onode_alloc: " << kv_onode_alloc
4253 << " kv_onode_used: " << kv_onode_used
4254 << " meta_alloc: " << meta_alloc
4255 << " meta_used: " << meta_used
4256 << " data_alloc: " << data_alloc
4257 << " data_used: " << data_used
<< dendl
;
4259 dout(20) << __func__
<< " cache_size: " << cache_size
4260 << " kv_alloc: " << kv_alloc
4261 << " kv_used: " << kv_used
4262 << " kv_onode_alloc: " << kv_onode_alloc
4263 << " kv_onode_used: " << kv_onode_used
4264 << " meta_alloc: " << meta_alloc
4265 << " meta_used: " << meta_used
4266 << " data_alloc: " << data_alloc
4267 << " data_used: " << data_used
<< dendl
;
4270 uint64_t max_shard_onodes
= static_cast<uint64_t>(
4271 (meta_alloc
/ (double) onode_shards
) / meta_cache
->get_bytes_per_onode());
4272 uint64_t max_shard_buffer
= static_cast<uint64_t>(data_alloc
/ buffer_shards
);
4274 dout(30) << __func__
<< " max_shard_onodes: " << max_shard_onodes
4275 << " max_shard_buffer: " << max_shard_buffer
<< dendl
;
4277 for (auto i
: store
->onode_cache_shards
) {
4278 i
->set_max(max_shard_onodes
);
4280 for (auto i
: store
->buffer_cache_shards
) {
4281 i
->set_max(max_shard_buffer
);
4285 void BlueStore::MempoolThread::_update_cache_settings()
4287 // Nothing to do if pcm is not used.
4288 if (pcm
== nullptr) {
4292 uint64_t target
= store
->osd_memory_target
;
4293 uint64_t base
= store
->osd_memory_base
;
4294 uint64_t min
= store
->osd_memory_cache_min
;
4296 double fragmentation
= store
->osd_memory_expected_fragmentation
;
4298 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
4299 if (ltarget
> base
+ min
) {
4300 max
= ltarget
- base
;
4303 // set pcm cache levels
4304 pcm
->set_target_memory(target
);
4305 pcm
->set_min_memory(min
);
4306 pcm
->set_max_memory(max
);
4308 dout(5) << __func__
<< " updated pcm target: " << target
4309 << " pcm min: " << min
4310 << " pcm max: " << max
4314 // =======================================================
4319 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4321 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4322 CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
)
4323 : c(c
), o(o
), it(it
)
4325 std::shared_lock
l(c
->lock
);
4326 if (o
->onode
.has_omap()) {
4327 o
->get_omap_key(string(), &head
);
4328 o
->get_omap_tail(&tail
);
4329 it
->lower_bound(head
);
4333 string
BlueStore::OmapIteratorImpl::_stringify() const
4336 s
<< " omap_iterator(cid = " << c
->cid
4337 <<", oid = " << o
->oid
<< ")";
4341 int BlueStore::OmapIteratorImpl::seek_to_first()
4343 std::shared_lock
l(c
->lock
);
4344 auto start1
= mono_clock::now();
4345 if (o
->onode
.has_omap()) {
4346 it
->lower_bound(head
);
4348 it
= KeyValueDB::Iterator();
4350 c
->store
->log_latency(
4352 l_bluestore_omap_seek_to_first_lat
,
4353 mono_clock::now() - start1
,
4354 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
4359 int BlueStore::OmapIteratorImpl::upper_bound(const string
& after
)
4361 std::shared_lock
l(c
->lock
);
4362 auto start1
= mono_clock::now();
4363 if (o
->onode
.has_omap()) {
4365 o
->get_omap_key(after
, &key
);
4366 ldout(c
->store
->cct
,20) << __func__
<< " after " << after
<< " key "
4367 << pretty_binary_string(key
) << dendl
;
4368 it
->upper_bound(key
);
4370 it
= KeyValueDB::Iterator();
4372 c
->store
->log_latency_fn(
4374 l_bluestore_omap_upper_bound_lat
,
4375 mono_clock::now() - start1
,
4376 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
,
4377 [&] (const ceph::timespan
& lat
) {
4378 return ", after = " + after
+
4385 int BlueStore::OmapIteratorImpl::lower_bound(const string
& to
)
4387 std::shared_lock
l(c
->lock
);
4388 auto start1
= mono_clock::now();
4389 if (o
->onode
.has_omap()) {
4391 o
->get_omap_key(to
, &key
);
4392 ldout(c
->store
->cct
,20) << __func__
<< " to " << to
<< " key "
4393 << pretty_binary_string(key
) << dendl
;
4394 it
->lower_bound(key
);
4396 it
= KeyValueDB::Iterator();
4398 c
->store
->log_latency_fn(
4400 l_bluestore_omap_lower_bound_lat
,
4401 mono_clock::now() - start1
,
4402 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
,
4403 [&] (const ceph::timespan
& lat
) {
4404 return ", to = " + to
+
4411 bool BlueStore::OmapIteratorImpl::valid()
4413 std::shared_lock
l(c
->lock
);
4414 bool r
= o
->onode
.has_omap() && it
&& it
->valid() &&
4415 it
->raw_key().second
< tail
;
4416 if (it
&& it
->valid()) {
4417 ldout(c
->store
->cct
,20) << __func__
<< " is at "
4418 << pretty_binary_string(it
->raw_key().second
)
4424 int BlueStore::OmapIteratorImpl::next()
4427 std::shared_lock
l(c
->lock
);
4428 auto start1
= mono_clock::now();
4429 if (o
->onode
.has_omap()) {
4433 c
->store
->log_latency(
4435 l_bluestore_omap_next_lat
,
4436 mono_clock::now() - start1
,
4437 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
4442 string
BlueStore::OmapIteratorImpl::key()
4444 std::shared_lock
l(c
->lock
);
4445 ceph_assert(it
->valid());
4446 string db_key
= it
->raw_key().second
;
4448 o
->decode_omap_key(db_key
, &user_key
);
4453 bufferlist
BlueStore::OmapIteratorImpl::value()
4455 std::shared_lock
l(c
->lock
);
4456 ceph_assert(it
->valid());
4461 // =====================================
4464 #define dout_prefix *_dout << "bluestore(" << path << ") "
4466 #define dout_context cct
4469 static void aio_cb(void *priv
, void *priv2
)
4471 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
4472 BlueStore::AioContext
*c
= static_cast<BlueStore::AioContext
*>(priv2
);
4473 c
->aio_finish(store
);
4476 static void discard_cb(void *priv
, void *priv2
)
4478 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
4479 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
4480 store
->handle_discard(*tmp
);
4483 void BlueStore::handle_discard(interval_set
<uint64_t>& to_release
)
4485 dout(10) << __func__
<< dendl
;
4486 ceph_assert(shared_alloc
.a
);
4487 shared_alloc
.a
->release(to_release
);
4490 BlueStore::BlueStore(CephContext
*cct
, const string
& path
)
4491 : BlueStore(cct
, path
, 0) {}
4493 BlueStore::BlueStore(CephContext
*cct
,
4495 uint64_t _min_alloc_size
)
4496 : ObjectStore(cct
, path
),
4498 finisher(cct
, "commit_finisher", "cfin"),
4499 kv_sync_thread(this),
4500 kv_finalize_thread(this),
4501 zoned_cleaner_thread(this),
4502 min_alloc_size(_min_alloc_size
),
4503 min_alloc_size_order(ctz(_min_alloc_size
)),
4504 mempool_thread(this)
4507 cct
->_conf
.add_observer(this);
4508 set_cache_shards(1);
4511 BlueStore::~BlueStore()
4513 cct
->_conf
.remove_observer(this);
4515 ceph_assert(!mounted
);
4516 ceph_assert(db
== NULL
);
4517 ceph_assert(bluefs
== NULL
);
4518 ceph_assert(fsid_fd
< 0);
4519 ceph_assert(path_fd
< 0);
4520 for (auto i
: onode_cache_shards
) {
4523 for (auto i
: buffer_cache_shards
) {
4526 onode_cache_shards
.clear();
4527 buffer_cache_shards
.clear();
4530 const char **BlueStore::get_tracked_conf_keys() const
4532 static const char* KEYS
[] = {
4533 "bluestore_csum_type",
4534 "bluestore_compression_mode",
4535 "bluestore_compression_algorithm",
4536 "bluestore_compression_min_blob_size",
4537 "bluestore_compression_min_blob_size_ssd",
4538 "bluestore_compression_min_blob_size_hdd",
4539 "bluestore_compression_max_blob_size",
4540 "bluestore_compression_max_blob_size_ssd",
4541 "bluestore_compression_max_blob_size_hdd",
4542 "bluestore_compression_required_ratio",
4543 "bluestore_max_alloc_size",
4544 "bluestore_prefer_deferred_size",
4545 "bluestore_prefer_deferred_size_hdd",
4546 "bluestore_prefer_deferred_size_ssd",
4547 "bluestore_deferred_batch_ops",
4548 "bluestore_deferred_batch_ops_hdd",
4549 "bluestore_deferred_batch_ops_ssd",
4550 "bluestore_throttle_bytes",
4551 "bluestore_throttle_deferred_bytes",
4552 "bluestore_throttle_cost_per_io_hdd",
4553 "bluestore_throttle_cost_per_io_ssd",
4554 "bluestore_throttle_cost_per_io",
4555 "bluestore_max_blob_size",
4556 "bluestore_max_blob_size_ssd",
4557 "bluestore_max_blob_size_hdd",
4558 "osd_memory_target",
4559 "osd_memory_target_cgroup_limit_ratio",
4561 "osd_memory_cache_min",
4562 "osd_memory_expected_fragmentation",
4563 "bluestore_cache_autotune",
4564 "bluestore_cache_autotune_interval",
4565 "bluestore_warn_on_legacy_statfs",
4566 "bluestore_warn_on_no_per_pool_omap",
4567 "bluestore_max_defer_interval",
4573 void BlueStore::handle_conf_change(const ConfigProxy
& conf
,
4574 const std::set
<std::string
> &changed
)
4576 if (changed
.count("bluestore_warn_on_legacy_statfs")) {
4577 _check_legacy_statfs_alert();
4579 if (changed
.count("bluestore_warn_on_no_per_pool_omap") ||
4580 changed
.count("bluestore_warn_on_no_per_pg_omap")) {
4581 _check_no_per_pg_or_pool_omap_alert();
4584 if (changed
.count("bluestore_csum_type")) {
4587 if (changed
.count("bluestore_compression_mode") ||
4588 changed
.count("bluestore_compression_algorithm") ||
4589 changed
.count("bluestore_compression_min_blob_size") ||
4590 changed
.count("bluestore_compression_max_blob_size")) {
4595 if (changed
.count("bluestore_max_blob_size") ||
4596 changed
.count("bluestore_max_blob_size_ssd") ||
4597 changed
.count("bluestore_max_blob_size_hdd")) {
4599 // only after startup
4603 if (changed
.count("bluestore_prefer_deferred_size") ||
4604 changed
.count("bluestore_prefer_deferred_size_hdd") ||
4605 changed
.count("bluestore_prefer_deferred_size_ssd") ||
4606 changed
.count("bluestore_max_alloc_size") ||
4607 changed
.count("bluestore_deferred_batch_ops") ||
4608 changed
.count("bluestore_deferred_batch_ops_hdd") ||
4609 changed
.count("bluestore_deferred_batch_ops_ssd")) {
4611 // only after startup
4615 if (changed
.count("bluestore_throttle_cost_per_io") ||
4616 changed
.count("bluestore_throttle_cost_per_io_hdd") ||
4617 changed
.count("bluestore_throttle_cost_per_io_ssd")) {
4619 _set_throttle_params();
4622 if (changed
.count("bluestore_throttle_bytes") ||
4623 changed
.count("bluestore_throttle_deferred_bytes") ||
4624 changed
.count("bluestore_throttle_trace_rate")) {
4625 throttle
.reset_throttle(conf
);
4627 if (changed
.count("bluestore_max_defer_interval")) {
4629 _set_max_defer_interval();
4632 if (changed
.count("osd_memory_target") ||
4633 changed
.count("osd_memory_base") ||
4634 changed
.count("osd_memory_cache_min") ||
4635 changed
.count("osd_memory_expected_fragmentation")) {
4636 _update_osd_memory_options();
4640 void BlueStore::_set_compression()
4642 auto m
= Compressor::get_comp_mode_type(cct
->_conf
->bluestore_compression_mode
);
4644 _clear_compression_alert();
4647 derr
<< __func__
<< " unrecognized value '"
4648 << cct
->_conf
->bluestore_compression_mode
4649 << "' for bluestore_compression_mode, reverting to 'none'"
4651 comp_mode
= Compressor::COMP_NONE
;
4652 string
s("unknown mode: ");
4653 s
+= cct
->_conf
->bluestore_compression_mode
;
4654 _set_compression_alert(true, s
.c_str());
4657 compressor
= nullptr;
4659 if (cct
->_conf
->bluestore_compression_min_blob_size
) {
4660 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size
;
4663 if (_use_rotational_settings()) {
4664 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_hdd
;
4666 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_ssd
;
4670 if (cct
->_conf
->bluestore_compression_max_blob_size
) {
4671 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size
;
4674 if (_use_rotational_settings()) {
4675 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_hdd
;
4677 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_ssd
;
4681 auto& alg_name
= cct
->_conf
->bluestore_compression_algorithm
;
4682 if (!alg_name
.empty()) {
4683 compressor
= Compressor::create(cct
, alg_name
);
4685 derr
<< __func__
<< " unable to initialize " << alg_name
.c_str() << " compressor"
4687 _set_compression_alert(false, alg_name
.c_str());
4691 dout(10) << __func__
<< " mode " << Compressor::get_comp_mode_name(comp_mode
)
4692 << " alg " << (compressor
? compressor
->get_type_name() : "(none)")
4693 << " min_blob " << comp_min_blob_size
4694 << " max_blob " << comp_max_blob_size
4698 void BlueStore::_set_csum()
4700 csum_type
= Checksummer::CSUM_NONE
;
4701 int t
= Checksummer::get_csum_string_type(cct
->_conf
->bluestore_csum_type
);
4702 if (t
> Checksummer::CSUM_NONE
)
4705 dout(10) << __func__
<< " csum_type "
4706 << Checksummer::get_csum_type_string(csum_type
)
4710 void BlueStore::_set_throttle_params()
4712 if (cct
->_conf
->bluestore_throttle_cost_per_io
) {
4713 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io
;
4716 if (_use_rotational_settings()) {
4717 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_hdd
;
4719 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_ssd
;
4723 dout(10) << __func__
<< " throttle_cost_per_io " << throttle_cost_per_io
4726 void BlueStore::_set_blob_size()
4728 if (cct
->_conf
->bluestore_max_blob_size
) {
4729 max_blob_size
= cct
->_conf
->bluestore_max_blob_size
;
4732 if (_use_rotational_settings()) {
4733 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_hdd
;
4735 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_ssd
;
4738 dout(10) << __func__
<< " max_blob_size 0x" << std::hex
<< max_blob_size
4739 << std::dec
<< dendl
;
4742 void BlueStore::_update_osd_memory_options()
4744 osd_memory_target
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_target");
4745 osd_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
4746 osd_memory_expected_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
4747 osd_memory_cache_min
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_cache_min");
4749 dout(10) << __func__
4750 << " osd_memory_target " << osd_memory_target
4751 << " osd_memory_base " << osd_memory_base
4752 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4753 << " osd_memory_cache_min " << osd_memory_cache_min
4757 int BlueStore::_set_cache_sizes()
4760 cache_autotune
= cct
->_conf
.get_val
<bool>("bluestore_cache_autotune");
4761 cache_autotune_interval
=
4762 cct
->_conf
.get_val
<double>("bluestore_cache_autotune_interval");
4763 osd_memory_target
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_target");
4764 osd_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
4765 osd_memory_expected_fragmentation
=
4766 cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
4767 osd_memory_cache_min
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_cache_min");
4768 osd_memory_cache_resize_interval
=
4769 cct
->_conf
.get_val
<double>("osd_memory_cache_resize_interval");
4771 if (cct
->_conf
->bluestore_cache_size
) {
4772 cache_size
= cct
->_conf
->bluestore_cache_size
;
4774 // choose global cache size based on backend type
4775 if (_use_rotational_settings()) {
4776 cache_size
= cct
->_conf
->bluestore_cache_size_hdd
;
4778 cache_size
= cct
->_conf
->bluestore_cache_size_ssd
;
4782 cache_meta_ratio
= cct
->_conf
.get_val
<double>("bluestore_cache_meta_ratio");
4783 if (cache_meta_ratio
< 0 || cache_meta_ratio
> 1.0) {
4784 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
4785 << ") must be in range [0,1.0]" << dendl
;
4789 cache_kv_ratio
= cct
->_conf
.get_val
<double>("bluestore_cache_kv_ratio");
4790 if (cache_kv_ratio
< 0 || cache_kv_ratio
> 1.0) {
4791 derr
<< __func__
<< " bluestore_cache_kv_ratio (" << cache_kv_ratio
4792 << ") must be in range [0,1.0]" << dendl
;
4796 cache_kv_onode_ratio
= cct
->_conf
.get_val
<double>("bluestore_cache_kv_onode_ratio");
4797 if (cache_kv_onode_ratio
< 0 || cache_kv_onode_ratio
> 1.0) {
4798 derr
<< __func__
<< " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
4799 << ") must be in range [0,1.0]" << dendl
;
4803 if (cache_meta_ratio
+ cache_kv_ratio
> 1.0) {
4804 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
4805 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4806 << ") = " << cache_meta_ratio
+ cache_kv_ratio
<< "; must be <= 1.0"
4811 cache_data_ratio
= (double)1.0 -
4812 (double)cache_meta_ratio
-
4813 (double)cache_kv_ratio
-
4814 (double)cache_kv_onode_ratio
;
4815 if (cache_data_ratio
< 0) {
4816 // deal with floating point imprecision
4817 cache_data_ratio
= 0;
4820 dout(1) << __func__
<< " cache_size " << cache_size
4821 << " meta " << cache_meta_ratio
4822 << " kv " << cache_kv_ratio
4823 << " data " << cache_data_ratio
4828 int BlueStore::write_meta(const std::string
& key
, const std::string
& value
)
4830 bluestore_bdev_label_t label
;
4831 string p
= path
+ "/block";
4832 int r
= _read_bdev_label(cct
, p
, &label
);
4834 return ObjectStore::write_meta(key
, value
);
4836 label
.meta
[key
] = value
;
4837 r
= _write_bdev_label(cct
, p
, label
);
4838 ceph_assert(r
== 0);
4839 return ObjectStore::write_meta(key
, value
);
4842 int BlueStore::read_meta(const std::string
& key
, std::string
*value
)
4844 bluestore_bdev_label_t label
;
4845 string p
= path
+ "/block";
4846 int r
= _read_bdev_label(cct
, p
, &label
);
4848 return ObjectStore::read_meta(key
, value
);
4850 auto i
= label
.meta
.find(key
);
4851 if (i
== label
.meta
.end()) {
4852 return ObjectStore::read_meta(key
, value
);
4858 void BlueStore::_init_logger()
4860 PerfCountersBuilder
b(cct
, "bluestore",
4861 l_bluestore_first
, l_bluestore_last
);
4862 b
.add_time_avg(l_bluestore_kv_flush_lat
, "kv_flush_lat",
4863 "Average kv_thread flush latency",
4864 "fl_l", PerfCountersBuilder::PRIO_INTERESTING
);
4865 b
.add_time_avg(l_bluestore_kv_commit_lat
, "kv_commit_lat",
4866 "Average kv_thread commit latency");
4867 b
.add_time_avg(l_bluestore_kv_sync_lat
, "kv_sync_lat",
4868 "Average kv_sync thread latency",
4869 "ks_l", PerfCountersBuilder::PRIO_INTERESTING
);
4870 b
.add_time_avg(l_bluestore_kv_final_lat
, "kv_final_lat",
4871 "Average kv_finalize thread latency",
4872 "kf_l", PerfCountersBuilder::PRIO_INTERESTING
);
4873 b
.add_time_avg(l_bluestore_state_prepare_lat
, "state_prepare_lat",
4874 "Average prepare state latency");
4875 b
.add_time_avg(l_bluestore_state_aio_wait_lat
, "state_aio_wait_lat",
4876 "Average aio_wait state latency",
4877 "io_l", PerfCountersBuilder::PRIO_INTERESTING
);
4878 b
.add_time_avg(l_bluestore_state_io_done_lat
, "state_io_done_lat",
4879 "Average io_done state latency");
4880 b
.add_time_avg(l_bluestore_state_kv_queued_lat
, "state_kv_queued_lat",
4881 "Average kv_queued state latency");
4882 b
.add_time_avg(l_bluestore_state_kv_committing_lat
, "state_kv_commiting_lat",
4883 "Average kv_commiting state latency");
4884 b
.add_time_avg(l_bluestore_state_kv_done_lat
, "state_kv_done_lat",
4885 "Average kv_done state latency");
4886 b
.add_time_avg(l_bluestore_state_deferred_queued_lat
, "state_deferred_queued_lat",
4887 "Average deferred_queued state latency");
4888 b
.add_time_avg(l_bluestore_state_deferred_aio_wait_lat
, "state_deferred_aio_wait_lat",
4889 "Average aio_wait state latency");
4890 b
.add_time_avg(l_bluestore_state_deferred_cleanup_lat
, "state_deferred_cleanup_lat",
4891 "Average cleanup state latency");
4892 b
.add_time_avg(l_bluestore_state_finishing_lat
, "state_finishing_lat",
4893 "Average finishing state latency");
4894 b
.add_time_avg(l_bluestore_state_done_lat
, "state_done_lat",
4895 "Average done state latency");
4896 b
.add_time_avg(l_bluestore_throttle_lat
, "throttle_lat",
4897 "Average submit throttle latency",
4898 "th_l", PerfCountersBuilder::PRIO_CRITICAL
);
4899 b
.add_time_avg(l_bluestore_submit_lat
, "submit_lat",
4900 "Average submit latency",
4901 "s_l", PerfCountersBuilder::PRIO_CRITICAL
);
4902 b
.add_time_avg(l_bluestore_commit_lat
, "commit_lat",
4903 "Average commit latency",
4904 "c_l", PerfCountersBuilder::PRIO_CRITICAL
);
4905 b
.add_time_avg(l_bluestore_read_lat
, "read_lat",
4906 "Average read latency",
4907 "r_l", PerfCountersBuilder::PRIO_CRITICAL
);
4908 b
.add_time_avg(l_bluestore_read_onode_meta_lat
, "read_onode_meta_lat",
4909 "Average read onode metadata latency");
4910 b
.add_time_avg(l_bluestore_read_wait_aio_lat
, "read_wait_aio_lat",
4911 "Average read latency");
4912 b
.add_time_avg(l_bluestore_compress_lat
, "compress_lat",
4913 "Average compress latency");
4914 b
.add_time_avg(l_bluestore_decompress_lat
, "decompress_lat",
4915 "Average decompress latency");
4916 b
.add_time_avg(l_bluestore_csum_lat
, "csum_lat",
4917 "Average checksum latency");
4918 b
.add_u64_counter(l_bluestore_compress_success_count
, "compress_success_count",
4919 "Sum for beneficial compress ops");
4920 b
.add_u64_counter(l_bluestore_compress_rejected_count
, "compress_rejected_count",
4921 "Sum for compress ops rejected due to low net gain of space");
4922 b
.add_u64_counter(l_bluestore_write_pad_bytes
, "write_pad_bytes",
4923 "Sum for write-op padded bytes", NULL
, 0, unit_t(UNIT_BYTES
));
4924 b
.add_u64_counter(l_bluestore_deferred_write_ops
, "deferred_write_ops",
4925 "Sum for deferred write op");
4926 b
.add_u64_counter(l_bluestore_deferred_write_bytes
, "deferred_write_bytes",
4927 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES
));
4928 b
.add_u64_counter(l_bluestore_write_penalty_read_ops
, "write_penalty_read_ops",
4929 "Sum for write penalty read ops");
4930 b
.add_u64(l_bluestore_allocated
, "bluestore_allocated",
4931 "Sum for allocated bytes");
4932 b
.add_u64(l_bluestore_stored
, "bluestore_stored",
4933 "Sum for stored bytes");
4934 b
.add_u64(l_bluestore_compressed
, "bluestore_compressed",
4935 "Sum for stored compressed bytes",
4936 "c", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4937 b
.add_u64(l_bluestore_compressed_allocated
, "bluestore_compressed_allocated",
4938 "Sum for bytes allocated for compressed data",
4939 "c_a", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4940 b
.add_u64(l_bluestore_compressed_original
, "bluestore_compressed_original",
4941 "Sum for original bytes that were compressed",
4942 "c_o", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4943 b
.add_u64(l_bluestore_onodes
, "bluestore_onodes",
4944 "Number of onodes in cache");
4945 b
.add_u64(l_bluestore_pinned_onodes
, "bluestore_pinned_onodes",
4946 "Number of pinned onodes in cache");
4947 b
.add_u64_counter(l_bluestore_onode_hits
, "bluestore_onode_hits",
4948 "Sum for onode-lookups hit in the cache");
4949 b
.add_u64_counter(l_bluestore_onode_misses
, "bluestore_onode_misses",
4950 "Sum for onode-lookups missed in the cache");
4951 b
.add_u64_counter(l_bluestore_onode_shard_hits
, "bluestore_onode_shard_hits",
4952 "Sum for onode-shard lookups hit in the cache");
4953 b
.add_u64_counter(l_bluestore_onode_shard_misses
,
4954 "bluestore_onode_shard_misses",
4955 "Sum for onode-shard lookups missed in the cache");
4956 b
.add_u64(l_bluestore_extents
, "bluestore_extents",
4957 "Number of extents in cache");
4958 b
.add_u64(l_bluestore_blobs
, "bluestore_blobs",
4959 "Number of blobs in cache");
4960 b
.add_u64(l_bluestore_buffers
, "bluestore_buffers",
4961 "Number of buffers in cache");
4962 b
.add_u64(l_bluestore_buffer_bytes
, "bluestore_buffer_bytes",
4963 "Number of buffer bytes in cache", NULL
, 0, unit_t(UNIT_BYTES
));
4964 b
.add_u64_counter(l_bluestore_buffer_hit_bytes
, "bluestore_buffer_hit_bytes",
4965 "Sum for bytes of read hit in the cache", NULL
, 0, unit_t(UNIT_BYTES
));
4966 b
.add_u64_counter(l_bluestore_buffer_miss_bytes
, "bluestore_buffer_miss_bytes",
4967 "Sum for bytes of read missed in the cache", NULL
, 0, unit_t(UNIT_BYTES
));
4969 b
.add_u64_counter(l_bluestore_write_big
, "bluestore_write_big",
4970 "Large aligned writes into fresh blobs");
4971 b
.add_u64_counter(l_bluestore_write_big_bytes
, "bluestore_write_big_bytes",
4972 "Large aligned writes into fresh blobs (bytes)", NULL
, 0, unit_t(UNIT_BYTES
));
4973 b
.add_u64_counter(l_bluestore_write_big_blobs
, "bluestore_write_big_blobs",
4974 "Large aligned writes into fresh blobs (blobs)");
4975 b
.add_u64_counter(l_bluestore_write_big_deferred
,
4976 "bluestore_write_big_deferred",
4977 "Big overwrites using deferred");
4978 b
.add_u64_counter(l_bluestore_write_small
, "bluestore_write_small",
4979 "Small writes into existing or sparse small blobs");
4980 b
.add_u64_counter(l_bluestore_write_small_bytes
, "bluestore_write_small_bytes",
4981 "Small writes into existing or sparse small blobs (bytes)", NULL
, 0, unit_t(UNIT_BYTES
));
4982 b
.add_u64_counter(l_bluestore_write_small_unused
,
4983 "bluestore_write_small_unused",
4984 "Small writes into unused portion of existing blob");
4985 b
.add_u64_counter(l_bluestore_write_deferred
,
4986 "bluestore_write_deferred",
4987 "Total deferred writes submitted");
4988 b
.add_u64_counter(l_bluestore_write_deferred_bytes
,
4989 "bluestore_write_deferred_bytes",
4990 "Total bytes submitted as deferred writes");
4991 b
.add_u64_counter(l_bluestore_write_small_pre_read
,
4992 "bluestore_write_small_pre_read",
4993 "Small writes that required we read some data (possibly "
4994 "cached) to fill out the block");
4995 b
.add_u64_counter(l_bluestore_write_new
, "bluestore_write_new",
4996 "Write into new blob");
4998 b
.add_u64_counter(l_bluestore_txc
, "bluestore_txc", "Transactions committed");
4999 b
.add_u64_counter(l_bluestore_onode_reshard
, "bluestore_onode_reshard",
5000 "Onode extent map reshard events");
5001 b
.add_u64_counter(l_bluestore_blob_split
, "bluestore_blob_split",
5002 "Sum for blob splitting due to resharding");
5003 b
.add_u64_counter(l_bluestore_extent_compress
, "bluestore_extent_compress",
5004 "Sum for extents that have been removed due to compression");
5005 b
.add_u64_counter(l_bluestore_gc_merged
, "bluestore_gc_merged",
5006 "Sum for extents that have been merged due to garbage "
5008 b
.add_u64_counter(l_bluestore_read_eio
, "bluestore_read_eio",
5009 "Read EIO errors propagated to high level callers");
5010 b
.add_u64_counter(l_bluestore_reads_with_retries
, "bluestore_reads_with_retries",
5011 "Read operations that required at least one retry due to failed checksum validation");
5012 b
.add_u64(l_bluestore_fragmentation
, "bluestore_fragmentation_micros",
5013 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
5014 b
.add_time_avg(l_bluestore_omap_seek_to_first_lat
, "omap_seek_to_first_lat",
5015 "Average omap iterator seek_to_first call latency");
5016 b
.add_time_avg(l_bluestore_omap_upper_bound_lat
, "omap_upper_bound_lat",
5017 "Average omap iterator upper_bound call latency");
5018 b
.add_time_avg(l_bluestore_omap_lower_bound_lat
, "omap_lower_bound_lat",
5019 "Average omap iterator lower_bound call latency");
5020 b
.add_time_avg(l_bluestore_omap_next_lat
, "omap_next_lat",
5021 "Average omap iterator next call latency");
5022 b
.add_time_avg(l_bluestore_omap_get_keys_lat
, "omap_get_keys_lat",
5023 "Average omap get_keys call latency");
5024 b
.add_time_avg(l_bluestore_omap_get_values_lat
, "omap_get_values_lat",
5025 "Average omap get_values call latency");
5026 b
.add_time_avg(l_bluestore_clist_lat
, "clist_lat",
5027 "Average collection listing latency");
5028 b
.add_time_avg(l_bluestore_remove_lat
, "remove_lat",
5029 "Average removal latency");
5031 logger
= b
.create_perf_counters();
5032 cct
->get_perfcounters_collection()->add(logger
);
5035 int BlueStore::_reload_logger()
5037 struct store_statfs_t store_statfs
;
5038 int r
= statfs(&store_statfs
);
5040 logger
->set(l_bluestore_allocated
, store_statfs
.allocated
);
5041 logger
->set(l_bluestore_stored
, store_statfs
.data_stored
);
5042 logger
->set(l_bluestore_compressed
, store_statfs
.data_compressed
);
5043 logger
->set(l_bluestore_compressed_allocated
, store_statfs
.data_compressed_allocated
);
5044 logger
->set(l_bluestore_compressed_original
, store_statfs
.data_compressed_original
);
5049 void BlueStore::_shutdown_logger()
5051 cct
->get_perfcounters_collection()->remove(logger
);
5055 int BlueStore::get_block_device_fsid(CephContext
* cct
, const string
& path
,
5058 bluestore_bdev_label_t label
;
5059 int r
= _read_bdev_label(cct
, path
, &label
);
5062 *fsid
= label
.osd_uuid
;
5066 int BlueStore::_open_path()
5069 ceph_assert(path_fd
< 0);
5070 path_fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_DIRECTORY
|O_CLOEXEC
));
5073 derr
<< __func__
<< " unable to open " << path
<< ": " << cpp_strerror(r
)
5080 void BlueStore::_close_path()
5082 VOID_TEMP_FAILURE_RETRY(::close(path_fd
));
5086 int BlueStore::_write_bdev_label(CephContext
*cct
,
5087 string path
, bluestore_bdev_label_t label
)
5089 dout(10) << __func__
<< " path " << path
<< " label " << label
<< dendl
;
5092 uint32_t crc
= bl
.crc32c(-1);
5094 ceph_assert(bl
.length() <= BDEV_LABEL_BLOCK_SIZE
);
5095 bufferptr
z(BDEV_LABEL_BLOCK_SIZE
- bl
.length());
5097 bl
.append(std::move(z
));
5099 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_WRONLY
|O_CLOEXEC
));
5102 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
5106 int r
= bl
.write_fd(fd
);
5108 derr
<< __func__
<< " failed to write to " << path
5109 << ": " << cpp_strerror(r
) << dendl
;
5114 derr
<< __func__
<< " failed to fsync " << path
5115 << ": " << cpp_strerror(r
) << dendl
;
5118 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5122 int BlueStore::_read_bdev_label(CephContext
* cct
, string path
,
5123 bluestore_bdev_label_t
*label
)
5125 dout(10) << __func__
<< dendl
;
5126 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_RDONLY
|O_CLOEXEC
));
5129 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
5134 int r
= bl
.read_fd(fd
, BDEV_LABEL_BLOCK_SIZE
);
5135 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5137 derr
<< __func__
<< " failed to read from " << path
5138 << ": " << cpp_strerror(r
) << dendl
;
5142 uint32_t crc
, expected_crc
;
5143 auto p
= bl
.cbegin();
5147 t
.substr_of(bl
, 0, p
.get_off());
5149 decode(expected_crc
, p
);
5151 catch (ceph::buffer::error
& e
) {
5152 dout(2) << __func__
<< " unable to decode label at offset " << p
.get_off()
5157 if (crc
!= expected_crc
) {
5158 derr
<< __func__
<< " bad crc on label, expected " << expected_crc
5159 << " != actual " << crc
<< dendl
;
5162 dout(10) << __func__
<< " got " << *label
<< dendl
;
5166 int BlueStore::_check_or_set_bdev_label(
5167 string path
, uint64_t size
, string desc
, bool create
)
5169 bluestore_bdev_label_t label
;
5171 label
.osd_uuid
= fsid
;
5173 label
.btime
= ceph_clock_now();
5174 label
.description
= desc
;
5175 int r
= _write_bdev_label(cct
, path
, label
);
5179 int r
= _read_bdev_label(cct
, path
, &label
);
5182 if (cct
->_conf
->bluestore_debug_permit_any_bdev_label
) {
5183 dout(20) << __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
5184 << " and fsid " << fsid
<< " check bypassed" << dendl
;
5185 } else if (label
.osd_uuid
!= fsid
) {
5186 derr
<< __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
5187 << " does not match our fsid " << fsid
<< dendl
;
5194 void BlueStore::_set_alloc_sizes(void)
5196 max_alloc_size
= cct
->_conf
->bluestore_max_alloc_size
;
5198 if (cct
->_conf
->bluestore_prefer_deferred_size
) {
5199 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size
;
5202 if (_use_rotational_settings()) {
5203 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_hdd
;
5205 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_ssd
;
5209 if (cct
->_conf
->bluestore_deferred_batch_ops
) {
5210 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops
;
5213 if (_use_rotational_settings()) {
5214 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_hdd
;
5216 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_ssd
;
5220 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
5221 << std::dec
<< " order " << (int)min_alloc_size_order
5222 << " max_alloc_size 0x" << std::hex
<< max_alloc_size
5223 << " prefer_deferred_size 0x" << prefer_deferred_size
5225 << " deferred_batch_ops " << deferred_batch_ops
5229 int BlueStore::_open_bdev(bool create
)
5231 ceph_assert(bdev
== NULL
);
5232 string p
= path
+ "/block";
5233 bdev
= BlockDevice::create(cct
, p
, aio_cb
, static_cast<void*>(this), discard_cb
, static_cast<void*>(this));
5234 int r
= bdev
->open(p
);
5238 if (create
&& cct
->_conf
->bdev_enable_discard
) {
5239 bdev
->discard(0, bdev
->get_size());
5242 if (bdev
->supported_bdev_label()) {
5243 r
= _check_or_set_bdev_label(p
, bdev
->get_size(), "main", create
);
5248 // initialize global block parameters
5249 block_size
= bdev
->get_block_size();
5250 block_mask
= ~(block_size
- 1);
5251 block_size_order
= ctz(block_size
);
5252 ceph_assert(block_size
== 1u << block_size_order
);
5253 _set_max_defer_interval();
5254 // and set cache_size based on device type
5255 r
= _set_cache_sizes();
5260 if (bdev
->is_smr()) {
5261 freelist_type
= "zoned";
5273 void BlueStore::_validate_bdev()
5276 uint64_t dev_size
= bdev
->get_size();
5277 ceph_assert(dev_size
> _get_ondisk_reserved());
5280 void BlueStore::_close_bdev()
5288 int BlueStore::_open_fm(KeyValueDB::Transaction t
, bool read_only
)
5292 ceph_assert(fm
== NULL
);
5293 fm
= FreelistManager::create(cct
, freelist_type
, PREFIX_ALLOC
);
5296 // create mode. initialize freespace
5297 dout(20) << __func__
<< " initializing freespace" << dendl
;
5300 bl
.append(freelist_type
);
5301 t
->set(PREFIX_SUPER
, "freelist_type", bl
);
5303 // being able to allocate in units less than bdev block size
5304 // seems to be a bad idea.
5305 ceph_assert( cct
->_conf
->bdev_block_size
<= (int64_t)min_alloc_size
);
5307 uint64_t alloc_size
= min_alloc_size
;
5308 if (bdev
->is_smr()) {
5309 alloc_size
= _zoned_piggyback_device_parameters_onto(alloc_size
);
5312 fm
->create(bdev
->get_size(), alloc_size
, t
);
5314 // allocate superblock reserved space. note that we do not mark
5315 // bluefs space as allocated in the freelist; we instead rely on
5316 // bluefs doing that itself.
5317 auto reserved
= _get_ondisk_reserved();
5318 fm
->allocate(0, reserved
, t
);
5320 if (cct
->_conf
->bluestore_debug_prefill
> 0) {
5321 uint64_t end
= bdev
->get_size() - reserved
;
5322 dout(1) << __func__
<< " pre-fragmenting freespace, using "
5323 << cct
->_conf
->bluestore_debug_prefill
<< " with max free extent "
5324 << cct
->_conf
->bluestore_debug_prefragment_max
<< dendl
;
5325 uint64_t start
= p2roundup(reserved
, min_alloc_size
);
5326 uint64_t max_b
= cct
->_conf
->bluestore_debug_prefragment_max
/ min_alloc_size
;
5327 float r
= cct
->_conf
->bluestore_debug_prefill
;
5331 while (!stop
&& start
< end
) {
5332 uint64_t l
= (rand() % max_b
+ 1) * min_alloc_size
;
5333 if (start
+ l
> end
) {
5335 l
= p2align(l
, min_alloc_size
);
5337 ceph_assert(start
+ l
<= end
);
5339 uint64_t u
= 1 + (uint64_t)(r
* (double)l
);
5340 u
= p2roundup(u
, min_alloc_size
);
5341 if (start
+ l
+ u
> end
) {
5342 u
= end
- (start
+ l
);
5343 // trim to align so we don't overflow again
5344 u
= p2align(u
, min_alloc_size
);
5347 ceph_assert(start
+ l
+ u
<= end
);
5349 dout(20) << __func__
<< " free 0x" << std::hex
<< start
<< "~" << l
5350 << " use 0x" << u
<< std::dec
<< dendl
;
5353 // break if u has been trimmed to nothing
5357 fm
->allocate(start
+ l
, u
, t
);
5361 r
= _write_out_fm_meta(0);
5362 ceph_assert(r
== 0);
5364 r
= fm
->init(db
, read_only
,
5365 [&](const std::string
& key
, std::string
* result
) {
5366 return read_meta(key
, result
);
5369 derr
<< __func__
<< " freelist init failed: " << cpp_strerror(r
) << dendl
;
5375 // if space size tracked by free list manager is that higher than actual
5376 // dev size one can hit out-of-space allocation which will result
5377 // in data loss and/or assertions
5378 // Probably user altered the device size somehow.
5379 // The only fix for now is to redeploy OSD.
5380 if (fm
->get_size() >= bdev
->get_size() + min_alloc_size
) {
5382 ss
<< "slow device size mismatch detected, "
5383 << " fm size(" << fm
->get_size()
5384 << ") > slow device size(" << bdev
->get_size()
5385 << "), Please stop using this OSD as it might cause data loss.";
5386 _set_disk_size_mismatch_alert(ss
.str());
5391 void BlueStore::_close_fm()
5393 dout(10) << __func__
<< dendl
;
5400 int BlueStore::_write_out_fm_meta(uint64_t target_size
)
5403 string p
= path
+ "/block";
5405 std::vector
<std::pair
<string
, string
>> fm_meta
;
5406 fm
->get_meta(target_size
, &fm_meta
);
5408 for (auto& m
: fm_meta
) {
5409 r
= write_meta(m
.first
, m
.second
);
5410 ceph_assert(r
== 0);
5415 int BlueStore::_create_alloc()
5417 ceph_assert(shared_alloc
.a
== NULL
);
5418 ceph_assert(bdev
->get_size());
5420 uint64_t alloc_size
= min_alloc_size
;
5421 if (bdev
->is_smr()) {
5422 int r
= _zoned_check_config_settings();
5425 alloc_size
= _zoned_piggyback_device_parameters_onto(alloc_size
);
5428 shared_alloc
.set(Allocator::create(cct
, cct
->_conf
->bluestore_allocator
,
5430 alloc_size
, "block"));
5432 if (!shared_alloc
.a
) {
5433 lderr(cct
) << __func__
<< "Failed to create allocator:: "
5434 << cct
->_conf
->bluestore_allocator
5441 int BlueStore::_init_alloc()
5443 int r
= _create_alloc();
5447 ceph_assert(shared_alloc
.a
!= NULL
);
5449 if (bdev
->is_smr()) {
5450 shared_alloc
.a
->zoned_set_zone_states(fm
->get_zone_states(db
));
5453 uint64_t num
= 0, bytes
= 0;
5455 dout(1) << __func__
<< " opening allocation metadata" << dendl
;
5456 // initialize from freelist
5457 fm
->enumerate_reset();
5458 uint64_t offset
, length
;
5459 while (fm
->enumerate_next(db
, &offset
, &length
)) {
5460 shared_alloc
.a
->init_add_free(offset
, length
);
5464 fm
->enumerate_reset();
5467 << " loaded " << byte_u_t(bytes
) << " in " << num
<< " extents"
5469 << ", allocator type " << shared_alloc
.a
->get_type()
5470 << ", capacity 0x" << shared_alloc
.a
->get_capacity()
5471 << ", block size 0x" << shared_alloc
.a
->get_block_size()
5472 << ", free 0x" << shared_alloc
.a
->get_free()
5473 << ", fragmentation " << shared_alloc
.a
->get_fragmentation()
5474 << std::dec
<< dendl
;
5479 void BlueStore::_close_alloc()
5482 bdev
->discard_drain();
5484 ceph_assert(shared_alloc
.a
);
5485 shared_alloc
.a
->shutdown();
5486 delete shared_alloc
.a
;
5487 shared_alloc
.reset();
5490 int BlueStore::_open_fsid(bool create
)
5492 ceph_assert(fsid_fd
< 0);
5493 int flags
= O_RDWR
|O_CLOEXEC
;
5496 fsid_fd
= ::openat(path_fd
, "fsid", flags
, 0644);
5499 derr
<< __func__
<< " " << cpp_strerror(err
) << dendl
;
5505 int BlueStore::_read_fsid(uuid_d
*uuid
)
5508 memset(fsid_str
, 0, sizeof(fsid_str
));
5509 int ret
= safe_read(fsid_fd
, fsid_str
, sizeof(fsid_str
));
5511 derr
<< __func__
<< " failed: " << cpp_strerror(ret
) << dendl
;
5518 if (!uuid
->parse(fsid_str
)) {
5519 derr
<< __func__
<< " unparsable uuid " << fsid_str
<< dendl
;
5525 int BlueStore::_write_fsid()
5527 int r
= ::ftruncate(fsid_fd
, 0);
5530 derr
<< __func__
<< " fsid truncate failed: " << cpp_strerror(r
) << dendl
;
5533 string str
= stringify(fsid
) + "\n";
5534 r
= safe_write(fsid_fd
, str
.c_str(), str
.length());
5536 derr
<< __func__
<< " fsid write failed: " << cpp_strerror(r
) << dendl
;
5539 r
= ::fsync(fsid_fd
);
5542 derr
<< __func__
<< " fsid fsync failed: " << cpp_strerror(r
) << dendl
;
5548 void BlueStore::_close_fsid()
5550 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd
));
5554 int BlueStore::_lock_fsid()
5557 memset(&l
, 0, sizeof(l
));
5559 l
.l_whence
= SEEK_SET
;
5560 int r
= ::fcntl(fsid_fd
, F_SETLK
, &l
);
5563 derr
<< __func__
<< " failed to lock " << path
<< "/fsid"
5564 << " (is another ceph-osd still running?)"
5565 << cpp_strerror(err
) << dendl
;
5571 bool BlueStore::is_rotational()
5574 return bdev
->is_rotational();
5577 bool rotational
= true;
5578 int r
= _open_path();
5581 r
= _open_fsid(false);
5584 r
= _read_fsid(&fsid
);
5590 r
= _open_bdev(false);
5593 rotational
= bdev
->is_rotational();
5603 bool BlueStore::is_journal_rotational()
5606 dout(5) << __func__
<< " bluefs disabled, default to store media type"
5608 return is_rotational();
5610 dout(10) << __func__
<< " " << (int)bluefs
->wal_is_rotational() << dendl
;
5611 return bluefs
->wal_is_rotational();
5614 bool BlueStore::_use_rotational_settings()
5616 if (cct
->_conf
->bluestore_debug_enforce_settings
== "hdd") {
5619 if (cct
->_conf
->bluestore_debug_enforce_settings
== "ssd") {
5622 return bdev
->is_rotational();
5625 bool BlueStore::test_mount_in_use()
5627 // most error conditions mean the mount is not in use (e.g., because
5628 // it doesn't exist). only if we fail to lock do we conclude it is
5631 int r
= _open_path();
5634 r
= _open_fsid(false);
5639 ret
= true; // if we can't lock, it is in use
5646 int BlueStore::_minimal_open_bluefs(bool create
)
5649 bluefs
= new BlueFS(cct
);
5654 bfn
= path
+ "/block.db";
5655 if (::stat(bfn
.c_str(), &st
) == 0) {
5656 r
= bluefs
->add_block_device(
5657 BlueFS::BDEV_DB
, bfn
,
5658 create
&& cct
->_conf
->bdev_enable_discard
,
5661 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5662 << cpp_strerror(r
) << dendl
;
5666 if (bluefs
->bdev_support_label(BlueFS::BDEV_DB
)) {
5667 r
= _check_or_set_bdev_label(
5669 bluefs
->get_block_device_size(BlueFS::BDEV_DB
),
5670 "bluefs db", create
);
5673 << " check block device(" << bfn
<< ") label returned: "
5674 << cpp_strerror(r
) << dendl
;
5678 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
5679 bluefs_layout
.dedicated_db
= true;
5682 if (::lstat(bfn
.c_str(), &st
) == -1) {
5684 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
5686 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
5687 << cpp_strerror(r
) << dendl
;
5693 bfn
= path
+ "/block";
5695 r
= bluefs
->add_block_device(bluefs_layout
.shared_bdev
, bfn
, false,
5696 0, // no need to provide valid 'reserved' for shared dev
5699 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5700 << cpp_strerror(r
) << dendl
;
5704 bfn
= path
+ "/block.wal";
5705 if (::stat(bfn
.c_str(), &st
) == 0) {
5706 r
= bluefs
->add_block_device(BlueFS::BDEV_WAL
, bfn
,
5707 create
&& cct
->_conf
->bdev_enable_discard
,
5708 BDEV_LABEL_BLOCK_SIZE
);
5710 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5711 << cpp_strerror(r
) << dendl
;
5715 if (bluefs
->bdev_support_label(BlueFS::BDEV_WAL
)) {
5716 r
= _check_or_set_bdev_label(
5718 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
),
5719 "bluefs wal", create
);
5721 derr
<< __func__
<< " check block device(" << bfn
5722 << ") label returned: " << cpp_strerror(r
) << dendl
;
5727 bluefs_layout
.dedicated_wal
= true;
5730 if (::lstat(bfn
.c_str(), &st
) != -1) {
5732 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
5733 << cpp_strerror(r
) << dendl
;
5740 ceph_assert(bluefs
);
5746 int BlueStore::_open_bluefs(bool create
, bool read_only
)
5748 int r
= _minimal_open_bluefs(create
);
5752 BlueFSVolumeSelector
* vselector
= nullptr;
5753 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
5755 string options
= cct
->_conf
->bluestore_rocksdb_options
;
5756 string options_annex
= cct
->_conf
->bluestore_rocksdb_options_annex
;
5757 if (!options_annex
.empty()) {
5758 if (!options
.empty() &&
5759 *options
.rbegin() != ',') {
5762 options
+= options_annex
;
5765 rocksdb::Options rocks_opts
;
5766 r
= RocksDBStore::ParseOptionsFromStringStatic(
5774 if (cct
->_conf
->bluestore_volume_selection_policy
== "fit_to_fast") {
5775 vselector
= new FitToFastVolumeSelector(
5776 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
5777 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
5778 bluefs
->get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100);
5780 double reserved_factor
= cct
->_conf
->bluestore_volume_selection_reserved_factor
;
5782 new RocksDBBlueFSVolumeSelector(
5783 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
5784 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
5785 bluefs
->get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100,
5786 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5787 rocks_opts
.max_bytes_for_level_base
,
5788 rocks_opts
.max_bytes_for_level_multiplier
,
5790 cct
->_conf
->bluestore_volume_selection_reserved
,
5791 cct
->_conf
->bluestore_volume_selection_policy
== "use_some_extra");
5795 bluefs
->mkfs(fsid
, bluefs_layout
);
5797 bluefs
->set_volume_selector(vselector
);
5798 r
= bluefs
->mount();
5800 derr
<< __func__
<< " failed bluefs mount: " << cpp_strerror(r
) << dendl
;
5802 ceph_assert_always(bluefs
->maybe_verify_layout(bluefs_layout
) == 0);
5806 void BlueStore::_close_bluefs(bool cold_close
)
5808 bluefs
->umount(cold_close
);
5809 _minimal_close_bluefs();
5812 void BlueStore::_minimal_close_bluefs()
5818 int BlueStore::_is_bluefs(bool create
, bool* ret
)
5821 *ret
= cct
->_conf
->bluestore_bluefs
;
5824 int r
= read_meta("bluefs", &s
);
5826 derr
<< __func__
<< " unable to read 'bluefs' meta" << dendl
;
5831 } else if (s
== "0") {
5834 derr
<< __func__
<< " bluefs = " << s
<< " : not 0 or 1, aborting"
5843 * opens both DB and dependant super_meta, FreelistManager and allocator
5844 * in the proper order
5846 int BlueStore::_open_db_and_around(bool read_only
, bool to_repair
)
5848 dout(0) << __func__
<< " read-only:" << read_only
5849 << " repair:" << to_repair
<< dendl
;
5852 int r
= read_meta("type", &type
);
5854 derr
<< __func__
<< " failed to load os-type: " << cpp_strerror(r
)
5859 if (type
!= "bluestore") {
5860 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
5865 int r
= _open_path();
5868 r
= _open_fsid(false);
5872 r
= _read_fsid(&fsid
);
5880 r
= _open_bdev(false);
5884 // open in read-only first to read FM list and init allocator
5885 // as they might be needed for some BlueFS procedures
5886 r
= _open_db(false, false, true);
5890 r
= _open_super_meta();
5895 r
= _open_fm(nullptr, true);
5903 // Re-open in the proper mode(s).
5905 // Can't simply bypass second open for read-only mode as we need to
5906 // load allocated extents from bluefs into allocator.
5907 // And now it's time to do that
5911 r
= _open_db(false, to_repair
, read_only
);
5922 _close_db(read_only
);
5932 void BlueStore::_close_db_and_around(bool read_only
)
5934 _close_db(read_only
);
5942 int BlueStore::open_db_environment(KeyValueDB
**pdb
, bool to_repair
)
5945 int r
= _open_db_and_around(false, to_repair
);
5954 int BlueStore::close_db_environment()
5956 _close_db_and_around(false);
5960 int BlueStore::_prepare_db_environment(bool create
, bool read_only
,
5961 std::string
* _fn
, std::string
* _kv_backend
)
5965 std::string
& fn
=*_fn
;
5966 std::string
& kv_backend
=*_kv_backend
;
5968 std::shared_ptr
<Int64ArrayMergeOperator
> merge_op(new Int64ArrayMergeOperator
);
5971 kv_backend
= cct
->_conf
->bluestore_kvbackend
;
5973 r
= read_meta("kv_backend", &kv_backend
);
5975 derr
<< __func__
<< " unable to read 'kv_backend' meta" << dendl
;
5979 dout(10) << __func__
<< " kv_backend = " << kv_backend
<< dendl
;
5982 r
= _is_bluefs(create
, &do_bluefs
);
5986 dout(10) << __func__
<< " do_bluefs = " << do_bluefs
<< dendl
;
5988 map
<string
,string
> kv_options
;
5989 // force separate wal dir for all new deployments.
5990 kv_options
["separate_wal_dir"] = 1;
5991 rocksdb::Env
*env
= NULL
;
5993 dout(10) << __func__
<< " initializing bluefs" << dendl
;
5994 if (kv_backend
!= "rocksdb") {
5995 derr
<< " backend must be rocksdb to use bluefs" << dendl
;
5999 r
= _open_bluefs(create
, read_only
);
6004 if (cct
->_conf
->bluestore_bluefs_env_mirror
) {
6005 rocksdb::Env
* a
= new BlueRocksEnv(bluefs
);
6006 rocksdb::Env
* b
= rocksdb::Env::Default();
6008 string cmd
= "rm -rf " + path
+ "/db " +
6009 path
+ "/db.slow " +
6011 int r
= system(cmd
.c_str());
6014 env
= new rocksdb::EnvMirror(b
, a
, false, true);
6016 env
= new BlueRocksEnv(bluefs
);
6018 // simplify the dir names, too, as "seen" by rocksdb
6021 BlueFSVolumeSelector::paths paths
;
6022 bluefs
->get_vselector_paths(fn
, paths
);
6025 ostringstream db_paths
;
6027 for (auto& p
: paths
) {
6032 db_paths
<< p
.first
<< "," << p
.second
;
6035 kv_options
["db_paths"] = db_paths
.str();
6036 dout(1) << __func__
<< " set db_paths to " << db_paths
.str() << dendl
;
6040 for (auto& p
: paths
) {
6041 env
->CreateDir(p
.first
);
6043 // Selectors don't provide wal path so far hence create explicitly
6044 env
->CreateDir(fn
+ ".wal");
6046 std::vector
<std::string
> res
;
6047 // check for dir presence
6048 auto r
= env
->GetChildren(fn
+".wal", &res
);
6049 if (r
.IsNotFound()) {
6050 kv_options
.erase("separate_wal_dir");
6054 string walfn
= path
+ "/db.wal";
6057 int r
= ::mkdir(fn
.c_str(), 0755);
6060 if (r
< 0 && r
!= -EEXIST
) {
6061 derr
<< __func__
<< " failed to create " << fn
<< ": " << cpp_strerror(r
)
6067 r
= ::mkdir(walfn
.c_str(), 0755);
6070 if (r
< 0 && r
!= -EEXIST
) {
6071 derr
<< __func__
<< " failed to create " << walfn
6072 << ": " << cpp_strerror(r
)
6078 r
= ::stat(walfn
.c_str(), &st
);
6079 if (r
< 0 && errno
== ENOENT
) {
6080 kv_options
.erase("separate_wal_dir");
6086 db
= KeyValueDB::create(cct
,
6090 static_cast<void*>(env
));
6092 derr
<< __func__
<< " error creating db" << dendl
;
6094 _close_bluefs(read_only
);
6096 // delete env manually here since we can't depend on db to do this
6103 FreelistManager::setup_merge_operators(db
, freelist_type
);
6104 db
->set_merge_operator(PREFIX_STAT
, merge_op
);
6105 db
->set_cache_size(cache_kv_ratio
* cache_size
);
6109 int BlueStore::_open_db(bool create
, bool to_repair_db
, bool read_only
)
6112 ceph_assert(!(create
&& read_only
));
6114 string options_annex
;
6118 std::string sharding_def
;
6119 r
= _prepare_db_environment(create
, read_only
, &kv_dir_fn
, &kv_backend
);
6121 derr
<< __func__
<< " failed to prepare db environment: " << err
.str() << dendl
;
6124 if (kv_backend
== "rocksdb") {
6125 options
= cct
->_conf
->bluestore_rocksdb_options
;
6126 options_annex
= cct
->_conf
->bluestore_rocksdb_options_annex
;
6127 if (!options_annex
.empty()) {
6128 if (!options
.empty() &&
6129 *options
.rbegin() != ',') {
6132 options
+= options_annex
;
6135 if (cct
->_conf
.get_val
<bool>("bluestore_rocksdb_cf")) {
6136 sharding_def
= cct
->_conf
.get_val
<std::string
>("bluestore_rocksdb_cfs");
6144 r
= db
->create_and_open(err
, sharding_def
);
6146 // we pass in cf list here, but it is only used if the db already has
6147 // column families created.
6149 db
->open_read_only(err
, sharding_def
) :
6150 db
->open(err
, sharding_def
);
6153 derr
<< __func__
<< " erroring opening db: " << err
.str() << dendl
;
6154 _close_db(read_only
);
6157 dout(1) << __func__
<< " opened " << kv_backend
6158 << " path " << kv_dir_fn
<< " options " << options
<< dendl
;
6162 void BlueStore::_close_db(bool cold_close
)
6168 _close_bluefs(cold_close
);
6172 void BlueStore::_dump_alloc_on_failure()
6174 auto dump_interval
=
6175 cct
->_conf
->bluestore_bluefs_alloc_failure_dump_interval
;
6176 if (dump_interval
> 0 &&
6177 next_dump_on_bluefs_alloc_failure
<= ceph_clock_now()) {
6178 shared_alloc
.a
->dump();
6179 next_dump_on_bluefs_alloc_failure
= ceph_clock_now();
6180 next_dump_on_bluefs_alloc_failure
+= dump_interval
;
6184 int BlueStore::_open_collections()
6186 dout(10) << __func__
<< dendl
;
6187 collections_had_errors
= false;
6188 ceph_assert(coll_map
.empty());
6189 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
6190 for (it
->upper_bound(string());
6194 if (cid
.parse(it
->key())) {
6195 auto c
= ceph::make_ref
<Collection
>(
6197 onode_cache_shards
[cid
.hash_to_shard(onode_cache_shards
.size())],
6198 buffer_cache_shards
[cid
.hash_to_shard(buffer_cache_shards
.size())],
6200 bufferlist bl
= it
->value();
6201 auto p
= bl
.cbegin();
6203 decode(c
->cnode
, p
);
6204 } catch (ceph::buffer::error
& e
) {
6205 derr
<< __func__
<< " failed to decode cnode, key:"
6206 << pretty_binary_string(it
->key()) << dendl
;
6209 dout(20) << __func__
<< " opened " << cid
<< " " << c
6210 << " " << c
->cnode
<< dendl
;
6211 _osr_attach(c
.get());
6215 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
6216 collections_had_errors
= true;
6222 void BlueStore::_fsck_collections(int64_t* errors
)
6224 if (collections_had_errors
) {
6225 dout(10) << __func__
<< dendl
;
6226 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
, KeyValueDB::ITERATOR_NOCACHE
);
6227 for (it
->upper_bound(string());
6231 if (!cid
.parse(it
->key())) {
6232 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
6241 void BlueStore::_set_per_pool_omap()
6243 per_pool_omap
= OMAP_BULK
;
6245 db
->get(PREFIX_SUPER
, "per_pool_omap", &bl
);
6247 auto s
= bl
.to_str();
6248 if (s
== stringify(OMAP_PER_POOL
)) {
6249 per_pool_omap
= OMAP_PER_POOL
;
6251 ceph_assert(s
== stringify(OMAP_PER_PG
));
6252 per_pool_omap
= OMAP_PER_PG
;
6254 dout(10) << __func__
<< " per_pool_omap = " << per_pool_omap
<< dendl
;
6256 dout(10) << __func__
<< " per_pool_omap not present" << dendl
;
6258 _check_no_per_pg_or_pool_omap_alert();
6261 void BlueStore::_open_statfs()
6267 int r
= db
->get(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, &bl
);
6269 per_pool_stat_collection
= false;
6270 if (size_t(bl
.length()) >= sizeof(vstatfs
.values
)) {
6271 auto it
= bl
.cbegin();
6273 dout(10) << __func__
<< " store_statfs is found" << dendl
;
6275 dout(10) << __func__
<< " store_statfs is corrupt, using empty" << dendl
;
6277 _check_legacy_statfs_alert();
6279 per_pool_stat_collection
= true;
6280 dout(10) << __func__
<< " per-pool statfs is enabled" << dendl
;
6281 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_STAT
, KeyValueDB::ITERATOR_NOCACHE
);
6282 for (it
->upper_bound(string());
6287 int r
= get_key_pool_stat(it
->key(), &pool_id
);
6288 ceph_assert(r
== 0);
6292 auto p
= bl
.cbegin();
6293 auto& st
= osd_pools
[pool_id
];
6298 dout(30) << __func__
<< " pool " << pool_id
6299 << " statfs " << st
<< dendl
;
6300 } catch (ceph::buffer::error
& e
) {
6301 derr
<< __func__
<< " failed to decode pool stats, key:"
6302 << pretty_binary_string(it
->key()) << dendl
;
6306 dout(30) << __func__
<< " statfs " << vstatfs
<< dendl
;
6310 int BlueStore::_setup_block_symlink_or_file(
6316 dout(20) << __func__
<< " name " << name
<< " path " << epath
6317 << " size " << size
<< " create=" << (int)create
<< dendl
;
6319 int flags
= O_RDWR
|O_CLOEXEC
;
6322 if (epath
.length()) {
6323 r
= ::symlinkat(epath
.c_str(), path_fd
, name
.c_str());
6326 derr
<< __func__
<< " failed to create " << name
<< " symlink to "
6327 << epath
<< ": " << cpp_strerror(r
) << dendl
;
6331 if (!epath
.compare(0, strlen(SPDK_PREFIX
), SPDK_PREFIX
)) {
6332 int fd
= ::openat(path_fd
, epath
.c_str(), flags
, 0644);
6335 derr
<< __func__
<< " failed to open " << epath
<< " file: "
6336 << cpp_strerror(r
) << dendl
;
6339 // write the Transport ID of the NVMe device
6340 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6341 // where "0000:02:00.0" is the selector of a PCI device, see
6342 // the first column of "lspci -mm -n -D"
6343 string trid
{"trtype:PCIe "};
6345 trid
+= epath
.substr(strlen(SPDK_PREFIX
));
6346 r
= ::write(fd
, trid
.c_str(), trid
.size());
6347 ceph_assert(r
== static_cast<int>(trid
.size()));
6348 dout(1) << __func__
<< " created " << name
<< " symlink to "
6350 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6354 int fd
= ::openat(path_fd
, name
.c_str(), flags
, 0644);
6356 // block file is present
6358 int r
= ::fstat(fd
, &st
);
6360 S_ISREG(st
.st_mode
) && // if it is a regular file
6361 st
.st_size
== 0) { // and is 0 bytes
6362 r
= ::ftruncate(fd
, size
);
6365 derr
<< __func__
<< " failed to resize " << name
<< " file to "
6366 << size
<< ": " << cpp_strerror(r
) << dendl
;
6367 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6371 if (cct
->_conf
->bluestore_block_preallocate_file
) {
6372 r
= ::ceph_posix_fallocate(fd
, 0, size
);
6374 derr
<< __func__
<< " failed to prefallocate " << name
<< " file to "
6375 << size
<< ": " << cpp_strerror(r
) << dendl
;
6376 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6380 dout(1) << __func__
<< " resized " << name
<< " file to "
6381 << byte_u_t(size
) << dendl
;
6383 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6387 derr
<< __func__
<< " failed to open " << name
<< " file: "
6388 << cpp_strerror(r
) << dendl
;
6396 int BlueStore::mkfs()
6398 dout(1) << __func__
<< " path " << path
<< dendl
;
6402 if (cct
->_conf
->osd_max_object_size
> OBJECT_MAX_SIZE
) {
6403 derr
<< __func__
<< " osd_max_object_size "
6404 << cct
->_conf
->osd_max_object_size
<< " > bluestore max "
6405 << OBJECT_MAX_SIZE
<< dendl
;
6411 r
= read_meta("mkfs_done", &done
);
6413 dout(1) << __func__
<< " already created" << dendl
;
6414 if (cct
->_conf
->bluestore_fsck_on_mkfs
) {
6415 r
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
6417 derr
<< __func__
<< " fsck found fatal error: " << cpp_strerror(r
)
6422 derr
<< __func__
<< " fsck found " << r
<< " errors" << dendl
;
6426 return r
; // idempotent
6432 r
= read_meta("type", &type
);
6434 if (type
!= "bluestore") {
6435 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
6439 r
= write_meta("type", "bluestore");
6445 freelist_type
= "bitmap";
6451 r
= _open_fsid(true);
6457 goto out_close_fsid
;
6459 r
= _read_fsid(&old_fsid
);
6460 if (r
< 0 || old_fsid
.is_zero()) {
6461 if (fsid
.is_zero()) {
6462 fsid
.generate_random();
6463 dout(1) << __func__
<< " generated fsid " << fsid
<< dendl
;
6465 dout(1) << __func__
<< " using provided fsid " << fsid
<< dendl
;
6467 // we'll write it later.
6469 if (!fsid
.is_zero() && fsid
!= old_fsid
) {
6470 derr
<< __func__
<< " on-disk fsid " << old_fsid
6471 << " != provided " << fsid
<< dendl
;
6473 goto out_close_fsid
;
6478 r
= _setup_block_symlink_or_file("block", cct
->_conf
->bluestore_block_path
,
6479 cct
->_conf
->bluestore_block_size
,
6480 cct
->_conf
->bluestore_block_create
);
6482 goto out_close_fsid
;
6483 if (cct
->_conf
->bluestore_bluefs
) {
6484 r
= _setup_block_symlink_or_file("block.wal", cct
->_conf
->bluestore_block_wal_path
,
6485 cct
->_conf
->bluestore_block_wal_size
,
6486 cct
->_conf
->bluestore_block_wal_create
);
6488 goto out_close_fsid
;
6489 r
= _setup_block_symlink_or_file("block.db", cct
->_conf
->bluestore_block_db_path
,
6490 cct
->_conf
->bluestore_block_db_size
,
6491 cct
->_conf
->bluestore_block_db_create
);
6493 goto out_close_fsid
;
6496 r
= _open_bdev(true);
6498 goto out_close_fsid
;
6500 // choose min_alloc_size
6501 if (cct
->_conf
->bluestore_min_alloc_size
) {
6502 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size
;
6505 if (_use_rotational_settings()) {
6506 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_hdd
;
6508 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_ssd
;
6513 // make sure min_alloc_size is power of 2 aligned.
6514 if (!isp2(min_alloc_size
)) {
6515 derr
<< __func__
<< " min_alloc_size 0x"
6516 << std::hex
<< min_alloc_size
<< std::dec
6517 << " is not power of 2 aligned!"
6520 goto out_close_bdev
;
6523 r
= _create_alloc();
6525 goto out_close_bdev
;
6528 reserved
= _get_ondisk_reserved();
6529 shared_alloc
.a
->init_add_free(reserved
,
6530 p2align(bdev
->get_size(), min_alloc_size
) - reserved
);
6534 goto out_close_alloc
;
6537 KeyValueDB::Transaction t
= db
->get_transaction();
6538 r
= _open_fm(t
, true);
6543 encode((uint64_t)0, bl
);
6544 t
->set(PREFIX_SUPER
, "nid_max", bl
);
6545 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
6550 encode((uint64_t)min_alloc_size
, bl
);
6551 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
6555 bl
.append(stringify(OMAP_PER_PG
));
6556 t
->set(PREFIX_SUPER
, "per_pool_omap", bl
);
6558 ondisk_format
= latest_ondisk_format
;
6559 _prepare_ondisk_format_super(t
);
6560 db
->submit_transaction_sync(t
);
6563 r
= write_meta("kv_backend", cct
->_conf
->bluestore_kvbackend
);
6567 r
= write_meta("bluefs", stringify(bluefs
? 1 : 0));
6571 if (fsid
!= old_fsid
) {
6574 derr
<< __func__
<< " error writing fsid: " << cpp_strerror(r
) << dendl
;
6593 cct
->_conf
->bluestore_fsck_on_mkfs
) {
6594 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
6598 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
6604 // indicate success by writing the 'mkfs_done' file
6605 r
= write_meta("mkfs_done", "yes");
6609 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
6611 dout(0) << __func__
<< " success" << dendl
;
6616 int BlueStore::add_new_bluefs_device(int id
, const string
& dev_path
)
6618 dout(10) << __func__
<< " path " << dev_path
<< " id:" << id
<< dendl
;
6620 ceph_assert(path_fd
< 0);
6622 ceph_assert(id
== BlueFS::BDEV_NEWWAL
|| id
== BlueFS::BDEV_NEWDB
);
6624 if (!cct
->_conf
->bluestore_bluefs
) {
6625 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6629 r
= _open_db_and_around(true);
6631 if (id
== BlueFS::BDEV_NEWWAL
) {
6632 string p
= path
+ "/block.wal";
6633 r
= _setup_block_symlink_or_file("block.wal", dev_path
,
6634 cct
->_conf
->bluestore_block_wal_size
,
6636 ceph_assert(r
== 0);
6638 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWWAL
, p
,
6639 cct
->_conf
->bdev_enable_discard
,
6640 BDEV_LABEL_BLOCK_SIZE
);
6641 ceph_assert(r
== 0);
6643 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWWAL
)) {
6644 r
= _check_or_set_bdev_label(
6646 bluefs
->get_block_device_size(BlueFS::BDEV_NEWWAL
),
6649 ceph_assert(r
== 0);
6652 bluefs_layout
.dedicated_wal
= true;
6653 } else if (id
== BlueFS::BDEV_NEWDB
) {
6654 string p
= path
+ "/block.db";
6655 r
= _setup_block_symlink_or_file("block.db", dev_path
,
6656 cct
->_conf
->bluestore_block_db_size
,
6658 ceph_assert(r
== 0);
6660 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWDB
, p
,
6661 cct
->_conf
->bdev_enable_discard
,
6663 ceph_assert(r
== 0);
6665 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWDB
)) {
6666 r
= _check_or_set_bdev_label(
6668 bluefs
->get_block_device_size(BlueFS::BDEV_NEWDB
),
6671 ceph_assert(r
== 0);
6673 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
6674 bluefs_layout
.dedicated_db
= true;
6680 r
= bluefs
->prepare_new_device(id
, bluefs_layout
);
6681 ceph_assert(r
== 0);
6684 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
6686 dout(0) << __func__
<< " success" << dendl
;
6689 _close_db_and_around(true);
6693 int BlueStore::migrate_to_existing_bluefs_device(const set
<int>& devs_source
,
6696 dout(10) << __func__
<< " id:" << id
<< dendl
;
6697 ceph_assert(path_fd
< 0);
6699 ceph_assert(id
== BlueFS::BDEV_SLOW
|| id
== BlueFS::BDEV_DB
);
6701 if (!cct
->_conf
->bluestore_bluefs
) {
6702 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6706 int r
= _open_db_and_around(true);
6708 uint64_t used_space
= 0;
6709 for(auto src_id
: devs_source
) {
6710 used_space
+= bluefs
->get_used(src_id
);
6712 uint64_t target_free
= bluefs
->get_free(id
);
6713 if (target_free
< used_space
) {
6715 << " can't migrate, free space at target: " << target_free
6716 << " is less than required space: " << used_space
6721 if (devs_source
.count(BlueFS::BDEV_DB
)) {
6722 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
6723 bluefs_layout
.dedicated_db
= false;
6725 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
6726 bluefs_layout
.dedicated_wal
= false;
6728 r
= bluefs
->device_migrate_to_existing(cct
, devs_source
, id
, bluefs_layout
);
6730 derr
<< __func__
<< " failed during BlueFS migration, " << cpp_strerror(r
) << dendl
;
6734 if (devs_source
.count(BlueFS::BDEV_DB
)) {
6735 r
= unlink(string(path
+ "/block.db").c_str());
6736 ceph_assert(r
== 0);
6738 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
6739 r
= unlink(string(path
+ "/block.wal").c_str());
6740 ceph_assert(r
== 0);
6744 _close_db_and_around(true);
6748 int BlueStore::migrate_to_new_bluefs_device(const set
<int>& devs_source
,
6750 const string
& dev_path
)
6752 dout(10) << __func__
<< " path " << dev_path
<< " id:" << id
<< dendl
;
6754 ceph_assert(path_fd
< 0);
6756 ceph_assert(id
== BlueFS::BDEV_NEWWAL
|| id
== BlueFS::BDEV_NEWDB
);
6758 if (!cct
->_conf
->bluestore_bluefs
) {
6759 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6763 r
= _open_db_and_around(true);
6767 if (devs_source
.count(BlueFS::BDEV_DB
) &&
6768 bluefs_layout
.shared_bdev
!= BlueFS::BDEV_DB
) {
6769 link_db
= path
+ "/block.db";
6770 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
6771 bluefs_layout
.dedicated_db
= false;
6773 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
6774 link_wal
= path
+ "/block.wal";
6775 bluefs_layout
.dedicated_wal
= false;
6780 if (id
== BlueFS::BDEV_NEWWAL
) {
6781 target_name
= "block.wal";
6782 target_size
= cct
->_conf
->bluestore_block_wal_size
;
6783 bluefs_layout
.dedicated_wal
= true;
6785 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWWAL
, dev_path
,
6786 cct
->_conf
->bdev_enable_discard
,
6787 BDEV_LABEL_BLOCK_SIZE
);
6788 ceph_assert(r
== 0);
6790 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWWAL
)) {
6791 r
= _check_or_set_bdev_label(
6793 bluefs
->get_block_device_size(BlueFS::BDEV_NEWWAL
),
6796 ceph_assert(r
== 0);
6798 } else if (id
== BlueFS::BDEV_NEWDB
) {
6799 target_name
= "block.db";
6800 target_size
= cct
->_conf
->bluestore_block_db_size
;
6801 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
6802 bluefs_layout
.dedicated_db
= true;
6804 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWDB
, dev_path
,
6805 cct
->_conf
->bdev_enable_discard
,
6807 ceph_assert(r
== 0);
6809 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWDB
)) {
6810 r
= _check_or_set_bdev_label(
6812 bluefs
->get_block_device_size(BlueFS::BDEV_NEWDB
),
6815 ceph_assert(r
== 0);
6822 r
= bluefs
->device_migrate_to_new(cct
, devs_source
, id
, bluefs_layout
);
6825 derr
<< __func__
<< " failed during BlueFS migration, " << cpp_strerror(r
) << dendl
;
6829 if (!link_db
.empty()) {
6830 r
= unlink(link_db
.c_str());
6831 ceph_assert(r
== 0);
6833 if (!link_wal
.empty()) {
6834 r
= unlink(link_wal
.c_str());
6835 ceph_assert(r
== 0);
6837 r
= _setup_block_symlink_or_file(
6842 ceph_assert(r
== 0);
6843 dout(0) << __func__
<< " success" << dendl
;
6846 _close_db_and_around(true);
6851 string
BlueStore::get_device_path(unsigned id
)
6854 if (id
< BlueFS::MAX_BDEV
) {
6856 case BlueFS::BDEV_WAL
:
6857 res
= path
+ "/block.wal";
6859 case BlueFS::BDEV_DB
:
6860 if (id
== bluefs_layout
.shared_bdev
) {
6861 res
= path
+ "/block";
6863 res
= path
+ "/block.db";
6866 case BlueFS::BDEV_SLOW
:
6867 res
= path
+ "/block";
6874 int BlueStore::_set_bdev_label_size(const string
& path
, uint64_t size
)
6876 bluestore_bdev_label_t label
;
6877 int r
= _read_bdev_label(cct
, path
, &label
);
6879 derr
<< "unable to read label for " << path
<< ": "
6880 << cpp_strerror(r
) << dendl
;
6883 r
= _write_bdev_label(cct
, path
, label
);
6885 derr
<< "unable to write label for " << path
<< ": "
6886 << cpp_strerror(r
) << dendl
;
6892 int BlueStore::expand_devices(ostream
& out
)
6894 int r
= _open_db_and_around(true);
6895 ceph_assert(r
== 0);
6896 bluefs
->dump_block_extents(out
);
6897 out
<< "Expanding DB/WAL..." << std::endl
;
6898 for (auto devid
: { BlueFS::BDEV_WAL
, BlueFS::BDEV_DB
}) {
6899 if (devid
== bluefs_layout
.shared_bdev
) {
6902 uint64_t size
= bluefs
->get_block_device_size(devid
);
6909 <<" : expanding " << " to 0x" << size
<< std::dec
<< std::endl
;
6910 string p
= get_device_path(devid
);
6911 const char* path
= p
.c_str();
6912 if (path
== nullptr) {
6914 <<": can't find device path " << dendl
;
6917 if (bluefs
->bdev_support_label(devid
)) {
6918 if (_set_bdev_label_size(p
, size
) >= 0) {
6920 << " : size label updated to " << size
6925 uint64_t size0
= fm
->get_size();
6926 uint64_t size
= bdev
->get_size();
6928 out
<< bluefs_layout
.shared_bdev
6929 << " : expanding " << " from 0x" << std::hex
6930 << size0
<< " to 0x" << size
<< std::dec
<< std::endl
;
6931 _write_out_fm_meta(size
);
6932 if (bdev
->supported_bdev_label()) {
6933 if (_set_bdev_label_size(path
, size
) >= 0) {
6934 out
<< bluefs_layout
.shared_bdev
6935 << " : size label updated to " << size
6939 _close_db_and_around(true);
6941 // mount in read/write to sync expansion changes
6943 ceph_assert(r
== 0);
6946 _close_db_and_around(true);
6951 int BlueStore::dump_bluefs_sizes(ostream
& out
)
6953 int r
= _open_db_and_around(true);
6954 ceph_assert(r
== 0);
6955 bluefs
->dump_block_extents(out
);
6956 _close_db_and_around(true);
6960 void BlueStore::set_cache_shards(unsigned num
)
6962 dout(10) << __func__
<< " " << num
<< dendl
;
6963 size_t oold
= onode_cache_shards
.size();
6964 size_t bold
= buffer_cache_shards
.size();
6965 ceph_assert(num
>= oold
&& num
>= bold
);
6966 onode_cache_shards
.resize(num
);
6967 buffer_cache_shards
.resize(num
);
6968 for (unsigned i
= oold
; i
< num
; ++i
) {
6969 onode_cache_shards
[i
] =
6970 OnodeCacheShard::create(cct
, cct
->_conf
->bluestore_cache_type
,
6973 for (unsigned i
= bold
; i
< num
; ++i
) {
6974 buffer_cache_shards
[i
] =
6975 BufferCacheShard::create(cct
, cct
->_conf
->bluestore_cache_type
,
6980 int BlueStore::_mount()
6982 dout(1) << __func__
<< " path " << path
<< dendl
;
6985 if (cct
->_conf
->bluestore_fsck_on_mount
) {
6986 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mount_deep
);
6990 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
6995 if (cct
->_conf
->osd_max_object_size
> OBJECT_MAX_SIZE
) {
6996 derr
<< __func__
<< " osd_max_object_size "
6997 << cct
->_conf
->osd_max_object_size
<< " > bluestore max "
6998 << OBJECT_MAX_SIZE
<< dendl
;
7002 int r
= _open_db_and_around(false);
7007 r
= _upgrade_super();
7012 r
= _open_collections();
7016 r
= _reload_logger();
7022 if (bdev
->is_smr()) {
7023 _zoned_cleaner_start();
7026 r
= _deferred_replay();
7030 mempool_thread
.init();
7032 if ((!per_pool_stat_collection
|| per_pool_omap
!= OMAP_PER_PG
) &&
7033 cct
->_conf
->bluestore_fsck_quick_fix_on_mount
== true) {
7035 auto was_per_pool_omap
= per_pool_omap
;
7037 dout(1) << __func__
<< " quick-fix on mount" << dendl
;
7038 _fsck_on_open(FSCK_SHALLOW
, true);
7041 //FIXME minor: replace with actual open/close?
7043 _check_legacy_statfs_alert();
7045 //set again as hopefully it has been fixed
7046 if (was_per_pool_omap
!= OMAP_PER_PG
) {
7047 _set_per_pool_omap();
7055 if (bdev
->is_smr()) {
7056 _zoned_cleaner_stop();
7062 _close_db_and_around(false);
7066 int BlueStore::umount()
7068 ceph_assert(_kv_only
|| mounted
);
7069 dout(1) << __func__
<< dendl
;
7075 mempool_thread
.shutdown();
7076 if (bdev
->is_smr()) {
7077 dout(20) << __func__
<< " stopping zone cleaner thread" << dendl
;
7078 _zoned_cleaner_stop();
7080 dout(20) << __func__
<< " stopping kv thread" << dendl
;
7083 dout(20) << __func__
<< " closing" << dendl
;
7086 _close_db_and_around(false);
7088 if (cct
->_conf
->bluestore_fsck_on_umount
) {
7089 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_umount_deep
);
7093 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
7100 int BlueStore::cold_open()
7102 return _open_db_and_around(true);
7105 int BlueStore::cold_close()
7107 _close_db_and_around(true);
7111 // derr wrapper to limit enormous output and avoid log flooding.
7112 // Of limited use where such output is expected for now
7113 #define fsck_derr(err_cnt, threshold) \
7114 if (err_cnt <= threshold) { \
7115 bool need_skip_print = err_cnt == threshold; \
7118 #define fsck_dendl \
7120 if (need_skip_print) \
7121 derr << "more error lines skipped..." << dendl; \
7124 int _fsck_sum_extents(
7125 const PExtentVector
& extents
,
7127 store_statfs_t
& expected_statfs
)
7129 for (auto e
: extents
) {
7132 expected_statfs
.allocated
+= e
.length
;
7134 expected_statfs
.data_compressed_allocated
+= e
.length
;
7140 int BlueStore::_fsck_check_extents(
7142 const ghobject_t
& oid
,
7143 const PExtentVector
& extents
,
7145 mempool_dynamic_bitset
&used_blocks
,
7146 uint64_t granularity
,
7147 BlueStoreRepairer
* repairer
,
7148 store_statfs_t
& expected_statfs
,
7151 dout(30) << __func__
<< " oid " << oid
<< " extents " << extents
<< dendl
;
7153 for (auto e
: extents
) {
7156 expected_statfs
.allocated
+= e
.length
;
7158 expected_statfs
.data_compressed_allocated
+= e
.length
;
7160 if (depth
!= FSCK_SHALLOW
) {
7161 bool already
= false;
7162 apply_for_bitset_range(
7163 e
.offset
, e
.length
, granularity
, used_blocks
,
7164 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
7167 repairer
->note_misreference(
7168 pos
* min_alloc_size
, min_alloc_size
, !already
);
7171 derr
<< "fsck error: " << oid
<< " extent " << e
7172 << " or a subset is already allocated (misreferenced)" << dendl
;
7181 repairer
->set_space_used(e
.offset
, e
.length
, cid
, oid
);
7184 if (e
.end() > bdev
->get_size()) {
7185 derr
<< "fsck error: " << oid
<< " extent " << e
7186 << " past end of block device" << dendl
;
7194 void BlueStore::_fsck_check_pool_statfs(
7195 BlueStore::per_pool_statfs
& expected_pool_statfs
,
7198 BlueStoreRepairer
* repairer
)
7200 auto it
= db
->get_iterator(PREFIX_STAT
, KeyValueDB::ITERATOR_NOCACHE
);
7202 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
7203 string key
= it
->key();
7204 if (key
== BLUESTORE_GLOBAL_STATFS_KEY
) {
7207 repairer
->remove_key(db
, PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
);
7208 derr
<< "fsck error: " << "legacy statfs record found, removing"
7214 if (get_key_pool_stat(key
, &pool_id
) < 0) {
7215 derr
<< "fsck error: bad key " << key
7216 << "in statfs namespece" << dendl
;
7218 repairer
->remove_key(db
, PREFIX_STAT
, key
);
7224 volatile_statfs vstatfs
;
7225 bufferlist bl
= it
->value();
7226 auto blp
= bl
.cbegin();
7228 vstatfs
.decode(blp
);
7229 } catch (ceph::buffer::error
& e
) {
7230 derr
<< "fsck error: failed to decode Pool StatFS record"
7231 << pretty_binary_string(key
) << dendl
;
7233 dout(20) << __func__
<< " undecodable Pool StatFS record, key:'"
7234 << pretty_binary_string(key
)
7235 << "', removing" << dendl
;
7236 repairer
->remove_key(db
, PREFIX_STAT
, key
);
7241 auto stat_it
= expected_pool_statfs
.find(pool_id
);
7242 if (stat_it
== expected_pool_statfs
.end()) {
7243 if (vstatfs
.is_empty()) {
7244 // we don't consider that as an error since empty pool statfs
7245 // are left in DB for now
7246 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7247 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7249 // but we need to increment error count in case of repair
7250 // to have proper counters at the end
7251 // (as repairer increments recovery counter anyway).
7255 derr
<< "fsck error: found stray Pool StatFS record for pool id 0x"
7256 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7260 repairer
->remove_key(db
, PREFIX_STAT
, key
);
7264 store_statfs_t statfs
;
7265 vstatfs
.publish(&statfs
);
7266 if (!(stat_it
->second
== statfs
)) {
7267 derr
<< "fsck error: actual " << statfs
7268 << " != expected " << stat_it
->second
7270 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7272 repairer
->fix_statfs(db
, key
, stat_it
->second
);
7276 expected_pool_statfs
.erase(stat_it
);
7279 for (auto& s
: expected_pool_statfs
) {
7280 if (s
.second
.is_zero()) {
7281 // we might lack empty statfs recs in DB
7284 derr
<< "fsck error: missing Pool StatFS record for pool "
7285 << std::hex
<< s
.first
<< std::dec
<< dendl
;
7288 get_pool_stat_key(s
.first
, &key
);
7289 repairer
->fix_statfs(db
, key
, s
.second
);
7293 if (!per_pool_stat_collection
&&
7295 // by virtue of running this method, we correct the top-level
7296 // error of having global stats
7297 repairer
->inc_repaired();
7301 BlueStore::OnodeRef
BlueStore::fsck_check_objects_shallow(
7302 BlueStore::FSCKDepth depth
,
7304 BlueStore::CollectionRef c
,
7305 const ghobject_t
& oid
,
7307 const bufferlist
& value
,
7308 mempool::bluestore_fsck::list
<string
>* expecting_shards
,
7309 map
<BlobRef
, bluestore_blob_t::unused_t
>* referenced
,
7310 const BlueStore::FSCK_ObjectCtx
& ctx
)
7312 auto& errors
= ctx
.errors
;
7313 auto& num_objects
= ctx
.num_objects
;
7314 auto& num_extents
= ctx
.num_extents
;
7315 auto& num_blobs
= ctx
.num_blobs
;
7316 auto& num_sharded_objects
= ctx
.num_sharded_objects
;
7317 auto& num_spanning_blobs
= ctx
.num_spanning_blobs
;
7318 auto used_blocks
= ctx
.used_blocks
;
7319 auto sb_info_lock
= ctx
.sb_info_lock
;
7320 auto& sb_info
= ctx
.sb_info
;
7321 auto repairer
= ctx
.repairer
;
7323 store_statfs_t
* res_statfs
= (per_pool_stat_collection
|| repairer
) ?
7324 &ctx
.expected_pool_statfs
[pool_id
] :
7325 &ctx
.expected_store_statfs
;
7327 dout(10) << __func__
<< " " << oid
<< dendl
;
7329 o
.reset(Onode::decode(c
, oid
, key
, value
));
7332 num_spanning_blobs
+= o
->extent_map
.spanning_blob_map
.size();
7334 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
7335 _dump_onode
<30>(cct
, *o
);
7337 if (!o
->extent_map
.shards
.empty()) {
7338 ++num_sharded_objects
;
7339 if (depth
!= FSCK_SHALLOW
) {
7340 ceph_assert(expecting_shards
);
7341 for (auto& s
: o
->extent_map
.shards
) {
7342 dout(20) << __func__
<< " shard " << *s
.shard_info
<< dendl
;
7343 expecting_shards
->push_back(string());
7344 get_extent_shard_key(o
->key
, s
.shard_info
->offset
,
7345 &expecting_shards
->back());
7346 if (s
.shard_info
->offset
>= o
->onode
.size
) {
7347 derr
<< "fsck error: " << oid
<< " shard 0x" << std::hex
7348 << s
.shard_info
->offset
<< " past EOF at 0x" << o
->onode
.size
7349 << std::dec
<< dendl
;
7358 mempool::bluestore_fsck::map
<BlobRef
,
7359 bluestore_blob_use_tracker_t
> ref_map
;
7360 for (auto& l
: o
->extent_map
.extent_map
) {
7361 dout(20) << __func__
<< " " << l
<< dendl
;
7362 if (l
.logical_offset
< pos
) {
7363 derr
<< "fsck error: " << oid
<< " lextent at 0x"
7364 << std::hex
<< l
.logical_offset
7365 << " overlaps with the previous, which ends at 0x" << pos
7366 << std::dec
<< dendl
;
7369 if (depth
!= FSCK_SHALLOW
&&
7370 o
->extent_map
.spans_shard(l
.logical_offset
, l
.length
)) {
7371 derr
<< "fsck error: " << oid
<< " lextent at 0x"
7372 << std::hex
<< l
.logical_offset
<< "~" << l
.length
7373 << " spans a shard boundary"
7374 << std::dec
<< dendl
;
7377 pos
= l
.logical_offset
+ l
.length
;
7378 res_statfs
->data_stored
+= l
.length
;
7379 ceph_assert(l
.blob
);
7380 const bluestore_blob_t
& blob
= l
.blob
->get_blob();
7382 auto& ref
= ref_map
[l
.blob
];
7383 if (ref
.is_empty()) {
7384 uint32_t min_release_size
= blob
.get_release_size(min_alloc_size
);
7385 uint32_t l
= blob
.get_logical_length();
7386 ref
.init(l
, min_release_size
);
7392 if (depth
!= FSCK_SHALLOW
&&
7393 blob
.has_unused()) {
7394 ceph_assert(referenced
);
7395 auto p
= referenced
->find(l
.blob
);
7396 bluestore_blob_t::unused_t
* pu
;
7397 if (p
== referenced
->end()) {
7398 pu
= &(*referenced
)[l
.blob
];
7403 uint64_t blob_len
= blob
.get_logical_length();
7404 ceph_assert((blob_len
% (sizeof(*pu
) * 8)) == 0);
7405 ceph_assert(l
.blob_offset
+ l
.length
<= blob_len
);
7406 uint64_t chunk_size
= blob_len
/ (sizeof(*pu
) * 8);
7407 uint64_t start
= l
.blob_offset
/ chunk_size
;
7409 round_up_to(l
.blob_offset
+ l
.length
, chunk_size
) / chunk_size
;
7410 for (auto i
= start
; i
< end
; ++i
) {
7414 } //for (auto& l : o->extent_map.extent_map)
7416 for (auto& i
: ref_map
) {
7418 const bluestore_blob_t
& blob
= i
.first
->get_blob();
7420 depth
== FSCK_SHALLOW
? true :
7421 i
.first
->get_blob_use_tracker().equal(i
.second
);
7423 derr
<< "fsck error: " << oid
<< " blob " << *i
.first
7424 << " doesn't match expected ref_map " << i
.second
<< dendl
;
7427 if (blob
.is_compressed()) {
7428 res_statfs
->data_compressed
+= blob
.get_compressed_payload_length();
7429 res_statfs
->data_compressed_original
+=
7430 i
.first
->get_referenced_bytes();
7432 if (blob
.is_shared()) {
7433 if (i
.first
->shared_blob
->get_sbid() > blobid_max
) {
7434 derr
<< "fsck error: " << oid
<< " blob " << blob
7435 << " sbid " << i
.first
->shared_blob
->get_sbid() << " > blobid_max "
7436 << blobid_max
<< dendl
;
7439 else if (i
.first
->shared_blob
->get_sbid() == 0) {
7440 derr
<< "fsck error: " << oid
<< " blob " << blob
7441 << " marked as shared but has uninitialized sbid"
7445 // the below lock is optional and provided in multithreading mode only
7447 sb_info_lock
->lock();
7449 sb_info_t
& sbi
= sb_info
[i
.first
->shared_blob
->get_sbid()];
7450 ceph_assert(sbi
.cid
== coll_t() || sbi
.cid
== c
->cid
);
7451 ceph_assert(sbi
.pool_id
== INT64_MIN
||
7452 sbi
.pool_id
== oid
.hobj
.get_logical_pool());
7454 sbi
.pool_id
= oid
.hobj
.get_logical_pool();
7455 sbi
.sb
= i
.first
->shared_blob
;
7456 sbi
.oids
.push_back(oid
);
7457 sbi
.compressed
= blob
.is_compressed();
7458 for (auto e
: blob
.get_extents()) {
7460 sbi
.ref_map
.get(e
.offset
, e
.length
);
7464 sb_info_lock
->unlock();
7466 } else if (depth
!= FSCK_SHALLOW
) {
7467 ceph_assert(used_blocks
);
7468 errors
+= _fsck_check_extents(c
->cid
, oid
, blob
.get_extents(),
7469 blob
.is_compressed(),
7471 fm
->get_alloc_size(),
7476 errors
+= _fsck_sum_extents(
7478 blob
.is_compressed(),
7481 } // for (auto& i : ref_map)
7484 auto &sbm
= o
->extent_map
.spanning_blob_map
;
7486 BlobRef first_broken
;
7487 for (auto it
= sbm
.begin(); it
!= sbm
.end();) {
7489 if (ref_map
.count(it1
->second
) == 0) {
7491 first_broken
= it1
->second
;
7501 derr
<< "fsck error: " << oid
<< " - " << broken
7502 << " zombie spanning blob(s) found, the first one: "
7503 << *first_broken
<< dendl
;
7505 repairer
->fix_spanning_blobs(
7507 [&](KeyValueDB::Transaction txn
) {
7508 _record_onode(o
, txn
);
7514 if (o
->onode
.has_omap()) {
7515 _fsck_check_object_omap(depth
, o
, ctx
);
7521 #include "common/WorkQueue.h"
7523 class ShallowFSCKThreadPool
: public ThreadPool
7526 ShallowFSCKThreadPool(CephContext
* cct_
, std::string nm
, std::string tn
, int n
) :
7527 ThreadPool(cct_
, nm
, tn
, n
) {
7529 void worker(ThreadPool::WorkThread
* wt
) override
{
7532 next_wq
%= work_queues
.size();
7533 WorkQueue_
*wq
= work_queues
[next_wq
++];
7535 void* item
= wq
->_void_dequeue();
7538 TPHandle
tp_handle(cct
, nullptr, wq
->timeout_interval
, wq
->suicide_interval
);
7539 wq
->_void_process(item
, tp_handle
);
7544 template <size_t BatchLen
>
7545 struct FSCKWorkQueue
: public ThreadPool::WorkQueue_
7549 BlueStore::CollectionRef c
;
7555 std::atomic
<size_t> running
= { 0 };
7556 size_t entry_count
= 0;
7557 std::array
<Entry
, BatchLen
> entries
;
7560 int64_t warnings
= 0;
7561 uint64_t num_objects
= 0;
7562 uint64_t num_extents
= 0;
7563 uint64_t num_blobs
= 0;
7564 uint64_t num_sharded_objects
= 0;
7565 uint64_t num_spanning_blobs
= 0;
7566 store_statfs_t expected_store_statfs
;
7567 BlueStore::per_pool_statfs expected_pool_statfs
;
7571 BlueStore
* store
= nullptr;
7573 ceph::mutex
* sb_info_lock
= nullptr;
7574 BlueStore::sb_info_map_t
* sb_info
= nullptr;
7575 BlueStoreRepairer
* repairer
= nullptr;
7577 Batch
* batches
= nullptr;
7578 size_t last_batch_pos
= 0;
7579 bool batch_acquired
= false;
7581 FSCKWorkQueue(std::string n
,
7584 ceph::mutex
* _sb_info_lock
,
7585 BlueStore::sb_info_map_t
& _sb_info
,
7586 BlueStoreRepairer
* _repairer
) :
7587 WorkQueue_(n
, ceph::timespan::zero(), ceph::timespan::zero()),
7588 batchCount(_batchCount
),
7590 sb_info_lock(_sb_info_lock
),
7594 batches
= new Batch
[batchCount
];
7600 /// Remove all work items from the queue.
7601 void _clear() override
{
7604 /// Check whether there is anything to do.
7605 bool _empty() override
{
7609 /// Get the next work item to process.
7610 void* _void_dequeue() override
{
7611 size_t pos
= rand() % batchCount
;
7614 auto& batch
= batches
[pos
];
7615 if (batch
.running
.fetch_add(1) == 0) {
7616 if (batch
.entry_count
) {
7623 } while (pos
!= pos0
);
7626 /** @brief Process the work item.
7627 * This function will be called several times in parallel
7628 * and must therefore be thread-safe. */
7629 void _void_process(void* item
, TPHandle
& handle
) override
{
7630 Batch
* batch
= (Batch
*)item
;
7632 BlueStore::FSCK_ObjectCtx
ctx(
7638 batch
->num_sharded_objects
,
7639 batch
->num_spanning_blobs
,
7640 nullptr, // used_blocks
7641 nullptr, //used_omap_head
7644 batch
->expected_store_statfs
,
7645 batch
->expected_pool_statfs
,
7648 for (size_t i
= 0; i
< batch
->entry_count
; i
++) {
7649 auto& entry
= batch
->entries
[i
];
7651 store
->fsck_check_objects_shallow(
7652 BlueStore::FSCK_SHALLOW
,
7658 nullptr, // expecting_shards - this will need a protection if passed
7659 nullptr, // referenced
7662 //std::cout << "processed " << batch << std::endl;
7663 batch
->entry_count
= 0;
7666 /** @brief Synchronously finish processing a work item.
7667 * This function is called after _void_process with the global thread pool lock held,
7668 * so at most one copy will execute simultaneously for a given thread pool.
7669 * It can be used for non-thread-safe finalization. */
7670 void _void_process_finish(void*) override
{
7676 BlueStore::CollectionRef c
,
7677 const ghobject_t
& oid
,
7679 const bufferlist
& value
) {
7681 size_t pos0
= last_batch_pos
;
7682 if (!batch_acquired
) {
7684 auto& batch
= batches
[last_batch_pos
];
7685 if (batch
.running
.fetch_add(1) == 0) {
7686 if (batch
.entry_count
< BatchLen
) {
7687 batch_acquired
= true;
7691 batch
.running
.fetch_sub(1);
7693 last_batch_pos
%= batchCount
;
7694 } while (last_batch_pos
!= pos0
);
7696 if (batch_acquired
) {
7697 auto& batch
= batches
[last_batch_pos
];
7698 ceph_assert(batch
.running
);
7699 ceph_assert(batch
.entry_count
< BatchLen
);
7701 auto& entry
= batch
.entries
[batch
.entry_count
];
7702 entry
.pool_id
= pool_id
;
7706 entry
.value
= value
;
7708 ++batch
.entry_count
;
7709 if (batch
.entry_count
== BatchLen
) {
7710 batch_acquired
= false;
7711 batch
.running
.fetch_sub(1);
7713 last_batch_pos
%= batchCount
;
7720 void finalize(ThreadPool
& tp
,
7721 BlueStore::FSCK_ObjectCtx
& ctx
) {
7722 if (batch_acquired
) {
7723 auto& batch
= batches
[last_batch_pos
];
7724 ceph_assert(batch
.running
);
7725 batch
.running
.fetch_sub(1);
7729 for (size_t i
= 0; i
< batchCount
; i
++) {
7730 auto& batch
= batches
[i
];
7732 //process leftovers if any
7733 if (batch
.entry_count
) {
7734 TPHandle
tp_handle(store
->cct
,
7738 ceph_assert(batch
.running
== 0);
7740 batch
.running
++; // just to be on-par with the regular call
7741 _void_process(&batch
, tp_handle
);
7743 ceph_assert(batch
.entry_count
== 0);
7745 ctx
.errors
+= batch
.errors
;
7746 ctx
.warnings
+= batch
.warnings
;
7747 ctx
.num_objects
+= batch
.num_objects
;
7748 ctx
.num_extents
+= batch
.num_extents
;
7749 ctx
.num_blobs
+= batch
.num_blobs
;
7750 ctx
.num_sharded_objects
+= batch
.num_sharded_objects
;
7751 ctx
.num_spanning_blobs
+= batch
.num_spanning_blobs
;
7753 ctx
.expected_store_statfs
.add(batch
.expected_store_statfs
);
7755 for (auto it
= batch
.expected_pool_statfs
.begin();
7756 it
!= batch
.expected_pool_statfs
.end();
7758 ctx
.expected_pool_statfs
[it
->first
].add(it
->second
);
7765 void BlueStore::_fsck_check_object_omap(FSCKDepth depth
,
7767 const BlueStore::FSCK_ObjectCtx
& ctx
)
7769 auto& errors
= ctx
.errors
;
7770 auto& warnings
= ctx
.warnings
;
7771 auto repairer
= ctx
.repairer
;
7773 ceph_assert(o
->onode
.has_omap());
7774 if (!o
->onode
.is_perpool_omap() && !o
->onode
.is_pgmeta_omap()) {
7775 if (per_pool_omap
== OMAP_PER_POOL
) {
7776 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
7777 << "fsck error: " << o
->oid
7778 << " has omap that is not per-pool or pgmeta"
7784 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_omap
) {
7793 fsck_derr(num
, MAX_FSCK_ERROR_LINES
)
7794 << "fsck " << w
<< ": " << o
->oid
7795 << " has omap that is not per-pool or pgmeta"
7798 } else if (!o
->onode
.is_perpg_omap() && !o
->onode
.is_pgmeta_omap()) {
7799 if (per_pool_omap
== OMAP_PER_PG
) {
7800 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
7801 << "fsck error: " << o
->oid
7802 << " has omap that is not per-pg or pgmeta"
7808 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pg_omap
) {
7817 fsck_derr(num
, MAX_FSCK_ERROR_LINES
)
7818 << "fsck " << w
<< ": " << o
->oid
7819 << " has omap that is not per-pg or pgmeta"
7824 !o
->onode
.is_perpg_omap() &&
7825 !o
->onode
.is_pgmeta_omap()) {
7826 dout(10) << "fsck converting " << o
->oid
<< " omap to per-pg" << dendl
;
7828 map
<string
, bufferlist
> kv
;
7830 KeyValueDB::Transaction txn
= db
->get_transaction();
7831 uint64_t txn_cost
= 0;
7832 const string
& prefix
= Onode::calc_omap_prefix(o
->onode
.flags
);
7833 uint8_t new_flags
= o
->onode
.flags
|
7834 bluestore_onode_t::FLAG_PERPOOL_OMAP
|
7835 bluestore_onode_t::FLAG_PERPG_OMAP
;
7836 const string
& new_omap_prefix
= Onode::calc_omap_prefix(new_flags
);
7838 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
7840 o
->get_omap_header(&head
);
7841 o
->get_omap_tail(&tail
);
7842 it
->lower_bound(head
);
7844 if (it
->valid() && it
->key() == head
) {
7845 dout(30) << __func__
<< " got header" << dendl
;
7846 header
= it
->value();
7847 if (header
.length()) {
7849 Onode::calc_omap_header(new_flags
, o
.get(), &new_head
);
7850 txn
->set(new_omap_prefix
, new_head
, header
);
7851 txn_cost
+= new_head
.length() + header
.length();
7857 Onode::calc_omap_tail(new_flags
, o
.get(), &new_tail
);
7859 txn
->set(new_omap_prefix
, new_tail
, empty
);
7860 txn_cost
+= new_tail
.length() + new_tail
.length();
7864 Onode::calc_omap_key(new_flags
, o
.get(), string(), &final_key
);
7865 size_t base_key_len
= final_key
.size();
7866 while (it
->valid() && it
->key() < tail
) {
7868 o
->decode_omap_key(it
->key(), &user_key
);
7869 dout(20) << __func__
<< " got " << pretty_binary_string(it
->key())
7870 << " -> " << user_key
<< dendl
;
7872 final_key
.resize(base_key_len
);
7873 final_key
+= it
->key();
7874 auto v
= it
->value();
7875 txn
->set(new_omap_prefix
, final_key
, v
);
7876 txn_cost
+= final_key
.length() + v
.length();
7878 // submit a portion if cost exceeds 16MB
7879 if (txn_cost
>= 16 * (1 << 20) ) {
7880 db
->submit_transaction_sync(txn
);
7881 txn
= db
->get_transaction();
7887 db
->submit_transaction_sync(txn
);
7890 // finalize: remove legacy data
7892 KeyValueDB::Transaction txn
= db
->get_transaction();
7894 const string
& old_omap_prefix
= o
->get_omap_prefix();
7895 string old_head
, old_tail
;
7896 o
->get_omap_header(&old_head
);
7897 o
->get_omap_tail(&old_tail
);
7898 txn
->rm_range_keys(old_omap_prefix
, old_head
, old_tail
);
7899 txn
->rmkey(old_omap_prefix
, old_tail
);
7901 o
->onode
.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP
| bluestore_onode_t::FLAG_PERPG_OMAP
);
7902 _record_onode(o
, txn
);
7903 db
->submit_transaction_sync(txn
);
7904 repairer
->inc_repaired();
7905 repairer
->request_compaction();
7910 void BlueStore::_fsck_check_objects(FSCKDepth depth
,
7911 BlueStore::FSCK_ObjectCtx
& ctx
)
7913 auto& errors
= ctx
.errors
;
7914 auto sb_info_lock
= ctx
.sb_info_lock
;
7915 auto& sb_info
= ctx
.sb_info
;
7916 auto repairer
= ctx
.repairer
;
7918 uint64_t_btree_t used_nids
;
7920 size_t processed_myself
= 0;
7922 auto it
= db
->get_iterator(PREFIX_OBJ
, KeyValueDB::ITERATOR_NOCACHE
);
7923 mempool::bluestore_fsck::list
<string
> expecting_shards
;
7925 const size_t thread_count
= cct
->_conf
->bluestore_fsck_quick_fix_threads
;
7926 typedef ShallowFSCKThreadPool::FSCKWorkQueue
<256> WQ
;
7927 std::unique_ptr
<WQ
> wq(
7930 (thread_count
? : 1) * 32,
7936 ShallowFSCKThreadPool
thread_pool(cct
, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count
);
7938 thread_pool
.add_work_queue(wq
.get());
7939 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
7940 //not the best place but let's check anyway
7941 ceph_assert(sb_info_lock
);
7942 thread_pool
.start();
7945 //fill global if not overriden below
7947 int64_t pool_id
= -1;
7949 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
7950 dout(30) << __func__
<< " key "
7951 << pretty_binary_string(it
->key()) << dendl
;
7952 if (is_extent_shard_key(it
->key())) {
7953 if (depth
== FSCK_SHALLOW
) {
7956 while (!expecting_shards
.empty() &&
7957 expecting_shards
.front() < it
->key()) {
7958 derr
<< "fsck error: missing shard key "
7959 << pretty_binary_string(expecting_shards
.front())
7962 expecting_shards
.pop_front();
7964 if (!expecting_shards
.empty() &&
7965 expecting_shards
.front() == it
->key()) {
7967 expecting_shards
.pop_front();
7973 get_key_extent_shard(it
->key(), &okey
, &offset
);
7974 derr
<< "fsck error: stray shard 0x" << std::hex
<< offset
7975 << std::dec
<< dendl
;
7976 if (expecting_shards
.empty()) {
7977 derr
<< "fsck error: " << pretty_binary_string(it
->key())
7978 << " is unexpected" << dendl
;
7982 while (expecting_shards
.front() > it
->key()) {
7983 derr
<< "fsck error: saw " << pretty_binary_string(it
->key())
7985 derr
<< "fsck error: exp "
7986 << pretty_binary_string(expecting_shards
.front()) << dendl
;
7988 expecting_shards
.pop_front();
7989 if (expecting_shards
.empty()) {
7997 int r
= get_key_object(it
->key(), &oid
);
7999 derr
<< "fsck error: bad object key "
8000 << pretty_binary_string(it
->key()) << dendl
;
8005 oid
.shard_id
!= pgid
.shard
||
8006 oid
.hobj
.get_logical_pool() != (int64_t)pgid
.pool() ||
8007 !c
->contains(oid
)) {
8009 for (auto& p
: coll_map
) {
8010 if (p
.second
->contains(oid
)) {
8016 derr
<< "fsck error: stray object " << oid
8017 << " not owned by any collection" << dendl
;
8021 pool_id
= c
->cid
.is_pg(&pgid
) ? pgid
.pool() : META_POOL_ID
;
8022 dout(20) << __func__
<< " collection " << c
->cid
<< " " << c
->cnode
8026 if (depth
!= FSCK_SHALLOW
&&
8027 !expecting_shards
.empty()) {
8028 for (auto& k
: expecting_shards
) {
8029 derr
<< "fsck error: missing shard key "
8030 << pretty_binary_string(k
) << dendl
;
8033 expecting_shards
.clear();
8036 bool queued
= false;
8037 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
8046 map
<BlobRef
, bluestore_blob_t::unused_t
> referenced
;
8051 o
= fsck_check_objects_shallow(
8063 if (depth
!= FSCK_SHALLOW
) {
8064 ceph_assert(o
!= nullptr);
8066 if (o
->onode
.nid
> nid_max
) {
8067 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
8068 << " > nid_max " << nid_max
<< dendl
;
8071 if (used_nids
.count(o
->onode
.nid
)) {
8072 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
8073 << " already in use" << dendl
;
8075 continue; // go for next object
8077 used_nids
.insert(o
->onode
.nid
);
8079 for (auto& i
: referenced
) {
8080 dout(20) << __func__
<< " referenced 0x" << std::hex
<< i
.second
8081 << std::dec
<< " for " << *i
.first
<< dendl
;
8082 const bluestore_blob_t
& blob
= i
.first
->get_blob();
8083 if (i
.second
& blob
.unused
) {
8084 derr
<< "fsck error: " << oid
<< " blob claims unused 0x"
8085 << std::hex
<< blob
.unused
8086 << " but extents reference 0x" << i
.second
<< std::dec
8087 << " on blob " << *i
.first
<< dendl
;
8090 if (blob
.has_csum()) {
8091 uint64_t blob_len
= blob
.get_logical_length();
8092 uint64_t unused_chunk_size
= blob_len
/ (sizeof(blob
.unused
) * 8);
8093 unsigned csum_count
= blob
.get_csum_count();
8094 unsigned csum_chunk_size
= blob
.get_csum_chunk_size();
8095 for (unsigned p
= 0; p
< csum_count
; ++p
) {
8096 unsigned pos
= p
* csum_chunk_size
;
8097 unsigned firstbit
= pos
/ unused_chunk_size
; // [firstbit,lastbit]
8098 unsigned lastbit
= (pos
+ csum_chunk_size
- 1) / unused_chunk_size
;
8099 unsigned mask
= 1u << firstbit
;
8100 for (unsigned b
= firstbit
+ 1; b
<= lastbit
; ++b
) {
8103 if ((blob
.unused
& mask
) == mask
) {
8104 // this csum chunk region is marked unused
8105 if (blob
.get_csum_item(p
) != 0) {
8106 derr
<< "fsck error: " << oid
8107 << " blob claims csum chunk 0x" << std::hex
<< pos
8108 << "~" << csum_chunk_size
8109 << " is unused (mask 0x" << mask
<< " of unused 0x"
8110 << blob
.unused
<< ") but csum is non-zero 0x"
8111 << blob
.get_csum_item(p
) << std::dec
<< " on blob "
8112 << *i
.first
<< dendl
;
8120 if (o
->onode
.has_omap()) {
8121 ceph_assert(ctx
.used_omap_head
);
8122 if (ctx
.used_omap_head
->count(o
->onode
.nid
)) {
8123 derr
<< "fsck error: " << o
->oid
<< " omap_head " << o
->onode
.nid
8124 << " already in use" << dendl
;
8127 ctx
.used_omap_head
->insert(o
->onode
.nid
);
8129 } // if (o->onode.has_omap())
8130 if (depth
== FSCK_DEEP
) {
8132 uint64_t max_read_block
= cct
->_conf
->bluestore_fsck_read_bytes_cap
;
8133 uint64_t offset
= 0;
8135 uint64_t l
= std::min(uint64_t(o
->onode
.size
- offset
), max_read_block
);
8136 int r
= _do_read(c
.get(), o
, offset
, l
, bl
,
8137 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
8140 derr
<< "fsck error: " << oid
<< std::hex
8141 << " error during read: "
8142 << " " << offset
<< "~" << l
8143 << " " << cpp_strerror(r
) << std::dec
8148 } while (offset
< o
->onode
.size
);
8150 } //if (depth != FSCK_SHALLOW)
8151 } // for (it->lower_bound(string()); it->valid(); it->next())
8152 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
8153 wq
->finalize(thread_pool
, ctx
);
8154 if (processed_myself
) {
8155 // may be needs more threads?
8156 dout(0) << __func__
<< " partial offload"
8157 << ", done myself " << processed_myself
8158 << " of " << ctx
.num_objects
8159 << "objects, threads " << thread_count
8166 An overview for currently implemented repair logics
8167 performed in fsck in two stages: detection(+preparation) and commit.
8168 Detection stage (in processing order):
8169 (Issue -> Repair action to schedule)
8170 - Detect undecodable keys for Shared Blobs -> Remove
8171 - Detect undecodable records for Shared Blobs -> Remove
8172 (might trigger missed Shared Blob detection below)
8173 - Detect stray records for Shared Blobs -> Remove
8174 - Detect misreferenced pextents -> Fix
8175 Prepare Bloom-like filter to track cid/oid -> pextent
8176 Prepare list of extents that are improperly referenced
8177 Enumerate Onode records that might use 'misreferenced' pextents
8178 (Bloom-like filter applied to reduce computation)
8179 Per each questinable Onode enumerate all blobs and identify broken ones
8180 (i.e. blobs having 'misreferences')
8181 Rewrite each broken blob data by allocating another extents and
8183 If blob is shared - unshare it and mark corresponding Shared Blob
8185 Release previously allocated space
8187 - Detect missed Shared Blobs -> Recreate
8188 - Detect undecodable deferred transaction -> Remove
8189 - Detect Freelist Manager's 'false free' entries -> Mark as used
8190 - Detect Freelist Manager's leaked entries -> Mark as free
8191 - Detect statfs inconsistency - Update
8192 Commit stage (separate DB commit per each step):
8193 - Apply leaked FM entries fix
8194 - Apply 'false free' FM entries fix
8195 - Apply 'Remove' actions
8196 - Apply fix for misreference pextents
8197 - Apply Shared Blob recreate
8198 (can be merged with the step above if misreferences were dectected)
8199 - Apply StatFS update
8201 int BlueStore::_fsck(BlueStore::FSCKDepth depth
, bool repair
)
8204 << (repair
? " repair" : " check")
8205 << (depth
== FSCK_DEEP
? " (deep)" :
8206 depth
== FSCK_SHALLOW
? " (shallow)" : " (regular)")
8209 // in deep mode we need R/W write access to be able to replay deferred ops
8210 bool read_only
= !(repair
|| depth
== FSCK_DEEP
);
8212 int r
= _open_db_and_around(read_only
);
8217 r
= _upgrade_super();
8223 r
= _open_collections();
8227 mempool_thread
.init();
8229 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8230 // enable in repair or deep mode modes only
8233 r
= _deferred_replay();
8239 r
= _fsck_on_open(depth
, repair
);
8242 mempool_thread
.shutdown();
8245 _close_db_and_around(false);
8250 int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth
, bool repair
)
8254 << (repair
? " repair" : " check")
8255 << (depth
== FSCK_DEEP
? " (deep)" :
8256 depth
== FSCK_SHALLOW
? " (shallow)" : " (regular)")
8257 << " start" << dendl
;
8259 int64_t warnings
= 0;
8260 unsigned repaired
= 0;
8262 uint64_t_btree_t used_omap_head
;
8263 uint64_t_btree_t used_sbids
;
8265 mempool_dynamic_bitset used_blocks
, bluefs_used_blocks
;
8266 KeyValueDB::Iterator it
;
8267 store_statfs_t expected_store_statfs
, actual_statfs
;
8268 per_pool_statfs expected_pool_statfs
;
8270 sb_info_map_t sb_info
;
8272 uint64_t num_objects
= 0;
8273 uint64_t num_extents
= 0;
8274 uint64_t num_blobs
= 0;
8275 uint64_t num_spanning_blobs
= 0;
8276 uint64_t num_shared_blobs
= 0;
8277 uint64_t num_sharded_objects
= 0;
8278 BlueStoreRepairer repairer
;
8280 auto alloc_size
= fm
->get_alloc_size();
8282 utime_t start
= ceph_clock_now();
8284 _fsck_collections(&errors
);
8285 used_blocks
.resize(fm
->get_alloc_units());
8288 interval_set
<uint64_t> bluefs_extents
;
8290 int r
= bluefs
->get_block_extents(bluefs_layout
.shared_bdev
, &bluefs_extents
);
8291 ceph_assert(r
== 0);
8292 for (auto [start
, len
] : bluefs_extents
) {
8293 apply_for_bitset_range(start
, len
, alloc_size
, used_blocks
,
8294 [&](uint64_t pos
, mempool_dynamic_bitset
& bs
) {
8295 ceph_assert(pos
< bs
.size());
8302 bluefs_used_blocks
= used_blocks
;
8304 apply_for_bitset_range(
8305 0, std::max
<uint64_t>(min_alloc_size
, SUPER_RESERVED
), alloc_size
, used_blocks
,
8306 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8313 repairer
.init_space_usage_tracker(
8319 int r
= bluefs
->fsck();
8327 if (!per_pool_stat_collection
) {
8329 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_stats
) {
8336 derr
<< "fsck " << w
<< ": store not yet converted to per-pool stats"
8339 if (per_pool_omap
!= OMAP_PER_PG
) {
8341 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_omap
) {
8348 derr
<< "fsck " << w
<< ": store not yet converted to per-pg omap"
8352 // get expected statfs; reset unaffected fields to be able to compare
8354 statfs(&actual_statfs
);
8355 actual_statfs
.total
= 0;
8356 actual_statfs
.internally_reserved
= 0;
8357 actual_statfs
.available
= 0;
8358 actual_statfs
.internal_metadata
= 0;
8359 actual_statfs
.omap_allocated
= 0;
8361 if (g_conf()->bluestore_debug_fsck_abort
) {
8362 dout(1) << __func__
<< " debug abort" << dendl
;
8367 dout(1) << __func__
<< " walking object keyspace" << dendl
;
8368 ceph::mutex sb_info_lock
= ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8369 BlueStore::FSCK_ObjectCtx
ctx(
8375 num_sharded_objects
,
8379 //no need for the below lock when in non-shallow mode as
8380 // there is no multithreading in this case
8381 depth
== FSCK_SHALLOW
? &sb_info_lock
: nullptr,
8383 expected_store_statfs
,
8384 expected_pool_statfs
,
8385 repair
? &repairer
: nullptr);
8387 _fsck_check_objects(depth
, ctx
);
8390 dout(1) << __func__
<< " checking shared_blobs" << dendl
;
8391 it
= db
->get_iterator(PREFIX_SHARED_BLOB
, KeyValueDB::ITERATOR_NOCACHE
);
8393 // FIXME minor: perhaps simplify for shallow mode?
8394 // fill global if not overriden below
8395 auto expected_statfs
= &expected_store_statfs
;
8397 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8398 string key
= it
->key();
8400 if (get_key_shared_blob(key
, &sbid
)) {
8401 derr
<< "fsck error: bad key '" << key
8402 << "' in shared blob namespace" << dendl
;
8404 repairer
.remove_key(db
, PREFIX_SHARED_BLOB
, key
);
8409 auto p
= sb_info
.find(sbid
);
8410 if (p
== sb_info
.end()) {
8411 derr
<< "fsck error: found stray shared blob data for sbid 0x"
8412 << std::hex
<< sbid
<< std::dec
<< dendl
;
8414 repairer
.remove_key(db
, PREFIX_SHARED_BLOB
, key
);
8419 sb_info_t
& sbi
= p
->second
;
8420 bluestore_shared_blob_t
shared_blob(sbid
);
8421 bufferlist bl
= it
->value();
8422 auto blp
= bl
.cbegin();
8424 decode(shared_blob
, blp
);
8425 } catch (ceph::buffer::error
& e
) {
8427 // Force update and don't report as missing
8428 sbi
.updated
= sbi
.passed
= true;
8430 derr
<< "fsck error: failed to decode Shared Blob"
8431 << pretty_binary_string(it
->key()) << dendl
;
8433 dout(20) << __func__
<< " undecodable Shared Blob, key:'"
8434 << pretty_binary_string(it
->key())
8435 << "', removing" << dendl
;
8436 repairer
.remove_key(db
, PREFIX_DEFERRED
, it
->key());
8440 dout(20) << __func__
<< " " << *sbi
.sb
<< " " << shared_blob
<< dendl
;
8441 if (shared_blob
.ref_map
!= sbi
.ref_map
) {
8442 derr
<< "fsck error: shared blob 0x" << std::hex
<< sbid
8443 << std::dec
<< " ref_map " << shared_blob
.ref_map
8444 << " != expected " << sbi
.ref_map
<< dendl
;
8445 sbi
.updated
= true; // will update later in repair mode only!
8448 PExtentVector extents
;
8449 for (auto &r
: shared_blob
.ref_map
.ref_map
) {
8450 extents
.emplace_back(bluestore_pextent_t(r
.first
, r
.second
.length
));
8452 if (per_pool_stat_collection
|| repair
) {
8453 expected_statfs
= &expected_pool_statfs
[sbi
.pool_id
];
8455 errors
+= _fsck_check_extents(sbi
.cid
,
8456 p
->second
.oids
.front(),
8458 p
->second
.compressed
,
8460 fm
->get_alloc_size(),
8461 repair
? &repairer
: nullptr,
8469 if (repair
&& repairer
.preprocess_misreference(db
)) {
8471 dout(1) << __func__
<< " sorting out misreferenced extents" << dendl
;
8472 auto& misref_extents
= repairer
.get_misreferences();
8473 interval_set
<uint64_t> to_release
;
8474 it
= db
->get_iterator(PREFIX_OBJ
, KeyValueDB::ITERATOR_NOCACHE
);
8476 // fill global if not overriden below
8477 auto expected_statfs
= &expected_store_statfs
;
8481 KeyValueDB::Transaction txn
= repairer
.get_fix_misreferences_txn();
8482 bool bypass_rest
= false;
8483 for (it
->lower_bound(string()); it
->valid() && !bypass_rest
;
8485 dout(30) << __func__
<< " key "
8486 << pretty_binary_string(it
->key()) << dendl
;
8487 if (is_extent_shard_key(it
->key())) {
8492 int r
= get_key_object(it
->key(), &oid
);
8493 if (r
< 0 || !repairer
.is_used(oid
)) {
8498 oid
.shard_id
!= pgid
.shard
||
8499 oid
.hobj
.get_logical_pool() != (int64_t)pgid
.pool() ||
8500 !c
->contains(oid
)) {
8502 for (auto& p
: coll_map
) {
8503 if (p
.second
->contains(oid
)) {
8511 if (per_pool_stat_collection
|| repair
) {
8512 auto pool_id
= c
->cid
.is_pg(&pgid
) ? pgid
.pool() : META_POOL_ID
;
8513 expected_statfs
= &expected_pool_statfs
[pool_id
];
8516 if (!repairer
.is_used(c
->cid
)) {
8520 dout(20) << __func__
<< " check misreference for col:" << c
->cid
8521 << " obj:" << oid
<< dendl
;
8524 o
.reset(Onode::decode(c
, oid
, it
->key(), it
->value()));
8525 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
8526 mempool::bluestore_fsck::set
<BlobRef
> blobs
;
8528 for (auto& e
: o
->extent_map
.extent_map
) {
8529 blobs
.insert(e
.blob
);
8531 bool need_onode_update
= false;
8532 bool first_dump
= true;
8533 for(auto b
: blobs
) {
8534 bool broken_blob
= false;
8535 auto& pextents
= b
->dirty_blob().dirty_extents();
8536 for (auto& e
: pextents
) {
8537 if (!e
.is_valid()) {
8540 // for the sake of simplicity and proper shared blob handling
8541 // always rewrite the whole blob even when it's partially
8543 if (misref_extents
.intersects(e
.offset
, e
.length
)) {
8546 _dump_onode
<10>(cct
, *o
);
8554 bool compressed
= b
->get_blob().is_compressed();
8555 need_onode_update
= true;
8556 dout(10) << __func__
8557 << " fix misreferences in oid:" << oid
8558 << " " << *b
<< dendl
;
8560 PExtentVector pext_to_release
;
8561 pext_to_release
.reserve(pextents
.size());
8562 // rewriting all valid pextents
8563 for (auto e
= pextents
.begin(); e
!= pextents
.end();
8564 b_off
+= e
->length
, e
++) {
8565 if (!e
->is_valid()) {
8570 shared_alloc
.a
->allocate(e
->length
, min_alloc_size
,
8572 if (alloc_len
< 0 || alloc_len
< (int64_t)e
->length
) {
8574 << " failed to allocate 0x" << std::hex
<< e
->length
8575 << " allocated 0x " << (alloc_len
< 0 ? 0 : alloc_len
)
8576 << " min_alloc_size 0x" << min_alloc_size
8577 << " available 0x " << shared_alloc
.a
->get_free()
8578 << std::dec
<< dendl
;
8579 if (alloc_len
> 0) {
8580 shared_alloc
.a
->release(exts
);
8585 expected_statfs
->allocated
+= e
->length
;
8587 expected_statfs
->data_compressed_allocated
+= e
->length
;
8591 IOContext
ioc(cct
, NULL
, true); // allow EIO
8592 r
= bdev
->read(e
->offset
, e
->length
, &bl
, &ioc
, false);
8594 derr
<< __func__
<< " failed to read from 0x" << std::hex
<< e
->offset
8595 <<"~" << e
->length
<< std::dec
<< dendl
;
8596 ceph_abort_msg("read failed, wtf");
8598 pext_to_release
.push_back(*e
);
8599 e
= pextents
.erase(e
);
8600 e
= pextents
.insert(e
, exts
.begin(), exts
.end());
8601 b
->get_blob().map_bl(
8603 [&](uint64_t offset
, bufferlist
& t
) {
8604 int r
= bdev
->write(offset
, t
, false);
8605 ceph_assert(r
== 0);
8607 e
+= exts
.size() - 1;
8608 for (auto& p
: exts
) {
8609 fm
->allocate(p
.offset
, p
.length
, txn
);
8611 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8613 if (b
->get_blob().is_shared()) {
8614 b
->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED
);
8616 auto sb_it
= sb_info
.find(b
->shared_blob
->get_sbid());
8617 ceph_assert(sb_it
!= sb_info
.end());
8618 sb_info_t
& sbi
= sb_it
->second
;
8620 for (auto& r
: sbi
.ref_map
.ref_map
) {
8621 expected_statfs
->allocated
-= r
.second
.length
;
8622 if (sbi
.compressed
) {
8623 // NB: it's crucial to use compressed flag from sb_info_t
8624 // as we originally used that value while accumulating
8626 expected_statfs
->data_compressed_allocated
-= r
.second
.length
;
8629 sbi
.updated
= sbi
.passed
= true;
8630 sbi
.ref_map
.clear();
8632 // relying on blob's pextents to decide what to release.
8633 for (auto& p
: pext_to_release
) {
8634 to_release
.union_insert(p
.offset
, p
.length
);
8637 for (auto& p
: pext_to_release
) {
8638 expected_statfs
->allocated
-= p
.length
;
8640 expected_statfs
->data_compressed_allocated
-= p
.length
;
8642 to_release
.union_insert(p
.offset
, p
.length
);
8648 } // for(auto b : blobs)
8649 if (need_onode_update
) {
8650 o
->extent_map
.dirty_range(0, OBJECT_MAX_SIZE
);
8651 _record_onode(o
, txn
);
8653 } // for (it->lower_bound(string()); it->valid(); it->next())
8655 for (auto it
= to_release
.begin(); it
!= to_release
.end(); ++it
) {
8656 dout(10) << __func__
<< " release 0x" << std::hex
<< it
.get_start()
8657 << "~" << it
.get_len() << std::dec
<< dendl
;
8658 fm
->release(it
.get_start(), it
.get_len(), txn
);
8660 shared_alloc
.a
->release(to_release
);
8663 } //if (repair && repairer.preprocess_misreference()) {
8665 if (depth
!= FSCK_SHALLOW
) {
8666 for (auto &p
: sb_info
) {
8667 sb_info_t
& sbi
= p
.second
;
8669 derr
<< "fsck error: missing " << *sbi
.sb
<< dendl
;
8672 if (repair
&& (!sbi
.passed
|| sbi
.updated
)) {
8673 auto sbid
= p
.first
;
8674 if (sbi
.ref_map
.empty()) {
8675 ceph_assert(sbi
.passed
);
8676 dout(20) << __func__
<< " " << *sbi
.sb
8677 << " is empty, removing" << dendl
;
8678 repairer
.fix_shared_blob(db
, sbid
, nullptr);
8681 bluestore_shared_blob_t
persistent(sbid
, std::move(sbi
.ref_map
));
8682 encode(persistent
, bl
);
8683 dout(20) << __func__
<< " " << *sbi
.sb
8684 << " is " << bl
.length() << " bytes, updating" << dendl
;
8686 repairer
.fix_shared_blob(db
, sbid
, &bl
);
8693 // check global stats only if fscking (not repairing) w/o per-pool stats
8694 if (!per_pool_stat_collection
&&
8696 !(actual_statfs
== expected_store_statfs
)) {
8697 derr
<< "fsck error: actual " << actual_statfs
8698 << " != expected " << expected_store_statfs
<< dendl
;
8700 repairer
.fix_statfs(db
, BLUESTORE_GLOBAL_STATFS_KEY
,
8701 expected_store_statfs
);
8706 dout(1) << __func__
<< " checking pool_statfs" << dendl
;
8707 _fsck_check_pool_statfs(expected_pool_statfs
,
8708 errors
, warnings
, repair
? &repairer
: nullptr);
8710 if (depth
!= FSCK_SHALLOW
) {
8711 dout(1) << __func__
<< " checking for stray omap data " << dendl
;
8712 it
= db
->get_iterator(PREFIX_OMAP
, KeyValueDB::ITERATOR_NOCACHE
);
8714 uint64_t last_omap_head
= 0;
8715 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8718 _key_decode_u64(it
->key().c_str(), &omap_head
);
8720 if (used_omap_head
.count(omap_head
) == 0 &&
8721 omap_head
!= last_omap_head
) {
8722 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8723 << "fsck error: found stray omap data on omap_head "
8724 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
8726 last_omap_head
= omap_head
;
8730 it
= db
->get_iterator(PREFIX_PGMETA_OMAP
, KeyValueDB::ITERATOR_NOCACHE
);
8732 uint64_t last_omap_head
= 0;
8733 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8735 _key_decode_u64(it
->key().c_str(), &omap_head
);
8736 if (used_omap_head
.count(omap_head
) == 0 &&
8737 omap_head
!= last_omap_head
) {
8738 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8739 << "fsck error: found stray (pgmeta) omap data on omap_head "
8740 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
8741 last_omap_head
= omap_head
;
8746 it
= db
->get_iterator(PREFIX_PERPOOL_OMAP
, KeyValueDB::ITERATOR_NOCACHE
);
8748 uint64_t last_omap_head
= 0;
8749 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8752 string k
= it
->key();
8753 const char *c
= k
.c_str();
8754 c
= _key_decode_u64(c
, &pool
);
8755 c
= _key_decode_u64(c
, &omap_head
);
8756 if (used_omap_head
.count(omap_head
) == 0 &&
8757 omap_head
!= last_omap_head
) {
8758 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8759 << "fsck error: found stray (per-pool) omap data on omap_head "
8760 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
8762 last_omap_head
= omap_head
;
8766 it
= db
->get_iterator(PREFIX_PERPG_OMAP
, KeyValueDB::ITERATOR_NOCACHE
);
8768 uint64_t last_omap_head
= 0;
8769 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8773 string k
= it
->key();
8774 const char* c
= k
.c_str();
8775 c
= _key_decode_u64(c
, &pool
);
8776 c
= _key_decode_u32(c
, &hash
);
8777 c
= _key_decode_u64(c
, &omap_head
);
8778 if (used_omap_head
.count(omap_head
) == 0 &&
8779 omap_head
!= last_omap_head
) {
8780 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8781 << "fsck error: found stray (per-pg) omap data on omap_head "
8782 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
8784 last_omap_head
= omap_head
;
8788 dout(1) << __func__
<< " checking deferred events" << dendl
;
8789 it
= db
->get_iterator(PREFIX_DEFERRED
, KeyValueDB::ITERATOR_NOCACHE
);
8791 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8792 bufferlist bl
= it
->value();
8793 auto p
= bl
.cbegin();
8794 bluestore_deferred_transaction_t wt
;
8797 } catch (ceph::buffer::error
& e
) {
8798 derr
<< "fsck error: failed to decode deferred txn "
8799 << pretty_binary_string(it
->key()) << dendl
;
8801 dout(20) << __func__
<< " undecodable deferred TXN record, key: '"
8802 << pretty_binary_string(it
->key())
8803 << "', removing" << dendl
;
8804 repairer
.remove_key(db
, PREFIX_DEFERRED
, it
->key());
8808 dout(20) << __func__
<< " deferred " << wt
.seq
8809 << " ops " << wt
.ops
.size()
8810 << " released 0x" << std::hex
<< wt
.released
<< std::dec
<< dendl
;
8811 for (auto e
= wt
.released
.begin(); e
!= wt
.released
.end(); ++e
) {
8812 apply_for_bitset_range(
8813 e
.get_start(), e
.get_len(), alloc_size
, used_blocks
,
8814 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8822 dout(1) << __func__
<< " checking freelist vs allocated" << dendl
;
8824 fm
->enumerate_reset();
8825 uint64_t offset
, length
;
8826 while (fm
->enumerate_next(db
, &offset
, &length
)) {
8827 bool intersects
= false;
8828 apply_for_bitset_range(
8829 offset
, length
, alloc_size
, used_blocks
,
8830 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8831 ceph_assert(pos
< bs
.size());
8832 if (bs
.test(pos
) && !bluefs_used_blocks
.test(pos
)) {
8833 if (offset
== SUPER_RESERVED
&&
8834 length
== min_alloc_size
- SUPER_RESERVED
) {
8835 // this is due to the change just after luminous to min_alloc_size
8836 // granularity allocations, and our baked in assumption at the top
8837 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
8838 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
8839 // since we will never allocate this region below min_alloc_size.
8840 dout(10) << __func__
<< " ignoring free extent between SUPER_RESERVED"
8841 << " and min_alloc_size, 0x" << std::hex
<< offset
<< "~"
8842 << length
<< std::dec
<< dendl
;
8846 repairer
.fix_false_free(db
, fm
,
8847 pos
* min_alloc_size
,
8857 derr
<< "fsck error: free extent 0x" << std::hex
<< offset
8858 << "~" << length
<< std::dec
8859 << " intersects allocated blocks" << dendl
;
8863 fm
->enumerate_reset();
8864 size_t count
= used_blocks
.count();
8865 if (used_blocks
.size() != count
) {
8866 ceph_assert(used_blocks
.size() > count
);
8868 size_t start
= used_blocks
.find_first();
8869 while (start
!= decltype(used_blocks
)::npos
) {
8872 size_t next
= used_blocks
.find_next(cur
);
8873 if (next
!= cur
+ 1) {
8875 derr
<< "fsck error: leaked extent 0x" << std::hex
8876 << ((uint64_t)start
* fm
->get_alloc_size()) << "~"
8877 << ((cur
+ 1 - start
) * fm
->get_alloc_size()) << std::dec
8880 repairer
.fix_leaked(db
,
8882 start
* min_alloc_size
,
8883 (cur
+ 1 - start
) * min_alloc_size
);
8896 if (per_pool_omap
!= OMAP_PER_PG
) {
8897 dout(5) << __func__
<< " fixing per_pg_omap" << dendl
;
8898 repairer
.fix_per_pool_omap(db
, OMAP_PER_PG
);
8901 dout(5) << __func__
<< " applying repair results" << dendl
;
8902 repaired
= repairer
.apply(db
);
8903 dout(5) << __func__
<< " repair applied" << dendl
;
8907 dout(2) << __func__
<< " " << num_objects
<< " objects, "
8908 << num_sharded_objects
<< " of them sharded. "
8910 dout(2) << __func__
<< " " << num_extents
<< " extents to "
8911 << num_blobs
<< " blobs, "
8912 << num_spanning_blobs
<< " spanning, "
8913 << num_shared_blobs
<< " shared."
8916 utime_t duration
= ceph_clock_now() - start
;
8917 dout(1) << __func__
<< " <<<FINISH>>> with " << errors
<< " errors, "
8918 << warnings
<< " warnings, "
8919 << repaired
<< " repaired, "
8920 << (errors
+ warnings
- (int)repaired
) << " remaining in "
8921 << duration
<< " seconds" << dendl
;
8923 // In non-repair mode we should return error count only as
8924 // it indicates if store status is OK.
8925 // In repair mode both errors and warnings are taken into account
8926 // since repaired counter relates to them both.
8927 return repair
? errors
+ warnings
- (int)repaired
: errors
;
8930 /// methods to inject various errors fsck can repair
8931 void BlueStore::inject_broken_shared_blob_key(const string
& key
,
8932 const bufferlist
& bl
)
8934 KeyValueDB::Transaction txn
;
8935 txn
= db
->get_transaction();
8936 txn
->set(PREFIX_SHARED_BLOB
, key
, bl
);
8937 db
->submit_transaction_sync(txn
);
8940 void BlueStore::inject_leaked(uint64_t len
)
8942 KeyValueDB::Transaction txn
;
8943 txn
= db
->get_transaction();
8946 int64_t alloc_len
= shared_alloc
.a
->allocate(len
, min_alloc_size
,
8947 min_alloc_size
* 256, 0, &exts
);
8948 ceph_assert(alloc_len
>= (int64_t)len
);
8949 for (auto& p
: exts
) {
8950 fm
->allocate(p
.offset
, p
.length
, txn
);
8952 db
->submit_transaction_sync(txn
);
8955 void BlueStore::inject_false_free(coll_t cid
, ghobject_t oid
)
8957 KeyValueDB::Transaction txn
;
8959 CollectionRef c
= _get_collection(cid
);
8962 std::unique_lock l
{c
->lock
}; // just to avoid internal asserts
8963 o
= c
->get_onode(oid
, false);
8965 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
8968 bool injected
= false;
8969 txn
= db
->get_transaction();
8970 auto& em
= o
->extent_map
.extent_map
;
8971 std::vector
<const PExtentVector
*> v
;
8973 v
.push_back(&em
.begin()->blob
->get_blob().get_extents());
8975 if (em
.size() > 1) {
8978 v
.push_back(&(it
->blob
->get_blob().get_extents()));
8980 for (auto pext
: v
) {
8982 auto p
= pext
->begin();
8983 while (p
!= pext
->end()) {
8984 if (p
->is_valid()) {
8985 dout(20) << __func__
<< " release 0x" << std::hex
<< p
->offset
8986 << "~" << p
->length
<< std::dec
<< dendl
;
8987 fm
->release(p
->offset
, p
->length
, txn
);
8995 ceph_assert(injected
);
8996 db
->submit_transaction_sync(txn
);
8999 void BlueStore::inject_legacy_omap()
9001 dout(1) << __func__
<< dendl
;
9002 per_pool_omap
= OMAP_BULK
;
9003 KeyValueDB::Transaction txn
;
9004 txn
= db
->get_transaction();
9005 txn
->rmkey(PREFIX_SUPER
, "per_pool_omap");
9006 db
->submit_transaction_sync(txn
);
9009 void BlueStore::inject_legacy_omap(coll_t cid
, ghobject_t oid
)
9011 dout(1) << __func__
<< " "
9012 << cid
<< " " << oid
9014 KeyValueDB::Transaction txn
;
9016 CollectionRef c
= _get_collection(cid
);
9019 std::unique_lock l
{ c
->lock
}; // just to avoid internal asserts
9020 o
= c
->get_onode(oid
, false);
9023 o
->onode
.clear_flag(
9024 bluestore_onode_t::FLAG_PERPG_OMAP
|
9025 bluestore_onode_t::FLAG_PERPOOL_OMAP
|
9026 bluestore_onode_t::FLAG_PGMETA_OMAP
);
9027 txn
= db
->get_transaction();
9028 _record_onode(o
, txn
);
9029 db
->submit_transaction_sync(txn
);
9033 void BlueStore::inject_statfs(const string
& key
, const store_statfs_t
& new_statfs
)
9035 BlueStoreRepairer repairer
;
9036 repairer
.fix_statfs(db
, key
, new_statfs
);
9040 void BlueStore::inject_global_statfs(const store_statfs_t
& new_statfs
)
9042 KeyValueDB::Transaction t
= db
->get_transaction();
9047 t
->set(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, bl
);
9048 db
->submit_transaction_sync(t
);
9051 void BlueStore::inject_misreference(coll_t cid1
, ghobject_t oid1
,
9052 coll_t cid2
, ghobject_t oid2
,
9056 CollectionRef c1
= _get_collection(cid1
);
9059 std::unique_lock l
{c1
->lock
}; // just to avoid internal asserts
9060 o1
= c1
->get_onode(oid1
, false);
9062 o1
->extent_map
.fault_range(db
, offset
, OBJECT_MAX_SIZE
);
9065 CollectionRef c2
= _get_collection(cid2
);
9068 std::unique_lock l
{c2
->lock
}; // just to avoid internal asserts
9069 o2
= c2
->get_onode(oid2
, false);
9071 o2
->extent_map
.fault_range(db
, offset
, OBJECT_MAX_SIZE
);
9073 Extent
& e1
= *(o1
->extent_map
.seek_lextent(offset
));
9074 Extent
& e2
= *(o2
->extent_map
.seek_lextent(offset
));
9076 // require onode/extent layout to be the same (and simple)
9077 // to make things easier
9078 ceph_assert(o1
->onode
.extent_map_shards
.empty());
9079 ceph_assert(o2
->onode
.extent_map_shards
.empty());
9080 ceph_assert(o1
->extent_map
.spanning_blob_map
.size() == 0);
9081 ceph_assert(o2
->extent_map
.spanning_blob_map
.size() == 0);
9082 ceph_assert(e1
.logical_offset
== e2
.logical_offset
);
9083 ceph_assert(e1
.length
== e2
.length
);
9084 ceph_assert(e1
.blob_offset
== e2
.blob_offset
);
9086 KeyValueDB::Transaction txn
;
9087 txn
= db
->get_transaction();
9089 // along with misreference error this will create space leaks errors
9090 e2
.blob
->dirty_blob() = e1
.blob
->get_blob();
9091 o2
->extent_map
.dirty_range(offset
, e2
.length
);
9092 o2
->extent_map
.update(txn
, false);
9094 _record_onode(o2
, txn
);
9095 db
->submit_transaction_sync(txn
);
9098 void BlueStore::inject_zombie_spanning_blob(coll_t cid
, ghobject_t oid
,
9102 CollectionRef c
= _get_collection(cid
);
9105 std::unique_lock l
{ c
->lock
}; // just to avoid internal asserts
9106 o
= c
->get_onode(oid
, false);
9108 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
9111 BlobRef b
= c
->new_blob();
9113 o
->extent_map
.spanning_blob_map
[blob_id
] = b
;
9115 KeyValueDB::Transaction txn
;
9116 txn
= db
->get_transaction();
9118 _record_onode(o
, txn
);
9119 db
->submit_transaction_sync(txn
);
9122 void BlueStore::collect_metadata(map
<string
,string
> *pm
)
9124 dout(10) << __func__
<< dendl
;
9125 bdev
->collect_metadata("bluestore_bdev_", pm
);
9127 (*pm
)["bluefs"] = "1";
9128 // this value is for backward compatibility only
9129 (*pm
)["bluefs_single_shared_device"] = \
9130 stringify((int)bluefs_layout
.single_shared_device());
9131 (*pm
)["bluefs_dedicated_db"] = \
9132 stringify((int)bluefs_layout
.dedicated_db
);
9133 (*pm
)["bluefs_dedicated_wal"] = \
9134 stringify((int)bluefs_layout
.dedicated_wal
);
9135 bluefs
->collect_metadata(pm
, bluefs_layout
.shared_bdev
);
9137 (*pm
)["bluefs"] = "0";
9140 // report numa mapping for underlying devices
9144 int r
= get_numa_node(&node
, &nodes
, &failed
);
9146 if (!failed
.empty()) {
9147 (*pm
)["objectstore_numa_unknown_devices"] = stringify(failed
);
9149 if (!nodes
.empty()) {
9150 dout(1) << __func__
<< " devices span numa nodes " << nodes
<< dendl
;
9151 (*pm
)["objectstore_numa_nodes"] = stringify(nodes
);
9154 (*pm
)["objectstore_numa_node"] = stringify(node
);
9159 int BlueStore::get_numa_node(
9161 set
<int> *out_nodes
,
9162 set
<string
> *out_failed
)
9165 set
<string
> devices
;
9166 get_devices(&devices
);
9169 for (auto& devname
: devices
) {
9171 BlkDev
bdev(devname
);
9172 int r
= bdev
.get_numa_node(&n
);
9174 dout(10) << __func__
<< " bdev " << devname
<< " can't detect numa_node"
9176 failed
.insert(devname
);
9179 dout(10) << __func__
<< " bdev " << devname
<< " on numa_node " << n
9186 if (node
>= 0 && nodes
.size() == 1 && failed
.empty()) {
9193 *out_failed
= failed
;
9198 int BlueStore::get_devices(set
<string
> *ls
)
9201 bdev
->get_devices(ls
);
9203 bluefs
->get_devices(ls
);
9208 // grumble, we haven't started up yet.
9209 int r
= _open_path();
9212 r
= _open_fsid(false);
9215 r
= _read_fsid(&fsid
);
9221 r
= _open_bdev(false);
9224 r
= _minimal_open_bluefs(false);
9227 bdev
->get_devices(ls
);
9229 bluefs
->get_devices(ls
);
9232 _minimal_close_bluefs();
9243 void BlueStore::_get_statfs_overall(struct store_statfs_t
*buf
)
9247 auto prefix
= per_pool_omap
== OMAP_BULK
?
9249 per_pool_omap
== OMAP_PER_POOL
?
9250 PREFIX_PERPOOL_OMAP
:
9252 buf
->omap_allocated
=
9253 db
->estimate_prefix_size(prefix
, string());
9255 uint64_t bfree
= shared_alloc
.a
->get_free();
9258 buf
->internally_reserved
= 0;
9259 // include dedicated db, too, if that isn't the shared device.
9260 if (bluefs_layout
.shared_bdev
!= BlueFS::BDEV_DB
) {
9261 buf
->total
+= bluefs
->get_total(BlueFS::BDEV_DB
);
9263 // call any non-omap bluefs space "internal metadata"
9264 buf
->internal_metadata
=
9266 - buf
->omap_allocated
;
9269 uint64_t thin_total
, thin_avail
;
9270 if (bdev
->get_thin_utilization(&thin_total
, &thin_avail
)) {
9271 buf
->total
+= thin_total
;
9273 // we are limited by both the size of the virtual device and the
9274 // underlying physical device.
9275 bfree
= std::min(bfree
, thin_avail
);
9277 buf
->allocated
= thin_total
- thin_avail
;
9279 buf
->total
+= bdev
->get_size();
9281 buf
->available
= bfree
;
9284 int BlueStore::statfs(struct store_statfs_t
*buf
,
9285 osd_alert_list_t
* alerts
)
9289 _log_alerts(*alerts
);
9291 _get_statfs_overall(buf
);
9293 std::lock_guard
l(vstatfs_lock
);
9294 buf
->allocated
= vstatfs
.allocated();
9295 buf
->data_stored
= vstatfs
.stored();
9296 buf
->data_compressed
= vstatfs
.compressed();
9297 buf
->data_compressed_original
= vstatfs
.compressed_original();
9298 buf
->data_compressed_allocated
= vstatfs
.compressed_allocated();
9301 dout(20) << __func__
<< " " << *buf
<< dendl
;
9305 int BlueStore::pool_statfs(uint64_t pool_id
, struct store_statfs_t
*buf
,
9306 bool *out_per_pool_omap
)
9308 dout(20) << __func__
<< " pool " << pool_id
<< dendl
;
9310 if (!per_pool_stat_collection
) {
9311 dout(20) << __func__
<< " not supported in legacy mode " << dendl
;
9317 std::lock_guard
l(vstatfs_lock
);
9318 osd_pools
[pool_id
].publish(buf
);
9322 _key_encode_u64(pool_id
, &key_prefix
);
9323 *out_per_pool_omap
= per_pool_omap
!= OMAP_BULK
;
9324 if (*out_per_pool_omap
) {
9325 auto prefix
= per_pool_omap
== OMAP_PER_POOL
?
9326 PREFIX_PERPOOL_OMAP
:
9328 buf
->omap_allocated
= db
->estimate_prefix_size(prefix
, key_prefix
);
9331 dout(10) << __func__
<< *buf
<< dendl
;
9335 void BlueStore::_check_legacy_statfs_alert()
9338 if (!per_pool_stat_collection
&&
9339 cct
->_conf
->bluestore_warn_on_legacy_statfs
) {
9340 s
= "legacy statfs reporting detected, "
9341 "suggest to run store repair to get consistent statistic reports";
9343 std::lock_guard
l(qlock
);
9344 legacy_statfs_alert
= s
;
9347 void BlueStore::_check_no_per_pg_or_pool_omap_alert()
9349 string per_pg
, per_pool
;
9350 if (per_pool_omap
!= OMAP_PER_PG
) {
9351 if (cct
->_conf
->bluestore_warn_on_no_per_pg_omap
) {
9352 per_pg
= "legacy (not per-pg) omap detected, "
9353 "suggest to run store repair to benefit from faster PG removal";
9355 if (per_pool_omap
!= OMAP_PER_POOL
) {
9356 if (cct
->_conf
->bluestore_warn_on_no_per_pool_omap
) {
9357 per_pool
= "legacy (not per-pool) omap detected, "
9358 "suggest to run store repair to benefit from per-pool omap usage statistics";
9362 std::lock_guard
l(qlock
);
9363 no_per_pg_omap_alert
= per_pg
;
9364 no_per_pool_omap_alert
= per_pool
;
9370 BlueStore::CollectionRef
BlueStore::_get_collection(const coll_t
& cid
)
9372 std::shared_lock
l(coll_lock
);
9373 ceph::unordered_map
<coll_t
,CollectionRef
>::iterator cp
= coll_map
.find(cid
);
9374 if (cp
== coll_map
.end())
9375 return CollectionRef();
9379 void BlueStore::_queue_reap_collection(CollectionRef
& c
)
9381 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
9382 // _reap_collections and this in the same thread,
9383 // so no need a lock.
9384 removed_collections
.push_back(c
);
9387 void BlueStore::_reap_collections()
9390 list
<CollectionRef
> removed_colls
;
9392 // _queue_reap_collection and this in the same thread.
9393 // So no need a lock.
9394 if (!removed_collections
.empty())
9395 removed_colls
.swap(removed_collections
);
9400 list
<CollectionRef
>::iterator p
= removed_colls
.begin();
9401 while (p
!= removed_colls
.end()) {
9402 CollectionRef c
= *p
;
9403 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
9404 if (c
->onode_map
.map_any([&](Onode
* o
) {
9405 ceph_assert(!o
->exists
);
9406 if (o
->flushing_count
.load()) {
9407 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " " << o
->oid
9408 << " flush_txns " << o
->flushing_count
<< dendl
;
9416 c
->onode_map
.clear();
9417 p
= removed_colls
.erase(p
);
9418 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " done" << dendl
;
9420 if (removed_colls
.empty()) {
9421 dout(10) << __func__
<< " all reaped" << dendl
;
9423 removed_collections
.splice(removed_collections
.begin(), removed_colls
);
9427 void BlueStore::_update_cache_logger()
9429 uint64_t num_onodes
= 0;
9430 uint64_t num_pinned_onodes
= 0;
9431 uint64_t num_extents
= 0;
9432 uint64_t num_blobs
= 0;
9433 uint64_t num_buffers
= 0;
9434 uint64_t num_buffer_bytes
= 0;
9435 for (auto c
: onode_cache_shards
) {
9436 c
->add_stats(&num_onodes
, &num_pinned_onodes
);
9438 for (auto c
: buffer_cache_shards
) {
9439 c
->add_stats(&num_extents
, &num_blobs
,
9440 &num_buffers
, &num_buffer_bytes
);
9442 logger
->set(l_bluestore_onodes
, num_onodes
);
9443 logger
->set(l_bluestore_pinned_onodes
, num_pinned_onodes
);
9444 logger
->set(l_bluestore_extents
, num_extents
);
9445 logger
->set(l_bluestore_blobs
, num_blobs
);
9446 logger
->set(l_bluestore_buffers
, num_buffers
);
9447 logger
->set(l_bluestore_buffer_bytes
, num_buffer_bytes
);
9453 ObjectStore::CollectionHandle
BlueStore::open_collection(const coll_t
& cid
)
9455 return _get_collection(cid
);
9458 ObjectStore::CollectionHandle
BlueStore::create_new_collection(
9461 std::unique_lock l
{coll_lock
};
9462 auto c
= ceph::make_ref
<Collection
>(
9464 onode_cache_shards
[cid
.hash_to_shard(onode_cache_shards
.size())],
9465 buffer_cache_shards
[cid
.hash_to_shard(buffer_cache_shards
.size())],
9467 new_coll_map
[cid
] = c
;
9468 _osr_attach(c
.get());
9472 void BlueStore::set_collection_commit_queue(
9474 ContextQueue
*commit_queue
)
9477 std::shared_lock
l(coll_lock
);
9478 if (coll_map
.count(cid
)) {
9479 coll_map
[cid
]->commit_queue
= commit_queue
;
9480 } else if (new_coll_map
.count(cid
)) {
9481 new_coll_map
[cid
]->commit_queue
= commit_queue
;
9487 bool BlueStore::exists(CollectionHandle
&c_
, const ghobject_t
& oid
)
9489 Collection
*c
= static_cast<Collection
*>(c_
.get());
9490 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
9497 std::shared_lock
l(c
->lock
);
9498 OnodeRef o
= c
->get_onode(oid
, false);
9499 if (!o
|| !o
->exists
)
9506 int BlueStore::stat(
9507 CollectionHandle
&c_
,
9508 const ghobject_t
& oid
,
9512 Collection
*c
= static_cast<Collection
*>(c_
.get());
9515 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
9518 std::shared_lock
l(c
->lock
);
9519 OnodeRef o
= c
->get_onode(oid
, false);
9520 if (!o
|| !o
->exists
)
9522 st
->st_size
= o
->onode
.size
;
9523 st
->st_blksize
= 4096;
9524 st
->st_blocks
= (st
->st_size
+ st
->st_blksize
- 1) / st
->st_blksize
;
9529 if (_debug_mdata_eio(oid
)) {
9531 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
9535 int BlueStore::set_collection_opts(
9536 CollectionHandle
& ch
,
9537 const pool_opts_t
& opts
)
9539 Collection
*c
= static_cast<Collection
*>(ch
.get());
9540 dout(15) << __func__
<< " " << ch
->cid
<< " options " << opts
<< dendl
;
9543 std::unique_lock l
{c
->lock
};
9544 c
->pool_opts
= opts
;
9548 int BlueStore::read(
9549 CollectionHandle
&c_
,
9550 const ghobject_t
& oid
,
9556 auto start
= mono_clock::now();
9557 Collection
*c
= static_cast<Collection
*>(c_
.get());
9558 const coll_t
&cid
= c
->get_cid();
9559 dout(15) << __func__
<< " " << cid
<< " " << oid
9560 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9568 std::shared_lock
l(c
->lock
);
9569 auto start1
= mono_clock::now();
9570 OnodeRef o
= c
->get_onode(oid
, false);
9571 log_latency("get_onode@read",
9572 l_bluestore_read_onode_meta_lat
,
9573 mono_clock::now() - start1
,
9574 cct
->_conf
->bluestore_log_op_age
);
9575 if (!o
|| !o
->exists
) {
9580 if (offset
== length
&& offset
== 0)
9581 length
= o
->onode
.size
;
9583 r
= _do_read(c
, o
, offset
, length
, bl
, op_flags
);
9585 logger
->inc(l_bluestore_read_eio
);
9590 if (r
>= 0 && _debug_data_eio(oid
)) {
9592 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
9593 } else if (oid
.hobj
.pool
> 0 && /* FIXME, see #23029 */
9594 cct
->_conf
->bluestore_debug_random_read_err
&&
9595 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
*
9597 dout(0) << __func__
<< ": inject random EIO" << dendl
;
9600 dout(10) << __func__
<< " " << cid
<< " " << oid
9601 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9602 << " = " << r
<< dendl
;
9603 log_latency(__func__
,
9604 l_bluestore_read_lat
,
9605 mono_clock::now() - start
,
9606 cct
->_conf
->bluestore_log_op_age
);
9610 void BlueStore::_read_cache(
9614 int read_cache_policy
,
9615 ready_regions_t
& ready_regions
,
9616 blobs2read_t
& blobs2read
)
9618 // build blob-wise list to of stuff read (that isn't cached)
9619 unsigned left
= length
;
9620 uint64_t pos
= offset
;
9621 auto lp
= o
->extent_map
.seek_lextent(offset
);
9622 while (left
> 0 && lp
!= o
->extent_map
.extent_map
.end()) {
9623 if (pos
< lp
->logical_offset
) {
9624 unsigned hole
= lp
->logical_offset
- pos
;
9628 dout(30) << __func__
<< " hole 0x" << std::hex
<< pos
<< "~" << hole
9629 << std::dec
<< dendl
;
9633 BlobRef
& bptr
= lp
->blob
;
9634 unsigned l_off
= pos
- lp
->logical_offset
;
9635 unsigned b_off
= l_off
+ lp
->blob_offset
;
9636 unsigned b_len
= std::min(left
, lp
->length
- l_off
);
9638 ready_regions_t cache_res
;
9639 interval_set
<uint32_t> cache_interval
;
9640 bptr
->shared_blob
->bc
.read(
9641 bptr
->shared_blob
->get_cache(), b_off
, b_len
, cache_res
, cache_interval
,
9643 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9644 << " need 0x" << b_off
<< "~" << b_len
9645 << " cache has 0x" << cache_interval
9646 << std::dec
<< dendl
;
9648 auto pc
= cache_res
.begin();
9649 uint64_t chunk_size
= bptr
->get_blob().get_chunk_size(block_size
);
9652 if (pc
!= cache_res
.end() &&
9653 pc
->first
== b_off
) {
9654 l
= pc
->second
.length();
9655 ready_regions
[pos
] = std::move(pc
->second
);
9656 dout(30) << __func__
<< " use cache 0x" << std::hex
<< pos
<< ": 0x"
9657 << b_off
<< "~" << l
<< std::dec
<< dendl
;
9661 if (pc
!= cache_res
.end()) {
9662 ceph_assert(pc
->first
> b_off
);
9663 l
= pc
->first
- b_off
;
9665 dout(30) << __func__
<< " will read 0x" << std::hex
<< pos
<< ": 0x"
9666 << b_off
<< "~" << l
<< std::dec
<< dendl
;
9669 uint64_t r_off
= b_off
;
9671 uint64_t front
= r_off
% chunk_size
;
9676 unsigned tail
= r_len
% chunk_size
;
9678 r_len
+= chunk_size
- tail
;
9680 bool merged
= false;
9681 regions2read_t
& r2r
= blobs2read
[bptr
];
9683 read_req_t
& pre
= r2r
.back();
9684 if (r_off
<= (pre
.r_off
+ pre
.r_len
)) {
9685 front
+= (r_off
- pre
.r_off
);
9686 pre
.r_len
+= (r_off
+ r_len
- pre
.r_off
- pre
.r_len
);
9687 pre
.regs
.emplace_back(region_t(pos
, b_off
, l
, front
));
9692 read_req_t
req(r_off
, r_len
);
9693 req
.regs
.emplace_back(region_t(pos
, b_off
, l
, front
));
9694 r2r
.emplace_back(std::move(req
));
9707 int BlueStore::_prepare_read_ioc(
9708 blobs2read_t
& blobs2read
,
9709 vector
<bufferlist
>* compressed_blob_bls
,
9712 for (auto& p
: blobs2read
) {
9713 const BlobRef
& bptr
= p
.first
;
9714 regions2read_t
& r2r
= p
.second
;
9715 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9716 << " need " << r2r
<< std::dec
<< dendl
;
9717 if (bptr
->get_blob().is_compressed()) {
9718 // read the whole thing
9719 if (compressed_blob_bls
->empty()) {
9720 // ensure we avoid any reallocation on subsequent blobs
9721 compressed_blob_bls
->reserve(blobs2read
.size());
9723 compressed_blob_bls
->push_back(bufferlist());
9724 bufferlist
& bl
= compressed_blob_bls
->back();
9725 auto r
= bptr
->get_blob().map(
9726 0, bptr
->get_blob().get_ondisk_length(),
9727 [&](uint64_t offset
, uint64_t length
) {
9728 int r
= bdev
->aio_read(offset
, length
, &bl
, ioc
);
9734 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
) << dendl
;
9736 // propagate EIO to caller
9739 ceph_assert(r
== 0);
9743 for (auto& req
: r2r
) {
9744 dout(20) << __func__
<< " region 0x" << std::hex
9745 << req
.regs
.front().logical_offset
9746 << ": 0x" << req
.regs
.front().blob_xoffset
9747 << " reading 0x" << req
.r_off
9748 << "~" << req
.r_len
<< std::dec
9752 auto r
= bptr
->get_blob().map(
9753 req
.r_off
, req
.r_len
,
9754 [&](uint64_t offset
, uint64_t length
) {
9755 int r
= bdev
->aio_read(offset
, length
, &req
.bl
, ioc
);
9761 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
)
9764 // propagate EIO to caller
9767 ceph_assert(r
== 0);
9769 ceph_assert(req
.bl
.length() == req
.r_len
);
9776 int BlueStore::_generate_read_result_bl(
9780 ready_regions_t
& ready_regions
,
9781 vector
<bufferlist
>& compressed_blob_bls
,
9782 blobs2read_t
& blobs2read
,
9787 // enumerate and decompress desired blobs
9788 auto p
= compressed_blob_bls
.begin();
9789 blobs2read_t::iterator b2r_it
= blobs2read
.begin();
9790 while (b2r_it
!= blobs2read
.end()) {
9791 const BlobRef
& bptr
= b2r_it
->first
;
9792 regions2read_t
& r2r
= b2r_it
->second
;
9793 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9794 << " need 0x" << r2r
<< std::dec
<< dendl
;
9795 if (bptr
->get_blob().is_compressed()) {
9796 ceph_assert(p
!= compressed_blob_bls
.end());
9797 bufferlist
& compressed_bl
= *p
++;
9798 if (_verify_csum(o
, &bptr
->get_blob(), 0, compressed_bl
,
9799 r2r
.front().regs
.front().logical_offset
) < 0) {
9804 auto r
= _decompress(compressed_bl
, &raw_bl
);
9808 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(), 0,
9811 for (auto& req
: r2r
) {
9812 for (auto& r
: req
.regs
) {
9813 ready_regions
[r
.logical_offset
].substr_of(
9814 raw_bl
, r
.blob_xoffset
, r
.length
);
9818 for (auto& req
: r2r
) {
9819 if (_verify_csum(o
, &bptr
->get_blob(), req
.r_off
, req
.bl
,
9820 req
.regs
.front().logical_offset
) < 0) {
9825 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(),
9829 // prune and keep result
9830 for (const auto& r
: req
.regs
) {
9831 ready_regions
[r
.logical_offset
].substr_of(req
.bl
, r
.front
, r
.length
);
9838 // generate a resulting buffer
9839 auto pr
= ready_regions
.begin();
9840 auto pr_end
= ready_regions
.end();
9842 while (pos
< length
) {
9843 if (pr
!= pr_end
&& pr
->first
== pos
+ offset
) {
9844 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
9845 << ": data from 0x" << pr
->first
<< "~" << pr
->second
.length()
9846 << std::dec
<< dendl
;
9847 pos
+= pr
->second
.length();
9848 bl
.claim_append(pr
->second
);
9851 uint64_t l
= length
- pos
;
9853 ceph_assert(pr
->first
> pos
+ offset
);
9854 l
= pr
->first
- (pos
+ offset
);
9856 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
9857 << ": zeros for 0x" << (pos
+ offset
) << "~" << l
9858 << std::dec
<< dendl
;
9863 ceph_assert(bl
.length() == length
);
9864 ceph_assert(pos
== length
);
9865 ceph_assert(pr
== pr_end
);
9869 int BlueStore::_do_read(
9876 uint64_t retry_count
)
9880 int read_cache_policy
= 0; // do not bypass clean or dirty cache
9882 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
9883 << " size 0x" << o
->onode
.size
<< " (" << std::dec
9884 << o
->onode
.size
<< ")" << dendl
;
9887 if (offset
>= o
->onode
.size
) {
9891 // generally, don't buffer anything, unless the client explicitly requests
9893 bool buffered
= false;
9894 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
9895 dout(20) << __func__
<< " will do buffered read" << dendl
;
9897 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
9898 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
9899 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
9900 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
9904 if (offset
+ length
> o
->onode
.size
) {
9905 length
= o
->onode
.size
- offset
;
9908 auto start
= mono_clock::now();
9909 o
->extent_map
.fault_range(db
, offset
, length
);
9910 log_latency(__func__
,
9911 l_bluestore_read_onode_meta_lat
,
9912 mono_clock::now() - start
,
9913 cct
->_conf
->bluestore_log_op_age
);
9914 _dump_onode
<30>(cct
, *o
);
9916 // for deep-scrub, we only read dirty cache and bypass clean cache in
9917 // order to read underlying block device in case there are silent disk errors.
9918 if (op_flags
& CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE
) {
9919 dout(20) << __func__
<< " will bypass cache and do direct read" << dendl
;
9920 read_cache_policy
= BufferSpace::BYPASS_CLEAN_CACHE
;
9923 // build blob-wise list to of stuff read (that isn't cached)
9924 ready_regions_t ready_regions
;
9925 blobs2read_t blobs2read
;
9926 _read_cache(o
, offset
, length
, read_cache_policy
, ready_regions
, blobs2read
);
9929 // read raw blob data.
9930 start
= mono_clock::now(); // for the sake of simplicity
9931 // measure the whole block below.
9932 // The error isn't that much...
9933 vector
<bufferlist
> compressed_blob_bls
;
9934 IOContext
ioc(cct
, NULL
, true); // allow EIO
9935 r
= _prepare_read_ioc(blobs2read
, &compressed_blob_bls
, &ioc
);
9936 // we always issue aio for reading, so errors other than EIO are not allowed
9940 int64_t num_ios
= blobs2read
.size();
9941 if (ioc
.has_pending_aios()) {
9942 num_ios
= ioc
.get_num_ios();
9943 bdev
->aio_submit(&ioc
);
9944 dout(20) << __func__
<< " waiting for aio" << dendl
;
9946 r
= ioc
.get_return_value();
9948 ceph_assert(r
== -EIO
); // no other errors allowed
9952 log_latency_fn(__func__
,
9953 l_bluestore_read_wait_aio_lat
,
9954 mono_clock::now() - start
,
9955 cct
->_conf
->bluestore_log_op_age
,
9956 [&](auto lat
) { return ", num_ios = " + stringify(num_ios
); }
9959 bool csum_error
= false;
9960 r
= _generate_read_result_bl(o
, offset
, length
, ready_regions
,
9961 compressed_blob_bls
, blobs2read
,
9962 buffered
, &csum_error
, bl
);
9964 // Handles spurious read errors caused by a kernel bug.
9965 // We sometimes get all-zero pages as a result of the read under
9966 // high memory pressure. Retrying the failing read succeeds in most
9968 // See also: http://tracker.ceph.com/issues/22464
9969 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
9972 return _do_read(c
, o
, offset
, length
, bl
, op_flags
, retry_count
+ 1);
9976 logger
->inc(l_bluestore_reads_with_retries
);
9977 dout(5) << __func__
<< " read at 0x" << std::hex
<< offset
<< "~" << length
9978 << " failed " << std::dec
<< retry_count
<< " times before succeeding" << dendl
;
9980 s
<< " reads with retries: " << logger
->get(l_bluestore_reads_with_retries
);
9981 _set_spurious_read_errors_alert(s
.str());
9986 int BlueStore::_verify_csum(OnodeRef
& o
,
9987 const bluestore_blob_t
* blob
, uint64_t blob_xoffset
,
9988 const bufferlist
& bl
,
9989 uint64_t logical_offset
) const
9993 auto start
= mono_clock::now();
9994 int r
= blob
->verify_csum(blob_xoffset
, bl
, &bad
, &bad_csum
);
9995 if (cct
->_conf
->bluestore_debug_inject_csum_err_probability
> 0 &&
9996 (rand() % 10000) < cct
->_conf
->bluestore_debug_inject_csum_err_probability
* 10000.0) {
9997 derr
<< __func__
<< " injecting bluestore checksum verifcation error" << dendl
;
10000 bad_csum
= 0xDEADBEEF;
10007 blob
->get_csum_chunk_size(),
10008 [&](uint64_t offset
, uint64_t length
) {
10009 pex
.emplace_back(bluestore_pextent_t(offset
, length
));
10012 derr
<< __func__
<< " bad "
10013 << Checksummer::get_csum_type_string(blob
->csum_type
)
10014 << "/0x" << std::hex
<< blob
->get_csum_chunk_size()
10015 << " checksum at blob offset 0x" << bad
10016 << ", got 0x" << bad_csum
<< ", expected 0x"
10017 << blob
->get_csum_item(bad
/ blob
->get_csum_chunk_size()) << std::dec
10018 << ", device location " << pex
10019 << ", logical extent 0x" << std::hex
10020 << (logical_offset
+ bad
- blob_xoffset
) << "~"
10021 << blob
->get_csum_chunk_size() << std::dec
10022 << ", object " << o
->oid
10025 derr
<< __func__
<< " failed with exit code: " << cpp_strerror(r
) << dendl
;
10028 log_latency(__func__
,
10029 l_bluestore_csum_lat
,
10030 mono_clock::now() - start
,
10031 cct
->_conf
->bluestore_log_op_age
);
10032 if (cct
->_conf
->bluestore_ignore_data_csum
) {
10038 int BlueStore::_decompress(bufferlist
& source
, bufferlist
* result
)
10041 auto start
= mono_clock::now();
10042 auto i
= source
.cbegin();
10043 bluestore_compression_header_t chdr
;
10045 int alg
= int(chdr
.type
);
10046 CompressorRef cp
= compressor
;
10047 if (!cp
|| (int)cp
->get_type() != alg
) {
10048 cp
= Compressor::create(cct
, alg
);
10052 // if compressor isn't available - error, because cannot return
10053 // decompressed data?
10055 const char* alg_name
= Compressor::get_comp_alg_name(alg
);
10056 derr
<< __func__
<< " can't load decompressor " << alg_name
<< dendl
;
10057 _set_compression_alert(false, alg_name
);
10060 r
= cp
->decompress(i
, chdr
.length
, *result
, chdr
.compressor_message
);
10062 derr
<< __func__
<< " decompression failed with exit code " << r
<< dendl
;
10066 log_latency(__func__
,
10067 l_bluestore_decompress_lat
,
10068 mono_clock::now() - start
,
10069 cct
->_conf
->bluestore_log_op_age
);
10073 // this stores fiemap into interval_set, other variations
10074 // use it internally
10075 int BlueStore::_fiemap(
10076 CollectionHandle
&c_
,
10077 const ghobject_t
& oid
,
10080 interval_set
<uint64_t>& destset
)
10082 Collection
*c
= static_cast<Collection
*>(c_
.get());
10086 std::shared_lock
l(c
->lock
);
10088 OnodeRef o
= c
->get_onode(oid
, false);
10089 if (!o
|| !o
->exists
) {
10092 _dump_onode
<30>(cct
, *o
);
10094 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10095 << " size 0x" << o
->onode
.size
<< std::dec
<< dendl
;
10097 boost::intrusive::set
<Extent
>::iterator ep
, eend
;
10098 if (offset
>= o
->onode
.size
)
10101 if (offset
+ length
> o
->onode
.size
) {
10102 length
= o
->onode
.size
- offset
;
10105 o
->extent_map
.fault_range(db
, offset
, length
);
10106 eend
= o
->extent_map
.extent_map
.end();
10107 ep
= o
->extent_map
.seek_lextent(offset
);
10108 while (length
> 0) {
10109 dout(20) << __func__
<< " offset " << offset
<< dendl
;
10110 if (ep
!= eend
&& ep
->logical_offset
+ ep
->length
<= offset
) {
10115 uint64_t x_len
= length
;
10116 if (ep
!= eend
&& ep
->logical_offset
<= offset
) {
10117 uint64_t x_off
= offset
- ep
->logical_offset
;
10118 x_len
= std::min(x_len
, ep
->length
- x_off
);
10119 dout(30) << __func__
<< " lextent 0x" << std::hex
<< offset
<< "~"
10120 << x_len
<< std::dec
<< " blob " << ep
->blob
<< dendl
;
10121 destset
.insert(offset
, x_len
);
10124 if (x_off
+ x_len
== ep
->length
)
10129 ep
->logical_offset
> offset
&&
10130 ep
->logical_offset
- offset
< x_len
) {
10131 x_len
= ep
->logical_offset
- offset
;
10139 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10140 << " size = 0x(" << destset
<< ")" << std::dec
<< dendl
;
10144 int BlueStore::fiemap(
10145 CollectionHandle
&c_
,
10146 const ghobject_t
& oid
,
10151 interval_set
<uint64_t> m
;
10152 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
10159 int BlueStore::fiemap(
10160 CollectionHandle
&c_
,
10161 const ghobject_t
& oid
,
10164 map
<uint64_t, uint64_t>& destmap
)
10166 interval_set
<uint64_t> m
;
10167 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
10169 destmap
= std::move(m
).detach();
10174 int BlueStore::readv(
10175 CollectionHandle
&c_
,
10176 const ghobject_t
& oid
,
10177 interval_set
<uint64_t>& m
,
10181 auto start
= mono_clock::now();
10182 Collection
*c
= static_cast<Collection
*>(c_
.get());
10183 const coll_t
&cid
= c
->get_cid();
10184 dout(15) << __func__
<< " " << cid
<< " " << oid
10193 std::shared_lock
l(c
->lock
);
10194 auto start1
= mono_clock::now();
10195 OnodeRef o
= c
->get_onode(oid
, false);
10196 log_latency("get_onode@read",
10197 l_bluestore_read_onode_meta_lat
,
10198 mono_clock::now() - start1
,
10199 cct
->_conf
->bluestore_log_op_age
);
10200 if (!o
|| !o
->exists
) {
10210 r
= _do_readv(c
, o
, m
, bl
, op_flags
);
10212 logger
->inc(l_bluestore_read_eio
);
10217 if (r
>= 0 && _debug_data_eio(oid
)) {
10219 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10220 } else if (oid
.hobj
.pool
> 0 && /* FIXME, see #23029 */
10221 cct
->_conf
->bluestore_debug_random_read_err
&&
10222 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
*
10224 dout(0) << __func__
<< ": inject random EIO" << dendl
;
10227 dout(10) << __func__
<< " " << cid
<< " " << oid
10228 << " fiemap " << m
<< std::dec
10229 << " = " << r
<< dendl
;
10230 log_latency(__func__
,
10231 l_bluestore_read_lat
,
10232 mono_clock::now() - start
,
10233 cct
->_conf
->bluestore_log_op_age
);
10237 int BlueStore::_do_readv(
10240 const interval_set
<uint64_t>& m
,
10243 uint64_t retry_count
)
10247 int read_cache_policy
= 0; // do not bypass clean or dirty cache
10249 dout(20) << __func__
<< " fiemap " << m
<< std::hex
10250 << " size 0x" << o
->onode
.size
<< " (" << std::dec
10251 << o
->onode
.size
<< ")" << dendl
;
10253 // generally, don't buffer anything, unless the client explicitly requests
10255 bool buffered
= false;
10256 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
10257 dout(20) << __func__
<< " will do buffered read" << dendl
;
10259 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
10260 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
10261 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
10262 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
10265 // this method must be idempotent since we may call it several times
10266 // before we finally read the expected result.
10269 // call fiemap first!
10270 ceph_assert(m
.range_start() <= o
->onode
.size
);
10271 ceph_assert(m
.range_end() <= o
->onode
.size
);
10272 auto start
= mono_clock::now();
10273 o
->extent_map
.fault_range(db
, m
.range_start(), m
.range_end() - m
.range_start());
10274 log_latency(__func__
,
10275 l_bluestore_read_onode_meta_lat
,
10276 mono_clock::now() - start
,
10277 cct
->_conf
->bluestore_log_op_age
);
10278 _dump_onode
<30>(cct
, *o
);
10280 IOContext
ioc(cct
, NULL
, true); // allow EIO
10281 vector
<std::tuple
<ready_regions_t
, vector
<bufferlist
>, blobs2read_t
>> raw_results
;
10282 raw_results
.reserve(m
.num_intervals());
10284 for (auto p
= m
.begin(); p
!= m
.end(); p
++, i
++) {
10285 raw_results
.push_back({});
10286 _read_cache(o
, p
.get_start(), p
.get_len(), read_cache_policy
,
10287 std::get
<0>(raw_results
[i
]), std::get
<2>(raw_results
[i
]));
10288 r
= _prepare_read_ioc(std::get
<2>(raw_results
[i
]), &std::get
<1>(raw_results
[i
]), &ioc
);
10289 // we always issue aio for reading, so errors other than EIO are not allowed
10294 auto num_ios
= m
.size();
10295 if (ioc
.has_pending_aios()) {
10296 num_ios
= ioc
.get_num_ios();
10297 bdev
->aio_submit(&ioc
);
10298 dout(20) << __func__
<< " waiting for aio" << dendl
;
10300 r
= ioc
.get_return_value();
10302 ceph_assert(r
== -EIO
); // no other errors allowed
10306 log_latency_fn(__func__
,
10307 l_bluestore_read_wait_aio_lat
,
10308 mono_clock::now() - start
,
10309 cct
->_conf
->bluestore_log_op_age
,
10310 [&](auto lat
) { return ", num_ios = " + stringify(num_ios
); }
10313 ceph_assert(raw_results
.size() == (size_t)m
.num_intervals());
10315 for (auto p
= m
.begin(); p
!= m
.end(); p
++, i
++) {
10316 bool csum_error
= false;
10318 r
= _generate_read_result_bl(o
, p
.get_start(), p
.get_len(),
10319 std::get
<0>(raw_results
[i
]),
10320 std::get
<1>(raw_results
[i
]),
10321 std::get
<2>(raw_results
[i
]),
10322 buffered
, &csum_error
, t
);
10324 // Handles spurious read errors caused by a kernel bug.
10325 // We sometimes get all-zero pages as a result of the read under
10326 // high memory pressure. Retrying the failing read succeeds in most
10328 // See also: http://tracker.ceph.com/issues/22464
10329 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
10332 return _do_readv(c
, o
, m
, bl
, op_flags
, retry_count
+ 1);
10334 bl
.claim_append(t
);
10337 logger
->inc(l_bluestore_reads_with_retries
);
10338 dout(5) << __func__
<< " read fiemap " << m
10339 << " failed " << retry_count
<< " times before succeeding"
10342 return bl
.length();
10345 int BlueStore::dump_onode(CollectionHandle
&c_
,
10346 const ghobject_t
& oid
,
10347 const string
& section_name
,
10350 Collection
*c
= static_cast<Collection
*>(c_
.get());
10351 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
10357 std::shared_lock
l(c
->lock
);
10359 OnodeRef o
= c
->get_onode(oid
, false);
10360 if (!o
|| !o
->exists
) {
10364 // FIXME minor: actually the next line isn't enough to
10365 // load shared blobs. Leaving as is for now..
10367 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
10369 _dump_onode
<0>(cct
, *o
);
10370 f
->open_object_section(section_name
.c_str());
10372 f
->close_section();
10376 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
10377 << " = " << r
<< dendl
;
10381 int BlueStore::getattr(
10382 CollectionHandle
&c_
,
10383 const ghobject_t
& oid
,
10387 Collection
*c
= static_cast<Collection
*>(c_
.get());
10388 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
<< dendl
;
10394 std::shared_lock
l(c
->lock
);
10395 mempool::bluestore_cache_meta::string
k(name
);
10397 OnodeRef o
= c
->get_onode(oid
, false);
10398 if (!o
|| !o
->exists
) {
10403 if (!o
->onode
.attrs
.count(k
)) {
10407 value
= o
->onode
.attrs
[k
];
10411 if (r
== 0 && _debug_mdata_eio(oid
)) {
10413 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10415 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
10416 << " = " << r
<< dendl
;
10420 int BlueStore::getattrs(
10421 CollectionHandle
&c_
,
10422 const ghobject_t
& oid
,
10423 map
<string
,bufferptr
>& aset
)
10425 Collection
*c
= static_cast<Collection
*>(c_
.get());
10426 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
10432 std::shared_lock
l(c
->lock
);
10434 OnodeRef o
= c
->get_onode(oid
, false);
10435 if (!o
|| !o
->exists
) {
10439 for (auto& i
: o
->onode
.attrs
) {
10440 aset
.emplace(i
.first
.c_str(), i
.second
);
10446 if (r
== 0 && _debug_mdata_eio(oid
)) {
10448 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10450 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
10451 << " = " << r
<< dendl
;
10455 int BlueStore::list_collections(vector
<coll_t
>& ls
)
10457 std::shared_lock
l(coll_lock
);
10458 ls
.reserve(coll_map
.size());
10459 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
= coll_map
.begin();
10460 p
!= coll_map
.end();
10462 ls
.push_back(p
->first
);
10466 bool BlueStore::collection_exists(const coll_t
& c
)
10468 std::shared_lock
l(coll_lock
);
10469 return coll_map
.count(c
);
10472 int BlueStore::collection_empty(CollectionHandle
& ch
, bool *empty
)
10474 dout(15) << __func__
<< " " << ch
->cid
<< dendl
;
10475 vector
<ghobject_t
> ls
;
10477 int r
= collection_list(ch
, ghobject_t(), ghobject_t::get_max(), 1,
10480 derr
<< __func__
<< " collection_list returned: " << cpp_strerror(r
)
10484 *empty
= ls
.empty();
10485 dout(10) << __func__
<< " " << ch
->cid
<< " = " << (int)(*empty
) << dendl
;
10489 int BlueStore::collection_bits(CollectionHandle
& ch
)
10491 dout(15) << __func__
<< " " << ch
->cid
<< dendl
;
10492 Collection
*c
= static_cast<Collection
*>(ch
.get());
10493 std::shared_lock
l(c
->lock
);
10494 dout(10) << __func__
<< " " << ch
->cid
<< " = " << c
->cnode
.bits
<< dendl
;
10495 return c
->cnode
.bits
;
10498 int BlueStore::collection_list(
10499 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10500 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10502 Collection
*c
= static_cast<Collection
*>(c_
.get());
10504 dout(15) << __func__
<< " " << c
->cid
10505 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
10508 std::shared_lock
l(c
->lock
);
10509 r
= _collection_list(c
, start
, end
, max
, false, ls
, pnext
);
10512 dout(10) << __func__
<< " " << c
->cid
10513 << " start " << start
<< " end " << end
<< " max " << max
10514 << " = " << r
<< ", ls.size() = " << ls
->size()
10515 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
10519 int BlueStore::collection_list_legacy(
10520 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10521 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10523 Collection
*c
= static_cast<Collection
*>(c_
.get());
10525 dout(15) << __func__
<< " " << c
->cid
10526 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
10529 std::shared_lock
l(c
->lock
);
10530 r
= _collection_list(c
, start
, end
, max
, true, ls
, pnext
);
10533 dout(10) << __func__
<< " " << c
->cid
10534 << " start " << start
<< " end " << end
<< " max " << max
10535 << " = " << r
<< ", ls.size() = " << ls
->size()
10536 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
10540 int BlueStore::_collection_list(
10541 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10542 bool legacy
, vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10548 auto start_time
= mono_clock::now();
10550 ghobject_t static_next
;
10551 std::unique_ptr
<CollectionListIterator
> it
;
10552 ghobject_t coll_range_temp_start
, coll_range_temp_end
;
10553 ghobject_t coll_range_start
, coll_range_end
;
10554 bool set_next
= false;
10559 pnext
= &static_next
;
10561 if (start
.is_max() || start
.hobj
.is_max()) {
10564 get_coll_range(c
->cid
, c
->cnode
.bits
, &coll_range_temp_start
,
10565 &coll_range_temp_end
, &coll_range_start
, &coll_range_end
);
10566 dout(20) << __func__
10567 << " range " << coll_range_temp_start
10568 << " to " << coll_range_temp_end
10569 << " and " << coll_range_start
10570 << " to " << coll_range_end
10571 << " start " << start
<< dendl
;
10573 it
= std::make_unique
<SimpleCollectionListIterator
>(
10574 cct
, db
->get_iterator(PREFIX_OBJ
));
10576 it
= std::make_unique
<SortedCollectionListIterator
>(
10577 db
->get_iterator(PREFIX_OBJ
));
10579 if (start
== ghobject_t() ||
10580 start
.hobj
== hobject_t() ||
10581 start
== c
->cid
.get_min_hobj()) {
10582 it
->upper_bound(coll_range_temp_start
);
10585 if (start
.hobj
.is_temp()) {
10587 ceph_assert(start
>= coll_range_temp_start
&& start
< coll_range_temp_end
);
10590 ceph_assert(start
>= coll_range_start
&& start
< coll_range_end
);
10592 dout(20) << __func__
<< " temp=" << (int)temp
<< dendl
;
10593 it
->lower_bound(start
);
10595 if (end
.hobj
.is_max()) {
10596 pend
= temp
? coll_range_temp_end
: coll_range_end
;
10598 if (end
.hobj
.is_temp()) {
10604 pend
= temp
? coll_range_temp_end
: end
;
10607 dout(20) << __func__
<< " pend " << pend
<< dendl
;
10609 if (!it
->valid() || it
->is_ge(pend
)) {
10611 dout(20) << __func__
<< " iterator not valid (end of db?)" << dendl
;
10613 dout(20) << __func__
<< " oid " << it
->oid() << " >= " << pend
<< dendl
;
10615 if (end
.hobj
.is_temp()) {
10616 if (it
->valid() && it
->is_lt(coll_range_temp_end
)) {
10617 *pnext
= it
->oid();
10622 dout(30) << __func__
<< " switch to non-temp namespace" << dendl
;
10624 it
->upper_bound(coll_range_start
);
10625 if (end
.hobj
.is_max())
10626 pend
= coll_range_end
;
10629 dout(30) << __func__
<< " pend " << pend
<< dendl
;
10632 if (it
->valid() && it
->is_lt(coll_range_end
)) {
10633 *pnext
= it
->oid();
10638 dout(20) << __func__
<< " oid " << it
->oid() << " end " << end
<< dendl
;
10639 if (ls
->size() >= (unsigned)max
) {
10640 dout(20) << __func__
<< " reached max " << max
<< dendl
;
10641 *pnext
= it
->oid();
10645 ls
->push_back(it
->oid());
10650 *pnext
= ghobject_t::get_max();
10654 l_bluestore_clist_lat
,
10655 mono_clock::now() - start_time
,
10656 cct
->_conf
->bluestore_log_collection_list_age
,
10657 [&] (const ceph::timespan
& lat
) {
10658 ostringstream ostr
;
10659 ostr
<< ", lat = " << timespan_str(lat
)
10660 << " cid =" << c
->cid
10661 << " start " << start
<< " end " << end
10669 int BlueStore::omap_get(
10670 CollectionHandle
&c_
, ///< [in] Collection containing oid
10671 const ghobject_t
&oid
, ///< [in] Object containing omap
10672 bufferlist
*header
, ///< [out] omap header
10673 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10676 Collection
*c
= static_cast<Collection
*>(c_
.get());
10677 return _omap_get(c
, oid
, header
, out
);
10680 int BlueStore::_omap_get(
10681 Collection
*c
, ///< [in] Collection containing oid
10682 const ghobject_t
&oid
, ///< [in] Object containing omap
10683 bufferlist
*header
, ///< [out] omap header
10684 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10687 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10690 std::shared_lock
l(c
->lock
);
10692 OnodeRef o
= c
->get_onode(oid
, false);
10693 if (!o
|| !o
->exists
) {
10697 r
= _onode_omap_get(o
, header
, out
);
10699 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10704 int BlueStore::_onode_omap_get(
10705 const OnodeRef
&o
, ///< [in] Object containing omap
10706 bufferlist
*header
, ///< [out] omap header
10707 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10711 if (!o
|| !o
->exists
) {
10715 if (!o
->onode
.has_omap())
10719 const string
& prefix
= o
->get_omap_prefix();
10720 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
10722 o
->get_omap_header(&head
);
10723 o
->get_omap_tail(&tail
);
10724 it
->lower_bound(head
);
10725 while (it
->valid()) {
10726 if (it
->key() == head
) {
10727 dout(30) << __func__
<< " got header" << dendl
;
10728 *header
= it
->value();
10729 } else if (it
->key() >= tail
) {
10730 dout(30) << __func__
<< " reached tail" << dendl
;
10734 o
->decode_omap_key(it
->key(), &user_key
);
10735 dout(20) << __func__
<< " got " << pretty_binary_string(it
->key())
10736 << " -> " << user_key
<< dendl
;
10737 (*out
)[user_key
] = it
->value();
10746 int BlueStore::omap_get_header(
10747 CollectionHandle
&c_
, ///< [in] Collection containing oid
10748 const ghobject_t
&oid
, ///< [in] Object containing omap
10749 bufferlist
*header
, ///< [out] omap header
10750 bool allow_eio
///< [in] don't assert on eio
10753 Collection
*c
= static_cast<Collection
*>(c_
.get());
10754 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10757 std::shared_lock
l(c
->lock
);
10759 OnodeRef o
= c
->get_onode(oid
, false);
10760 if (!o
|| !o
->exists
) {
10764 if (!o
->onode
.has_omap())
10769 o
->get_omap_header(&head
);
10770 if (db
->get(o
->get_omap_prefix(), head
, header
) >= 0) {
10771 dout(30) << __func__
<< " got header" << dendl
;
10773 dout(30) << __func__
<< " no header" << dendl
;
10777 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10782 int BlueStore::omap_get_keys(
10783 CollectionHandle
&c_
, ///< [in] Collection containing oid
10784 const ghobject_t
&oid
, ///< [in] Object containing omap
10785 set
<string
> *keys
///< [out] Keys defined on oid
10788 Collection
*c
= static_cast<Collection
*>(c_
.get());
10789 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10792 auto start1
= mono_clock::now();
10793 std::shared_lock
l(c
->lock
);
10795 OnodeRef o
= c
->get_onode(oid
, false);
10796 if (!o
|| !o
->exists
) {
10800 if (!o
->onode
.has_omap())
10804 const string
& prefix
= o
->get_omap_prefix();
10805 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
10807 o
->get_omap_key(string(), &head
);
10808 o
->get_omap_tail(&tail
);
10809 it
->lower_bound(head
);
10810 while (it
->valid()) {
10811 if (it
->key() >= tail
) {
10812 dout(30) << __func__
<< " reached tail" << dendl
;
10816 o
->decode_omap_key(it
->key(), &user_key
);
10817 dout(20) << __func__
<< " got " << pretty_binary_string(it
->key())
10818 << " -> " << user_key
<< dendl
;
10819 keys
->insert(user_key
);
10824 c
->store
->log_latency(
10826 l_bluestore_omap_get_keys_lat
,
10827 mono_clock::now() - start1
,
10828 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
10830 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10835 int BlueStore::omap_get_values(
10836 CollectionHandle
&c_
, ///< [in] Collection containing oid
10837 const ghobject_t
&oid
, ///< [in] Object containing omap
10838 const set
<string
> &keys
, ///< [in] Keys to get
10839 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
10842 Collection
*c
= static_cast<Collection
*>(c_
.get());
10843 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10846 std::shared_lock
l(c
->lock
);
10847 auto start1
= mono_clock::now();
10850 OnodeRef o
= c
->get_onode(oid
, false);
10851 if (!o
|| !o
->exists
) {
10855 if (!o
->onode
.has_omap()) {
10860 const string
& prefix
= o
->get_omap_prefix();
10861 o
->get_omap_key(string(), &final_key
);
10862 size_t base_key_len
= final_key
.size();
10863 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
10864 final_key
.resize(base_key_len
); // keep prefix
10867 if (db
->get(prefix
, final_key
, &val
) >= 0) {
10868 dout(30) << __func__
<< " got " << pretty_binary_string(final_key
)
10869 << " -> " << *p
<< dendl
;
10870 out
->insert(make_pair(*p
, val
));
10875 c
->store
->log_latency(
10877 l_bluestore_omap_get_values_lat
,
10878 mono_clock::now() - start1
,
10879 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
10881 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10886 #ifdef WITH_SEASTAR
10887 int BlueStore::omap_get_values(
10888 CollectionHandle
&c_
, ///< [in] Collection containing oid
10889 const ghobject_t
&oid
, ///< [in] Object containing omap
10890 const std::optional
<string
> &start_after
, ///< [in] Keys to get
10891 map
<string
, bufferlist
> *output
///< [out] Returned keys and values
10894 Collection
*c
= static_cast<Collection
*>(c_
.get());
10895 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10898 std::shared_lock
l(c
->lock
);
10900 OnodeRef o
= c
->get_onode(oid
, false);
10901 if (!o
|| !o
->exists
) {
10905 if (!o
->onode
.has_omap()) {
10910 ObjectMap::ObjectMapIterator iter
= get_omap_iterator(c_
, oid
);
10915 iter
->upper_bound(*start_after
);
10916 for (; iter
->valid(); iter
->next()) {
10917 output
->insert(make_pair(iter
->key(), iter
->value()));
10922 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10928 int BlueStore::omap_check_keys(
10929 CollectionHandle
&c_
, ///< [in] Collection containing oid
10930 const ghobject_t
&oid
, ///< [in] Object containing omap
10931 const set
<string
> &keys
, ///< [in] Keys to check
10932 set
<string
> *out
///< [out] Subset of keys defined on oid
10935 Collection
*c
= static_cast<Collection
*>(c_
.get());
10936 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10939 std::shared_lock
l(c
->lock
);
10942 OnodeRef o
= c
->get_onode(oid
, false);
10943 if (!o
|| !o
->exists
) {
10947 if (!o
->onode
.has_omap()) {
10952 const string
& prefix
= o
->get_omap_prefix();
10953 o
->get_omap_key(string(), &final_key
);
10954 size_t base_key_len
= final_key
.size();
10955 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
10956 final_key
.resize(base_key_len
); // keep prefix
10959 if (db
->get(prefix
, final_key
, &val
) >= 0) {
10960 dout(30) << __func__
<< " have " << pretty_binary_string(final_key
)
10961 << " -> " << *p
<< dendl
;
10964 dout(30) << __func__
<< " miss " << pretty_binary_string(final_key
)
10965 << " -> " << *p
<< dendl
;
10970 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10975 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
10976 CollectionHandle
&c_
, ///< [in] collection
10977 const ghobject_t
&oid
///< [in] object
10980 Collection
*c
= static_cast<Collection
*>(c_
.get());
10981 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
10983 return ObjectMap::ObjectMapIterator();
10985 std::shared_lock
l(c
->lock
);
10986 OnodeRef o
= c
->get_onode(oid
, false);
10987 if (!o
|| !o
->exists
) {
10988 dout(10) << __func__
<< " " << oid
<< "doesn't exist" <<dendl
;
10989 return ObjectMap::ObjectMapIterator();
10992 dout(10) << __func__
<< " has_omap = " << (int)o
->onode
.has_omap() <<dendl
;
10993 KeyValueDB::Iterator it
= db
->get_iterator(o
->get_omap_prefix());
10994 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c
, o
, it
));
10997 // -----------------
11000 uint64_t BlueStore::_get_ondisk_reserved() const {
11001 ceph_assert(min_alloc_size
);
11002 return round_up_to(
11003 std::max
<uint64_t>(SUPER_RESERVED
, min_alloc_size
), min_alloc_size
);
11006 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction
& t
)
11008 dout(10) << __func__
<< " ondisk_format " << ondisk_format
11009 << " min_compat_ondisk_format " << min_compat_ondisk_format
11011 ceph_assert(ondisk_format
== latest_ondisk_format
);
11014 encode(ondisk_format
, bl
);
11015 t
->set(PREFIX_SUPER
, "ondisk_format", bl
);
11019 encode(min_compat_ondisk_format
, bl
);
11020 t
->set(PREFIX_SUPER
, "min_compat_ondisk_format", bl
);
11024 int BlueStore::_open_super_meta()
11030 db
->get(PREFIX_SUPER
, "nid_max", &bl
);
11031 auto p
= bl
.cbegin();
11036 } catch (ceph::buffer::error
& e
) {
11037 derr
<< __func__
<< " unable to read nid_max" << dendl
;
11040 dout(1) << __func__
<< " old nid_max " << nid_max
<< dendl
;
11041 nid_last
= nid_max
.load();
11048 db
->get(PREFIX_SUPER
, "blobid_max", &bl
);
11049 auto p
= bl
.cbegin();
11054 } catch (ceph::buffer::error
& e
) {
11055 derr
<< __func__
<< " unable to read blobid_max" << dendl
;
11058 dout(1) << __func__
<< " old blobid_max " << blobid_max
<< dendl
;
11059 blobid_last
= blobid_max
.load();
11065 db
->get(PREFIX_SUPER
, "freelist_type", &bl
);
11067 freelist_type
= std::string(bl
.c_str(), bl
.length());
11068 dout(1) << __func__
<< " freelist_type " << freelist_type
<< dendl
;
11070 ceph_abort_msg("Not Support extent freelist manager");
11075 int32_t compat_ondisk_format
= 0;
11078 int r
= db
->get(PREFIX_SUPER
, "ondisk_format", &bl
);
11080 // base case: kraken bluestore is v1 and readable by v1
11081 dout(20) << __func__
<< " missing ondisk_format; assuming kraken"
11084 compat_ondisk_format
= 1;
11086 auto p
= bl
.cbegin();
11088 decode(ondisk_format
, p
);
11089 } catch (ceph::buffer::error
& e
) {
11090 derr
<< __func__
<< " unable to read ondisk_format" << dendl
;
11095 r
= db
->get(PREFIX_SUPER
, "min_compat_ondisk_format", &bl
);
11097 auto p
= bl
.cbegin();
11099 decode(compat_ondisk_format
, p
);
11100 } catch (ceph::buffer::error
& e
) {
11101 derr
<< __func__
<< " unable to read compat_ondisk_format" << dendl
;
11106 dout(1) << __func__
<< " ondisk_format " << ondisk_format
11107 << " compat_ondisk_format " << compat_ondisk_format
11111 if (latest_ondisk_format
< compat_ondisk_format
) {
11112 derr
<< __func__
<< " compat_ondisk_format is "
11113 << compat_ondisk_format
<< " but we only understand version "
11114 << latest_ondisk_format
<< dendl
;
11120 db
->get(PREFIX_SUPER
, "min_alloc_size", &bl
);
11121 auto p
= bl
.cbegin();
11125 min_alloc_size
= val
;
11126 min_alloc_size_order
= ctz(val
);
11127 ceph_assert(min_alloc_size
== 1u << min_alloc_size_order
);
11128 } catch (ceph::buffer::error
& e
) {
11129 derr
<< __func__
<< " unable to read min_alloc_size" << dendl
;
11132 dout(1) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
11133 << std::dec
<< dendl
;
11136 _set_per_pool_omap();
11139 _set_alloc_sizes();
11140 _set_throttle_params();
11143 _set_compression();
11150 int BlueStore::_upgrade_super()
11152 dout(1) << __func__
<< " from " << ondisk_format
<< ", latest "
11153 << latest_ondisk_format
<< dendl
;
11154 if (ondisk_format
< latest_ondisk_format
) {
11155 ceph_assert(ondisk_format
> 0);
11156 ceph_assert(ondisk_format
< latest_ondisk_format
);
11158 KeyValueDB::Transaction t
= db
->get_transaction();
11159 if (ondisk_format
== 1) {
11161 // - super: added ondisk_format
11162 // - super: added min_readable_ondisk_format
11163 // - super: added min_compat_ondisk_format
11164 // - super: added min_alloc_size
11165 // - super: removed min_min_alloc_size
11168 db
->get(PREFIX_SUPER
, "min_min_alloc_size", &bl
);
11169 auto p
= bl
.cbegin();
11173 min_alloc_size
= val
;
11174 } catch (ceph::buffer::error
& e
) {
11175 derr
<< __func__
<< " failed to read min_min_alloc_size" << dendl
;
11178 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
11179 t
->rmkey(PREFIX_SUPER
, "min_min_alloc_size");
11183 if (ondisk_format
== 2) {
11185 // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
11186 // oondes are using the per-pool prefix until a repair is run; at that
11187 // point the per_pool_omap=1 key will be set.
11188 // - super: added per_pool_omap key, which indicates that *all* objects
11189 // are using the new prefix and key format
11192 if (ondisk_format
== 3) {
11194 // - FreelistManager keeps meta within bdev label
11195 int r
= _write_out_fm_meta(0);
11196 ceph_assert(r
== 0);
11199 // This to be the last operation
11200 _prepare_ondisk_format_super(t
);
11201 int r
= db
->submit_transaction_sync(t
);
11202 ceph_assert(r
== 0);
11205 dout(1) << __func__
<< " done" << dendl
;
11209 void BlueStore::_assign_nid(TransContext
*txc
, OnodeRef o
)
11211 if (o
->onode
.nid
) {
11212 ceph_assert(o
->exists
);
11215 uint64_t nid
= ++nid_last
;
11216 dout(20) << __func__
<< " " << nid
<< dendl
;
11217 o
->onode
.nid
= nid
;
11218 txc
->last_nid
= nid
;
11222 uint64_t BlueStore::_assign_blobid(TransContext
*txc
)
11224 uint64_t bid
= ++blobid_last
;
11225 dout(20) << __func__
<< " " << bid
<< dendl
;
11226 txc
->last_blobid
= bid
;
11230 void BlueStore::get_db_statistics(Formatter
*f
)
11232 db
->get_statistics(f
);
11235 BlueStore::TransContext
*BlueStore::_txc_create(
11236 Collection
*c
, OpSequencer
*osr
,
11237 list
<Context
*> *on_commits
,
11238 TrackedOpRef osd_op
)
11240 TransContext
*txc
= new TransContext(cct
, c
, osr
, on_commits
);
11241 txc
->t
= db
->get_transaction();
11244 if (osd_op
&& osd_op
->pg_trace
) {
11245 txc
->trace
.init("TransContext", &trace_endpoint
,
11246 &osd_op
->pg_trace
);
11247 txc
->trace
.event("txc create");
11248 txc
->trace
.keyval("txc seq", txc
->seq
);
11252 osr
->queue_new(txc
);
11253 dout(20) << __func__
<< " osr " << osr
<< " = " << txc
11254 << " seq " << txc
->seq
<< dendl
;
11258 void BlueStore::_txc_calc_cost(TransContext
*txc
)
11260 // one "io" for the kv commit
11261 auto ios
= 1 + txc
->ioc
.get_num_ios();
11262 auto cost
= throttle_cost_per_io
.load();
11263 txc
->cost
= ios
* cost
+ txc
->bytes
;
11265 dout(10) << __func__
<< " " << txc
<< " cost " << txc
->cost
<< " ("
11266 << ios
<< " ios * " << cost
<< " + " << txc
->bytes
11267 << " bytes)" << dendl
;
11270 void BlueStore::_txc_update_store_statfs(TransContext
*txc
)
11272 if (txc
->statfs_delta
.is_empty())
11275 logger
->inc(l_bluestore_allocated
, txc
->statfs_delta
.allocated());
11276 logger
->inc(l_bluestore_stored
, txc
->statfs_delta
.stored());
11277 logger
->inc(l_bluestore_compressed
, txc
->statfs_delta
.compressed());
11278 logger
->inc(l_bluestore_compressed_allocated
, txc
->statfs_delta
.compressed_allocated());
11279 logger
->inc(l_bluestore_compressed_original
, txc
->statfs_delta
.compressed_original());
11282 txc
->statfs_delta
.encode(bl
);
11283 if (per_pool_stat_collection
) {
11285 get_pool_stat_key(txc
->osd_pool_id
, &key
);
11286 txc
->t
->merge(PREFIX_STAT
, key
, bl
);
11288 std::lock_guard
l(vstatfs_lock
);
11289 auto& stats
= osd_pools
[txc
->osd_pool_id
];
11290 stats
+= txc
->statfs_delta
;
11292 vstatfs
+= txc
->statfs_delta
; //non-persistent in this mode
11295 txc
->t
->merge(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, bl
);
11297 std::lock_guard
l(vstatfs_lock
);
11298 vstatfs
+= txc
->statfs_delta
;
11300 txc
->statfs_delta
.reset();
11303 void BlueStore::_txc_state_proc(TransContext
*txc
)
11306 dout(10) << __func__
<< " txc " << txc
11307 << " " << txc
->get_state_name() << dendl
;
11308 switch (txc
->get_state()) {
11309 case TransContext::STATE_PREPARE
:
11310 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_prepare_lat
);
11311 if (txc
->ioc
.has_pending_aios()) {
11312 txc
->set_state(TransContext::STATE_AIO_WAIT
);
11315 txc
->trace
.keyval("pending aios", txc
->ioc
.num_pending
.load());
11318 txc
->had_ios
= true;
11319 _txc_aio_submit(txc
);
11324 case TransContext::STATE_AIO_WAIT
:
11326 mono_clock::duration lat
= throttle
.log_state_latency(
11327 *txc
, logger
, l_bluestore_state_aio_wait_lat
);
11328 if (ceph::to_seconds
<double>(lat
) >= cct
->_conf
->bluestore_log_op_age
) {
11329 dout(0) << __func__
<< " slow aio_wait, txc = " << txc
11330 << ", latency = " << lat
11335 _txc_finish_io(txc
); // may trigger blocked txc's too
11338 case TransContext::STATE_IO_DONE
:
11339 ceph_assert(ceph_mutex_is_locked(txc
->osr
->qlock
)); // see _txc_finish_io
11340 if (txc
->had_ios
) {
11341 ++txc
->osr
->txc_with_unstable_io
;
11343 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_io_done_lat
);
11344 txc
->set_state(TransContext::STATE_KV_QUEUED
);
11345 if (cct
->_conf
->bluestore_sync_submit_transaction
) {
11346 if (txc
->last_nid
>= nid_max
||
11347 txc
->last_blobid
>= blobid_max
) {
11348 dout(20) << __func__
11349 << " last_{nid,blobid} exceeds max, submit via kv thread"
11351 } else if (txc
->osr
->kv_committing_serially
) {
11352 dout(20) << __func__
<< " prior txc submitted via kv thread, us too"
11354 // note: this is starvation-prone. once we have a txc in a busy
11355 // sequencer that is committing serially it is possible to keep
11356 // submitting new transactions fast enough that we get stuck doing
11357 // so. the alternative is to block here... fixme?
11358 } else if (txc
->osr
->txc_with_unstable_io
) {
11359 dout(20) << __func__
<< " prior txc(s) with unstable ios "
11360 << txc
->osr
->txc_with_unstable_io
.load() << dendl
;
11361 } else if (cct
->_conf
->bluestore_debug_randomize_serial_transaction
&&
11362 rand() % cct
->_conf
->bluestore_debug_randomize_serial_transaction
11364 dout(20) << __func__
<< " DEBUG randomly forcing submit via kv thread"
11367 _txc_apply_kv(txc
, true);
11371 std::lock_guard
l(kv_lock
);
11372 kv_queue
.push_back(txc
);
11373 if (!kv_sync_in_progress
) {
11374 kv_sync_in_progress
= true;
11375 kv_cond
.notify_one();
11377 if (txc
->get_state() != TransContext::STATE_KV_SUBMITTED
) {
11378 kv_queue_unsubmitted
.push_back(txc
);
11379 ++txc
->osr
->kv_committing_serially
;
11383 kv_throttle_costs
+= txc
->cost
;
11386 case TransContext::STATE_KV_SUBMITTED
:
11387 _txc_committed_kv(txc
);
11390 case TransContext::STATE_KV_DONE
:
11391 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_done_lat
);
11392 if (txc
->deferred_txn
) {
11393 txc
->set_state(TransContext::STATE_DEFERRED_QUEUED
);
11394 _deferred_queue(txc
);
11397 txc
->set_state(TransContext::STATE_FINISHING
);
11400 case TransContext::STATE_DEFERRED_CLEANUP
:
11401 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_deferred_cleanup_lat
);
11402 txc
->set_state(TransContext::STATE_FINISHING
);
11405 case TransContext::STATE_FINISHING
:
11406 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_finishing_lat
);
11411 derr
<< __func__
<< " unexpected txc " << txc
11412 << " state " << txc
->get_state_name() << dendl
;
11413 ceph_abort_msg("unexpected txc state");
11419 void BlueStore::_txc_finish_io(TransContext
*txc
)
11421 dout(20) << __func__
<< " " << txc
<< dendl
;
11424 * we need to preserve the order of kv transactions,
11425 * even though aio will complete in any order.
11428 OpSequencer
*osr
= txc
->osr
.get();
11429 std::lock_guard
l(osr
->qlock
);
11430 txc
->set_state(TransContext::STATE_IO_DONE
);
11431 txc
->ioc
.release_running_aios();
11432 OpSequencer::q_list_t::iterator p
= osr
->q
.iterator_to(*txc
);
11433 while (p
!= osr
->q
.begin()) {
11435 if (p
->get_state() < TransContext::STATE_IO_DONE
) {
11436 dout(20) << __func__
<< " " << txc
<< " blocked by " << &*p
<< " "
11437 << p
->get_state_name() << dendl
;
11440 if (p
->get_state() > TransContext::STATE_IO_DONE
) {
11446 _txc_state_proc(&*p
++);
11447 } while (p
!= osr
->q
.end() &&
11448 p
->get_state() == TransContext::STATE_IO_DONE
);
11450 if (osr
->kv_submitted_waiters
) {
11451 osr
->qcond
.notify_all();
11455 void BlueStore::_txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
)
11457 dout(20) << __func__
<< " txc " << txc
11458 << " onodes " << txc
->onodes
11459 << " shared_blobs " << txc
->shared_blobs
11463 for (auto o
: txc
->onodes
) {
11464 _record_onode(o
, t
);
11465 o
->flushing_count
++;
11468 // objects we modified but didn't affect the onode
11469 auto p
= txc
->modified_objects
.begin();
11470 while (p
!= txc
->modified_objects
.end()) {
11471 if (txc
->onodes
.count(*p
) == 0) {
11472 (*p
)->flushing_count
++;
11475 // remove dups with onodes list to avoid problems in _txc_finish
11476 p
= txc
->modified_objects
.erase(p
);
11480 // finalize shared_blobs
11481 for (auto sb
: txc
->shared_blobs
) {
11483 auto sbid
= sb
->get_sbid();
11484 get_shared_blob_key(sbid
, &key
);
11485 if (sb
->persistent
->empty()) {
11486 dout(20) << __func__
<< " shared_blob 0x"
11487 << std::hex
<< sbid
<< std::dec
11488 << " is empty" << dendl
;
11489 t
->rmkey(PREFIX_SHARED_BLOB
, key
);
11492 encode(*(sb
->persistent
), bl
);
11493 dout(20) << __func__
<< " shared_blob 0x"
11494 << std::hex
<< sbid
<< std::dec
11495 << " is " << bl
.length() << " " << *sb
<< dendl
;
11496 t
->set(PREFIX_SHARED_BLOB
, key
, bl
);
11501 void BlueStore::BSPerfTracker::update_from_perfcounters(
11502 PerfCounters
&logger
)
11504 os_commit_latency_ns
.consume_next(
11505 logger
.get_tavg_ns(
11506 l_bluestore_commit_lat
));
11507 os_apply_latency_ns
.consume_next(
11508 logger
.get_tavg_ns(
11509 l_bluestore_commit_lat
));
11512 // For every object we maintain <zone_num+oid, offset> tuple in the key-value
11513 // store. When a new object written to a zone, we insert the corresponding
11514 // tuple to the database. When an object is truncated, we remove the
11515 // corresponding tuple. When an object is overwritten, we remove the old tuple
11516 // and insert a new tuple corresponding to the new location of the object. The
11517 // cleaner can now identify live objects within the zone <zone_num> by
11518 // enumerating all the keys starting with <zone_num> prefix.
11519 void BlueStore::_zoned_update_cleaning_metadata(TransContext
*txc
) {
11520 for (const auto &[o
, offsets
] : txc
->zoned_onode_to_offset_map
) {
11522 get_object_key(cct
, o
->oid
, &key
);
11523 for (auto offset
: offsets
) {
11525 bufferlist offset_bl
;
11526 encode(offset
, offset_bl
);
11527 txc
->t
->set(_zoned_get_prefix(offset
), key
, offset_bl
);
11529 txc
->t
->rmkey(_zoned_get_prefix(-offset
), key
);
11535 std::string
BlueStore::_zoned_get_prefix(uint64_t offset
) {
11536 uint64_t zone_num
= offset
/ bdev
->get_zone_size();
11537 std::string zone_key
;
11538 _key_encode_u64(zone_num
, &zone_key
);
11539 return PREFIX_ZONED_CL_INFO
+ zone_key
;
11542 // For now, to avoid interface changes we piggyback zone_size (in MiB) and the
11543 // first sequential zone number onto min_alloc_size and pass it to functions
11544 // Allocator::create and FreelistManager::create.
11545 uint64_t BlueStore::_zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size
) {
11546 uint64_t zone_size
= bdev
->get_zone_size();
11547 uint64_t zone_size_mb
= zone_size
/ (1024 * 1024);
11548 uint64_t first_seq_zone
= bdev
->get_conventional_region_size() / zone_size
;
11549 min_alloc_size
|= (zone_size_mb
<< 32);
11550 min_alloc_size
|= (first_seq_zone
<< 48);
11551 return min_alloc_size
;
11554 int BlueStore::_zoned_check_config_settings() {
11555 if (cct
->_conf
->bluestore_allocator
!= "zoned") {
11556 dout(1) << __func__
<< " The drive is HM-SMR but "
11557 << cct
->_conf
->bluestore_allocator
<< " allocator is specified. "
11558 << "Only zoned allocator can be used with HM-SMR drive." << dendl
;
11562 // At least for now we want to use large min_alloc_size with HM-SMR drives.
11563 // Populating used_blocks bitset on a debug build of ceph-osd takes about 5
11564 // minutes with a 14 TB HM-SMR drive and 4 KiB min_alloc_size.
11565 if (min_alloc_size
< 64 * 1024) {
11566 dout(1) << __func__
<< " The drive is HM-SMR but min_alloc_size is "
11567 << min_alloc_size
<< ". "
11568 << "Please set to at least 64 KiB." << dendl
;
11572 // We don't want to defer writes with HM-SMR because it violates sequential
11573 // write requirement.
11574 if (prefer_deferred_size
) {
11575 dout(1) << __func__
<< " The drive is HM-SMR but prefer_deferred_size is "
11576 << prefer_deferred_size
<< ". "
11577 << "Please set to 0." << dendl
;
11583 void BlueStore::_txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
)
11585 dout(20) << __func__
<< " txc " << txc
<< std::hex
11586 << " allocated 0x" << txc
->allocated
11587 << " released 0x" << txc
->released
11588 << std::dec
<< dendl
;
11590 // We have to handle the case where we allocate *and* deallocate the
11591 // same region in this transaction. The freelist doesn't like that.
11592 // (Actually, the only thing that cares is the BitmapFreelistManager
11593 // debug check. But that's important.)
11594 interval_set
<uint64_t> tmp_allocated
, tmp_released
;
11595 interval_set
<uint64_t> *pallocated
= &txc
->allocated
;
11596 interval_set
<uint64_t> *preleased
= &txc
->released
;
11597 if (!txc
->allocated
.empty() && !txc
->released
.empty()) {
11598 interval_set
<uint64_t> overlap
;
11599 overlap
.intersection_of(txc
->allocated
, txc
->released
);
11600 if (!overlap
.empty()) {
11601 tmp_allocated
= txc
->allocated
;
11602 tmp_allocated
.subtract(overlap
);
11603 tmp_released
= txc
->released
;
11604 tmp_released
.subtract(overlap
);
11605 dout(20) << __func__
<< " overlap 0x" << std::hex
<< overlap
11606 << ", new allocated 0x" << tmp_allocated
11607 << " released 0x" << tmp_released
<< std::dec
11609 pallocated
= &tmp_allocated
;
11610 preleased
= &tmp_released
;
11614 // update freelist with non-overlap sets
11615 for (interval_set
<uint64_t>::iterator p
= pallocated
->begin();
11616 p
!= pallocated
->end();
11618 fm
->allocate(p
.get_start(), p
.get_len(), t
);
11620 for (interval_set
<uint64_t>::iterator p
= preleased
->begin();
11621 p
!= preleased
->end();
11623 dout(20) << __func__
<< " release 0x" << std::hex
<< p
.get_start()
11624 << "~" << p
.get_len() << std::dec
<< dendl
;
11625 fm
->release(p
.get_start(), p
.get_len(), t
);
11628 if (bdev
->is_smr()) {
11629 _zoned_update_cleaning_metadata(txc
);
11632 _txc_update_store_statfs(txc
);
11635 void BlueStore::_txc_apply_kv(TransContext
*txc
, bool sync_submit_transaction
)
11637 ceph_assert(txc
->get_state() == TransContext::STATE_KV_QUEUED
);
11639 #if defined(WITH_LTTNG)
11640 auto start
= mono_clock::now();
11645 txc
->trace
.event("db async submit");
11649 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction(txc
->t
);
11650 ceph_assert(r
== 0);
11651 txc
->set_state(TransContext::STATE_KV_SUBMITTED
);
11652 if (txc
->osr
->kv_submitted_waiters
) {
11653 std::lock_guard
l(txc
->osr
->qlock
);
11654 txc
->osr
->qcond
.notify_all();
11657 #if defined(WITH_LTTNG)
11658 if (txc
->tracing
) {
11661 transaction_kv_submit_latency
,
11662 txc
->osr
->get_sequencer_id(),
11664 sync_submit_transaction
,
11665 ceph::to_seconds
<double>(mono_clock::now() - start
));
11670 for (auto ls
: { &txc
->onodes
, &txc
->modified_objects
}) {
11671 for (auto& o
: *ls
) {
11672 dout(20) << __func__
<< " onode " << o
<< " had " << o
->flushing_count
11674 if (--o
->flushing_count
== 0 && o
->waiting_count
.load()) {
11675 std::lock_guard
l(o
->flush_lock
);
11676 o
->flush_cond
.notify_all();
11682 void BlueStore::_txc_committed_kv(TransContext
*txc
)
11684 dout(20) << __func__
<< " txc " << txc
<< dendl
;
11685 throttle
.complete_kv(*txc
);
11687 std::lock_guard
l(txc
->osr
->qlock
);
11688 txc
->set_state(TransContext::STATE_KV_DONE
);
11689 if (txc
->ch
->commit_queue
) {
11690 txc
->ch
->commit_queue
->queue(txc
->oncommits
);
11692 finisher
.queue(txc
->oncommits
);
11695 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_committing_lat
);
11698 l_bluestore_commit_lat
,
11699 mono_clock::now() - txc
->start
,
11700 cct
->_conf
->bluestore_log_op_age
,
11702 return ", txc = " + stringify(txc
);
11707 void BlueStore::_txc_finish(TransContext
*txc
)
11709 dout(20) << __func__
<< " " << txc
<< " onodes " << txc
->onodes
<< dendl
;
11710 ceph_assert(txc
->get_state() == TransContext::STATE_FINISHING
);
11712 for (auto& sb
: txc
->shared_blobs_written
) {
11713 sb
->finish_write(txc
->seq
);
11715 txc
->shared_blobs_written
.clear();
11717 while (!txc
->removed_collections
.empty()) {
11718 _queue_reap_collection(txc
->removed_collections
.front());
11719 txc
->removed_collections
.pop_front();
11722 OpSequencerRef osr
= txc
->osr
;
11723 bool empty
= false;
11724 bool submit_deferred
= false;
11725 OpSequencer::q_list_t releasing_txc
;
11727 std::lock_guard
l(osr
->qlock
);
11728 txc
->set_state(TransContext::STATE_DONE
);
11729 bool notify
= false;
11730 while (!osr
->q
.empty()) {
11731 TransContext
*txc
= &osr
->q
.front();
11732 dout(20) << __func__
<< " txc " << txc
<< " " << txc
->get_state_name()
11734 if (txc
->get_state() != TransContext::STATE_DONE
) {
11735 if (txc
->get_state() == TransContext::STATE_PREPARE
&&
11736 deferred_aggressive
) {
11737 // for _osr_drain_preceding()
11740 if (txc
->get_state() == TransContext::STATE_DEFERRED_QUEUED
&&
11741 osr
->q
.size() > g_conf()->bluestore_max_deferred_txc
) {
11742 submit_deferred
= true;
11747 osr
->q
.pop_front();
11748 releasing_txc
.push_back(*txc
);
11751 if (osr
->q
.empty()) {
11752 dout(20) << __func__
<< " osr " << osr
<< " q now empty" << dendl
;
11756 // only drain()/drain_preceding() need wakeup,
11757 // other cases use kv_submitted_waiters
11758 if (notify
|| empty
) {
11759 osr
->qcond
.notify_all();
11763 while (!releasing_txc
.empty()) {
11764 // release to allocator only after all preceding txc's have also
11765 // finished any deferred writes that potentially land in these
11767 auto txc
= &releasing_txc
.front();
11768 _txc_release_alloc(txc
);
11769 releasing_txc
.pop_front();
11770 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_done_lat
);
11771 throttle
.complete(*txc
);
11775 if (submit_deferred
) {
11776 // we're pinning memory; flush! we could be more fine-grained here but
11777 // i'm not sure it's worth the bother.
11778 deferred_try_submit();
11781 if (empty
&& osr
->zombie
) {
11782 std::lock_guard
l(zombie_osr_lock
);
11783 if (zombie_osr_set
.erase(osr
->cid
)) {
11784 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
11786 dout(10) << __func__
<< " empty zombie osr " << osr
<< " already reaped"
11792 void BlueStore::_txc_release_alloc(TransContext
*txc
)
11794 // it's expected we're called with lazy_release_lock already taken!
11795 if (likely(!cct
->_conf
->bluestore_debug_no_reuse_blocks
)) {
11797 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
11798 r
= bdev
->queue_discard(txc
->released
);
11800 dout(10) << __func__
<< "(queued) " << txc
<< " " << std::hex
11801 << txc
->released
<< std::dec
<< dendl
;
11804 } else if (cct
->_conf
->bdev_enable_discard
) {
11805 for (auto p
= txc
->released
.begin(); p
!= txc
->released
.end(); ++p
) {
11806 bdev
->discard(p
.get_start(), p
.get_len());
11809 dout(10) << __func__
<< "(sync) " << txc
<< " " << std::hex
11810 << txc
->released
<< std::dec
<< dendl
;
11811 shared_alloc
.a
->release(txc
->released
);
11815 txc
->allocated
.clear();
11816 txc
->released
.clear();
11819 void BlueStore::_osr_attach(Collection
*c
)
11821 // note: caller has RWLock on coll_map
11822 auto q
= coll_map
.find(c
->cid
);
11823 if (q
!= coll_map
.end()) {
11824 c
->osr
= q
->second
->osr
;
11825 ldout(cct
, 10) << __func__
<< " " << c
->cid
11826 << " reusing osr " << c
->osr
<< " from existing coll "
11827 << q
->second
<< dendl
;
11829 std::lock_guard
l(zombie_osr_lock
);
11830 auto p
= zombie_osr_set
.find(c
->cid
);
11831 if (p
== zombie_osr_set
.end()) {
11832 c
->osr
= ceph::make_ref
<OpSequencer
>(this, next_sequencer_id
++, c
->cid
);
11833 ldout(cct
, 10) << __func__
<< " " << c
->cid
11834 << " fresh osr " << c
->osr
<< dendl
;
11836 c
->osr
= p
->second
;
11837 zombie_osr_set
.erase(p
);
11838 ldout(cct
, 10) << __func__
<< " " << c
->cid
11839 << " resurrecting zombie osr " << c
->osr
<< dendl
;
11840 c
->osr
->zombie
= false;
11845 void BlueStore::_osr_register_zombie(OpSequencer
*osr
)
11847 std::lock_guard
l(zombie_osr_lock
);
11848 dout(10) << __func__
<< " " << osr
<< " " << osr
->cid
<< dendl
;
11849 osr
->zombie
= true;
11850 auto i
= zombie_osr_set
.emplace(osr
->cid
, osr
);
11851 // this is either a new insertion or the same osr is already there
11852 ceph_assert(i
.second
|| i
.first
->second
== osr
);
11855 void BlueStore::_osr_drain_preceding(TransContext
*txc
)
11857 OpSequencer
*osr
= txc
->osr
.get();
11858 dout(10) << __func__
<< " " << txc
<< " osr " << osr
<< dendl
;
11859 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
11861 // submit anything pending
11862 osr
->deferred_lock
.lock();
11863 if (osr
->deferred_pending
&& !osr
->deferred_running
) {
11864 _deferred_submit_unlock(osr
);
11866 osr
->deferred_lock
.unlock();
11870 // wake up any previously finished deferred events
11871 std::lock_guard
l(kv_lock
);
11872 if (!kv_sync_in_progress
) {
11873 kv_sync_in_progress
= true;
11874 kv_cond
.notify_one();
11877 osr
->drain_preceding(txc
);
11878 --deferred_aggressive
;
11879 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
11882 void BlueStore::_osr_drain(OpSequencer
*osr
)
11884 dout(10) << __func__
<< " " << osr
<< dendl
;
11885 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
11887 // submit anything pending
11888 osr
->deferred_lock
.lock();
11889 if (osr
->deferred_pending
&& !osr
->deferred_running
) {
11890 _deferred_submit_unlock(osr
);
11892 osr
->deferred_lock
.unlock();
11896 // wake up any previously finished deferred events
11897 std::lock_guard
l(kv_lock
);
11898 if (!kv_sync_in_progress
) {
11899 kv_sync_in_progress
= true;
11900 kv_cond
.notify_one();
11904 --deferred_aggressive
;
11905 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
11908 void BlueStore::_osr_drain_all()
11910 dout(10) << __func__
<< dendl
;
11912 set
<OpSequencerRef
> s
;
11913 vector
<OpSequencerRef
> zombies
;
11915 std::shared_lock
l(coll_lock
);
11916 for (auto& i
: coll_map
) {
11917 s
.insert(i
.second
->osr
);
11921 std::lock_guard
l(zombie_osr_lock
);
11922 for (auto& i
: zombie_osr_set
) {
11923 s
.insert(i
.second
);
11924 zombies
.push_back(i
.second
);
11927 dout(20) << __func__
<< " osr_set " << s
<< dendl
;
11929 ++deferred_aggressive
;
11931 // submit anything pending
11932 deferred_try_submit();
11935 // wake up any previously finished deferred events
11936 std::lock_guard
l(kv_lock
);
11937 kv_cond
.notify_one();
11940 std::lock_guard
l(kv_finalize_lock
);
11941 kv_finalize_cond
.notify_one();
11943 for (auto osr
: s
) {
11944 dout(20) << __func__
<< " drain " << osr
<< dendl
;
11947 --deferred_aggressive
;
11950 std::lock_guard
l(zombie_osr_lock
);
11951 for (auto& osr
: zombies
) {
11952 if (zombie_osr_set
.erase(osr
->cid
)) {
11953 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
11954 ceph_assert(osr
->q
.empty());
11955 } else if (osr
->zombie
) {
11956 dout(10) << __func__
<< " empty zombie osr " << osr
11957 << " already reaped" << dendl
;
11958 ceph_assert(osr
->q
.empty());
11960 dout(10) << __func__
<< " empty zombie osr " << osr
11961 << " resurrected" << dendl
;
11966 dout(10) << __func__
<< " done" << dendl
;
11970 void BlueStore::_kv_start()
11972 dout(10) << __func__
<< dendl
;
11975 kv_sync_thread
.create("bstore_kv_sync");
11976 kv_finalize_thread
.create("bstore_kv_final");
11979 void BlueStore::_kv_stop()
11981 dout(10) << __func__
<< dendl
;
11983 std::unique_lock l
{kv_lock
};
11984 while (!kv_sync_started
) {
11988 kv_cond
.notify_all();
11991 std::unique_lock l
{kv_finalize_lock
};
11992 while (!kv_finalize_started
) {
11993 kv_finalize_cond
.wait(l
);
11995 kv_finalize_stop
= true;
11996 kv_finalize_cond
.notify_all();
11998 kv_sync_thread
.join();
11999 kv_finalize_thread
.join();
12000 ceph_assert(removed_collections
.empty());
12002 std::lock_guard
l(kv_lock
);
12006 std::lock_guard
l(kv_finalize_lock
);
12007 kv_finalize_stop
= false;
12009 dout(10) << __func__
<< " stopping finishers" << dendl
;
12010 finisher
.wait_for_empty();
12012 dout(10) << __func__
<< " stopped" << dendl
;
12015 void BlueStore::_kv_sync_thread()
12017 dout(10) << __func__
<< " start" << dendl
;
12018 deque
<DeferredBatch
*> deferred_stable_queue
; ///< deferred ios done + stable
12019 std::unique_lock l
{kv_lock
};
12020 ceph_assert(!kv_sync_started
);
12021 kv_sync_started
= true;
12022 kv_cond
.notify_all();
12024 auto t0
= mono_clock::now();
12025 timespan twait
= ceph::make_timespan(0);
12026 size_t kv_submitted
= 0;
12029 auto period
= cct
->_conf
->bluestore_kv_sync_util_logging_s
;
12030 auto observation_period
=
12031 ceph::make_timespan(period
);
12032 auto elapsed
= mono_clock::now() - t0
;
12033 if (period
&& elapsed
>= observation_period
) {
12034 dout(5) << __func__
<< " utilization: idle "
12035 << twait
<< " of " << elapsed
12036 << ", submitted: " << kv_submitted
12038 t0
= mono_clock::now();
12039 twait
= ceph::make_timespan(0);
12042 ceph_assert(kv_committing
.empty());
12043 if (kv_queue
.empty() &&
12044 ((deferred_done_queue
.empty() && deferred_stable_queue
.empty()) ||
12045 !deferred_aggressive
)) {
12048 dout(20) << __func__
<< " sleep" << dendl
;
12049 auto t
= mono_clock::now();
12050 kv_sync_in_progress
= false;
12052 twait
+= mono_clock::now() - t
;
12054 dout(20) << __func__
<< " wake" << dendl
;
12056 deque
<TransContext
*> kv_submitting
;
12057 deque
<DeferredBatch
*> deferred_done
, deferred_stable
;
12058 uint64_t aios
= 0, costs
= 0;
12060 dout(20) << __func__
<< " committing " << kv_queue
.size()
12061 << " submitting " << kv_queue_unsubmitted
.size()
12062 << " deferred done " << deferred_done_queue
.size()
12063 << " stable " << deferred_stable_queue
.size()
12065 kv_committing
.swap(kv_queue
);
12066 kv_submitting
.swap(kv_queue_unsubmitted
);
12067 deferred_done
.swap(deferred_done_queue
);
12068 deferred_stable
.swap(deferred_stable_queue
);
12070 costs
= kv_throttle_costs
;
12072 kv_throttle_costs
= 0;
12075 dout(30) << __func__
<< " committing " << kv_committing
<< dendl
;
12076 dout(30) << __func__
<< " submitting " << kv_submitting
<< dendl
;
12077 dout(30) << __func__
<< " deferred_done " << deferred_done
<< dendl
;
12078 dout(30) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
12080 auto start
= mono_clock::now();
12082 bool force_flush
= false;
12083 // if bluefs is sharing the same device as data (only), then we
12084 // can rely on the bluefs commit to flush the device and make
12085 // deferred aios stable. that means that if we do have done deferred
12086 // txcs AND we are not on a single device, we need to force a flush.
12087 if (bluefs
&& bluefs_layout
.single_shared_device()) {
12089 force_flush
= true;
12090 } else if (kv_committing
.empty() && deferred_stable
.empty()) {
12091 force_flush
= true; // there's nothing else to commit!
12092 } else if (deferred_aggressive
) {
12093 force_flush
= true;
12096 if (aios
|| !deferred_done
.empty()) {
12097 force_flush
= true;
12099 dout(20) << __func__
<< " skipping flush (no aios, no deferred_done)" << dendl
;
12104 dout(20) << __func__
<< " num_aios=" << aios
12105 << " force_flush=" << (int)force_flush
12106 << ", flushing, deferred done->stable" << dendl
;
12107 // flush/barrier on block device
12110 // if we flush then deferred done are now deferred stable
12111 deferred_stable
.insert(deferred_stable
.end(), deferred_done
.begin(),
12112 deferred_done
.end());
12113 deferred_done
.clear();
12115 auto after_flush
= mono_clock::now();
12117 // we will use one final transaction to force a sync
12118 KeyValueDB::Transaction synct
= db
->get_transaction();
12120 // increase {nid,blobid}_max? note that this covers both the
12121 // case where we are approaching the max and the case we passed
12122 // it. in either case, we increase the max in the earlier txn
12124 uint64_t new_nid_max
= 0, new_blobid_max
= 0;
12125 if (nid_last
+ cct
->_conf
->bluestore_nid_prealloc
/2 > nid_max
) {
12126 KeyValueDB::Transaction t
=
12127 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
12128 new_nid_max
= nid_last
+ cct
->_conf
->bluestore_nid_prealloc
;
12130 encode(new_nid_max
, bl
);
12131 t
->set(PREFIX_SUPER
, "nid_max", bl
);
12132 dout(10) << __func__
<< " new_nid_max " << new_nid_max
<< dendl
;
12134 if (blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
/2 > blobid_max
) {
12135 KeyValueDB::Transaction t
=
12136 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
12137 new_blobid_max
= blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
;
12139 encode(new_blobid_max
, bl
);
12140 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
12141 dout(10) << __func__
<< " new_blobid_max " << new_blobid_max
<< dendl
;
12144 for (auto txc
: kv_committing
) {
12145 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_queued_lat
);
12146 if (txc
->get_state() == TransContext::STATE_KV_QUEUED
) {
12148 _txc_apply_kv(txc
, false);
12149 --txc
->osr
->kv_committing_serially
;
12151 ceph_assert(txc
->get_state() == TransContext::STATE_KV_SUBMITTED
);
12153 if (txc
->had_ios
) {
12154 --txc
->osr
->txc_with_unstable_io
;
12158 // release throttle *before* we commit. this allows new ops
12159 // to be prepared and enter pipeline while we are waiting on
12160 // the kv commit sync/flush. then hopefully on the next
12161 // iteration there will already be ops awake. otherwise, we
12162 // end up going to sleep, and then wake up when the very first
12163 // transaction is ready for commit.
12164 throttle
.release_kv_throttle(costs
);
12166 // cleanup sync deferred keys
12167 for (auto b
: deferred_stable
) {
12168 for (auto& txc
: b
->txcs
) {
12169 bluestore_deferred_transaction_t
& wt
= *txc
.deferred_txn
;
12170 ceph_assert(wt
.released
.empty()); // only kraken did this
12172 get_deferred_key(wt
.seq
, &key
);
12173 synct
->rm_single_key(PREFIX_DEFERRED
, key
);
12177 #if defined(WITH_LTTNG)
12178 auto sync_start
= mono_clock::now();
12180 // submit synct synchronously (block and wait for it to commit)
12181 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction_sync(synct
);
12182 ceph_assert(r
== 0);
12185 for (auto txc
: kv_committing
) {
12187 txc
->trace
.event("db sync submit");
12188 txc
->trace
.keyval("kv_committing size", kv_committing
.size());
12193 int committing_size
= kv_committing
.size();
12194 int deferred_size
= deferred_stable
.size();
12196 #if defined(WITH_LTTNG)
12197 double sync_latency
= ceph::to_seconds
<double>(mono_clock::now() - sync_start
);
12198 for (auto txc
: kv_committing
) {
12199 if (txc
->tracing
) {
12202 transaction_kv_sync_latency
,
12203 txc
->osr
->get_sequencer_id(),
12205 kv_committing
.size(),
12206 deferred_done
.size(),
12207 deferred_stable
.size(),
12214 std::unique_lock m
{kv_finalize_lock
};
12215 if (kv_committing_to_finalize
.empty()) {
12216 kv_committing_to_finalize
.swap(kv_committing
);
12218 kv_committing_to_finalize
.insert(
12219 kv_committing_to_finalize
.end(),
12220 kv_committing
.begin(),
12221 kv_committing
.end());
12222 kv_committing
.clear();
12224 if (deferred_stable_to_finalize
.empty()) {
12225 deferred_stable_to_finalize
.swap(deferred_stable
);
12227 deferred_stable_to_finalize
.insert(
12228 deferred_stable_to_finalize
.end(),
12229 deferred_stable
.begin(),
12230 deferred_stable
.end());
12231 deferred_stable
.clear();
12233 if (!kv_finalize_in_progress
) {
12234 kv_finalize_in_progress
= true;
12235 kv_finalize_cond
.notify_one();
12240 nid_max
= new_nid_max
;
12241 dout(10) << __func__
<< " nid_max now " << nid_max
<< dendl
;
12243 if (new_blobid_max
) {
12244 blobid_max
= new_blobid_max
;
12245 dout(10) << __func__
<< " blobid_max now " << blobid_max
<< dendl
;
12249 auto finish
= mono_clock::now();
12250 ceph::timespan dur_flush
= after_flush
- start
;
12251 ceph::timespan dur_kv
= finish
- after_flush
;
12252 ceph::timespan dur
= finish
- start
;
12253 dout(20) << __func__
<< " committed " << committing_size
12254 << " cleaned " << deferred_size
12256 << " (" << dur_flush
<< " flush + " << dur_kv
<< " kv commit)"
12258 log_latency("kv_flush",
12259 l_bluestore_kv_flush_lat
,
12261 cct
->_conf
->bluestore_log_op_age
);
12262 log_latency("kv_commit",
12263 l_bluestore_kv_commit_lat
,
12265 cct
->_conf
->bluestore_log_op_age
);
12266 log_latency("kv_sync",
12267 l_bluestore_kv_sync_lat
,
12269 cct
->_conf
->bluestore_log_op_age
);
12273 // previously deferred "done" are now "stable" by virtue of this
12275 deferred_stable_queue
.swap(deferred_done
);
12278 dout(10) << __func__
<< " finish" << dendl
;
12279 kv_sync_started
= false;
12282 void BlueStore::_kv_finalize_thread()
12284 deque
<TransContext
*> kv_committed
;
12285 deque
<DeferredBatch
*> deferred_stable
;
12286 dout(10) << __func__
<< " start" << dendl
;
12287 std::unique_lock
l(kv_finalize_lock
);
12288 ceph_assert(!kv_finalize_started
);
12289 kv_finalize_started
= true;
12290 kv_finalize_cond
.notify_all();
12292 ceph_assert(kv_committed
.empty());
12293 ceph_assert(deferred_stable
.empty());
12294 if (kv_committing_to_finalize
.empty() &&
12295 deferred_stable_to_finalize
.empty()) {
12296 if (kv_finalize_stop
)
12298 dout(20) << __func__
<< " sleep" << dendl
;
12299 kv_finalize_in_progress
= false;
12300 kv_finalize_cond
.wait(l
);
12301 dout(20) << __func__
<< " wake" << dendl
;
12303 kv_committed
.swap(kv_committing_to_finalize
);
12304 deferred_stable
.swap(deferred_stable_to_finalize
);
12306 dout(20) << __func__
<< " kv_committed " << kv_committed
<< dendl
;
12307 dout(20) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
12309 auto start
= mono_clock::now();
12311 while (!kv_committed
.empty()) {
12312 TransContext
*txc
= kv_committed
.front();
12313 ceph_assert(txc
->get_state() == TransContext::STATE_KV_SUBMITTED
);
12314 _txc_state_proc(txc
);
12315 kv_committed
.pop_front();
12318 for (auto b
: deferred_stable
) {
12319 auto p
= b
->txcs
.begin();
12320 while (p
!= b
->txcs
.end()) {
12321 TransContext
*txc
= &*p
;
12322 p
= b
->txcs
.erase(p
); // unlink here because
12323 _txc_state_proc(txc
); // this may destroy txc
12327 deferred_stable
.clear();
12329 if (!deferred_aggressive
) {
12330 if (deferred_queue_size
>= deferred_batch_ops
.load() ||
12331 throttle
.should_submit_deferred()) {
12332 deferred_try_submit();
12336 // this is as good a place as any ...
12337 _reap_collections();
12339 logger
->set(l_bluestore_fragmentation
,
12340 (uint64_t)(shared_alloc
.a
->get_fragmentation() * 1000));
12342 log_latency("kv_final",
12343 l_bluestore_kv_final_lat
,
12344 mono_clock::now() - start
,
12345 cct
->_conf
->bluestore_log_op_age
);
12350 dout(10) << __func__
<< " finish" << dendl
;
12351 kv_finalize_started
= false;
12354 void BlueStore::_zoned_cleaner_start() {
12355 dout(10) << __func__
<< dendl
;
12357 zoned_cleaner_thread
.create("bstore_zcleaner");
12360 void BlueStore::_zoned_cleaner_stop() {
12361 dout(10) << __func__
<< dendl
;
12363 std::unique_lock l
{zoned_cleaner_lock
};
12364 while (!zoned_cleaner_started
) {
12365 zoned_cleaner_cond
.wait(l
);
12367 zoned_cleaner_stop
= true;
12368 zoned_cleaner_cond
.notify_all();
12370 zoned_cleaner_thread
.join();
12372 std::lock_guard l
{zoned_cleaner_lock
};
12373 zoned_cleaner_stop
= false;
12375 dout(10) << __func__
<< " done" << dendl
;
12378 void BlueStore::_zoned_cleaner_thread() {
12379 dout(10) << __func__
<< " start" << dendl
;
12380 std::unique_lock l
{zoned_cleaner_lock
};
12381 ceph_assert(!zoned_cleaner_started
);
12382 zoned_cleaner_started
= true;
12383 zoned_cleaner_cond
.notify_all();
12384 std::deque
<uint64_t> zones_to_clean
;
12386 if (zoned_cleaner_queue
.empty()) {
12387 if (zoned_cleaner_stop
) {
12390 dout(20) << __func__
<< " sleep" << dendl
;
12391 zoned_cleaner_cond
.wait(l
);
12392 dout(20) << __func__
<< " wake" << dendl
;
12394 zones_to_clean
.swap(zoned_cleaner_queue
);
12396 while (!zones_to_clean
.empty()) {
12397 _zoned_clean_zone(zones_to_clean
.front());
12398 zones_to_clean
.pop_front();
12403 dout(10) << __func__
<< " finish" << dendl
;
12404 zoned_cleaner_started
= false;
12407 void BlueStore::_zoned_clean_zone(uint64_t zone_num
) {
12408 dout(10) << __func__
<< " cleaning zone " << zone_num
<< dendl
;
12411 bluestore_deferred_op_t
*BlueStore::_get_deferred_op(
12412 TransContext
*txc
, uint64_t len
)
12414 if (!txc
->deferred_txn
) {
12415 txc
->deferred_txn
= new bluestore_deferred_transaction_t
;
12417 txc
->deferred_txn
->ops
.push_back(bluestore_deferred_op_t());
12418 logger
->inc(l_bluestore_write_deferred
);
12419 logger
->inc(l_bluestore_write_deferred_bytes
, len
);
12420 return &txc
->deferred_txn
->ops
.back();
12423 void BlueStore::_deferred_queue(TransContext
*txc
)
12425 dout(20) << __func__
<< " txc " << txc
<< " osr " << txc
->osr
<< dendl
;
12427 DeferredBatch
*tmp
;
12428 txc
->osr
->deferred_lock
.lock();
12430 if (!txc
->osr
->deferred_pending
) {
12431 tmp
= new DeferredBatch(cct
, txc
->osr
.get());
12433 tmp
= txc
->osr
->deferred_pending
;
12437 tmp
->txcs
.push_back(*txc
);
12438 bluestore_deferred_transaction_t
& wt
= *txc
->deferred_txn
;
12439 for (auto opi
= wt
.ops
.begin(); opi
!= wt
.ops
.end(); ++opi
) {
12440 const auto& op
= *opi
;
12441 ceph_assert(op
.op
== bluestore_deferred_op_t::OP_WRITE
);
12442 bufferlist::const_iterator p
= op
.data
.begin();
12443 for (auto e
: op
.extents
) {
12444 tmp
->prepare_write(cct
, wt
.seq
, e
.offset
, e
.length
, p
);
12449 ++deferred_queue_size
;
12450 txc
->osr
->deferred_pending
= tmp
;
12451 // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
12452 // So we should add osr into deferred_queue.
12453 if (!txc
->osr
->deferred_running
&& (tmp
->txcs
.size() == 1)) {
12454 deferred_lock
.lock();
12455 deferred_queue
.push_back(*txc
->osr
);
12456 deferred_lock
.unlock();
12459 if (deferred_aggressive
&&
12460 !txc
->osr
->deferred_running
) {
12461 _deferred_submit_unlock(txc
->osr
.get());
12463 txc
->osr
->deferred_lock
.unlock();
12468 void BlueStore::deferred_try_submit()
12470 dout(20) << __func__
<< " " << deferred_queue
.size() << " osrs, "
12471 << deferred_queue_size
<< " txcs" << dendl
;
12472 vector
<OpSequencerRef
> osrs
;
12475 std::lock_guard
l(deferred_lock
);
12476 osrs
.reserve(deferred_queue
.size());
12477 for (auto& osr
: deferred_queue
) {
12478 osrs
.push_back(&osr
);
12482 for (auto& osr
: osrs
) {
12483 osr
->deferred_lock
.lock();
12484 if (osr
->deferred_pending
) {
12485 if (!osr
->deferred_running
) {
12486 _deferred_submit_unlock(osr
.get());
12488 osr
->deferred_lock
.unlock();
12489 dout(20) << __func__
<< " osr " << osr
<< " already has running"
12493 osr
->deferred_lock
.unlock();
12494 dout(20) << __func__
<< " osr " << osr
<< " has no pending" << dendl
;
12499 std::lock_guard
l(deferred_lock
);
12500 deferred_last_submitted
= ceph_clock_now();
12504 void BlueStore::_deferred_submit_unlock(OpSequencer
*osr
)
12506 dout(10) << __func__
<< " osr " << osr
12507 << " " << osr
->deferred_pending
->iomap
.size() << " ios pending "
12509 ceph_assert(osr
->deferred_pending
);
12510 ceph_assert(!osr
->deferred_running
);
12512 auto b
= osr
->deferred_pending
;
12513 deferred_queue_size
-= b
->seq_bytes
.size();
12514 ceph_assert(deferred_queue_size
>= 0);
12516 osr
->deferred_running
= osr
->deferred_pending
;
12517 osr
->deferred_pending
= nullptr;
12519 osr
->deferred_lock
.unlock();
12521 for (auto& txc
: b
->txcs
) {
12522 throttle
.log_state_latency(txc
, logger
, l_bluestore_state_deferred_queued_lat
);
12524 uint64_t start
= 0, pos
= 0;
12526 auto i
= b
->iomap
.begin();
12528 if (i
== b
->iomap
.end() || i
->first
!= pos
) {
12530 dout(20) << __func__
<< " write 0x" << std::hex
12531 << start
<< "~" << bl
.length()
12532 << " crc " << bl
.crc32c(-1) << std::dec
<< dendl
;
12533 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
12534 logger
->inc(l_bluestore_deferred_write_ops
);
12535 logger
->inc(l_bluestore_deferred_write_bytes
, bl
.length());
12536 int r
= bdev
->aio_write(start
, bl
, &b
->ioc
, false);
12537 ceph_assert(r
== 0);
12540 if (i
== b
->iomap
.end()) {
12547 dout(20) << __func__
<< " seq " << i
->second
.seq
<< " 0x"
12548 << std::hex
<< pos
<< "~" << i
->second
.bl
.length() << std::dec
12550 if (!bl
.length()) {
12553 pos
+= i
->second
.bl
.length();
12554 bl
.claim_append(i
->second
.bl
);
12558 bdev
->aio_submit(&b
->ioc
);
12561 struct C_DeferredTrySubmit
: public Context
{
12563 C_DeferredTrySubmit(BlueStore
*s
) : store(s
) {}
12564 void finish(int r
) {
12565 store
->deferred_try_submit();
12569 void BlueStore::_deferred_aio_finish(OpSequencer
*osr
)
12571 dout(10) << __func__
<< " osr " << osr
<< dendl
;
12572 ceph_assert(osr
->deferred_running
);
12573 DeferredBatch
*b
= osr
->deferred_running
;
12576 osr
->deferred_lock
.lock();
12577 ceph_assert(osr
->deferred_running
== b
);
12578 osr
->deferred_running
= nullptr;
12579 if (!osr
->deferred_pending
) {
12580 dout(20) << __func__
<< " dequeueing" << dendl
;
12582 deferred_lock
.lock();
12583 auto q
= deferred_queue
.iterator_to(*osr
);
12584 deferred_queue
.erase(q
);
12585 deferred_lock
.unlock();
12587 osr
->deferred_lock
.unlock();
12589 osr
->deferred_lock
.unlock();
12590 if (deferred_aggressive
) {
12591 dout(20) << __func__
<< " queuing async deferred_try_submit" << dendl
;
12592 finisher
.queue(new C_DeferredTrySubmit(this));
12594 dout(20) << __func__
<< " leaving queued, more pending" << dendl
;
12600 uint64_t costs
= 0;
12602 for (auto& i
: b
->txcs
) {
12603 TransContext
*txc
= &i
;
12604 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_deferred_aio_wait_lat
);
12605 txc
->set_state(TransContext::STATE_DEFERRED_CLEANUP
);
12606 costs
+= txc
->cost
;
12609 throttle
.release_deferred_throttle(costs
);
12613 std::lock_guard
l(kv_lock
);
12614 deferred_done_queue
.emplace_back(b
);
12616 // in the normal case, do not bother waking up the kv thread; it will
12617 // catch us on the next commit anyway.
12618 if (deferred_aggressive
&& !kv_sync_in_progress
) {
12619 kv_sync_in_progress
= true;
12620 kv_cond
.notify_one();
12625 int BlueStore::_deferred_replay()
12627 dout(10) << __func__
<< " start" << dendl
;
12630 CollectionRef ch
= _get_collection(coll_t::meta());
12631 bool fake_ch
= false;
12633 // hmm, replaying initial mkfs?
12634 ch
= static_cast<Collection
*>(create_new_collection(coll_t::meta()).get());
12637 OpSequencer
*osr
= static_cast<OpSequencer
*>(ch
->osr
.get());
12638 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_DEFERRED
);
12639 for (it
->lower_bound(string()); it
->valid(); it
->next(), ++count
) {
12640 dout(20) << __func__
<< " replay " << pretty_binary_string(it
->key())
12642 bluestore_deferred_transaction_t
*deferred_txn
=
12643 new bluestore_deferred_transaction_t
;
12644 bufferlist bl
= it
->value();
12645 auto p
= bl
.cbegin();
12647 decode(*deferred_txn
, p
);
12648 } catch (ceph::buffer::error
& e
) {
12649 derr
<< __func__
<< " failed to decode deferred txn "
12650 << pretty_binary_string(it
->key()) << dendl
;
12651 delete deferred_txn
;
12655 TransContext
*txc
= _txc_create(ch
.get(), osr
, nullptr);
12656 txc
->deferred_txn
= deferred_txn
;
12657 txc
->set_state(TransContext::STATE_KV_DONE
);
12658 _txc_state_proc(txc
);
12661 dout(20) << __func__
<< " draining osr" << dendl
;
12662 _osr_register_zombie(osr
);
12665 new_coll_map
.clear();
12667 dout(10) << __func__
<< " completed " << count
<< " events" << dendl
;
12671 // ---------------------------
12674 int BlueStore::queue_transactions(
12675 CollectionHandle
& ch
,
12676 vector
<Transaction
>& tls
,
12678 ThreadPool::TPHandle
*handle
)
12681 list
<Context
*> on_applied
, on_commit
, on_applied_sync
;
12682 ObjectStore::Transaction::collect_contexts(
12683 tls
, &on_applied
, &on_commit
, &on_applied_sync
);
12685 auto start
= mono_clock::now();
12687 Collection
*c
= static_cast<Collection
*>(ch
.get());
12688 OpSequencer
*osr
= c
->osr
.get();
12689 dout(10) << __func__
<< " ch " << c
<< " " << c
->cid
<< dendl
;
12692 TransContext
*txc
= _txc_create(static_cast<Collection
*>(ch
.get()), osr
,
12695 // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
12696 // submission to happen atomically because if I/O submission happens in a
12697 // different order than I/O allocation, we end up issuing non-sequential
12698 // writes to the drive. This is a temporary solution until ZONE APPEND
12699 // support matures in the kernel. For more information please see:
12700 // https://www.usenix.org/conference/vault20/presentation/bjorling
12701 if (bdev
->is_smr()) {
12702 atomic_alloc_and_submit_lock
.lock();
12704 for (vector
<Transaction
>::iterator p
= tls
.begin(); p
!= tls
.end(); ++p
) {
12705 txc
->bytes
+= (*p
).get_num_bytes();
12706 _txc_add_transaction(txc
, &(*p
));
12708 _txc_calc_cost(txc
);
12710 _txc_write_nodes(txc
, txc
->t
);
12712 // journal deferred items
12713 if (txc
->deferred_txn
) {
12714 txc
->deferred_txn
->seq
= ++deferred_seq
;
12716 encode(*txc
->deferred_txn
, bl
);
12718 get_deferred_key(txc
->deferred_txn
->seq
, &key
);
12719 txc
->t
->set(PREFIX_DEFERRED
, key
, bl
);
12722 _txc_finalize_kv(txc
, txc
->t
);
12726 txc
->trace
.event("txc encode finished");
12731 handle
->suspend_tp_timeout();
12733 auto tstart
= mono_clock::now();
12735 if (!throttle
.try_start_transaction(
12739 // ensure we do not block here because of deferred writes
12740 dout(10) << __func__
<< " failed get throttle_deferred_bytes, aggressive"
12742 ++deferred_aggressive
;
12743 deferred_try_submit();
12745 // wake up any previously finished deferred events
12746 std::lock_guard
l(kv_lock
);
12747 if (!kv_sync_in_progress
) {
12748 kv_sync_in_progress
= true;
12749 kv_cond
.notify_one();
12752 throttle
.finish_start_transaction(*db
, *txc
, tstart
);
12753 --deferred_aggressive
;
12755 auto tend
= mono_clock::now();
12758 handle
->reset_tp_timeout();
12760 logger
->inc(l_bluestore_txc
);
12763 _txc_state_proc(txc
);
12765 if (bdev
->is_smr()) {
12766 atomic_alloc_and_submit_lock
.unlock();
12769 // we're immediately readable (unlike FileStore)
12770 for (auto c
: on_applied_sync
) {
12773 if (!on_applied
.empty()) {
12774 if (c
->commit_queue
) {
12775 c
->commit_queue
->queue(on_applied
);
12777 finisher
.queue(on_applied
);
12783 txc
->trace
.event("txc applied");
12787 log_latency("submit_transact",
12788 l_bluestore_submit_lat
,
12789 mono_clock::now() - start
,
12790 cct
->_conf
->bluestore_log_op_age
);
12791 log_latency("throttle_transact",
12792 l_bluestore_throttle_lat
,
12794 cct
->_conf
->bluestore_log_op_age
);
12798 void BlueStore::_txc_aio_submit(TransContext
*txc
)
12800 dout(10) << __func__
<< " txc " << txc
<< dendl
;
12801 bdev
->aio_submit(&txc
->ioc
);
12804 void BlueStore::_txc_add_transaction(TransContext
*txc
, Transaction
*t
)
12806 Transaction::iterator i
= t
->begin();
12808 _dump_transaction
<30>(cct
, t
);
12810 vector
<CollectionRef
> cvec(i
.colls
.size());
12812 for (vector
<coll_t
>::iterator p
= i
.colls
.begin(); p
!= i
.colls
.end();
12814 cvec
[j
] = _get_collection(*p
);
12817 vector
<OnodeRef
> ovec(i
.objects
.size());
12819 for (int pos
= 0; i
.have_op(); ++pos
) {
12820 Transaction::Op
*op
= i
.decode_op();
12824 if (op
->op
== Transaction::OP_NOP
)
12828 // collection operations
12829 CollectionRef
&c
= cvec
[op
->cid
];
12831 // initialize osd_pool_id and do a smoke test that all collections belong
12832 // to the same pool
12834 if (!!c
? c
->cid
.is_pg(&pgid
) : false) {
12835 ceph_assert(txc
->osd_pool_id
== META_POOL_ID
||
12836 txc
->osd_pool_id
== pgid
.pool());
12837 txc
->osd_pool_id
= pgid
.pool();
12841 case Transaction::OP_RMCOLL
:
12843 const coll_t
&cid
= i
.get_cid(op
->cid
);
12844 r
= _remove_collection(txc
, cid
, &c
);
12850 case Transaction::OP_MKCOLL
:
12853 const coll_t
&cid
= i
.get_cid(op
->cid
);
12854 r
= _create_collection(txc
, cid
, op
->split_bits
, &c
);
12860 case Transaction::OP_SPLIT_COLLECTION
:
12861 ceph_abort_msg("deprecated");
12864 case Transaction::OP_SPLIT_COLLECTION2
:
12866 uint32_t bits
= op
->split_bits
;
12867 uint32_t rem
= op
->split_rem
;
12868 r
= _split_collection(txc
, c
, cvec
[op
->dest_cid
], bits
, rem
);
12874 case Transaction::OP_MERGE_COLLECTION
:
12876 uint32_t bits
= op
->split_bits
;
12877 r
= _merge_collection(txc
, &c
, cvec
[op
->dest_cid
], bits
);
12883 case Transaction::OP_COLL_HINT
:
12885 uint32_t type
= op
->hint
;
12888 auto hiter
= hint
.cbegin();
12889 if (type
== Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
) {
12892 decode(pg_num
, hiter
);
12893 decode(num_objs
, hiter
);
12894 dout(10) << __func__
<< " collection hint objects is a no-op, "
12895 << " pg_num " << pg_num
<< " num_objects " << num_objs
12899 dout(10) << __func__
<< " unknown collection hint " << type
<< dendl
;
12905 case Transaction::OP_COLL_SETATTR
:
12909 case Transaction::OP_COLL_RMATTR
:
12913 case Transaction::OP_COLL_RENAME
:
12914 ceph_abort_msg("not implemented");
12918 derr
<< __func__
<< " error " << cpp_strerror(r
)
12919 << " not handled on operation " << op
->op
12920 << " (op " << pos
<< ", counting from 0)" << dendl
;
12921 _dump_transaction
<0>(cct
, t
);
12922 ceph_abort_msg("unexpected error");
12925 // these operations implicity create the object
12926 bool create
= false;
12927 if (op
->op
== Transaction::OP_TOUCH
||
12928 op
->op
== Transaction::OP_CREATE
||
12929 op
->op
== Transaction::OP_WRITE
||
12930 op
->op
== Transaction::OP_ZERO
) {
12934 // object operations
12935 std::unique_lock
l(c
->lock
);
12936 OnodeRef
&o
= ovec
[op
->oid
];
12938 ghobject_t oid
= i
.get_oid(op
->oid
);
12939 o
= c
->get_onode(oid
, create
, op
->op
== Transaction::OP_CREATE
);
12941 if (!create
&& (!o
|| !o
->exists
)) {
12942 dout(10) << __func__
<< " op " << op
->op
<< " got ENOENT on "
12943 << i
.get_oid(op
->oid
) << dendl
;
12949 case Transaction::OP_CREATE
:
12950 case Transaction::OP_TOUCH
:
12951 r
= _touch(txc
, c
, o
);
12954 case Transaction::OP_WRITE
:
12956 uint64_t off
= op
->off
;
12957 uint64_t len
= op
->len
;
12958 uint32_t fadvise_flags
= i
.get_fadvise_flags();
12961 r
= _write(txc
, c
, o
, off
, len
, bl
, fadvise_flags
);
12965 case Transaction::OP_ZERO
:
12967 uint64_t off
= op
->off
;
12968 uint64_t len
= op
->len
;
12969 r
= _zero(txc
, c
, o
, off
, len
);
12973 case Transaction::OP_TRIMCACHE
:
12975 // deprecated, no-op
12979 case Transaction::OP_TRUNCATE
:
12981 uint64_t off
= op
->off
;
12982 r
= _truncate(txc
, c
, o
, off
);
12986 case Transaction::OP_REMOVE
:
12988 r
= _remove(txc
, c
, o
);
12992 case Transaction::OP_SETATTR
:
12994 string name
= i
.decode_string();
12997 r
= _setattr(txc
, c
, o
, name
, bp
);
13001 case Transaction::OP_SETATTRS
:
13003 map
<string
, bufferptr
> aset
;
13004 i
.decode_attrset(aset
);
13005 r
= _setattrs(txc
, c
, o
, aset
);
13009 case Transaction::OP_RMATTR
:
13011 string name
= i
.decode_string();
13012 r
= _rmattr(txc
, c
, o
, name
);
13016 case Transaction::OP_RMATTRS
:
13018 r
= _rmattrs(txc
, c
, o
);
13022 case Transaction::OP_CLONE
:
13024 OnodeRef
& no
= ovec
[op
->dest_oid
];
13026 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
13027 no
= c
->get_onode(noid
, true);
13029 r
= _clone(txc
, c
, o
, no
);
13033 case Transaction::OP_CLONERANGE
:
13034 ceph_abort_msg("deprecated");
13037 case Transaction::OP_CLONERANGE2
:
13039 OnodeRef
& no
= ovec
[op
->dest_oid
];
13041 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
13042 no
= c
->get_onode(noid
, true);
13044 uint64_t srcoff
= op
->off
;
13045 uint64_t len
= op
->len
;
13046 uint64_t dstoff
= op
->dest_off
;
13047 r
= _clone_range(txc
, c
, o
, no
, srcoff
, len
, dstoff
);
13051 case Transaction::OP_COLL_ADD
:
13052 ceph_abort_msg("not implemented");
13055 case Transaction::OP_COLL_REMOVE
:
13056 ceph_abort_msg("not implemented");
13059 case Transaction::OP_COLL_MOVE
:
13060 ceph_abort_msg("deprecated");
13063 case Transaction::OP_COLL_MOVE_RENAME
:
13064 case Transaction::OP_TRY_RENAME
:
13066 ceph_assert(op
->cid
== op
->dest_cid
);
13067 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
13068 OnodeRef
& no
= ovec
[op
->dest_oid
];
13070 no
= c
->get_onode(noid
, false);
13072 r
= _rename(txc
, c
, o
, no
, noid
);
13076 case Transaction::OP_OMAP_CLEAR
:
13078 r
= _omap_clear(txc
, c
, o
);
13081 case Transaction::OP_OMAP_SETKEYS
:
13083 bufferlist aset_bl
;
13084 i
.decode_attrset_bl(&aset_bl
);
13085 r
= _omap_setkeys(txc
, c
, o
, aset_bl
);
13088 case Transaction::OP_OMAP_RMKEYS
:
13090 bufferlist keys_bl
;
13091 i
.decode_keyset_bl(&keys_bl
);
13092 r
= _omap_rmkeys(txc
, c
, o
, keys_bl
);
13095 case Transaction::OP_OMAP_RMKEYRANGE
:
13097 string first
, last
;
13098 first
= i
.decode_string();
13099 last
= i
.decode_string();
13100 r
= _omap_rmkey_range(txc
, c
, o
, first
, last
);
13103 case Transaction::OP_OMAP_SETHEADER
:
13107 r
= _omap_setheader(txc
, c
, o
, bl
);
13111 case Transaction::OP_SETALLOCHINT
:
13113 r
= _set_alloc_hint(txc
, c
, o
,
13114 op
->expected_object_size
,
13115 op
->expected_write_size
,
13121 derr
<< __func__
<< " bad op " << op
->op
<< dendl
;
13129 if (r
== -ENOENT
&& !(op
->op
== Transaction::OP_CLONERANGE
||
13130 op
->op
== Transaction::OP_CLONE
||
13131 op
->op
== Transaction::OP_CLONERANGE2
||
13132 op
->op
== Transaction::OP_COLL_ADD
||
13133 op
->op
== Transaction::OP_SETATTR
||
13134 op
->op
== Transaction::OP_SETATTRS
||
13135 op
->op
== Transaction::OP_RMATTR
||
13136 op
->op
== Transaction::OP_OMAP_SETKEYS
||
13137 op
->op
== Transaction::OP_OMAP_RMKEYS
||
13138 op
->op
== Transaction::OP_OMAP_RMKEYRANGE
||
13139 op
->op
== Transaction::OP_OMAP_SETHEADER
))
13140 // -ENOENT is usually okay
13146 const char *msg
= "unexpected error code";
13148 if (r
== -ENOENT
&& (op
->op
== Transaction::OP_CLONERANGE
||
13149 op
->op
== Transaction::OP_CLONE
||
13150 op
->op
== Transaction::OP_CLONERANGE2
))
13151 msg
= "ENOENT on clone suggests osd bug";
13154 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
13155 // by partially applying transactions.
13156 msg
= "ENOSPC from bluestore, misconfigured cluster";
13158 if (r
== -ENOTEMPTY
) {
13159 msg
= "ENOTEMPTY suggests garbage data in osd data dir";
13162 derr
<< __func__
<< " error " << cpp_strerror(r
)
13163 << " not handled on operation " << op
->op
13164 << " (op " << pos
<< ", counting from 0)"
13166 derr
<< msg
<< dendl
;
13167 _dump_transaction
<0>(cct
, t
);
13168 ceph_abort_msg("unexpected error");
13176 // -----------------
13177 // write operations
13179 int BlueStore::_touch(TransContext
*txc
,
13183 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
13185 _assign_nid(txc
, o
);
13186 txc
->write_onode(o
);
13187 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
13191 void BlueStore::_pad_zeros(
13192 bufferlist
*bl
, uint64_t *offset
,
13193 uint64_t chunk_size
)
13195 auto length
= bl
->length();
13196 dout(30) << __func__
<< " 0x" << std::hex
<< *offset
<< "~" << length
13197 << " chunk_size 0x" << chunk_size
<< std::dec
<< dendl
;
13198 dout(40) << "before:\n";
13199 bl
->hexdump(*_dout
);
13202 size_t front_pad
= *offset
% chunk_size
;
13203 size_t back_pad
= 0;
13204 size_t pad_count
= 0;
13206 size_t front_copy
= std::min
<uint64_t>(chunk_size
- front_pad
, length
);
13207 bufferptr z
= ceph::buffer::create_small_page_aligned(chunk_size
);
13208 z
.zero(0, front_pad
, false);
13209 pad_count
+= front_pad
;
13210 bl
->begin().copy(front_copy
, z
.c_str() + front_pad
);
13211 if (front_copy
+ front_pad
< chunk_size
) {
13212 back_pad
= chunk_size
- (length
+ front_pad
);
13213 z
.zero(front_pad
+ length
, back_pad
, false);
13214 pad_count
+= back_pad
;
13218 t
.substr_of(old
, front_copy
, length
- front_copy
);
13220 bl
->claim_append(t
);
13221 *offset
-= front_pad
;
13222 length
+= pad_count
;
13226 uint64_t end
= *offset
+ length
;
13227 unsigned back_copy
= end
% chunk_size
;
13229 ceph_assert(back_pad
== 0);
13230 back_pad
= chunk_size
- back_copy
;
13231 ceph_assert(back_copy
<= length
);
13232 bufferptr
tail(chunk_size
);
13233 bl
->begin(length
- back_copy
).copy(back_copy
, tail
.c_str());
13234 tail
.zero(back_copy
, back_pad
, false);
13237 bl
->substr_of(old
, 0, length
- back_copy
);
13239 length
+= back_pad
;
13240 pad_count
+= back_pad
;
13242 dout(20) << __func__
<< " pad 0x" << std::hex
<< front_pad
<< " + 0x"
13243 << back_pad
<< " on front/back, now 0x" << *offset
<< "~"
13244 << length
<< std::dec
<< dendl
;
13245 dout(40) << "after:\n";
13246 bl
->hexdump(*_dout
);
13249 logger
->inc(l_bluestore_write_pad_bytes
, pad_count
);
13250 ceph_assert(bl
->length() == length
);
13253 void BlueStore::_do_write_small(
13257 uint64_t offset
, uint64_t length
,
13258 bufferlist::iterator
& blp
,
13259 WriteContext
*wctx
)
13261 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
13262 << std::dec
<< dendl
;
13263 ceph_assert(length
< min_alloc_size
);
13265 uint64_t end_offs
= offset
+ length
;
13267 logger
->inc(l_bluestore_write_small
);
13268 logger
->inc(l_bluestore_write_small_bytes
, length
);
13271 blp
.copy(length
, bl
);
13273 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13274 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
13275 uint32_t alloc_len
= min_alloc_size
;
13276 auto offset0
= p2align
<uint64_t>(offset
, alloc_len
);
13280 // search suitable extent in both forward and reverse direction in
13281 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13282 // then check if blob can be reused via can_reuse_blob func or apply
13283 // direct/deferred write (the latter for extents including or higher
13284 // than 'offset' only).
13285 o
->extent_map
.fault_range(db
, min_off
, offset
+ max_bsize
- min_off
);
13287 // On zoned devices, the first goal is to support non-overwrite workloads,
13288 // such as RGW, with large, aligned objects. Therefore, for user writes
13289 // _do_write_small should not trigger. OSDs, however, write and update a tiny
13290 // amount of metadata, such as OSD maps, to disk. For those cases, we
13291 // temporarily just pad them to min_alloc_size and write them to a new place
13292 // on every update.
13293 if (bdev
->is_smr()) {
13294 BlobRef b
= c
->new_blob();
13295 uint64_t b_off
= p2phase
<uint64_t>(offset
, alloc_len
);
13296 uint64_t b_off0
= b_off
;
13297 _pad_zeros(&bl
, &b_off0
, min_alloc_size
);
13298 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13299 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
, false, true);
13303 // Look for an existing mutable blob we can use.
13304 auto begin
= o
->extent_map
.extent_map
.begin();
13305 auto end
= o
->extent_map
.extent_map
.end();
13306 auto ep
= o
->extent_map
.seek_lextent(offset
);
13309 if (ep
->blob_end() <= offset
) {
13313 auto prev_ep
= end
;
13319 boost::container::flat_set
<const bluestore_blob_t
*> inspected_blobs
;
13320 // We don't want to have more blobs than min alloc units fit
13321 // into 2 max blobs
13322 size_t blob_threshold
= max_blob_size
/ min_alloc_size
* 2 + 1;
13323 bool above_blob_threshold
= false;
13325 inspected_blobs
.reserve(blob_threshold
);
13327 uint64_t max_off
= 0;
13328 auto start_ep
= ep
;
13329 auto end_ep
= ep
; // exclusively
13331 any_change
= false;
13333 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
13334 BlobRef b
= ep
->blob
;
13335 if (!above_blob_threshold
) {
13336 inspected_blobs
.insert(&b
->get_blob());
13337 above_blob_threshold
= inspected_blobs
.size() >= blob_threshold
;
13339 max_off
= ep
->logical_end();
13340 auto bstart
= ep
->blob_start();
13342 dout(20) << __func__
<< " considering " << *b
13343 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
13344 if (bstart
>= end_offs
) {
13345 dout(20) << __func__
<< " ignoring distant " << *b
<< dendl
;
13346 } else if (!b
->get_blob().is_mutable()) {
13347 dout(20) << __func__
<< " ignoring immutable " << *b
<< dendl
;
13348 } else if (ep
->logical_offset
% min_alloc_size
!=
13349 ep
->blob_offset
% min_alloc_size
) {
13350 dout(20) << __func__
<< " ignoring offset-skewed " << *b
<< dendl
;
13352 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
13353 // can we pad our head/tail out with zeros?
13354 uint64_t head_pad
, tail_pad
;
13355 head_pad
= p2phase(offset
, chunk_size
);
13356 tail_pad
= p2nphase(end_offs
, chunk_size
);
13357 if (head_pad
|| tail_pad
) {
13358 o
->extent_map
.fault_range(db
, offset
- head_pad
,
13359 end_offs
- offset
+ head_pad
+ tail_pad
);
13362 o
->extent_map
.has_any_lextents(offset
- head_pad
, chunk_size
)) {
13365 if (tail_pad
&& o
->extent_map
.has_any_lextents(end_offs
, tail_pad
)) {
13369 uint64_t b_off
= offset
- head_pad
- bstart
;
13370 uint64_t b_len
= length
+ head_pad
+ tail_pad
;
13372 // direct write into unused blocks of an existing mutable blob?
13373 if ((b_off
% chunk_size
== 0 && b_len
% chunk_size
== 0) &&
13374 b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
13375 b
->get_blob().is_unused(b_off
, b_len
) &&
13376 b
->get_blob().is_allocated(b_off
, b_len
)) {
13377 _apply_padding(head_pad
, tail_pad
, bl
);
13379 dout(20) << __func__
<< " write to unused 0x" << std::hex
13380 << b_off
<< "~" << b_len
13381 << " pad 0x" << head_pad
<< " + 0x" << tail_pad
13382 << std::dec
<< " of mutable " << *b
<< dendl
;
13383 _buffer_cache_write(txc
, b
, b_off
, bl
,
13384 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13386 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13387 if (b_len
< prefer_deferred_size
) {
13388 dout(20) << __func__
<< " deferring small 0x" << std::hex
13389 << b_len
<< std::dec
<< " unused write via deferred" << dendl
;
13390 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, bl
.length());
13391 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13394 [&](uint64_t offset
, uint64_t length
) {
13395 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13400 b
->get_blob().map_bl(
13402 [&](uint64_t offset
, bufferlist
& t
) {
13403 bdev
->aio_write(offset
, t
,
13404 &txc
->ioc
, wctx
->buffered
);
13408 b
->dirty_blob().calc_csum(b_off
, bl
);
13409 dout(20) << __func__
<< " lex old " << *ep
<< dendl
;
13410 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, b_off
+ head_pad
, length
,
13412 &wctx
->old_extents
);
13413 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13415 txc
->statfs_delta
.stored() += le
->length
;
13416 dout(20) << __func__
<< " lex " << *le
<< dendl
;
13417 logger
->inc(l_bluestore_write_small_unused
);
13420 // read some data to fill out the chunk?
13421 uint64_t head_read
= p2phase(b_off
, chunk_size
);
13422 uint64_t tail_read
= p2nphase(b_off
+ b_len
, chunk_size
);
13423 if ((head_read
|| tail_read
) &&
13424 (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
+ tail_read
) &&
13425 head_read
+ tail_read
< min_alloc_size
) {
13426 b_off
-= head_read
;
13427 b_len
+= head_read
+ tail_read
;
13430 head_read
= tail_read
= 0;
13433 // chunk-aligned deferred overwrite?
13434 if (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
13435 b_off
% chunk_size
== 0 &&
13436 b_len
% chunk_size
== 0 &&
13437 b
->get_blob().is_allocated(b_off
, b_len
)) {
13439 _apply_padding(head_pad
, tail_pad
, bl
);
13441 dout(20) << __func__
<< " reading head 0x" << std::hex
<< head_read
13442 << " and tail 0x" << tail_read
<< std::dec
<< dendl
;
13444 bufferlist head_bl
;
13445 int r
= _do_read(c
.get(), o
, offset
- head_pad
- head_read
, head_read
,
13447 ceph_assert(r
>= 0 && r
<= (int)head_read
);
13448 size_t zlen
= head_read
- r
;
13450 head_bl
.append_zero(zlen
);
13451 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13453 head_bl
.claim_append(bl
);
13455 logger
->inc(l_bluestore_write_penalty_read_ops
);
13458 bufferlist tail_bl
;
13459 int r
= _do_read(c
.get(), o
, offset
+ length
+ tail_pad
, tail_read
,
13461 ceph_assert(r
>= 0 && r
<= (int)tail_read
);
13462 size_t zlen
= tail_read
- r
;
13464 tail_bl
.append_zero(zlen
);
13465 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13467 bl
.claim_append(tail_bl
);
13468 logger
->inc(l_bluestore_write_penalty_read_ops
);
13470 logger
->inc(l_bluestore_write_small_pre_read
);
13472 _buffer_cache_write(txc
, b
, b_off
, bl
,
13473 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13475 b
->dirty_blob().calc_csum(b_off
, bl
);
13477 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13478 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, bl
.length());
13479 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13480 int r
= b
->get_blob().map(
13482 [&](uint64_t offset
, uint64_t length
) {
13483 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13486 ceph_assert(r
== 0);
13487 op
->data
= std::move(bl
);
13488 dout(20) << __func__
<< " deferred write 0x" << std::hex
<< b_off
<< "~"
13489 << b_len
<< std::dec
<< " of mutable " << *b
13490 << " at " << op
->extents
<< dendl
;
13493 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, offset
- bstart
, length
,
13494 b
, &wctx
->old_extents
);
13495 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13496 txc
->statfs_delta
.stored() += le
->length
;
13497 dout(20) << __func__
<< " lex " << *le
<< dendl
;
13500 // try to reuse blob if we can
13501 if (b
->can_reuse_blob(min_alloc_size
,
13505 ceph_assert(alloc_len
== min_alloc_size
); // expecting data always
13506 // fit into reused blob
13507 // Need to check for pending writes desiring to
13508 // reuse the same pextent. The rationale is that during GC two chunks
13509 // from garbage blobs(compressed?) can share logical space within the same
13510 // AU. That's in turn might be caused by unaligned len in clone_range2.
13511 // Hence the second write will fail in an attempt to reuse blob at
13512 // do_alloc_write().
13513 if (!wctx
->has_conflict(b
,
13515 offset0
+ alloc_len
,
13518 // we can't reuse pad_head/pad_tail since they might be truncated
13519 // due to existent extents
13520 uint64_t b_off
= offset
- bstart
;
13521 uint64_t b_off0
= b_off
;
13522 _pad_zeros(&bl
, &b_off0
, chunk_size
);
13524 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13525 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
13526 << " (0x" << b_off
<< "~" << length
<< ")"
13527 << std::dec
<< dendl
;
13529 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13530 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13532 logger
->inc(l_bluestore_write_small_unused
);
13540 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13542 // check extent for reuse in reverse order
13543 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
13544 BlobRef b
= prev_ep
->blob
;
13545 if (!above_blob_threshold
) {
13546 inspected_blobs
.insert(&b
->get_blob());
13547 above_blob_threshold
= inspected_blobs
.size() >= blob_threshold
;
13549 start_ep
= prev_ep
;
13550 auto bstart
= prev_ep
->blob_start();
13551 dout(20) << __func__
<< " considering " << *b
13552 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
13553 if (b
->can_reuse_blob(min_alloc_size
,
13557 ceph_assert(alloc_len
== min_alloc_size
); // expecting data always
13558 // fit into reused blob
13559 // Need to check for pending writes desiring to
13560 // reuse the same pextent. The rationale is that during GC two chunks
13561 // from garbage blobs(compressed?) can share logical space within the same
13562 // AU. That's in turn might be caused by unaligned len in clone_range2.
13563 // Hence the second write will fail in an attempt to reuse blob at
13564 // do_alloc_write().
13565 if (!wctx
->has_conflict(b
,
13567 offset0
+ alloc_len
,
13570 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
13571 uint64_t b_off
= offset
- bstart
;
13572 uint64_t b_off0
= b_off
;
13573 _pad_zeros(&bl
, &b_off0
, chunk_size
);
13575 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13576 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
13577 << " (0x" << b_off
<< "~" << length
<< ")"
13578 << std::dec
<< dendl
;
13580 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13581 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13583 logger
->inc(l_bluestore_write_small_unused
);
13587 if (prev_ep
!= begin
) {
13591 prev_ep
= end
; // to avoid useless first extent re-check
13593 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13594 } while (any_change
);
13596 if (above_blob_threshold
) {
13597 dout(10) << __func__
<< " request GC, blobs >= " << inspected_blobs
.size()
13598 << " " << std::hex
<< min_off
<< "~" << max_off
<< std::dec
13600 ceph_assert(start_ep
!= end_ep
);
13601 for (auto ep
= start_ep
; ep
!= end_ep
; ++ep
) {
13602 dout(20) << __func__
<< " inserting for GC "
13603 << std::hex
<< ep
->logical_offset
<< "~" << ep
->length
13604 << std::dec
<< dendl
;
13606 wctx
->extents_to_gc
.union_insert(ep
->logical_offset
, ep
->length
);
13608 // insert newly written extent to GC
13609 wctx
->extents_to_gc
.union_insert(offset
, length
);
13610 dout(20) << __func__
<< " inserting (last) for GC "
13611 << std::hex
<< offset
<< "~" << length
13612 << std::dec
<< dendl
;
13615 BlobRef b
= c
->new_blob();
13616 uint64_t b_off
= p2phase
<uint64_t>(offset
, alloc_len
);
13617 uint64_t b_off0
= b_off
;
13618 _pad_zeros(&bl
, &b_off0
, block_size
);
13619 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13620 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13621 min_alloc_size
!= block_size
, // use 'unused' bitmap when alloc granularity
13622 // doesn't match disk one only
13628 bool BlueStore::BigDeferredWriteContext::can_defer(
13629 BlueStore::extent_map_t::iterator ep
,
13630 uint64_t prefer_deferred_size
,
13631 uint64_t block_size
,
13636 auto& blob
= ep
->blob
->get_blob();
13637 if (offset
>= ep
->blob_start() &&
13638 blob
.is_mutable()) {
13640 b_off
= offset
- ep
->blob_start();
13641 uint64_t chunk_size
= blob
.get_chunk_size(block_size
);
13642 uint64_t ondisk
= blob
.get_ondisk_length();
13643 used
= std::min(l
, ondisk
- b_off
);
13645 // will read some data to fill out the chunk?
13646 head_read
= p2phase
<uint64_t>(b_off
, chunk_size
);
13647 tail_read
= p2nphase
<uint64_t>(b_off
+ used
, chunk_size
);
13648 b_off
-= head_read
;
13650 ceph_assert(b_off
% chunk_size
== 0);
13651 ceph_assert(blob_aligned_len() % chunk_size
== 0);
13653 res
= blob_aligned_len() < prefer_deferred_size
&&
13654 blob_aligned_len() <= ondisk
&&
13655 blob
.is_allocated(b_off
, blob_aligned_len());
13657 blob_ref
= ep
->blob
;
13658 blob_start
= ep
->blob_start();
13664 bool BlueStore::BigDeferredWriteContext::apply_defer()
13666 int r
= blob_ref
->get_blob().map(
13667 b_off
, blob_aligned_len(),
13668 [&](const bluestore_pextent_t
& pext
,
13671 // apply deferred if overwrite breaks blob continuity only.
13672 // if it totally overlaps some pextent - fallback to regular write
13673 if (pext
.offset
< offset
||
13674 pext
.end() > offset
+ length
) {
13675 res_extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13683 void BlueStore::_do_write_big_apply_deferred(
13687 BlueStore::BigDeferredWriteContext
& dctx
,
13688 bufferlist::iterator
& blp
,
13689 WriteContext
* wctx
)
13692 dout(20) << __func__
<< " reading head 0x" << std::hex
<< dctx
.head_read
13693 << " and tail 0x" << dctx
.tail_read
<< std::dec
<< dendl
;
13694 if (dctx
.head_read
) {
13695 int r
= _do_read(c
.get(), o
,
13696 dctx
.off
- dctx
.head_read
,
13700 ceph_assert(r
>= 0 && r
<= (int)dctx
.head_read
);
13701 size_t zlen
= dctx
.head_read
- r
;
13703 bl
.append_zero(zlen
);
13704 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13706 logger
->inc(l_bluestore_write_penalty_read_ops
);
13708 blp
.copy(dctx
.used
, bl
);
13710 if (dctx
.tail_read
) {
13711 bufferlist tail_bl
;
13712 int r
= _do_read(c
.get(), o
,
13713 dctx
.off
+ dctx
.used
, dctx
.tail_read
,
13715 ceph_assert(r
>= 0 && r
<= (int)dctx
.tail_read
);
13716 size_t zlen
= dctx
.tail_read
- r
;
13718 tail_bl
.append_zero(zlen
);
13719 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13721 bl
.claim_append(tail_bl
);
13722 logger
->inc(l_bluestore_write_penalty_read_ops
);
13724 auto& b0
= dctx
.blob_ref
;
13725 _buffer_cache_write(txc
, b0
, dctx
.b_off
, bl
,
13726 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13728 b0
->dirty_blob().calc_csum(dctx
.b_off
, bl
);
13730 Extent
* le
= o
->extent_map
.set_lextent(c
, dctx
.off
,
13731 dctx
.off
- dctx
.blob_start
, dctx
.used
, b0
, &wctx
->old_extents
);
13733 // in fact this is a no-op for big writes but left here to maintain
13734 // uniformity and avoid missing after some refactor.
13735 b0
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13736 txc
->statfs_delta
.stored() += le
->length
;
13738 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13739 bluestore_deferred_op_t
* op
= _get_deferred_op(txc
, bl
.length());
13740 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13741 op
->extents
.swap(dctx
.res_extents
);
13742 op
->data
= std::move(bl
);
13746 void BlueStore::_do_write_big(
13750 uint64_t offset
, uint64_t length
,
13751 bufferlist::iterator
& blp
,
13752 WriteContext
*wctx
)
13754 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
13755 << " target_blob_size 0x" << wctx
->target_blob_size
<< std::dec
13756 << " compress " << (int)wctx
->compress
13758 logger
->inc(l_bluestore_write_big
);
13759 logger
->inc(l_bluestore_write_big_bytes
, length
);
13760 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13761 uint64_t prefer_deferred_size_snapshot
= prefer_deferred_size
.load();
13762 while (length
> 0) {
13763 bool new_blob
= false;
13765 uint32_t b_off
= 0;
13768 //attempting to reuse existing blob
13769 if (!wctx
->compress
) {
13770 // enforce target blob alignment with max_bsize
13771 l
= max_bsize
- p2phase(offset
, max_bsize
);
13772 l
= std::min(uint64_t(l
), length
);
13774 auto end
= o
->extent_map
.extent_map
.end();
13776 dout(20) << __func__
<< " may be defer: 0x" << std::hex
13777 << offset
<< "~" << l
13778 << std::dec
<< dendl
;
13780 if (prefer_deferred_size_snapshot
&&
13781 l
<= prefer_deferred_size_snapshot
* 2) {
13782 // Single write that spans two adjusted existing blobs can result
13783 // in up to two deferred blocks of 'prefer_deferred_size'
13784 // So we're trying to minimize the amount of resulting blobs
13785 // and preserve 2 blobs rather than inserting one more in between
13786 // E.g. write 0x10000~20000 over existing blobs
13787 // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
13788 // performance point of view) to result in two deferred writes to
13789 // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
13791 // look for an existing mutable blob we can write into
13792 auto ep
= o
->extent_map
.seek_lextent(offset
);
13793 auto ep_next
= end
;
13794 BigDeferredWriteContext head_info
, tail_info
;
13796 bool will_defer
= ep
!= end
?
13797 head_info
.can_defer(ep
,
13798 prefer_deferred_size_snapshot
,
13803 auto offset_next
= offset
+ head_info
.used
;
13804 auto remaining
= l
- head_info
.used
;
13805 if (will_defer
&& remaining
) {
13806 will_defer
= false;
13807 if (remaining
<= prefer_deferred_size_snapshot
) {
13808 ep_next
= o
->extent_map
.seek_lextent(offset_next
);
13809 // check if we can defer remaining totally
13810 will_defer
= ep_next
== end
?
13812 tail_info
.can_defer(ep_next
,
13813 prefer_deferred_size_snapshot
,
13817 will_defer
= will_defer
&& remaining
== tail_info
.used
;
13821 dout(20) << __func__
<< " " << *(head_info
.blob_ref
)
13822 << " deferring big " << std::hex
13823 << " (0x" << head_info
.b_off
<< "~" << head_info
.blob_aligned_len() << ")"
13824 << std::dec
<< " write via deferred"
13827 dout(20) << __func__
<< " " << *(tail_info
.blob_ref
)
13828 << " deferring big " << std::hex
13829 << " (0x" << tail_info
.b_off
<< "~" << tail_info
.blob_aligned_len() << ")"
13830 << std::dec
<< " write via deferred"
13834 will_defer
= head_info
.apply_defer();
13836 dout(20) << __func__
13837 << " deferring big fell back, head isn't continuous"
13839 } else if (remaining
) {
13840 will_defer
= tail_info
.apply_defer();
13842 dout(20) << __func__
13843 << " deferring big fell back, tail isn't continuous"
13849 _do_write_big_apply_deferred(txc
, c
, o
, head_info
, blp
, wctx
);
13851 _do_write_big_apply_deferred(txc
, c
, o
, tail_info
,
13854 dout(20) << __func__
<< " defer big: 0x" << std::hex
13855 << offset
<< "~" << l
13856 << std::dec
<< dendl
;
13859 logger
->inc(l_bluestore_write_big_blobs
, remaining
? 2 : 1);
13860 logger
->inc(l_bluestore_write_big_deferred
, remaining
? 2 : 1);
13864 dout(20) << __func__
<< " lookup for blocks to reuse..." << dendl
;
13866 o
->extent_map
.punch_hole(c
, offset
, l
, &wctx
->old_extents
);
13868 // seek again as punch_hole could invalidate ep
13869 auto ep
= o
->extent_map
.seek_lextent(offset
);
13870 auto begin
= o
->extent_map
.extent_map
.begin();
13871 auto prev_ep
= end
;
13877 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
13878 // search suitable extent in both forward and reverse direction in
13879 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13880 // then check if blob can be reused via can_reuse_blob func.
13883 any_change
= false;
13884 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
13885 dout(20) << __func__
<< " considering " << *ep
13886 << " bstart 0x" << std::hex
<< ep
->blob_start() << std::dec
<< dendl
;
13888 if (offset
>= ep
->blob_start() &&
13889 ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
13890 offset
- ep
->blob_start(),
13893 b_off
= offset
- ep
->blob_start();
13894 prev_ep
= end
; // to avoid check below
13895 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13896 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
13903 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
13904 dout(20) << __func__
<< " considering rev " << *prev_ep
13905 << " bstart 0x" << std::hex
<< prev_ep
->blob_start() << std::dec
<< dendl
;
13906 if (prev_ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
13907 offset
- prev_ep
->blob_start(),
13910 b_off
= offset
- prev_ep
->blob_start();
13911 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13912 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
13913 } else if (prev_ep
!= begin
) {
13917 prev_ep
= end
; // to avoid useless first extent re-check
13920 } while (b
== nullptr && any_change
);
13922 // trying to utilize as longer chunk as permitted in case of compression.
13923 l
= std::min(max_bsize
, length
);
13924 o
->extent_map
.punch_hole(c
, offset
, l
, &wctx
->old_extents
);
13925 } // if (!wctx->compress)
13927 if (b
== nullptr) {
13934 wctx
->write(offset
, b
, l
, b_off
, t
, b_off
, l
, false, new_blob
);
13935 dout(20) << __func__
<< " schedule write big: 0x"
13936 << std::hex
<< offset
<< "~" << l
<< std::dec
13937 << (new_blob
? " new " : " reuse ")
13941 logger
->inc(l_bluestore_write_big_blobs
);
13945 int BlueStore::_do_alloc_write(
13947 CollectionRef coll
,
13949 WriteContext
*wctx
)
13951 dout(20) << __func__
<< " txc " << txc
13952 << " " << wctx
->writes
.size() << " blobs"
13954 if (wctx
->writes
.empty()) {
13960 if (wctx
->compress
) {
13962 "compression_algorithm",
13966 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_ALGORITHM
, &val
)) {
13967 CompressorRef cp
= compressor
;
13968 if (!cp
|| cp
->get_type_name() != val
) {
13969 cp
= Compressor::create(cct
, val
);
13971 if (_set_compression_alert(false, val
.c_str())) {
13972 derr
<< __func__
<< " unable to initialize " << val
.c_str()
13973 << " compressor" << dendl
;
13977 return boost::optional
<CompressorRef
>(cp
);
13979 return boost::optional
<CompressorRef
>();
13983 crr
= select_option(
13984 "compression_required_ratio",
13985 cct
->_conf
->bluestore_compression_required_ratio
,
13988 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO
, &val
)) {
13989 return boost::optional
<double>(val
);
13991 return boost::optional
<double>();
13997 int64_t csum
= csum_type
.load();
13998 csum
= select_option(
14003 if (coll
->pool_opts
.get(pool_opts_t::CSUM_TYPE
, &val
)) {
14004 return boost::optional
<int64_t>(val
);
14006 return boost::optional
<int64_t>();
14010 // compress (as needed) and calc needed space
14012 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
14013 for (auto& wi
: wctx
->writes
) {
14014 if (c
&& wi
.blob_length
> min_alloc_size
) {
14015 auto start
= mono_clock::now();
14018 ceph_assert(wi
.b_off
== 0);
14019 ceph_assert(wi
.blob_length
== wi
.bl
.length());
14021 // FIXME: memory alignment here is bad
14023 boost::optional
<int32_t> compressor_message
;
14024 int r
= c
->compress(wi
.bl
, t
, compressor_message
);
14025 uint64_t want_len_raw
= wi
.blob_length
* crr
;
14026 uint64_t want_len
= p2roundup(want_len_raw
, min_alloc_size
);
14027 bool rejected
= false;
14028 uint64_t compressed_len
= t
.length();
14029 // do an approximate (fast) estimation for resulting blob size
14030 // that doesn't take header overhead into account
14031 uint64_t result_len
= p2roundup(compressed_len
, min_alloc_size
);
14032 if (r
== 0 && result_len
<= want_len
&& result_len
< wi
.blob_length
) {
14033 bluestore_compression_header_t chdr
;
14034 chdr
.type
= c
->get_type();
14035 chdr
.length
= t
.length();
14036 chdr
.compressor_message
= compressor_message
;
14037 encode(chdr
, wi
.compressed_bl
);
14038 wi
.compressed_bl
.claim_append(t
);
14040 compressed_len
= wi
.compressed_bl
.length();
14041 result_len
= p2roundup(compressed_len
, min_alloc_size
);
14042 if (result_len
<= want_len
&& result_len
< wi
.blob_length
) {
14043 // Cool. We compressed at least as much as we were hoping to.
14044 // pad out to min_alloc_size
14045 wi
.compressed_bl
.append_zero(result_len
- compressed_len
);
14046 wi
.compressed_len
= compressed_len
;
14047 wi
.compressed
= true;
14048 logger
->inc(l_bluestore_write_pad_bytes
, result_len
- compressed_len
);
14049 dout(20) << __func__
<< std::hex
<< " compressed 0x" << wi
.blob_length
14050 << " -> 0x" << compressed_len
<< " => 0x" << result_len
14051 << " with " << c
->get_type()
14052 << std::dec
<< dendl
;
14053 txc
->statfs_delta
.compressed() += compressed_len
;
14054 txc
->statfs_delta
.compressed_original() += wi
.blob_length
;
14055 txc
->statfs_delta
.compressed_allocated() += result_len
;
14056 logger
->inc(l_bluestore_compress_success_count
);
14057 need
+= result_len
;
14061 } else if (r
!= 0) {
14062 dout(5) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
14063 << " bytes compressed using " << c
->get_type_name()
14065 << " failed with errcode = " << r
14066 << ", leaving uncompressed"
14068 logger
->inc(l_bluestore_compress_rejected_count
);
14069 need
+= wi
.blob_length
;
14075 dout(20) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
14076 << " compressed to 0x" << compressed_len
<< " -> 0x" << result_len
14077 << " with " << c
->get_type()
14078 << ", which is more than required 0x" << want_len_raw
14079 << " -> 0x" << want_len
14080 << ", leaving uncompressed"
14081 << std::dec
<< dendl
;
14082 logger
->inc(l_bluestore_compress_rejected_count
);
14083 need
+= wi
.blob_length
;
14085 log_latency("compress@_do_alloc_write",
14086 l_bluestore_compress_lat
,
14087 mono_clock::now() - start
,
14088 cct
->_conf
->bluestore_log_op_age
);
14090 need
+= wi
.blob_length
;
14093 PExtentVector prealloc
;
14094 prealloc
.reserve(2 * wctx
->writes
.size());;
14095 int64_t prealloc_left
= 0;
14096 prealloc_left
= shared_alloc
.a
->allocate(
14097 need
, min_alloc_size
, need
,
14099 if (prealloc_left
< 0 || prealloc_left
< (int64_t)need
) {
14100 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< need
14101 << " allocated 0x " << (prealloc_left
< 0 ? 0 : prealloc_left
)
14102 << " min_alloc_size 0x" << min_alloc_size
14103 << " available 0x " << shared_alloc
.a
->get_free()
14104 << std::dec
<< dendl
;
14105 if (prealloc
.size()) {
14106 shared_alloc
.a
->release(prealloc
);
14110 _collect_allocation_stats(need
, min_alloc_size
, prealloc
.size());
14112 if (bdev
->is_smr()) {
14113 std::deque
<uint64_t> zones_to_clean
;
14114 if (shared_alloc
.a
->zoned_get_zones_to_clean(&zones_to_clean
)) {
14115 std::lock_guard l
{zoned_cleaner_lock
};
14116 zoned_cleaner_queue
.swap(zones_to_clean
);
14117 zoned_cleaner_cond
.notify_one();
14121 dout(20) << __func__
<< " prealloc " << prealloc
<< dendl
;
14122 auto prealloc_pos
= prealloc
.begin();
14123 ceph_assert(prealloc_pos
!= prealloc
.end());
14124 uint64_t prealloc_pos_length
= prealloc_pos
->length
;
14126 for (auto& wi
: wctx
->writes
) {
14127 bluestore_blob_t
& dblob
= wi
.b
->dirty_blob();
14128 uint64_t b_off
= wi
.b_off
;
14129 bufferlist
*l
= &wi
.bl
;
14130 uint64_t final_length
= wi
.blob_length
;
14131 uint64_t csum_length
= wi
.blob_length
;
14132 if (wi
.compressed
) {
14133 final_length
= wi
.compressed_bl
.length();
14134 csum_length
= final_length
;
14135 unsigned csum_order
= ctz(csum_length
);
14136 l
= &wi
.compressed_bl
;
14137 dblob
.set_compressed(wi
.blob_length
, wi
.compressed_len
);
14138 if (csum
!= Checksummer::CSUM_NONE
) {
14139 dout(20) << __func__
14140 << " initialize csum setting for compressed blob " << *wi
.b
14141 << " csum_type " << Checksummer::get_csum_type_string(csum
)
14142 << " csum_order " << csum_order
14143 << " csum_length 0x" << std::hex
<< csum_length
14144 << " blob_length 0x" << wi
.blob_length
14145 << " compressed_length 0x" << wi
.compressed_len
<< std::dec
14147 dblob
.init_csum(csum
, csum_order
, csum_length
);
14149 } else if (wi
.new_blob
) {
14150 unsigned csum_order
;
14151 // initialize newly created blob only
14152 ceph_assert(dblob
.is_mutable());
14153 if (l
->length() != wi
.blob_length
) {
14154 // hrm, maybe we could do better here, but let's not bother.
14155 dout(20) << __func__
<< " forcing csum_order to block_size_order "
14156 << block_size_order
<< dendl
;
14157 csum_order
= block_size_order
;
14159 csum_order
= std::min(wctx
->csum_order
, ctz(l
->length()));
14161 // try to align blob with max_blob_size to improve
14162 // its reuse ratio, e.g. in case of reverse write
14163 uint32_t suggested_boff
=
14164 (wi
.logical_offset
- (wi
.b_off0
- wi
.b_off
)) % max_bsize
;
14165 if ((suggested_boff
% (1 << csum_order
)) == 0 &&
14166 suggested_boff
+ final_length
<= max_bsize
&&
14167 suggested_boff
> b_off
) {
14168 dout(20) << __func__
<< " forcing blob_offset to 0x"
14169 << std::hex
<< suggested_boff
<< std::dec
<< dendl
;
14170 ceph_assert(suggested_boff
>= b_off
);
14171 csum_length
+= suggested_boff
- b_off
;
14172 b_off
= suggested_boff
;
14174 if (csum
!= Checksummer::CSUM_NONE
) {
14175 dout(20) << __func__
14176 << " initialize csum setting for new blob " << *wi
.b
14177 << " csum_type " << Checksummer::get_csum_type_string(csum
)
14178 << " csum_order " << csum_order
14179 << " csum_length 0x" << std::hex
<< csum_length
<< std::dec
14181 dblob
.init_csum(csum
, csum_order
, csum_length
);
14185 PExtentVector extents
;
14186 int64_t left
= final_length
;
14187 bool has_chunk2defer
= false;
14188 auto prefer_deferred_size_snapshot
= prefer_deferred_size
.load();
14190 ceph_assert(prealloc_left
> 0);
14191 has_chunk2defer
|= (prealloc_pos_length
< prefer_deferred_size_snapshot
);
14192 if (prealloc_pos
->length
<= left
) {
14193 prealloc_left
-= prealloc_pos
->length
;
14194 left
-= prealloc_pos
->length
;
14195 txc
->statfs_delta
.allocated() += prealloc_pos
->length
;
14196 extents
.push_back(*prealloc_pos
);
14198 if (prealloc_pos
!= prealloc
.end()) {
14199 prealloc_pos_length
= prealloc_pos
->length
;
14202 extents
.emplace_back(prealloc_pos
->offset
, left
);
14203 prealloc_pos
->offset
+= left
;
14204 prealloc_pos
->length
-= left
;
14205 prealloc_left
-= left
;
14206 txc
->statfs_delta
.allocated() += left
;
14211 for (auto& p
: extents
) {
14212 txc
->allocated
.insert(p
.offset
, p
.length
);
14214 dblob
.allocated(p2align(b_off
, min_alloc_size
), final_length
, extents
);
14216 dout(20) << __func__
<< " blob " << *wi
.b
<< dendl
;
14217 if (dblob
.has_csum()) {
14218 dblob
.calc_csum(b_off
, *l
);
14221 if (wi
.mark_unused
) {
14222 ceph_assert(!dblob
.is_compressed());
14223 auto b_end
= b_off
+ wi
.bl
.length();
14225 dblob
.add_unused(0, b_off
);
14227 uint64_t llen
= dblob
.get_logical_length();
14228 if (b_end
< llen
) {
14229 dblob
.add_unused(b_end
, llen
- b_end
);
14233 Extent
*le
= o
->extent_map
.set_lextent(coll
, wi
.logical_offset
,
14234 b_off
+ (wi
.b_off0
- wi
.b_off
),
14238 wi
.b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
14239 txc
->statfs_delta
.stored() += le
->length
;
14240 dout(20) << __func__
<< " lex " << *le
<< dendl
;
14241 _buffer_cache_write(txc
, wi
.b
, b_off
, wi
.bl
,
14242 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
14245 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
14246 if (has_chunk2defer
&& l
->length() < prefer_deferred_size_snapshot
) {
14247 dout(20) << __func__
<< " deferring 0x" << std::hex
14248 << l
->length() << std::dec
<< " write via deferred" << dendl
;
14249 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, l
->length());
14250 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
14251 int r
= wi
.b
->get_blob().map(
14252 b_off
, l
->length(),
14253 [&](uint64_t offset
, uint64_t length
) {
14254 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
14257 ceph_assert(r
== 0);
14260 wi
.b
->get_blob().map_bl(
14262 [&](uint64_t offset
, bufferlist
& t
) {
14263 bdev
->aio_write(offset
, t
, &txc
->ioc
, false);
14265 logger
->inc(l_bluestore_write_new
);
14269 ceph_assert(prealloc_pos
== prealloc
.end());
14270 ceph_assert(prealloc_left
== 0);
14274 void BlueStore::_wctx_finish(
14278 WriteContext
*wctx
,
14279 set
<SharedBlob
*> *maybe_unshared_blobs
)
14281 auto oep
= wctx
->old_extents
.begin();
14282 while (oep
!= wctx
->old_extents
.end()) {
14284 oep
= wctx
->old_extents
.erase(oep
);
14285 dout(20) << __func__
<< " lex_old " << lo
.e
<< dendl
;
14286 BlobRef b
= lo
.e
.blob
;
14287 const bluestore_blob_t
& blob
= b
->get_blob();
14288 if (blob
.is_compressed()) {
14289 if (lo
.blob_empty
) {
14290 txc
->statfs_delta
.compressed() -= blob
.get_compressed_payload_length();
14292 txc
->statfs_delta
.compressed_original() -= lo
.e
.length
;
14295 txc
->statfs_delta
.stored() -= lo
.e
.length
;
14297 dout(20) << __func__
<< " blob " << *b
<< " release " << r
<< dendl
;
14298 if (blob
.is_shared()) {
14299 PExtentVector final
;
14300 c
->load_shared_blob(b
->shared_blob
);
14301 bool unshare
= false;
14302 bool* unshare_ptr
=
14303 !maybe_unshared_blobs
|| b
->is_referenced() ? nullptr : &unshare
;
14305 b
->shared_blob
->put_ref(
14306 e
.offset
, e
.length
, &final
,
14310 ceph_assert(maybe_unshared_blobs
);
14311 maybe_unshared_blobs
->insert(b
->shared_blob
.get());
14313 dout(20) << __func__
<< " shared_blob release " << final
14314 << " from " << *b
->shared_blob
<< dendl
;
14315 txc
->write_shared_blob(b
->shared_blob
);
14320 // we can't invalidate our logical extents as we drop them because
14321 // other lextents (either in our onode or others) may still
14322 // reference them. but we can throw out anything that is no
14323 // longer allocated. Note that this will leave behind edge bits
14324 // that are no longer referenced but not deallocated (until they
14325 // age out of the cache naturally).
14326 b
->discard_unallocated(c
.get());
14328 dout(20) << __func__
<< " release " << e
<< dendl
;
14329 txc
->released
.insert(e
.offset
, e
.length
);
14330 txc
->statfs_delta
.allocated() -= e
.length
;
14331 if (blob
.is_compressed()) {
14332 txc
->statfs_delta
.compressed_allocated() -= e
.length
;
14336 if (b
->is_spanning() && !b
->is_referenced() && lo
.blob_empty
) {
14337 dout(20) << __func__
<< " spanning_blob_map removing empty " << *b
14339 o
->extent_map
.spanning_blob_map
.erase(b
->id
);
14345 void BlueStore::_do_write_data(
14352 WriteContext
*wctx
)
14354 uint64_t end
= offset
+ length
;
14355 bufferlist::iterator p
= bl
.begin();
14357 if (offset
/ min_alloc_size
== (end
- 1) / min_alloc_size
&&
14358 (length
!= min_alloc_size
)) {
14359 // we fall within the same block
14360 _do_write_small(txc
, c
, o
, offset
, length
, p
, wctx
);
14362 uint64_t head_offset
, head_length
;
14363 uint64_t middle_offset
, middle_length
;
14364 uint64_t tail_offset
, tail_length
;
14366 head_offset
= offset
;
14367 head_length
= p2nphase(offset
, min_alloc_size
);
14369 tail_offset
= p2align(end
, min_alloc_size
);
14370 tail_length
= p2phase(end
, min_alloc_size
);
14372 middle_offset
= head_offset
+ head_length
;
14373 middle_length
= length
- head_length
- tail_length
;
14376 _do_write_small(txc
, c
, o
, head_offset
, head_length
, p
, wctx
);
14379 _do_write_big(txc
, c
, o
, middle_offset
, middle_length
, p
, wctx
);
14382 _do_write_small(txc
, c
, o
, tail_offset
, tail_length
, p
, wctx
);
14387 void BlueStore::_choose_write_options(
14390 uint32_t fadvise_flags
,
14391 WriteContext
*wctx
)
14393 if (fadvise_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
14394 dout(20) << __func__
<< " will do buffered write" << dendl
;
14395 wctx
->buffered
= true;
14396 } else if (cct
->_conf
->bluestore_default_buffered_write
&&
14397 (fadvise_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
14398 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
14399 dout(20) << __func__
<< " defaulting to buffered write" << dendl
;
14400 wctx
->buffered
= true;
14403 // apply basic csum block size
14404 wctx
->csum_order
= block_size_order
;
14406 // compression parameters
14407 unsigned alloc_hints
= o
->onode
.alloc_hint_flags
;
14408 auto cm
= select_option(
14409 "compression_mode",
14413 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MODE
, &val
)) {
14414 return boost::optional
<Compressor::CompressionMode
>(
14415 Compressor::get_comp_mode_type(val
));
14417 return boost::optional
<Compressor::CompressionMode
>();
14421 wctx
->compress
= (cm
!= Compressor::COMP_NONE
) &&
14422 ((cm
== Compressor::COMP_FORCE
) ||
14423 (cm
== Compressor::COMP_AGGRESSIVE
&&
14424 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE
) == 0) ||
14425 (cm
== Compressor::COMP_PASSIVE
&&
14426 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE
)));
14428 if ((alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ
) &&
14429 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ
) == 0 &&
14430 (alloc_hints
& (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE
|
14431 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY
)) &&
14432 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE
) == 0) {
14434 dout(20) << __func__
<< " will prefer large blob and csum sizes" << dendl
;
14436 if (o
->onode
.expected_write_size
) {
14437 wctx
->csum_order
= std::max(min_alloc_size_order
,
14438 (uint8_t)ctz(o
->onode
.expected_write_size
));
14440 wctx
->csum_order
= min_alloc_size_order
;
14443 if (wctx
->compress
) {
14444 wctx
->target_blob_size
= select_option(
14445 "compression_max_blob_size",
14446 comp_max_blob_size
.load(),
14449 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, &val
)) {
14450 return boost::optional
<uint64_t>((uint64_t)val
);
14452 return boost::optional
<uint64_t>();
14457 if (wctx
->compress
) {
14458 wctx
->target_blob_size
= select_option(
14459 "compression_min_blob_size",
14460 comp_min_blob_size
.load(),
14463 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, &val
)) {
14464 return boost::optional
<uint64_t>((uint64_t)val
);
14466 return boost::optional
<uint64_t>();
14472 uint64_t max_bsize
= max_blob_size
.load();
14473 if (wctx
->target_blob_size
== 0 || wctx
->target_blob_size
> max_bsize
) {
14474 wctx
->target_blob_size
= max_bsize
;
14477 // set the min blob size floor at 2x the min_alloc_size, or else we
14478 // won't be able to allocate a smaller extent for the compressed
14480 if (wctx
->compress
&&
14481 wctx
->target_blob_size
< min_alloc_size
* 2) {
14482 wctx
->target_blob_size
= min_alloc_size
* 2;
14485 dout(20) << __func__
<< " prefer csum_order " << wctx
->csum_order
14486 << " target_blob_size 0x" << std::hex
<< wctx
->target_blob_size
14487 << " compress=" << (int)wctx
->compress
14488 << " buffered=" << (int)wctx
->buffered
14489 << std::dec
<< dendl
;
14492 int BlueStore::_do_gc(
14496 const WriteContext
& wctx
,
14497 uint64_t *dirty_start
,
14498 uint64_t *dirty_end
)
14501 bool dirty_range_updated
= false;
14502 WriteContext wctx_gc
;
14503 wctx_gc
.fork(wctx
); // make a clone for garbage collection
14505 auto & extents_to_collect
= wctx
.extents_to_gc
;
14506 for (auto it
= extents_to_collect
.begin();
14507 it
!= extents_to_collect
.end();
14510 auto offset
= (*it
).first
;
14511 auto length
= (*it
).second
;
14512 dout(20) << __func__
<< " processing " << std::hex
14513 << offset
<< "~" << length
<< std::dec
14515 int r
= _do_read(c
.get(), o
, offset
, length
, bl
, 0);
14516 ceph_assert(r
== (int)length
);
14518 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx_gc
);
14519 logger
->inc(l_bluestore_gc_merged
, length
);
14521 if (*dirty_start
> offset
) {
14522 *dirty_start
= offset
;
14523 dirty_range_updated
= true;
14526 if (*dirty_end
< offset
+ length
) {
14527 *dirty_end
= offset
+ length
;
14528 dirty_range_updated
= true;
14531 if (dirty_range_updated
) {
14532 o
->extent_map
.fault_range(db
, *dirty_start
, *dirty_end
);
14535 dout(30) << __func__
<< " alloc write" << dendl
;
14536 int r
= _do_alloc_write(txc
, c
, o
, &wctx_gc
);
14538 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
14543 _wctx_finish(txc
, c
, o
, &wctx_gc
);
14547 int BlueStore::_do_write(
14554 uint32_t fadvise_flags
)
14558 dout(20) << __func__
14560 << " 0x" << std::hex
<< offset
<< "~" << length
14561 << " - have 0x" << o
->onode
.size
14562 << " (" << std::dec
<< o
->onode
.size
<< ")"
14563 << " bytes" << std::hex
14564 << " fadvise_flags 0x" << fadvise_flags
14565 << " alloc_hint 0x" << o
->onode
.alloc_hint_flags
14566 << " expected_object_size " << o
->onode
.expected_object_size
14567 << " expected_write_size " << o
->onode
.expected_write_size
14570 _dump_onode
<30>(cct
, *o
);
14576 uint64_t end
= offset
+ length
;
14578 GarbageCollector
gc(c
->store
->cct
);
14579 int64_t benefit
= 0;
14580 auto dirty_start
= offset
;
14581 auto dirty_end
= end
;
14584 _choose_write_options(c
, o
, fadvise_flags
, &wctx
);
14585 o
->extent_map
.fault_range(db
, offset
, length
);
14586 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx
);
14587 r
= _do_alloc_write(txc
, c
, o
, &wctx
);
14589 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
14594 if (wctx
.extents_to_gc
.empty() ||
14595 wctx
.extents_to_gc
.range_start() > offset
||
14596 wctx
.extents_to_gc
.range_end() < offset
+ length
) {
14597 benefit
= gc
.estimate(offset
,
14604 if (bdev
->is_smr()) {
14605 if (wctx
.old_extents
.empty()) {
14606 txc
->zoned_note_new_object(o
);
14608 int64_t old_ondisk_offset
= wctx
.old_extents
.begin()->r
.begin()->offset
;
14609 txc
->zoned_note_updated_object(o
, old_ondisk_offset
);
14613 // NB: _wctx_finish() will empty old_extents
14614 // so we must do gc estimation before that
14615 _wctx_finish(txc
, c
, o
, &wctx
);
14616 if (end
> o
->onode
.size
) {
14617 dout(20) << __func__
<< " extending size to 0x" << std::hex
<< end
14618 << std::dec
<< dendl
;
14619 o
->onode
.size
= end
;
14622 if (benefit
>= g_conf()->bluestore_gc_enable_total_threshold
) {
14623 wctx
.extents_to_gc
.union_of(gc
.get_extents_to_collect());
14624 dout(20) << __func__
14625 << " perform garbage collection for compressed extents, "
14626 << "expected benefit = " << benefit
<< " AUs" << dendl
;
14628 if (!wctx
.extents_to_gc
.empty()) {
14629 dout(20) << __func__
<< " perform garbage collection" << dendl
;
14631 r
= _do_gc(txc
, c
, o
,
14633 &dirty_start
, &dirty_end
);
14635 derr
<< __func__
<< " _do_gc failed with " << cpp_strerror(r
)
14639 dout(20)<<__func__
<<" gc range is " << std::hex
<< dirty_start
14640 << "~" << dirty_end
- dirty_start
<< std::dec
<< dendl
;
14642 o
->extent_map
.compress_extent_map(dirty_start
, dirty_end
- dirty_start
);
14643 o
->extent_map
.dirty_range(dirty_start
, dirty_end
- dirty_start
);
14651 int BlueStore::_write(TransContext
*txc
,
14654 uint64_t offset
, size_t length
,
14656 uint32_t fadvise_flags
)
14658 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14659 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14662 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
14665 _assign_nid(txc
, o
);
14666 r
= _do_write(txc
, c
, o
, offset
, length
, bl
, fadvise_flags
);
14667 txc
->write_onode(o
);
14669 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14670 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14671 << " = " << r
<< dendl
;
14675 int BlueStore::_zero(TransContext
*txc
,
14678 uint64_t offset
, size_t length
)
14680 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14681 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14684 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
14687 _assign_nid(txc
, o
);
14688 r
= _do_zero(txc
, c
, o
, offset
, length
);
14690 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14691 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14692 << " = " << r
<< dendl
;
14696 int BlueStore::_do_zero(TransContext
*txc
,
14699 uint64_t offset
, size_t length
)
14701 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14702 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14706 _dump_onode
<30>(cct
, *o
);
14709 o
->extent_map
.fault_range(db
, offset
, length
);
14710 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
14711 o
->extent_map
.dirty_range(offset
, length
);
14712 _wctx_finish(txc
, c
, o
, &wctx
);
14714 if (length
> 0 && offset
+ length
> o
->onode
.size
) {
14715 o
->onode
.size
= offset
+ length
;
14716 dout(20) << __func__
<< " extending size to " << offset
+ length
14719 txc
->write_onode(o
);
14721 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14722 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14723 << " = " << r
<< dendl
;
14727 void BlueStore::_do_truncate(
14728 TransContext
*txc
, CollectionRef
& c
, OnodeRef o
, uint64_t offset
,
14729 set
<SharedBlob
*> *maybe_unshared_blobs
)
14731 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14732 << " 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
14734 _dump_onode
<30>(cct
, *o
);
14736 if (offset
== o
->onode
.size
)
14740 if (offset
< o
->onode
.size
) {
14741 uint64_t length
= o
->onode
.size
- offset
;
14742 o
->extent_map
.fault_range(db
, offset
, length
);
14743 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
14744 o
->extent_map
.dirty_range(offset
, length
);
14745 _wctx_finish(txc
, c
, o
, &wctx
, maybe_unshared_blobs
);
14747 // if we have shards past EOF, ask for a reshard
14748 if (!o
->onode
.extent_map_shards
.empty() &&
14749 o
->onode
.extent_map_shards
.back().offset
>= offset
) {
14750 dout(10) << __func__
<< " request reshard past EOF" << dendl
;
14752 o
->extent_map
.request_reshard(offset
- 1, offset
+ length
);
14754 o
->extent_map
.request_reshard(0, length
);
14759 o
->onode
.size
= offset
;
14761 if (bdev
->is_smr()) {
14762 // On zoned devices, we currently support only removing an object or
14763 // truncating it to zero size, both of which fall through this code path.
14764 ceph_assert(offset
== 0 && !wctx
.old_extents
.empty());
14765 int64_t ondisk_offset
= wctx
.old_extents
.begin()->r
.begin()->offset
;
14766 txc
->zoned_note_truncated_object(o
, ondisk_offset
);
14769 txc
->write_onode(o
);
14772 int BlueStore::_truncate(TransContext
*txc
,
14777 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14778 << " 0x" << std::hex
<< offset
<< std::dec
14781 if (offset
>= OBJECT_MAX_SIZE
) {
14784 _do_truncate(txc
, c
, o
, offset
);
14786 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14787 << " 0x" << std::hex
<< offset
<< std::dec
14788 << " = " << r
<< dendl
;
14792 int BlueStore::_do_remove(
14797 set
<SharedBlob
*> maybe_unshared_blobs
;
14798 bool is_gen
= !o
->oid
.is_no_gen();
14799 _do_truncate(txc
, c
, o
, 0, is_gen
? &maybe_unshared_blobs
: nullptr);
14800 if (o
->onode
.has_omap()) {
14802 _do_omap_clear(txc
, o
);
14806 for (auto &s
: o
->extent_map
.shards
) {
14807 dout(20) << __func__
<< " removing shard 0x" << std::hex
14808 << s
.shard_info
->offset
<< std::dec
<< dendl
;
14809 generate_extent_shard_key_and_apply(o
->key
, s
.shard_info
->offset
, &key
,
14810 [&](const string
& final_key
) {
14811 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
14815 txc
->t
->rmkey(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size());
14816 txc
->note_removed_object(o
);
14817 o
->extent_map
.clear();
14818 o
->onode
= bluestore_onode_t();
14819 _debug_obj_on_delete(o
->oid
);
14821 if (!is_gen
|| maybe_unshared_blobs
.empty()) {
14825 // see if we can unshare blobs still referenced by the head
14826 dout(10) << __func__
<< " gen and maybe_unshared_blobs "
14827 << maybe_unshared_blobs
<< dendl
;
14828 ghobject_t nogen
= o
->oid
;
14829 nogen
.generation
= ghobject_t::NO_GEN
;
14830 OnodeRef h
= c
->get_onode(nogen
, false);
14832 if (!h
|| !h
->exists
) {
14836 dout(20) << __func__
<< " checking for unshareable blobs on " << h
14837 << " " << h
->oid
<< dendl
;
14838 map
<SharedBlob
*,bluestore_extent_ref_map_t
> expect
;
14839 for (auto& e
: h
->extent_map
.extent_map
) {
14840 const bluestore_blob_t
& b
= e
.blob
->get_blob();
14841 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
14842 if (b
.is_shared() &&
14844 maybe_unshared_blobs
.count(sb
)) {
14845 if (b
.is_compressed()) {
14846 expect
[sb
].get(0, b
.get_ondisk_length());
14848 b
.map(e
.blob_offset
, e
.length
, [&](uint64_t off
, uint64_t len
) {
14849 expect
[sb
].get(off
, len
);
14856 vector
<SharedBlob
*> unshared_blobs
;
14857 unshared_blobs
.reserve(maybe_unshared_blobs
.size());
14858 for (auto& p
: expect
) {
14859 dout(20) << " ? " << *p
.first
<< " vs " << p
.second
<< dendl
;
14860 if (p
.first
->persistent
->ref_map
== p
.second
) {
14861 SharedBlob
*sb
= p
.first
;
14862 dout(20) << __func__
<< " unsharing " << *sb
<< dendl
;
14863 unshared_blobs
.push_back(sb
);
14864 txc
->unshare_blob(sb
);
14865 uint64_t sbid
= c
->make_blob_unshared(sb
);
14867 get_shared_blob_key(sbid
, &key
);
14868 txc
->t
->rmkey(PREFIX_SHARED_BLOB
, key
);
14872 if (unshared_blobs
.empty()) {
14876 for (auto& e
: h
->extent_map
.extent_map
) {
14877 const bluestore_blob_t
& b
= e
.blob
->get_blob();
14878 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
14879 if (b
.is_shared() &&
14880 std::find(unshared_blobs
.begin(), unshared_blobs
.end(),
14881 sb
) != unshared_blobs
.end()) {
14882 dout(20) << __func__
<< " unsharing " << e
<< dendl
;
14883 bluestore_blob_t
& blob
= e
.blob
->dirty_blob();
14884 blob
.clear_flag(bluestore_blob_t::FLAG_SHARED
);
14885 h
->extent_map
.dirty_range(e
.logical_offset
, 1);
14888 txc
->write_onode(h
);
14893 int BlueStore::_remove(TransContext
*txc
,
14897 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14898 << " onode " << o
.get()
14899 << " txc "<< txc
<< dendl
;
14901 auto start_time
= mono_clock::now();
14902 int r
= _do_remove(txc
, c
, o
);
14905 l_bluestore_remove_lat
,
14906 mono_clock::now() - start_time
,
14907 cct
->_conf
->bluestore_log_op_age
,
14908 [&](const ceph::timespan
& lat
) {
14909 ostringstream ostr
;
14910 ostr
<< ", lat = " << timespan_str(lat
)
14911 << " cid =" << c
->cid
14912 << " oid =" << o
->oid
;
14917 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14921 int BlueStore::_setattr(TransContext
*txc
,
14924 const string
& name
,
14927 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14928 << " " << name
<< " (" << val
.length() << " bytes)"
14931 if (val
.is_partial()) {
14932 auto& b
= o
->onode
.attrs
[name
.c_str()] = bufferptr(val
.c_str(),
14934 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14936 auto& b
= o
->onode
.attrs
[name
.c_str()] = val
;
14937 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14939 txc
->write_onode(o
);
14940 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14941 << " " << name
<< " (" << val
.length() << " bytes)"
14942 << " = " << r
<< dendl
;
14946 int BlueStore::_setattrs(TransContext
*txc
,
14949 const map
<string
,bufferptr
>& aset
)
14951 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14952 << " " << aset
.size() << " keys"
14955 for (map
<string
,bufferptr
>::const_iterator p
= aset
.begin();
14956 p
!= aset
.end(); ++p
) {
14957 if (p
->second
.is_partial()) {
14958 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] =
14959 bufferptr(p
->second
.c_str(), p
->second
.length());
14960 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14962 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] = p
->second
;
14963 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14966 txc
->write_onode(o
);
14967 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14968 << " " << aset
.size() << " keys"
14969 << " = " << r
<< dendl
;
14974 int BlueStore::_rmattr(TransContext
*txc
,
14977 const string
& name
)
14979 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14980 << " " << name
<< dendl
;
14982 auto it
= o
->onode
.attrs
.find(name
.c_str());
14983 if (it
== o
->onode
.attrs
.end())
14986 o
->onode
.attrs
.erase(it
);
14987 txc
->write_onode(o
);
14990 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14991 << " " << name
<< " = " << r
<< dendl
;
14995 int BlueStore::_rmattrs(TransContext
*txc
,
14999 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
15002 if (o
->onode
.attrs
.empty())
15005 o
->onode
.attrs
.clear();
15006 txc
->write_onode(o
);
15009 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15013 void BlueStore::_do_omap_clear(TransContext
*txc
, OnodeRef
& o
)
15015 const string
& omap_prefix
= o
->get_omap_prefix();
15016 string prefix
, tail
;
15017 o
->get_omap_header(&prefix
);
15018 o
->get_omap_tail(&tail
);
15019 txc
->t
->rm_range_keys(omap_prefix
, prefix
, tail
);
15020 txc
->t
->rmkey(omap_prefix
, tail
);
15021 dout(20) << __func__
<< " remove range start: "
15022 << pretty_binary_string(prefix
) << " end: "
15023 << pretty_binary_string(tail
) << dendl
;
15026 int BlueStore::_omap_clear(TransContext
*txc
,
15030 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
15032 if (o
->onode
.has_omap()) {
15034 _do_omap_clear(txc
, o
);
15035 o
->onode
.clear_omap_flag();
15036 txc
->write_onode(o
);
15038 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15042 int BlueStore::_omap_setkeys(TransContext
*txc
,
15047 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
15049 auto p
= bl
.cbegin();
15051 if (!o
->onode
.has_omap()) {
15052 if (o
->oid
.is_pgmeta()) {
15053 o
->onode
.set_omap_flags_pgmeta();
15055 o
->onode
.set_omap_flags(per_pool_omap
== OMAP_BULK
);
15057 txc
->write_onode(o
);
15059 const string
& prefix
= o
->get_omap_prefix();
15062 o
->get_omap_tail(&key_tail
);
15063 txc
->t
->set(prefix
, key_tail
, tail
);
15065 txc
->note_modified_object(o
);
15067 const string
& prefix
= o
->get_omap_prefix();
15069 o
->get_omap_key(string(), &final_key
);
15070 size_t base_key_len
= final_key
.size();
15077 final_key
.resize(base_key_len
); // keep prefix
15079 dout(20) << __func__
<< " " << pretty_binary_string(final_key
)
15080 << " <- " << key
<< dendl
;
15081 txc
->t
->set(prefix
, final_key
, value
);
15084 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15088 int BlueStore::_omap_setheader(TransContext
*txc
,
15093 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
15096 if (!o
->onode
.has_omap()) {
15097 if (o
->oid
.is_pgmeta()) {
15098 o
->onode
.set_omap_flags_pgmeta();
15100 o
->onode
.set_omap_flags(per_pool_omap
== OMAP_BULK
);
15102 txc
->write_onode(o
);
15104 const string
& prefix
= o
->get_omap_prefix();
15107 o
->get_omap_tail(&key_tail
);
15108 txc
->t
->set(prefix
, key_tail
, tail
);
15110 txc
->note_modified_object(o
);
15112 const string
& prefix
= o
->get_omap_prefix();
15113 o
->get_omap_header(&key
);
15114 txc
->t
->set(prefix
, key
, bl
);
15116 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15120 int BlueStore::_omap_rmkeys(TransContext
*txc
,
15125 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
15127 auto p
= bl
.cbegin();
15131 if (!o
->onode
.has_omap()) {
15135 const string
& prefix
= o
->get_omap_prefix();
15136 o
->get_omap_key(string(), &final_key
);
15137 size_t base_key_len
= final_key
.size();
15142 final_key
.resize(base_key_len
); // keep prefix
15144 dout(20) << __func__
<< " rm " << pretty_binary_string(final_key
)
15145 << " <- " << key
<< dendl
;
15146 txc
->t
->rmkey(prefix
, final_key
);
15149 txc
->note_modified_object(o
);
15152 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15156 int BlueStore::_omap_rmkey_range(TransContext
*txc
,
15159 const string
& first
, const string
& last
)
15161 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
15162 string key_first
, key_last
;
15164 if (!o
->onode
.has_omap()) {
15168 const string
& prefix
= o
->get_omap_prefix();
15170 o
->get_omap_key(first
, &key_first
);
15171 o
->get_omap_key(last
, &key_last
);
15172 txc
->t
->rm_range_keys(prefix
, key_first
, key_last
);
15173 dout(20) << __func__
<< " remove range start: "
15174 << pretty_binary_string(key_first
) << " end: "
15175 << pretty_binary_string(key_last
) << dendl
;
15177 txc
->note_modified_object(o
);
15180 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15184 int BlueStore::_set_alloc_hint(
15188 uint64_t expected_object_size
,
15189 uint64_t expected_write_size
,
15192 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
15193 << " object_size " << expected_object_size
15194 << " write_size " << expected_write_size
15195 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
15198 o
->onode
.expected_object_size
= expected_object_size
;
15199 o
->onode
.expected_write_size
= expected_write_size
;
15200 o
->onode
.alloc_hint_flags
= flags
;
15201 txc
->write_onode(o
);
15202 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
15203 << " object_size " << expected_object_size
15204 << " write_size " << expected_write_size
15205 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
15206 << " = " << r
<< dendl
;
15210 int BlueStore::_clone(TransContext
*txc
,
15215 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15216 << newo
->oid
<< dendl
;
15218 if (oldo
->oid
.hobj
.get_hash() != newo
->oid
.hobj
.get_hash()) {
15219 derr
<< __func__
<< " mismatched hash on " << oldo
->oid
15220 << " and " << newo
->oid
<< dendl
;
15224 _assign_nid(txc
, newo
);
15228 _do_truncate(txc
, c
, newo
, 0);
15229 if (cct
->_conf
->bluestore_clone_cow
) {
15230 _do_clone_range(txc
, c
, oldo
, newo
, 0, oldo
->onode
.size
, 0);
15233 r
= _do_read(c
.get(), oldo
, 0, oldo
->onode
.size
, bl
, 0);
15236 r
= _do_write(txc
, c
, newo
, 0, oldo
->onode
.size
, bl
, 0);
15242 newo
->onode
.attrs
= oldo
->onode
.attrs
;
15245 if (newo
->onode
.has_omap()) {
15246 dout(20) << __func__
<< " clearing old omap data" << dendl
;
15248 _do_omap_clear(txc
, newo
);
15249 newo
->onode
.clear_omap_flag();
15251 if (oldo
->onode
.has_omap()) {
15252 dout(20) << __func__
<< " copying omap data" << dendl
;
15253 if (newo
->oid
.is_pgmeta()) {
15254 newo
->onode
.set_omap_flags_pgmeta();
15256 newo
->onode
.set_omap_flags(per_pool_omap
== OMAP_BULK
);
15258 const string
& prefix
= newo
->get_omap_prefix();
15259 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
15261 oldo
->get_omap_header(&head
);
15262 oldo
->get_omap_tail(&tail
);
15263 it
->lower_bound(head
);
15264 while (it
->valid()) {
15265 if (it
->key() >= tail
) {
15266 dout(30) << __func__
<< " reached tail" << dendl
;
15269 dout(30) << __func__
<< " got header/data "
15270 << pretty_binary_string(it
->key()) << dendl
;
15272 newo
->rewrite_omap_key(it
->key(), &key
);
15273 txc
->t
->set(prefix
, key
, it
->value());
15278 bufferlist new_tail_value
;
15279 newo
->get_omap_tail(&new_tail
);
15280 txc
->t
->set(prefix
, new_tail
, new_tail_value
);
15283 txc
->write_onode(newo
);
15287 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15288 << newo
->oid
<< " = " << r
<< dendl
;
15292 int BlueStore::_do_clone_range(
15301 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15303 << " 0x" << std::hex
<< srcoff
<< "~" << length
<< " -> "
15304 << " 0x" << dstoff
<< "~" << length
<< std::dec
<< dendl
;
15305 oldo
->extent_map
.fault_range(db
, srcoff
, length
);
15306 newo
->extent_map
.fault_range(db
, dstoff
, length
);
15307 _dump_onode
<30>(cct
, *oldo
);
15308 _dump_onode
<30>(cct
, *newo
);
15310 oldo
->extent_map
.dup(this, txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
15311 _dump_onode
<30>(cct
, *oldo
);
15312 _dump_onode
<30>(cct
, *newo
);
15316 int BlueStore::_clone_range(TransContext
*txc
,
15320 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
)
15322 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15323 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
15324 << " to offset 0x" << dstoff
<< std::dec
<< dendl
;
15327 if (srcoff
+ length
>= OBJECT_MAX_SIZE
||
15328 dstoff
+ length
>= OBJECT_MAX_SIZE
) {
15332 if (srcoff
+ length
> oldo
->onode
.size
) {
15337 _assign_nid(txc
, newo
);
15340 if (cct
->_conf
->bluestore_clone_cow
) {
15341 _do_zero(txc
, c
, newo
, dstoff
, length
);
15342 _do_clone_range(txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
15345 r
= _do_read(c
.get(), oldo
, srcoff
, length
, bl
, 0);
15348 r
= _do_write(txc
, c
, newo
, dstoff
, bl
.length(), bl
, 0);
15354 txc
->write_onode(newo
);
15358 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15359 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
15360 << " to offset 0x" << dstoff
<< std::dec
15361 << " = " << r
<< dendl
;
15365 int BlueStore::_rename(TransContext
*txc
,
15369 const ghobject_t
& new_oid
)
15371 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15372 << new_oid
<< dendl
;
15374 ghobject_t old_oid
= oldo
->oid
;
15375 mempool::bluestore_cache_meta::string new_okey
;
15378 if (newo
->exists
) {
15382 ceph_assert(txc
->onodes
.count(newo
) == 0);
15385 txc
->t
->rmkey(PREFIX_OBJ
, oldo
->key
.c_str(), oldo
->key
.size());
15389 oldo
->extent_map
.fault_range(db
, 0, oldo
->onode
.size
);
15390 get_object_key(cct
, new_oid
, &new_okey
);
15392 for (auto &s
: oldo
->extent_map
.shards
) {
15393 generate_extent_shard_key_and_apply(oldo
->key
, s
.shard_info
->offset
, &key
,
15394 [&](const string
& final_key
) {
15395 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
15403 txc
->write_onode(newo
);
15405 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
15406 // Onode in the old slot
15407 c
->onode_map
.rename(oldo
, old_oid
, new_oid
, new_okey
);
15410 // hold a ref to new Onode in old name position, to ensure we don't drop
15411 // it from the cache before this txc commits (or else someone may come along
15412 // and read newo's metadata via the old name).
15413 txc
->note_modified_object(oldo
);
15416 dout(10) << __func__
<< " " << c
->cid
<< " " << old_oid
<< " -> "
15417 << new_oid
<< " = " << r
<< dendl
;
15423 int BlueStore::_create_collection(
15429 dout(15) << __func__
<< " " << cid
<< " bits " << bits
<< dendl
;
15434 std::unique_lock
l(coll_lock
);
15439 auto p
= new_coll_map
.find(cid
);
15440 ceph_assert(p
!= new_coll_map
.end());
15442 (*c
)->cnode
.bits
= bits
;
15443 coll_map
[cid
] = *c
;
15444 new_coll_map
.erase(p
);
15446 encode((*c
)->cnode
, bl
);
15447 txc
->t
->set(PREFIX_COLL
, stringify(cid
), bl
);
15451 dout(10) << __func__
<< " " << cid
<< " bits " << bits
<< " = " << r
<< dendl
;
15455 int BlueStore::_remove_collection(TransContext
*txc
, const coll_t
&cid
,
15458 dout(15) << __func__
<< " " << cid
<< dendl
;
15461 (*c
)->flush_all_but_last();
15463 std::unique_lock
l(coll_lock
);
15468 size_t nonexistent_count
= 0;
15469 ceph_assert((*c
)->exists
);
15470 if ((*c
)->onode_map
.map_any([&](Onode
* o
) {
15472 dout(1) << __func__
<< " " << o
->oid
<< " " << o
15473 << " exists in onode_map" << dendl
;
15476 ++nonexistent_count
;
15482 vector
<ghobject_t
> ls
;
15484 // Enumerate onodes in db, up to nonexistent_count + 1
15485 // then check if all of them are marked as non-existent.
15486 // Bypass the check if (next != ghobject_t::get_max())
15487 r
= _collection_list(c
->get(), ghobject_t(), ghobject_t::get_max(),
15488 nonexistent_count
+ 1, false, &ls
, &next
);
15490 // If true mean collecton has more objects than nonexistent_count,
15491 // so bypass check.
15492 bool exists
= (!next
.is_max());
15493 for (auto it
= ls
.begin(); !exists
&& it
< ls
.end(); ++it
) {
15494 dout(10) << __func__
<< " oid " << *it
<< dendl
;
15495 auto onode
= (*c
)->onode_map
.lookup(*it
);
15496 exists
= !onode
|| onode
->exists
;
15498 dout(1) << __func__
<< " " << *it
15499 << " exists in db, "
15500 << (!onode
? "not present in ram" : "present in ram")
15505 _do_remove_collection(txc
, c
);
15508 dout(10) << __func__
<< " " << cid
15509 << " is non-empty" << dendl
;
15515 dout(10) << __func__
<< " " << cid
<< " = " << r
<< dendl
;
15519 void BlueStore::_do_remove_collection(TransContext
*txc
,
15522 coll_map
.erase((*c
)->cid
);
15523 txc
->removed_collections
.push_back(*c
);
15524 (*c
)->exists
= false;
15525 _osr_register_zombie((*c
)->osr
.get());
15526 txc
->t
->rmkey(PREFIX_COLL
, stringify((*c
)->cid
));
15530 int BlueStore::_split_collection(TransContext
*txc
,
15533 unsigned bits
, int rem
)
15535 dout(15) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
15536 << " bits " << bits
<< dendl
;
15537 std::unique_lock
l(c
->lock
);
15538 std::unique_lock
l2(d
->lock
);
15541 // flush all previous deferred writes on this sequencer. this is a bit
15542 // heavyweight, but we need to make sure all deferred writes complete
15543 // before we split as the new collection's sequencer may need to order
15544 // this after those writes, and we don't bother with the complexity of
15545 // moving those TransContexts over to the new osr.
15546 _osr_drain_preceding(txc
);
15548 // move any cached items (onodes and referenced shared blobs) that will
15549 // belong to the child collection post-split. leave everything else behind.
15550 // this may include things that don't strictly belong to the now-smaller
15551 // parent split, but the OSD will always send us a split for every new
15554 spg_t pgid
, dest_pgid
;
15555 bool is_pg
= c
->cid
.is_pg(&pgid
);
15556 ceph_assert(is_pg
);
15557 is_pg
= d
->cid
.is_pg(&dest_pgid
);
15558 ceph_assert(is_pg
);
15560 // the destination should initially be empty.
15561 ceph_assert(d
->onode_map
.empty());
15562 ceph_assert(d
->shared_blob_set
.empty());
15563 ceph_assert(d
->cnode
.bits
== bits
);
15565 c
->split_cache(d
.get());
15567 // adjust bits. note that this will be redundant for all but the first
15568 // split call for this parent (first child).
15569 c
->cnode
.bits
= bits
;
15570 ceph_assert(d
->cnode
.bits
== bits
);
15574 encode(c
->cnode
, bl
);
15575 txc
->t
->set(PREFIX_COLL
, stringify(c
->cid
), bl
);
15577 dout(10) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
15578 << " bits " << bits
<< " = " << r
<< dendl
;
15582 int BlueStore::_merge_collection(
15588 dout(15) << __func__
<< " " << (*c
)->cid
<< " to " << d
->cid
15589 << " bits " << bits
<< dendl
;
15590 std::unique_lock
l((*c
)->lock
);
15591 std::unique_lock
l2(d
->lock
);
15594 coll_t cid
= (*c
)->cid
;
15596 // flush all previous deferred writes on the source collection to ensure
15597 // that all deferred writes complete before we merge as the target collection's
15598 // sequencer may need to order new ops after those writes.
15600 _osr_drain((*c
)->osr
.get());
15602 // move any cached items (onodes and referenced shared blobs) that will
15603 // belong to the child collection post-split. leave everything else behind.
15604 // this may include things that don't strictly belong to the now-smaller
15605 // parent split, but the OSD will always send us a split for every new
15608 spg_t pgid
, dest_pgid
;
15609 bool is_pg
= cid
.is_pg(&pgid
);
15610 ceph_assert(is_pg
);
15611 is_pg
= d
->cid
.is_pg(&dest_pgid
);
15612 ceph_assert(is_pg
);
15614 // adjust bits. note that this will be redundant for all but the first
15615 // merge call for the parent/target.
15616 d
->cnode
.bits
= bits
;
15618 // behavior depends on target (d) bits, so this after that is updated.
15619 (*c
)->split_cache(d
.get());
15621 // remove source collection
15623 std::unique_lock
l3(coll_lock
);
15624 _do_remove_collection(txc
, c
);
15630 encode(d
->cnode
, bl
);
15631 txc
->t
->set(PREFIX_COLL
, stringify(d
->cid
), bl
);
15633 dout(10) << __func__
<< " " << cid
<< " to " << d
->cid
<< " "
15634 << " bits " << bits
<< " = " << r
<< dendl
;
15638 void BlueStore::log_latency(
15641 const ceph::timespan
& l
,
15642 double lat_threshold
,
15643 const char* info
) const
15645 logger
->tinc(idx
, l
);
15646 if (lat_threshold
> 0.0 &&
15647 l
>= make_timespan(lat_threshold
)) {
15648 dout(0) << __func__
<< " slow operation observed for " << name
15649 << ", latency = " << l
15655 void BlueStore::log_latency_fn(
15658 const ceph::timespan
& l
,
15659 double lat_threshold
,
15660 std::function
<string (const ceph::timespan
& lat
)> fn
) const
15662 logger
->tinc(idx
, l
);
15663 if (lat_threshold
> 0.0 &&
15664 l
>= make_timespan(lat_threshold
)) {
15665 dout(0) << __func__
<< " slow operation observed for " << name
15666 << ", latency = " << l
15672 #if defined(WITH_LTTNG)
15673 void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15676 mono_clock::time_point start_throttle_acquire
)
15678 pending_kv_ios
+= txc
.ios
;
15679 if (txc
.deferred_txn
) {
15680 pending_deferred_ios
+= txc
.ios
;
15683 uint64_t started
= 0;
15684 uint64_t completed
= 0;
15685 if (should_trace(&started
, &completed
)) {
15686 txc
.tracing
= true;
15687 uint64_t rocksdb_base_level
,
15688 rocksdb_estimate_pending_compaction_bytes
,
15689 rocksdb_cur_size_all_mem_tables
,
15690 rocksdb_compaction_pending
,
15691 rocksdb_mem_table_flush_pending
,
15692 rocksdb_num_running_compactions
,
15693 rocksdb_num_running_flushes
,
15694 rocksdb_actual_delayed_write_rate
;
15696 "rocksdb.base-level",
15697 &rocksdb_base_level
);
15699 "rocksdb.estimate-pending-compaction-bytes",
15700 &rocksdb_estimate_pending_compaction_bytes
);
15702 "rocksdb.cur-size-all-mem-tables",
15703 &rocksdb_cur_size_all_mem_tables
);
15705 "rocksdb.compaction-pending",
15706 &rocksdb_compaction_pending
);
15708 "rocksdb.mem-table-flush-pending",
15709 &rocksdb_mem_table_flush_pending
);
15711 "rocksdb.num-running-compactions",
15712 &rocksdb_num_running_compactions
);
15714 "rocksdb.num-running-flushes",
15715 &rocksdb_num_running_flushes
);
15717 "rocksdb.actual-delayed-write-rate",
15718 &rocksdb_actual_delayed_write_rate
);
15723 transaction_initial_state
,
15724 txc
.osr
->get_sequencer_id(),
15726 throttle_bytes
.get_current(),
15727 throttle_deferred_bytes
.get_current(),
15729 pending_deferred_ios
,
15732 ceph::to_seconds
<double>(mono_clock::now() - start_throttle_acquire
));
15736 transaction_initial_state_rocksdb
,
15737 txc
.osr
->get_sequencer_id(),
15739 rocksdb_base_level
,
15740 rocksdb_estimate_pending_compaction_bytes
,
15741 rocksdb_cur_size_all_mem_tables
,
15742 rocksdb_compaction_pending
,
15743 rocksdb_mem_table_flush_pending
,
15744 rocksdb_num_running_compactions
,
15745 rocksdb_num_running_flushes
,
15746 rocksdb_actual_delayed_write_rate
);
15751 mono_clock::duration
BlueStore::BlueStoreThrottle::log_state_latency(
15752 TransContext
&txc
, PerfCounters
*logger
, int state
)
15754 mono_clock::time_point now
= mono_clock::now();
15755 mono_clock::duration lat
= now
- txc
.last_stamp
;
15756 logger
->tinc(state
, lat
);
15757 #if defined(WITH_LTTNG)
15759 state
>= l_bluestore_state_prepare_lat
&&
15760 state
<= l_bluestore_state_done_lat
) {
15761 OID_ELAPSED("", lat
.to_nsec() / 1000.0, txc
.get_state_latency_name(state
));
15764 transaction_state_duration
,
15765 txc
.osr
->get_sequencer_id(),
15768 ceph::to_seconds
<double>(lat
));
15771 txc
.last_stamp
= now
;
15775 bool BlueStore::BlueStoreThrottle::try_start_transaction(
15778 mono_clock::time_point start_throttle_acquire
)
15780 throttle_bytes
.get(txc
.cost
);
15782 if (!txc
.deferred_txn
|| throttle_deferred_bytes
.get_or_fail(txc
.cost
)) {
15783 emit_initial_tracepoint(db
, txc
, start_throttle_acquire
);
15790 void BlueStore::BlueStoreThrottle::finish_start_transaction(
15793 mono_clock::time_point start_throttle_acquire
)
15795 ceph_assert(txc
.deferred_txn
);
15796 throttle_deferred_bytes
.get(txc
.cost
);
15797 emit_initial_tracepoint(db
, txc
, start_throttle_acquire
);
15800 #if defined(WITH_LTTNG)
15801 void BlueStore::BlueStoreThrottle::complete_kv(TransContext
&txc
)
15803 pending_kv_ios
-= 1;
15804 ios_completed_since_last_traced
++;
15808 transaction_commit_latency
,
15809 txc
.osr
->get_sequencer_id(),
15811 ceph::to_seconds
<double>(mono_clock::now() - txc
.start
));
15816 #if defined(WITH_LTTNG)
15817 void BlueStore::BlueStoreThrottle::complete(TransContext
&txc
)
15819 if (txc
.deferred_txn
) {
15820 pending_deferred_ios
-= 1;
15823 mono_clock::time_point now
= mono_clock::now();
15824 mono_clock::duration lat
= now
- txc
.start
;
15827 transaction_total_duration
,
15828 txc
.osr
->get_sequencer_id(),
15830 ceph::to_seconds
<double>(lat
));
15835 // DB key value Histogram
15836 #define KEY_SLAB 32
15837 #define VALUE_SLAB 64
15839 const string prefix_onode
= "o";
15840 const string prefix_onode_shard
= "x";
15841 const string prefix_other
= "Z";
15843 int BlueStore::DBHistogram::get_key_slab(size_t sz
)
15845 return (sz
/KEY_SLAB
);
15848 string
BlueStore::DBHistogram::get_key_slab_to_range(int slab
)
15850 int lower_bound
= slab
* KEY_SLAB
;
15851 int upper_bound
= (slab
+ 1) * KEY_SLAB
;
15852 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
15856 int BlueStore::DBHistogram::get_value_slab(size_t sz
)
15858 return (sz
/VALUE_SLAB
);
15861 string
BlueStore::DBHistogram::get_value_slab_to_range(int slab
)
15863 int lower_bound
= slab
* VALUE_SLAB
;
15864 int upper_bound
= (slab
+ 1) * VALUE_SLAB
;
15865 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
15869 void BlueStore::DBHistogram::update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
15870 const string
&prefix
, size_t key_size
, size_t value_size
)
15872 uint32_t key_slab
= get_key_slab(key_size
);
15873 uint32_t value_slab
= get_value_slab(value_size
);
15874 key_hist
[prefix
][key_slab
].count
++;
15875 key_hist
[prefix
][key_slab
].max_len
=
15876 std::max
<size_t>(key_size
, key_hist
[prefix
][key_slab
].max_len
);
15877 key_hist
[prefix
][key_slab
].val_map
[value_slab
].count
++;
15878 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
=
15879 std::max
<size_t>(value_size
,
15880 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
);
15883 void BlueStore::DBHistogram::dump(Formatter
*f
)
15885 f
->open_object_section("rocksdb_value_distribution");
15886 for (auto i
: value_hist
) {
15887 f
->dump_unsigned(get_value_slab_to_range(i
.first
).data(), i
.second
);
15889 f
->close_section();
15891 f
->open_object_section("rocksdb_key_value_histogram");
15892 for (auto i
: key_hist
) {
15893 f
->dump_string("prefix", i
.first
);
15894 f
->open_object_section("key_hist");
15895 for ( auto k
: i
.second
) {
15896 f
->dump_unsigned(get_key_slab_to_range(k
.first
).data(), k
.second
.count
);
15897 f
->dump_unsigned("max_len", k
.second
.max_len
);
15898 f
->open_object_section("value_hist");
15899 for ( auto j
: k
.second
.val_map
) {
15900 f
->dump_unsigned(get_value_slab_to_range(j
.first
).data(), j
.second
.count
);
15901 f
->dump_unsigned("max_len", j
.second
.max_len
);
15903 f
->close_section();
15905 f
->close_section();
15907 f
->close_section();
15910 //Itrerates through the db and collects the stats
15911 void BlueStore::generate_db_histogram(Formatter
*f
)
15914 uint64_t num_onodes
= 0;
15915 uint64_t num_shards
= 0;
15916 uint64_t num_super
= 0;
15917 uint64_t num_coll
= 0;
15918 uint64_t num_omap
= 0;
15919 uint64_t num_pgmeta_omap
= 0;
15920 uint64_t num_deferred
= 0;
15921 uint64_t num_alloc
= 0;
15922 uint64_t num_stat
= 0;
15923 uint64_t num_others
= 0;
15924 uint64_t num_shared_shards
= 0;
15925 size_t max_key_size
=0, max_value_size
= 0;
15926 uint64_t total_key_size
= 0, total_value_size
= 0;
15927 size_t key_size
= 0, value_size
= 0;
15930 auto start
= coarse_mono_clock::now();
15932 KeyValueDB::WholeSpaceIterator iter
= db
->get_wholespace_iterator();
15933 iter
->seek_to_first();
15934 while (iter
->valid()) {
15935 dout(30) << __func__
<< " Key: " << iter
->key() << dendl
;
15936 key_size
= iter
->key_size();
15937 value_size
= iter
->value_size();
15938 hist
.value_hist
[hist
.get_value_slab(value_size
)]++;
15939 max_key_size
= std::max(max_key_size
, key_size
);
15940 max_value_size
= std::max(max_value_size
, value_size
);
15941 total_key_size
+= key_size
;
15942 total_value_size
+= value_size
;
15944 pair
<string
,string
> key(iter
->raw_key());
15946 if (key
.first
== PREFIX_SUPER
) {
15947 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SUPER
, key_size
, value_size
);
15949 } else if (key
.first
== PREFIX_STAT
) {
15950 hist
.update_hist_entry(hist
.key_hist
, PREFIX_STAT
, key_size
, value_size
);
15952 } else if (key
.first
== PREFIX_COLL
) {
15953 hist
.update_hist_entry(hist
.key_hist
, PREFIX_COLL
, key_size
, value_size
);
15955 } else if (key
.first
== PREFIX_OBJ
) {
15956 if (key
.second
.back() == ONODE_KEY_SUFFIX
) {
15957 hist
.update_hist_entry(hist
.key_hist
, prefix_onode
, key_size
, value_size
);
15960 hist
.update_hist_entry(hist
.key_hist
, prefix_onode_shard
, key_size
, value_size
);
15963 } else if (key
.first
== PREFIX_OMAP
) {
15964 hist
.update_hist_entry(hist
.key_hist
, PREFIX_OMAP
, key_size
, value_size
);
15966 } else if (key
.first
== PREFIX_PERPOOL_OMAP
) {
15967 hist
.update_hist_entry(hist
.key_hist
, PREFIX_PERPOOL_OMAP
, key_size
, value_size
);
15969 } else if (key
.first
== PREFIX_PERPG_OMAP
) {
15970 hist
.update_hist_entry(hist
.key_hist
, PREFIX_PERPG_OMAP
, key_size
, value_size
);
15972 } else if (key
.first
== PREFIX_PGMETA_OMAP
) {
15973 hist
.update_hist_entry(hist
.key_hist
, PREFIX_PGMETA_OMAP
, key_size
, value_size
);
15975 } else if (key
.first
== PREFIX_DEFERRED
) {
15976 hist
.update_hist_entry(hist
.key_hist
, PREFIX_DEFERRED
, key_size
, value_size
);
15978 } else if (key
.first
== PREFIX_ALLOC
|| key
.first
== PREFIX_ALLOC_BITMAP
) {
15979 hist
.update_hist_entry(hist
.key_hist
, PREFIX_ALLOC
, key_size
, value_size
);
15981 } else if (key
.first
== PREFIX_SHARED_BLOB
) {
15982 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SHARED_BLOB
, key_size
, value_size
);
15983 num_shared_shards
++;
15985 hist
.update_hist_entry(hist
.key_hist
, prefix_other
, key_size
, value_size
);
15991 ceph::timespan duration
= coarse_mono_clock::now() - start
;
15992 f
->open_object_section("rocksdb_key_value_stats");
15993 f
->dump_unsigned("num_onodes", num_onodes
);
15994 f
->dump_unsigned("num_shards", num_shards
);
15995 f
->dump_unsigned("num_super", num_super
);
15996 f
->dump_unsigned("num_coll", num_coll
);
15997 f
->dump_unsigned("num_omap", num_omap
);
15998 f
->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap
);
15999 f
->dump_unsigned("num_deferred", num_deferred
);
16000 f
->dump_unsigned("num_alloc", num_alloc
);
16001 f
->dump_unsigned("num_stat", num_stat
);
16002 f
->dump_unsigned("num_shared_shards", num_shared_shards
);
16003 f
->dump_unsigned("num_others", num_others
);
16004 f
->dump_unsigned("max_key_size", max_key_size
);
16005 f
->dump_unsigned("max_value_size", max_value_size
);
16006 f
->dump_unsigned("total_key_size", total_key_size
);
16007 f
->dump_unsigned("total_value_size", total_value_size
);
16008 f
->close_section();
16012 dout(20) << __func__
<< " finished in " << duration
<< " seconds" << dendl
;
16016 void BlueStore::_shutdown_cache()
16018 dout(10) << __func__
<< dendl
;
16019 for (auto i
: buffer_cache_shards
) {
16021 ceph_assert(i
->empty());
16023 for (auto& p
: coll_map
) {
16024 p
.second
->onode_map
.clear();
16025 if (!p
.second
->shared_blob_set
.empty()) {
16026 derr
<< __func__
<< " stray shared blobs on " << p
.first
<< dendl
;
16027 p
.second
->shared_blob_set
.dump
<0>(cct
);
16029 ceph_assert(p
.second
->onode_map
.empty());
16030 ceph_assert(p
.second
->shared_blob_set
.empty());
16033 for (auto i
: onode_cache_shards
) {
16034 ceph_assert(i
->empty());
16038 // For external caller.
16039 // We use a best-effort policy instead, e.g.,
16040 // we don't care if there are still some pinned onodes/data in the cache
16041 // after this command is completed.
16042 int BlueStore::flush_cache(ostream
*os
)
16044 dout(10) << __func__
<< dendl
;
16045 for (auto i
: onode_cache_shards
) {
16048 for (auto i
: buffer_cache_shards
) {
16055 void BlueStore::_apply_padding(uint64_t head_pad
,
16057 bufferlist
& padded
)
16060 padded
.prepend_zero(head_pad
);
16063 padded
.append_zero(tail_pad
);
16065 if (head_pad
|| tail_pad
) {
16066 dout(20) << __func__
<< " can pad head 0x" << std::hex
<< head_pad
16067 << " tail 0x" << tail_pad
<< std::dec
<< dendl
;
16068 logger
->inc(l_bluestore_write_pad_bytes
, head_pad
+ tail_pad
);
16072 void BlueStore::_record_onode(OnodeRef
&o
, KeyValueDB::Transaction
&txn
)
16074 // finalize extent_map shards
16075 o
->extent_map
.update(txn
, false);
16076 if (o
->extent_map
.needs_reshard()) {
16077 o
->extent_map
.reshard(db
, txn
);
16078 o
->extent_map
.update(txn
, true);
16079 if (o
->extent_map
.needs_reshard()) {
16080 dout(20) << __func__
<< " warning: still wants reshard, check options?"
16082 o
->extent_map
.clear_needs_reshard();
16084 logger
->inc(l_bluestore_onode_reshard
);
16089 denc(o
->onode
, bound
);
16090 o
->extent_map
.bound_encode_spanning_blobs(bound
);
16091 if (o
->onode
.extent_map_shards
.empty()) {
16092 denc(o
->extent_map
.inline_bl
, bound
);
16097 unsigned onode_part
, blob_part
, extent_part
;
16099 auto p
= bl
.get_contiguous_appender(bound
, true);
16101 onode_part
= p
.get_logical_offset();
16102 o
->extent_map
.encode_spanning_blobs(p
);
16103 blob_part
= p
.get_logical_offset() - onode_part
;
16104 if (o
->onode
.extent_map_shards
.empty()) {
16105 denc(o
->extent_map
.inline_bl
, p
);
16107 extent_part
= p
.get_logical_offset() - onode_part
- blob_part
;
16110 dout(20) << __func__
<< " onode " << o
->oid
<< " is " << bl
.length()
16111 << " (" << onode_part
<< " bytes onode + "
16112 << blob_part
<< " bytes spanning blobs + "
16113 << extent_part
<< " bytes inline extents)"
16117 txn
->set(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size(), bl
);
16120 void BlueStore::_log_alerts(osd_alert_list_t
& alerts
)
16122 std::lock_guard
l(qlock
);
16124 if (!spurious_read_errors_alert
.empty() &&
16125 cct
->_conf
->bluestore_warn_on_spurious_read_errors
) {
16127 "BLUESTORE_SPURIOUS_READ_ERRORS",
16128 spurious_read_errors_alert
);
16130 if (!disk_size_mismatch_alert
.empty()) {
16132 "BLUESTORE_DISK_SIZE_MISMATCH",
16133 disk_size_mismatch_alert
);
16135 if (!legacy_statfs_alert
.empty()) {
16137 "BLUESTORE_LEGACY_STATFS",
16138 legacy_statfs_alert
);
16140 if (!spillover_alert
.empty() &&
16141 cct
->_conf
->bluestore_warn_on_bluefs_spillover
) {
16143 "BLUEFS_SPILLOVER",
16146 if (!no_per_pg_omap_alert
.empty()) {
16148 "BLUESTORE_NO_PER_PG_OMAP",
16149 no_per_pg_omap_alert
);
16151 if (!no_per_pool_omap_alert
.empty()) {
16153 "BLUESTORE_NO_PER_POOL_OMAP",
16154 no_per_pool_omap_alert
);
16156 string
s0(failed_cmode
);
16158 if (!failed_compressors
.empty()) {
16162 s0
+= "unable to load:";
16164 for (auto& s
: failed_compressors
) {
16173 "BLUESTORE_NO_COMPRESSION",
16178 void BlueStore::_collect_allocation_stats(uint64_t need
, uint32_t alloc_size
,
16181 alloc_stats_count
++;
16182 alloc_stats_fragments
+= extents
;
16183 alloc_stats_size
+= need
;
16186 void BlueStore::_record_allocation_stats()
16188 // don't care about data consistency,
16189 // fields can be partially modified while making the tuple
16190 auto t0
= std::make_tuple(
16191 alloc_stats_count
.exchange(0),
16192 alloc_stats_fragments
.exchange(0),
16193 alloc_stats_size
.exchange(0));
16195 dout(0) << " allocation stats probe "
16196 << probe_count
<< ":"
16197 << " cnt: " << std::get
<0>(t0
)
16198 << " frags: " << std::get
<1>(t0
)
16199 << " size: " << std::get
<2>(t0
)
16204 // Keep the history for probes from the power-of-two sequence:
16205 // -1, -2, -4, -8, -16
16208 for (auto& t
: alloc_stats_history
) {
16209 dout(0) << " probe -"
16210 << base
+ (probe_count
% base
) << ": "
16212 << ", " << std::get
<1>(t
)
16213 << ", " << std::get
<2>(t
)
16217 dout(0) << "------------" << dendl
;
16221 for (ssize_t i
= alloc_stats_history
.size() - 1 ; i
> 0 ; --i
) {
16222 if ((probe_count
% (1 << i
)) == 0) {
16223 alloc_stats_history
[i
] = alloc_stats_history
[i
- 1];
16226 alloc_stats_history
[0].swap(t0
);
16229 // ===========================================
16230 // BlueStoreRepairer
16232 size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
16233 const interval_set
<uint64_t>& extents
)
16235 ceph_assert(granularity
); // initialized
16236 // can't call for the second time
16237 ceph_assert(!was_filtered_out
);
16238 ceph_assert(collections_bfs
.size() == objects_bfs
.size());
16240 uint64_t prev_pos
= 0;
16241 uint64_t npos
= collections_bfs
.size();
16243 bloom_vector collections_reduced
;
16244 bloom_vector objects_reduced
;
16246 for (auto e
: extents
) {
16247 if (e
.second
== 0) {
16250 uint64_t pos
= max(e
.first
/ granularity
, prev_pos
);
16251 uint64_t end_pos
= 1 + (e
.first
+ e
.second
- 1) / granularity
;
16252 while (pos
!= npos
&& pos
< end_pos
) {
16253 ceph_assert( collections_bfs
[pos
].element_count() ==
16254 objects_bfs
[pos
].element_count());
16255 if (collections_bfs
[pos
].element_count()) {
16256 collections_reduced
.push_back(std::move(collections_bfs
[pos
]));
16257 objects_reduced
.push_back(std::move(objects_bfs
[pos
]));
16261 prev_pos
= end_pos
;
16263 collections_reduced
.swap(collections_bfs
);
16264 objects_reduced
.swap(objects_bfs
);
16265 was_filtered_out
= true;
16266 return collections_bfs
.size();
16269 bool BlueStoreRepairer::remove_key(KeyValueDB
*db
,
16270 const string
& prefix
,
16273 std::lock_guard
l(lock
);
16274 if (!remove_key_txn
) {
16275 remove_key_txn
= db
->get_transaction();
16278 remove_key_txn
->rmkey(prefix
, key
);
16283 void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB
*db
, int val
)
16285 std::lock_guard
l(lock
); // possibly redundant
16286 ceph_assert(fix_per_pool_omap_txn
== nullptr);
16287 fix_per_pool_omap_txn
= db
->get_transaction();
16290 bl
.append(stringify(val
));
16291 fix_per_pool_omap_txn
->set(PREFIX_SUPER
, "per_pool_omap", bl
);
16294 bool BlueStoreRepairer::fix_shared_blob(
16297 const bufferlist
* bl
)
16299 std::lock_guard
l(lock
); // possibly redundant
16300 KeyValueDB::Transaction txn
;
16301 if (fix_misreferences_txn
) { // reuse this txn
16302 txn
= fix_misreferences_txn
;
16304 if (!fix_shared_blob_txn
) {
16305 fix_shared_blob_txn
= db
->get_transaction();
16307 txn
= fix_shared_blob_txn
;
16310 get_shared_blob_key(sbid
, &key
);
16314 txn
->set(PREFIX_SHARED_BLOB
, key
, *bl
);
16316 txn
->rmkey(PREFIX_SHARED_BLOB
, key
);
16321 bool BlueStoreRepairer::fix_statfs(KeyValueDB
*db
,
16323 const store_statfs_t
& new_statfs
)
16325 std::lock_guard
l(lock
);
16326 if (!fix_statfs_txn
) {
16327 fix_statfs_txn
= db
->get_transaction();
16329 BlueStore::volatile_statfs vstatfs
;
16330 vstatfs
= new_statfs
;
16332 vstatfs
.encode(bl
);
16334 fix_statfs_txn
->set(PREFIX_STAT
, key
, bl
);
16338 bool BlueStoreRepairer::fix_leaked(KeyValueDB
*db
,
16339 FreelistManager
* fm
,
16340 uint64_t offset
, uint64_t len
)
16342 std::lock_guard
l(lock
);
16343 if (!fix_fm_leaked_txn
) {
16344 fix_fm_leaked_txn
= db
->get_transaction();
16347 fm
->release(offset
, len
, fix_fm_leaked_txn
);
16350 bool BlueStoreRepairer::fix_false_free(KeyValueDB
*db
,
16351 FreelistManager
* fm
,
16352 uint64_t offset
, uint64_t len
)
16354 std::lock_guard
l(lock
);
16355 if (!fix_fm_false_free_txn
) {
16356 fix_fm_false_free_txn
= db
->get_transaction();
16359 fm
->allocate(offset
, len
, fix_fm_false_free_txn
);
16363 bool BlueStoreRepairer::fix_spanning_blobs(
16365 std::function
<void(KeyValueDB::Transaction
)> f
)
16367 std::lock_guard
l(lock
);
16368 if (!fix_onode_txn
) {
16369 fix_onode_txn
= db
->get_transaction();
16376 bool BlueStoreRepairer::preprocess_misreference(KeyValueDB
*db
)
16378 //NB: not for use in multithreading mode!!!
16379 if (misreferenced_extents
.size()) {
16380 size_t n
= space_usage_tracker
.filter_out(misreferenced_extents
);
16381 ceph_assert(n
> 0);
16382 if (!fix_misreferences_txn
) {
16383 fix_misreferences_txn
= db
->get_transaction();
16390 unsigned BlueStoreRepairer::apply(KeyValueDB
* db
)
16392 //NB: not for use in multithreading mode!!!
16393 if (fix_per_pool_omap_txn
) {
16394 db
->submit_transaction_sync(fix_per_pool_omap_txn
);
16395 fix_per_pool_omap_txn
= nullptr;
16397 if (fix_fm_leaked_txn
) {
16398 db
->submit_transaction_sync(fix_fm_leaked_txn
);
16399 fix_fm_leaked_txn
= nullptr;
16401 if (fix_fm_false_free_txn
) {
16402 db
->submit_transaction_sync(fix_fm_false_free_txn
);
16403 fix_fm_false_free_txn
= nullptr;
16405 if (remove_key_txn
) {
16406 db
->submit_transaction_sync(remove_key_txn
);
16407 remove_key_txn
= nullptr;
16409 if (fix_misreferences_txn
) {
16410 db
->submit_transaction_sync(fix_misreferences_txn
);
16411 fix_misreferences_txn
= nullptr;
16413 if (fix_onode_txn
) {
16414 db
->submit_transaction_sync(fix_onode_txn
);
16415 fix_onode_txn
= nullptr;
16417 if (fix_shared_blob_txn
) {
16418 db
->submit_transaction_sync(fix_shared_blob_txn
);
16419 fix_shared_blob_txn
= nullptr;
16422 if (fix_statfs_txn
) {
16423 db
->submit_transaction_sync(fix_statfs_txn
);
16424 fix_statfs_txn
= nullptr;
16426 if (need_compact
) {
16428 need_compact
= false;
16430 unsigned repaired
= to_repair_cnt
;
16435 // =======================================================
16436 // RocksDBBlueFSVolumeSelector
16438 uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h
) {
16439 ceph_assert(h
!= nullptr);
16440 uint64_t hint
= reinterpret_cast<uint64_t>(h
);
16444 res
= BlueFS::BDEV_SLOW
;
16445 if (db_avail4slow
> 0) {
16446 // considering statically available db space vs.
16447 // - observed maximums on DB dev for DB/WAL/UNSORTED data
16448 // - observed maximum spillovers
16449 uint64_t max_db_use
= 0; // max db usage we potentially observed
16450 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_LOG
- LEVEL_FIRST
);
16451 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_WAL
- LEVEL_FIRST
);
16452 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_DB
- LEVEL_FIRST
);
16453 // this could go to db hence using it in the estimation
16454 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_SLOW
, LEVEL_DB
- LEVEL_FIRST
);
16456 auto db_total
= l_totals
[LEVEL_DB
- LEVEL_FIRST
];
16457 uint64_t avail
= min(
16459 max_db_use
< db_total
? db_total
- max_db_use
: 0);
16461 // considering current DB dev usage for SLOW data
16462 if (avail
> per_level_per_dev_usage
.at(BlueFS::BDEV_DB
, LEVEL_SLOW
- LEVEL_FIRST
)) {
16463 res
= BlueFS::BDEV_DB
;
16469 res
= BlueFS::BDEV_WAL
;
16473 res
= BlueFS::BDEV_DB
;
16479 void RocksDBBlueFSVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const
16481 res
.emplace_back(base
, l_totals
[LEVEL_DB
- LEVEL_FIRST
]);
16482 res
.emplace_back(base
+ ".slow", l_totals
[LEVEL_SLOW
- LEVEL_FIRST
]);
16485 void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname
) const {
16486 uint8_t res
= LEVEL_DB
;
16487 if (dirname
.length() > 5) {
16488 // the "db.slow" and "db.wal" directory names are hard-coded at
16489 // match up with bluestore. the slow device is always the second
16490 // one (when a dedicated block.db device is present and used at
16491 // bdev 0). the wal device is always last.
16492 if (boost::algorithm::ends_with(dirname
, ".slow")) {
16495 else if (boost::algorithm::ends_with(dirname
, ".wal")) {
16499 return reinterpret_cast<void*>(res
);
16502 void RocksDBBlueFSVolumeSelector::dump(ostream
& sout
) {
16503 auto max_x
= per_level_per_dev_usage
.get_max_x();
16504 auto max_y
= per_level_per_dev_usage
.get_max_y();
16505 sout
<< "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals
[LEVEL_WAL
- LEVEL_FIRST
]
16506 << ", db_total:" << l_totals
[LEVEL_DB
- LEVEL_FIRST
]
16507 << ", slow_total:" << l_totals
[LEVEL_SLOW
- LEVEL_FIRST
]
16508 << ", db_avail:" << db_avail4slow
<< std::endl
16509 << "Usage matrix:" << std::endl
;
16510 constexpr std::array
<const char*, 8> names
{ {
16520 const size_t width
= 12;
16521 for (size_t i
= 0; i
< names
.size(); ++i
) {
16522 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16527 for (size_t l
= 0; l
< max_y
; l
++) {
16528 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16530 switch (l
+ LEVEL_FIRST
) {
16532 sout
<< "LOG"; break;
16534 sout
<< "WAL"; break;
16536 sout
<< "DB"; break;
16538 sout
<< "SLOW"; break;
16540 sout
<< "TOTALS"; break;
16542 for (size_t d
= 0; d
< max_x
; d
++) {
16543 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16545 sout
<< stringify(byte_u_t(per_level_per_dev_usage
.at(d
, l
)));
16547 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16549 sout
<< stringify(per_level_files
[l
]) << std::endl
;
16551 ceph_assert(max_x
== per_level_per_dev_max
.get_max_x());
16552 ceph_assert(max_y
== per_level_per_dev_max
.get_max_y());
16553 sout
<< "MAXIMUMS:" << std::endl
;
16554 for (size_t l
= 0; l
< max_y
; l
++) {
16555 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16557 switch (l
+ LEVEL_FIRST
) {
16559 sout
<< "LOG"; break;
16561 sout
<< "WAL"; break;
16563 sout
<< "DB"; break;
16565 sout
<< "SLOW"; break;
16567 sout
<< "TOTALS"; break;
16569 for (size_t d
= 0; d
< max_x
- 1; d
++) {
16570 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16572 sout
<< stringify(byte_u_t(per_level_per_dev_max
.at(d
, l
)));
16574 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16576 sout
<< stringify(byte_u_t(per_level_per_dev_max
.at(max_x
- 1, l
)));
16577 if (l
< max_y
- 1) {
16583 // =======================================================