1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include <sys/types.h>
21 #include <boost/container/flat_set.hpp>
22 #include "boost/algorithm/string.hpp"
24 #include "include/cpp-btree/btree_set.h"
26 #include "BlueStore.h"
27 #include "bluestore_common.h"
29 #include "include/compat.h"
30 #include "include/intarith.h"
31 #include "include/stringify.h"
32 #include "include/str_map.h"
33 #include "include/util.h"
34 #include "common/errno.h"
35 #include "common/safe_io.h"
36 #include "common/PriorityCache.h"
37 #include "common/RWLock.h"
38 #include "Allocator.h"
39 #include "FreelistManager.h"
41 #include "BlueRocksEnv.h"
42 #include "auth/Crypto.h"
43 #include "common/EventTrace.h"
44 #include "perfglue/heap_profiler.h"
45 #include "common/blkdev.h"
46 #include "common/numa.h"
47 #include "common/pretty_binary.h"
49 #if defined(WITH_LTTNG)
50 #define TRACEPOINT_DEFINE
51 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
52 #include "tracing/bluestore.h"
53 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
54 #undef TRACEPOINT_DEFINE
56 #define tracepoint(...)
59 #define dout_context cct
60 #define dout_subsys ceph_subsys_bluestore
62 using bid_t
= decltype(BlueStore::Blob::id
);
64 // bluestore_cache_onode
65 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode
, bluestore_onode
,
66 bluestore_cache_onode
);
68 // bluestore_cache_other
69 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer
, bluestore_buffer
,
71 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent
, bluestore_extent
,
73 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob
, bluestore_blob
,
75 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob
, bluestore_shared_blob
,
76 bluestore_SharedBlob
);
79 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext
, bluestore_transcontext
,
84 using std::numeric_limits
;
90 using std::ostringstream
;
93 using std::stringstream
;
96 using ceph::bufferlist
;
97 using ceph::bufferptr
;
98 using ceph::coarse_mono_clock
;
101 using ceph::Formatter
;
102 using ceph::JSONFormatter
;
103 using ceph::make_timespan
;
104 using ceph::mono_clock
;
105 using ceph::mono_time
;
106 using ceph::timespan_str
;
109 const string PREFIX_SUPER
= "S"; // field -> value
110 const string PREFIX_STAT
= "T"; // field -> value(int64 array)
111 const string PREFIX_COLL
= "C"; // collection name -> cnode_t
112 const string PREFIX_OBJ
= "O"; // object name -> onode_t
113 const string PREFIX_OMAP
= "M"; // u64 + keyname -> value
114 const string PREFIX_PGMETA_OMAP
= "P"; // u64 + keyname -> value(for meta coll)
115 const string PREFIX_PERPOOL_OMAP
= "m"; // s64 + u64 + keyname -> value
116 const string PREFIX_PERPG_OMAP
= "p"; // u64(pool) + u32(hash) + u64(id) + keyname -> value
117 const string PREFIX_DEFERRED
= "L"; // id -> deferred_transaction_t
118 const string PREFIX_ALLOC
= "B"; // u64 offset -> u64 length (freelist)
119 const string PREFIX_ALLOC_BITMAP
= "b";// (see BitmapFreelistManager)
120 const string PREFIX_SHARED_BLOB
= "X"; // u64 offset -> shared_blob_t
121 const string PREFIX_ZONED_FM_META
= "Z"; // (see ZonedFreelistManager)
122 const string PREFIX_ZONED_FM_INFO
= "z"; // (see ZonedFreelistManager)
123 const string PREFIX_ZONED_CL_INFO
= "G"; // (per-zone cleaner metadata)
125 const string BLUESTORE_GLOBAL_STATFS_KEY
= "bluestore_statfs";
127 // write a label in the first block. always use this size. note that
128 // bluefs makes a matching assumption about the location of its
129 // superblock (always the second block of the device).
130 #define BDEV_LABEL_BLOCK_SIZE 4096
132 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
133 #define SUPER_RESERVED 8192
135 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
139 * extent map blob encoding
141 * we use the low bits of the blobid field to indicate some common scenarios
142 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
144 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
145 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
146 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
147 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
148 #define BLOBID_SHIFT_BITS 4
151 * object name key structure
153 * encoded u8: shard + 2^7 (so that it sorts properly)
154 * encoded u64: poolid + 2^63 (so that it sorts properly)
155 * encoded u32: hash (bit reversed)
157 * escaped string: namespace
159 * escaped string: key or object name
160 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
161 * we are done. otherwise, we are followed by the object name.
162 * escaped string: object name (unless '=' above)
165 * encoded u64: generation
168 #define ONODE_KEY_SUFFIX 'o'
177 #define EXTENT_SHARD_KEY_SUFFIX 'x'
180 * string encoding in the key
182 * The key string needs to lexicographically sort the same way that
183 * ghobject_t does. We do this by escaping anything <= to '#' with #
184 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
187 * We use ! as a terminator for strings; this works because it is < #
188 * and will get escaped if it is present in the string.
190 * NOTE: There is a bug in this implementation: due to implicit
191 * character type conversion in comparison it may produce unexpected
192 * ordering. Unfortunately fixing the bug would mean invalidating the
193 * keys in existing deployments. Instead we do additional sorting
194 * where it is needed.
197 static void append_escaped(const string
&in
, S
*out
)
199 char hexbyte
[in
.length() * 3 + 1];
200 char* ptr
= &hexbyte
[0];
201 for (string::const_iterator i
= in
.begin(); i
!= in
.end(); ++i
) {
202 if (*i
<= '#') { // bug: unexpected result for *i > 0x7f
204 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
205 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
206 } else if (*i
>= '~') { // bug: unexpected result for *i > 0x7f
208 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
209 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
215 out
->append(hexbyte
, ptr
- &hexbyte
[0]);
218 inline unsigned h2i(char c
)
220 if ((c
>= '0') && (c
<= '9')) {
222 } else if ((c
>= 'a') && (c
<= 'f')) {
224 } else if ((c
>= 'A') && (c
<= 'F')) {
227 return 256; // make it always larger than 255
231 static int decode_escaped(const char *p
, string
*out
)
234 char* ptr
= &buff
[0];
235 char* max
= &buff
[252];
236 const char *orig_p
= p
;
237 while (*p
&& *p
!= '!') {
238 if (*p
== '#' || *p
== '~') {
241 hex
= h2i(*p
++) << 4;
254 out
->append(buff
, ptr
-buff
);
259 out
->append(buff
, ptr
-buff
);
265 static void _key_encode_shard(shard_id_t shard
, T
*key
)
267 key
->push_back((char)((uint8_t)shard
.id
+ (uint8_t)0x80));
270 static const char *_key_decode_shard(const char *key
, shard_id_t
*pshard
)
272 pshard
->id
= (uint8_t)*key
- (uint8_t)0x80;
276 static void get_coll_range(const coll_t
& cid
, int bits
,
277 ghobject_t
*temp_start
, ghobject_t
*temp_end
,
278 ghobject_t
*start
, ghobject_t
*end
)
281 if (cid
.is_pg(&pgid
)) {
282 start
->shard_id
= pgid
.shard
;
283 *temp_start
= *start
;
285 start
->hobj
.pool
= pgid
.pool();
286 temp_start
->hobj
.pool
= -2ll - pgid
.pool();
289 *temp_end
= *temp_start
;
291 uint32_t reverse_hash
= hobject_t::_reverse_bits(pgid
.ps());
292 start
->hobj
.set_bitwise_key_u32(reverse_hash
);
293 temp_start
->hobj
.set_bitwise_key_u32(reverse_hash
);
295 uint64_t end_hash
= reverse_hash
+ (1ull << (32 - bits
));
296 if (end_hash
> 0xffffffffull
)
297 end_hash
= 0xffffffffull
;
299 end
->hobj
.set_bitwise_key_u32(end_hash
);
300 temp_end
->hobj
.set_bitwise_key_u32(end_hash
);
302 start
->shard_id
= shard_id_t::NO_SHARD
;
303 start
->hobj
.pool
= -1ull;
306 start
->hobj
.set_bitwise_key_u32(0);
307 end
->hobj
.set_bitwise_key_u32(0xffffffff);
309 // no separate temp section
314 start
->generation
= 0;
316 temp_start
->generation
= 0;
317 temp_end
->generation
= 0;
320 static void get_shared_blob_key(uint64_t sbid
, string
*key
)
323 _key_encode_u64(sbid
, key
);
326 static int get_key_shared_blob(const string
& key
, uint64_t *sbid
)
328 const char *p
= key
.c_str();
329 if (key
.length() < sizeof(uint64_t))
331 _key_decode_u64(p
, sbid
);
336 static void _key_encode_prefix(const ghobject_t
& oid
, S
*key
)
338 _key_encode_shard(oid
.shard_id
, key
);
339 _key_encode_u64(oid
.hobj
.pool
+ 0x8000000000000000ull
, key
);
340 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), key
);
343 static const char *_key_decode_prefix(const char *p
, ghobject_t
*oid
)
345 p
= _key_decode_shard(p
, &oid
->shard_id
);
348 p
= _key_decode_u64(p
, &pool
);
349 oid
->hobj
.pool
= pool
- 0x8000000000000000ull
;
352 p
= _key_decode_u32(p
, &hash
);
354 oid
->hobj
.set_bitwise_key_u32(hash
);
359 #define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
362 static int get_key_object(const S
& key
, ghobject_t
*oid
)
365 const char *p
= key
.c_str();
367 if (key
.length() < ENCODED_KEY_PREFIX_LEN
)
370 p
= _key_decode_prefix(p
, oid
);
372 if (key
.length() == ENCODED_KEY_PREFIX_LEN
)
375 r
= decode_escaped(p
, &oid
->hobj
.nspace
);
381 r
= decode_escaped(p
, &k
);
388 oid
->hobj
.oid
.name
= k
;
389 } else if (*p
== '<' || *p
== '>') {
392 r
= decode_escaped(p
, &oid
->hobj
.oid
.name
);
396 oid
->hobj
.set_key(k
);
402 p
= _key_decode_u64(p
, &oid
->hobj
.snap
.val
);
403 p
= _key_decode_u64(p
, &oid
->generation
);
405 if (*p
!= ONODE_KEY_SUFFIX
) {
410 // if we get something other than a null terminator here,
411 // something goes wrong.
419 static void get_object_key(CephContext
*cct
, const ghobject_t
& oid
, S
*key
)
423 size_t max_len
= ENCODED_KEY_PREFIX_LEN
+
424 (oid
.hobj
.nspace
.length() * 3 + 1) +
425 (oid
.hobj
.get_key().length() * 3 + 1) +
426 1 + // for '<', '=', or '>'
427 (oid
.hobj
.oid
.name
.length() * 3 + 1) +
429 key
->reserve(max_len
);
431 _key_encode_prefix(oid
, key
);
433 append_escaped(oid
.hobj
.nspace
, key
);
435 if (oid
.hobj
.get_key().length()) {
436 // is a key... could be < = or >.
437 append_escaped(oid
.hobj
.get_key(), key
);
438 // (ASCII chars < = and > sort in that order, yay)
439 int r
= oid
.hobj
.get_key().compare(oid
.hobj
.oid
.name
);
441 key
->append(r
> 0 ? ">" : "<");
442 append_escaped(oid
.hobj
.oid
.name
, key
);
449 append_escaped(oid
.hobj
.oid
.name
, key
);
453 _key_encode_u64(oid
.hobj
.snap
, key
);
454 _key_encode_u64(oid
.generation
, key
);
456 key
->push_back(ONODE_KEY_SUFFIX
);
461 int r
= get_key_object(*key
, &t
);
463 derr
<< " r " << r
<< dendl
;
464 derr
<< "key " << pretty_binary_string(*key
) << dendl
;
465 derr
<< "oid " << oid
<< dendl
;
466 derr
<< " t " << t
<< dendl
;
467 ceph_assert(r
== 0 && t
== oid
);
472 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
473 // char lets us quickly test whether it is a shard key without decoding any
474 // of the prefix bytes.
476 static void get_extent_shard_key(const S
& onode_key
, uint32_t offset
,
480 key
->reserve(onode_key
.length() + 4 + 1);
481 key
->append(onode_key
.c_str(), onode_key
.size());
482 _key_encode_u32(offset
, key
);
483 key
->push_back(EXTENT_SHARD_KEY_SUFFIX
);
486 static void rewrite_extent_shard_key(uint32_t offset
, string
*key
)
488 ceph_assert(key
->size() > sizeof(uint32_t) + 1);
489 ceph_assert(*key
->rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
490 _key_encode_u32(offset
, key
->size() - sizeof(uint32_t) - 1, key
);
494 static void generate_extent_shard_key_and_apply(
498 std::function
<void(const string
& final_key
)> apply
)
500 if (key
->empty()) { // make full key
501 ceph_assert(!onode_key
.empty());
502 get_extent_shard_key(onode_key
, offset
, key
);
504 rewrite_extent_shard_key(offset
, key
);
509 int get_key_extent_shard(const string
& key
, string
*onode_key
, uint32_t *offset
)
511 ceph_assert(key
.size() > sizeof(uint32_t) + 1);
512 ceph_assert(*key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
513 int okey_len
= key
.size() - sizeof(uint32_t) - 1;
514 *onode_key
= key
.substr(0, okey_len
);
515 const char *p
= key
.data() + okey_len
;
516 _key_decode_u32(p
, offset
);
520 static bool is_extent_shard_key(const string
& key
)
522 return *key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
;
525 static void get_deferred_key(uint64_t seq
, string
*out
)
527 _key_encode_u64(seq
, out
);
530 static void get_pool_stat_key(int64_t pool_id
, string
*key
)
533 _key_encode_u64(pool_id
, key
);
536 static int get_key_pool_stat(const string
& key
, uint64_t* pool_id
)
538 const char *p
= key
.c_str();
539 if (key
.length() < sizeof(uint64_t))
541 _key_decode_u64(p
, pool_id
);
545 template <int LogLevelV
>
546 void _dump_extent_map(CephContext
*cct
, const BlueStore::ExtentMap
&em
)
549 for (auto& s
: em
.shards
) {
550 dout(LogLevelV
) << __func__
<< " shard " << *s
.shard_info
551 << (s
.loaded
? " (loaded)" : "")
552 << (s
.dirty
? " (dirty)" : "")
555 for (auto& e
: em
.extent_map
) {
556 dout(LogLevelV
) << __func__
<< " " << e
<< dendl
;
557 ceph_assert(e
.logical_offset
>= pos
);
558 pos
= e
.logical_offset
+ e
.length
;
559 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
560 if (blob
.has_csum()) {
562 unsigned n
= blob
.get_csum_count();
563 for (unsigned i
= 0; i
< n
; ++i
)
564 v
.push_back(blob
.get_csum_item(i
));
565 dout(LogLevelV
) << __func__
<< " csum: " << std::hex
<< v
<< std::dec
568 std::lock_guard
l(e
.blob
->shared_blob
->get_cache()->lock
);
569 for (auto& i
: e
.blob
->shared_blob
->bc
.buffer_map
) {
570 dout(LogLevelV
) << __func__
<< " 0x" << std::hex
<< i
.first
571 << "~" << i
.second
->length
<< std::dec
572 << " " << *i
.second
<< dendl
;
577 template <int LogLevelV
>
578 void _dump_onode(CephContext
*cct
, const BlueStore::Onode
& o
)
580 if (!cct
->_conf
->subsys
.should_gather
<ceph_subsys_bluestore
, LogLevelV
>())
582 dout(LogLevelV
) << __func__
<< " " << &o
<< " " << o
.oid
583 << " nid " << o
.onode
.nid
584 << " size 0x" << std::hex
<< o
.onode
.size
585 << " (" << std::dec
<< o
.onode
.size
<< ")"
586 << " expected_object_size " << o
.onode
.expected_object_size
587 << " expected_write_size " << o
.onode
.expected_write_size
588 << " in " << o
.onode
.extent_map_shards
.size() << " shards"
589 << ", " << o
.extent_map
.spanning_blob_map
.size()
592 for (auto p
= o
.onode
.attrs
.begin();
593 p
!= o
.onode
.attrs
.end();
595 dout(LogLevelV
) << __func__
<< " attr " << p
->first
596 << " len " << p
->second
.length() << dendl
;
598 _dump_extent_map
<LogLevelV
>(cct
, o
.extent_map
);
601 template <int LogLevelV
>
602 void _dump_transaction(CephContext
*cct
, ObjectStore::Transaction
*t
)
604 dout(LogLevelV
) << __func__
<< " transaction dump:\n";
605 JSONFormatter
f(true);
606 f
.open_object_section("transaction");
615 ostream
& operator<<(ostream
& out
, const BlueStore::Buffer
& b
)
617 out
<< "buffer(" << &b
<< " space " << b
.space
<< " 0x" << std::hex
618 << b
.offset
<< "~" << b
.length
<< std::dec
619 << " " << BlueStore::Buffer::get_state_name(b
.state
);
621 out
<< " " << BlueStore::Buffer::get_flag_name(b
.flags
);
628 * Due to a bug in key string encoding (see a comment for append_escaped)
629 * the KeyValueDB iterator does not lexicographically sort the same
630 * way that ghobject_t does: objects with the same hash may have wrong order.
632 * This is the iterator wrapper that fixes the keys order.
635 class CollectionListIterator
{
637 CollectionListIterator(const KeyValueDB::Iterator
&it
)
640 virtual ~CollectionListIterator() {
643 virtual bool valid() const = 0;
644 virtual const ghobject_t
&oid() const = 0;
645 virtual void lower_bound(const ghobject_t
&oid
) = 0;
646 virtual void upper_bound(const ghobject_t
&oid
) = 0;
647 virtual void next() = 0;
649 virtual int cmp(const ghobject_t
&oid
) const = 0;
651 bool is_ge(const ghobject_t
&oid
) const {
652 return cmp(oid
) >= 0;
655 bool is_lt(const ghobject_t
&oid
) const {
660 KeyValueDB::Iterator m_it
;
663 class SimpleCollectionListIterator
: public CollectionListIterator
{
665 SimpleCollectionListIterator(CephContext
*cct
, const KeyValueDB::Iterator
&it
)
666 : CollectionListIterator(it
), m_cct(cct
) {
669 bool valid() const override
{
670 return m_it
->valid();
673 const ghobject_t
&oid() const override
{
674 ceph_assert(valid());
679 void lower_bound(const ghobject_t
&oid
) override
{
681 get_object_key(m_cct
, oid
, &key
);
683 m_it
->lower_bound(key
);
687 void upper_bound(const ghobject_t
&oid
) override
{
689 get_object_key(m_cct
, oid
, &key
);
691 m_it
->upper_bound(key
);
695 void next() override
{
696 ceph_assert(valid());
702 int cmp(const ghobject_t
&oid
) const override
{
703 ceph_assert(valid());
706 get_object_key(m_cct
, oid
, &key
);
708 return m_it
->key().compare(key
);
716 m_oid
= ghobject_t();
717 while (m_it
->valid() && is_extent_shard_key(m_it
->key())) {
724 int r
= get_key_object(m_it
->key(), &m_oid
);
729 class SortedCollectionListIterator
: public CollectionListIterator
{
731 SortedCollectionListIterator(const KeyValueDB::Iterator
&it
)
732 : CollectionListIterator(it
), m_chunk_iter(m_chunk
.end()) {
735 bool valid() const override
{
736 return m_chunk_iter
!= m_chunk
.end();
739 const ghobject_t
&oid() const override
{
740 ceph_assert(valid());
742 return m_chunk_iter
->first
;
745 void lower_bound(const ghobject_t
&oid
) override
{
747 _key_encode_prefix(oid
, &key
);
749 m_it
->lower_bound(key
);
750 m_chunk_iter
= m_chunk
.end();
751 if (!get_next_chunk()) {
755 if (this->oid().shard_id
!= oid
.shard_id
||
756 this->oid().hobj
.pool
!= oid
.hobj
.pool
||
757 this->oid().hobj
.get_bitwise_key_u32() != oid
.hobj
.get_bitwise_key_u32()) {
761 m_chunk_iter
= m_chunk
.lower_bound(oid
);
762 if (m_chunk_iter
== m_chunk
.end()) {
767 void upper_bound(const ghobject_t
&oid
) override
{
770 if (valid() && this->oid() == oid
) {
775 void next() override
{
776 ceph_assert(valid());
779 if (m_chunk_iter
== m_chunk
.end()) {
784 int cmp(const ghobject_t
&oid
) const override
{
785 ceph_assert(valid());
787 if (this->oid() < oid
) {
790 if (this->oid() > oid
) {
797 std::map
<ghobject_t
, std::string
> m_chunk
;
798 std::map
<ghobject_t
, std::string
>::iterator m_chunk_iter
;
800 bool get_next_chunk() {
801 while (m_it
->valid() && is_extent_shard_key(m_it
->key())) {
805 if (!m_it
->valid()) {
810 int r
= get_key_object(m_it
->key(), &oid
);
815 m_chunk
.insert({oid
, m_it
->key()});
819 } while (m_it
->valid() && is_extent_shard_key(m_it
->key()));
821 if (!m_it
->valid()) {
826 r
= get_key_object(m_it
->key(), &next
);
828 if (next
.shard_id
!= oid
.shard_id
||
829 next
.hobj
.pool
!= oid
.hobj
.pool
||
830 next
.hobj
.get_bitwise_key_u32() != oid
.hobj
.get_bitwise_key_u32()) {
836 m_chunk_iter
= m_chunk
.begin();
841 } // anonymous namespace
845 void BlueStore::GarbageCollector::process_protrusive_extents(
846 const BlueStore::ExtentMap
& extent_map
,
847 uint64_t start_offset
,
849 uint64_t start_touch_offset
,
850 uint64_t end_touch_offset
,
851 uint64_t min_alloc_size
)
853 ceph_assert(start_offset
<= start_touch_offset
&& end_offset
>= end_touch_offset
);
855 uint64_t lookup_start_offset
= p2align(start_offset
, min_alloc_size
);
856 uint64_t lookup_end_offset
= round_up_to(end_offset
, min_alloc_size
);
858 dout(30) << __func__
<< " (hex): [" << std::hex
859 << lookup_start_offset
<< ", " << lookup_end_offset
860 << ")" << std::dec
<< dendl
;
862 for (auto it
= extent_map
.seek_lextent(lookup_start_offset
);
863 it
!= extent_map
.extent_map
.end() &&
864 it
->logical_offset
< lookup_end_offset
;
866 uint64_t alloc_unit_start
= it
->logical_offset
/ min_alloc_size
;
867 uint64_t alloc_unit_end
= (it
->logical_end() - 1) / min_alloc_size
;
869 dout(30) << __func__
<< " " << *it
870 << "alloc_units: " << alloc_unit_start
<< ".." << alloc_unit_end
873 Blob
* b
= it
->blob
.get();
875 if (it
->logical_offset
>=start_touch_offset
&&
876 it
->logical_end() <= end_touch_offset
) {
877 // Process extents within the range affected by
878 // the current write request.
879 // Need to take into account if existing extents
880 // can be merged with them (uncompressed case)
881 if (!b
->get_blob().is_compressed()) {
882 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
883 --blob_info_counted
->expected_allocations
; // don't need to allocate
884 // new AU for compressed
885 // data since another
886 // collocated uncompressed
887 // blob already exists
888 dout(30) << __func__
<< " --expected:"
889 << alloc_unit_start
<< dendl
;
891 used_alloc_unit
= alloc_unit_end
;
892 blob_info_counted
= nullptr;
894 } else if (b
->get_blob().is_compressed()) {
896 // additionally we take compressed blobs that were not impacted
897 // by the write into account too
899 affected_blobs
.emplace(
900 b
, BlobInfo(b
->get_referenced_bytes())).first
->second
;
903 (used_alloc_unit
&& used_alloc_unit
== alloc_unit_start
) ? 0 : 1;
904 bi
.expected_allocations
+= alloc_unit_end
- alloc_unit_start
+ adjust
;
905 dout(30) << __func__
<< " expected_allocations="
906 << bi
.expected_allocations
<< " end_au:"
907 << alloc_unit_end
<< dendl
;
909 blob_info_counted
= &bi
;
910 used_alloc_unit
= alloc_unit_end
;
912 ceph_assert(it
->length
<= bi
.referenced_bytes
);
913 bi
.referenced_bytes
-= it
->length
;
914 dout(30) << __func__
<< " affected_blob:" << *b
915 << " unref 0x" << std::hex
<< it
->length
916 << " referenced = 0x" << bi
.referenced_bytes
917 << std::dec
<< dendl
;
918 // NOTE: we can't move specific blob to resulting GC list here
919 // when reference counter == 0 since subsequent extents might
920 // decrement its expected_allocation.
921 // Hence need to enumerate all the extents first.
922 if (!bi
.collect_candidate
) {
923 bi
.first_lextent
= it
;
924 bi
.collect_candidate
= true;
926 bi
.last_lextent
= it
;
928 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
929 // don't need to allocate new AU for compressed data since another
930 // collocated uncompressed blob already exists
931 --blob_info_counted
->expected_allocations
;
932 dout(30) << __func__
<< " --expected_allocations:"
933 << alloc_unit_start
<< dendl
;
935 used_alloc_unit
= alloc_unit_end
;
936 blob_info_counted
= nullptr;
940 for (auto b_it
= affected_blobs
.begin();
941 b_it
!= affected_blobs
.end();
943 Blob
* b
= b_it
->first
;
944 BlobInfo
& bi
= b_it
->second
;
945 if (bi
.referenced_bytes
== 0) {
946 uint64_t len_on_disk
= b_it
->first
->get_blob().get_ondisk_length();
947 int64_t blob_expected_for_release
=
948 round_up_to(len_on_disk
, min_alloc_size
) / min_alloc_size
;
950 dout(30) << __func__
<< " " << *(b_it
->first
)
951 << " expected4release=" << blob_expected_for_release
952 << " expected_allocations=" << bi
.expected_allocations
954 int64_t benefit
= blob_expected_for_release
- bi
.expected_allocations
;
955 if (benefit
>= g_conf()->bluestore_gc_enable_blob_threshold
) {
956 if (bi
.collect_candidate
) {
957 auto it
= bi
.first_lextent
;
960 if (it
->blob
.get() == b
) {
961 extents_to_collect
.insert(it
->logical_offset
, it
->length
);
963 bExit
= it
== bi
.last_lextent
;
967 expected_for_release
+= blob_expected_for_release
;
968 expected_allocations
+= bi
.expected_allocations
;
974 int64_t BlueStore::GarbageCollector::estimate(
975 uint64_t start_offset
,
977 const BlueStore::ExtentMap
& extent_map
,
978 const BlueStore::old_extent_map_t
& old_extents
,
979 uint64_t min_alloc_size
)
982 affected_blobs
.clear();
983 extents_to_collect
.clear();
984 used_alloc_unit
= boost::optional
<uint64_t >();
985 blob_info_counted
= nullptr;
987 uint64_t gc_start_offset
= start_offset
;
988 uint64_t gc_end_offset
= start_offset
+ length
;
990 uint64_t end_offset
= start_offset
+ length
;
992 for (auto it
= old_extents
.begin(); it
!= old_extents
.end(); ++it
) {
993 Blob
* b
= it
->e
.blob
.get();
994 if (b
->get_blob().is_compressed()) {
996 // update gc_start_offset/gc_end_offset if needed
997 gc_start_offset
= min(gc_start_offset
, (uint64_t)it
->e
.blob_start());
998 gc_end_offset
= std::max(gc_end_offset
, (uint64_t)it
->e
.blob_end());
1000 auto o
= it
->e
.logical_offset
;
1001 auto l
= it
->e
.length
;
1003 uint64_t ref_bytes
= b
->get_referenced_bytes();
1004 // micro optimization to bypass blobs that have no more references
1005 if (ref_bytes
!= 0) {
1006 dout(30) << __func__
<< " affected_blob:" << *b
1007 << " unref 0x" << std::hex
<< o
<< "~" << l
1008 << std::dec
<< dendl
;
1009 affected_blobs
.emplace(b
, BlobInfo(ref_bytes
));
1013 dout(30) << __func__
<< " gc range(hex): [" << std::hex
1014 << gc_start_offset
<< ", " << gc_end_offset
1015 << ")" << std::dec
<< dendl
;
1017 // enumerate preceeding extents to check if they reference affected blobs
1018 if (gc_start_offset
< start_offset
|| gc_end_offset
> end_offset
) {
1019 process_protrusive_extents(extent_map
,
1026 return expected_for_release
- expected_allocations
;
1029 // LruOnodeCacheShard
1030 struct LruOnodeCacheShard
: public BlueStore::OnodeCacheShard
{
1031 typedef boost::intrusive::list
<
1033 boost::intrusive::member_hook
<
1035 boost::intrusive::list_member_hook
<>,
1036 &BlueStore::Onode::lru_item
> > list_t
;
1040 explicit LruOnodeCacheShard(CephContext
*cct
) : BlueStore::OnodeCacheShard(cct
) {}
1042 void _add(BlueStore::Onode
* o
, int level
) override
1044 if (o
->put_cache()) {
1045 (level
> 0) ? lru
.push_front(*o
) : lru
.push_back(*o
);
1049 ++num
; // we count both pinned and unpinned entries
1050 dout(20) << __func__
<< " " << this << " " << o
->oid
<< " added, num=" << num
<< dendl
;
1052 void _rm(BlueStore::Onode
* o
) override
1054 if (o
->pop_cache()) {
1055 lru
.erase(lru
.iterator_to(*o
));
1057 ceph_assert(num_pinned
);
1062 dout(20) << __func__
<< " " << this << " " << " " << o
->oid
<< " removed, num=" << num
<< dendl
;
1064 void _pin(BlueStore::Onode
* o
) override
1066 lru
.erase(lru
.iterator_to(*o
));
1068 dout(20) << __func__
<< this << " " << " " << " " << o
->oid
<< " pinned" << dendl
;
1070 void _unpin(BlueStore::Onode
* o
) override
1073 ceph_assert(num_pinned
);
1075 dout(20) << __func__
<< this << " " << " " << " " << o
->oid
<< " unpinned" << dendl
;
1077 void _unpin_and_rm(BlueStore::Onode
* o
) override
1080 ceph_assert(num_pinned
);
1085 void _trim_to(uint64_t new_size
) override
1087 if (new_size
>= lru
.size()) {
1088 return; // don't even try
1090 uint64_t n
= lru
.size() - new_size
;
1092 ceph_assert(p
!= lru
.begin());
1094 ceph_assert(num
>= n
);
1097 BlueStore::Onode
*o
= &*p
;
1098 dout(20) << __func__
<< " rm " << o
->oid
<< " "
1099 << o
->nref
<< " " << o
->cached
<< " " << o
->pinned
<< dendl
;
1100 if (p
!= lru
.begin()) {
1103 ceph_assert(n
== 0);
1106 auto pinned
= !o
->pop_cache();
1107 ceph_assert(!pinned
);
1108 o
->c
->onode_map
._remove(o
->oid
);
1111 void move_pinned(OnodeCacheShard
*to
, BlueStore::Onode
*o
) override
1116 ceph_assert(o
->cached
);
1117 ceph_assert(o
->pinned
);
1119 ceph_assert(num_pinned
);
1125 void add_stats(uint64_t *onodes
, uint64_t *pinned_onodes
) override
1128 *pinned_onodes
+= num_pinned
;
1133 BlueStore::OnodeCacheShard
*BlueStore::OnodeCacheShard::create(
1136 PerfCounters
*logger
)
1138 BlueStore::OnodeCacheShard
*c
= nullptr;
1139 // Currently we only implement an LRU cache for onodes
1140 c
= new LruOnodeCacheShard(cct
);
1145 // LruBufferCacheShard
1146 struct LruBufferCacheShard
: public BlueStore::BufferCacheShard
{
1147 typedef boost::intrusive::list
<
1149 boost::intrusive::member_hook
<
1151 boost::intrusive::list_member_hook
<>,
1152 &BlueStore::Buffer::lru_item
> > list_t
;
1155 explicit LruBufferCacheShard(CephContext
*cct
) : BlueStore::BufferCacheShard(cct
) {}
1157 void _add(BlueStore::Buffer
*b
, int level
, BlueStore::Buffer
*near
) override
{
1159 auto q
= lru
.iterator_to(*near
);
1161 } else if (level
> 0) {
1166 buffer_bytes
+= b
->length
;
1169 void _rm(BlueStore::Buffer
*b
) override
{
1170 ceph_assert(buffer_bytes
>= b
->length
);
1171 buffer_bytes
-= b
->length
;
1172 auto q
= lru
.iterator_to(*b
);
1176 void _move(BlueStore::BufferCacheShard
*src
, BlueStore::Buffer
*b
) override
{
1178 _add(b
, 0, nullptr);
1180 void _adjust_size(BlueStore::Buffer
*b
, int64_t delta
) override
{
1181 ceph_assert((int64_t)buffer_bytes
+ delta
>= 0);
1182 buffer_bytes
+= delta
;
1184 void _touch(BlueStore::Buffer
*b
) override
{
1185 auto p
= lru
.iterator_to(*b
);
1189 _audit("_touch_buffer end");
1192 void _trim_to(uint64_t max
) override
1194 while (buffer_bytes
> max
) {
1195 auto i
= lru
.rbegin();
1196 if (i
== lru
.rend()) {
1197 // stop if lru is now empty
1201 BlueStore::Buffer
*b
= &*i
;
1202 ceph_assert(b
->is_clean());
1203 dout(20) << __func__
<< " rm " << *b
<< dendl
;
1204 b
->space
->_rm_buffer(this, b
);
1209 void add_stats(uint64_t *extents
,
1212 uint64_t *bytes
) override
{
1213 *extents
+= num_extents
;
1214 *blobs
+= num_blobs
;
1216 *bytes
+= buffer_bytes
;
1219 void _audit(const char *s
) override
1221 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1223 for (auto i
= lru
.begin(); i
!= lru
.end(); ++i
) {
1226 if (s
!= buffer_bytes
) {
1227 derr
<< __func__
<< " buffer_size " << buffer_bytes
<< " actual " << s
1229 for (auto i
= lru
.begin(); i
!= lru
.end(); ++i
) {
1230 derr
<< __func__
<< " " << *i
<< dendl
;
1232 ceph_assert(s
== buffer_bytes
);
1234 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1240 // TwoQBufferCacheShard
1242 struct TwoQBufferCacheShard
: public BlueStore::BufferCacheShard
{
1243 typedef boost::intrusive::list
<
1245 boost::intrusive::member_hook
<
1247 boost::intrusive::list_member_hook
<>,
1248 &BlueStore::Buffer::lru_item
> > list_t
;
1249 list_t hot
; ///< "Am" hot buffers
1250 list_t warm_in
; ///< "A1in" newly warm buffers
1251 list_t warm_out
; ///< "A1out" empty buffers we've evicted
1255 BUFFER_WARM_IN
, ///< in warm_in
1256 BUFFER_WARM_OUT
, ///< in warm_out
1257 BUFFER_HOT
, ///< in hot
1261 uint64_t list_bytes
[BUFFER_TYPE_MAX
] = {0}; ///< bytes per type
1264 explicit TwoQBufferCacheShard(CephContext
*cct
) : BufferCacheShard(cct
) {}
1266 void _add(BlueStore::Buffer
*b
, int level
, BlueStore::Buffer
*near
) override
1268 dout(20) << __func__
<< " level " << level
<< " near " << near
1270 << " which has cache_private " << b
->cache_private
<< dendl
;
1272 b
->cache_private
= near
->cache_private
;
1273 switch (b
->cache_private
) {
1274 case BUFFER_WARM_IN
:
1275 warm_in
.insert(warm_in
.iterator_to(*near
), *b
);
1277 case BUFFER_WARM_OUT
:
1278 ceph_assert(b
->is_empty());
1279 warm_out
.insert(warm_out
.iterator_to(*near
), *b
);
1282 hot
.insert(hot
.iterator_to(*near
), *b
);
1285 ceph_abort_msg("bad cache_private");
1287 } else if (b
->cache_private
== BUFFER_NEW
) {
1288 b
->cache_private
= BUFFER_WARM_IN
;
1290 warm_in
.push_front(*b
);
1292 // take caller hint to start at the back of the warm queue
1293 warm_in
.push_back(*b
);
1296 // we got a hint from discard
1297 switch (b
->cache_private
) {
1298 case BUFFER_WARM_IN
:
1299 // stay in warm_in. move to front, even though 2Q doesn't actually
1301 dout(20) << __func__
<< " move to front of warm " << *b
<< dendl
;
1302 warm_in
.push_front(*b
);
1304 case BUFFER_WARM_OUT
:
1305 b
->cache_private
= BUFFER_HOT
;
1306 // move to hot. fall-thru
1308 dout(20) << __func__
<< " move to front of hot " << *b
<< dendl
;
1312 ceph_abort_msg("bad cache_private");
1315 if (!b
->is_empty()) {
1316 buffer_bytes
+= b
->length
;
1317 list_bytes
[b
->cache_private
] += b
->length
;
1319 num
= hot
.size() + warm_in
.size();
1322 void _rm(BlueStore::Buffer
*b
) override
1324 dout(20) << __func__
<< " " << *b
<< dendl
;
1325 if (!b
->is_empty()) {
1326 ceph_assert(buffer_bytes
>= b
->length
);
1327 buffer_bytes
-= b
->length
;
1328 ceph_assert(list_bytes
[b
->cache_private
] >= b
->length
);
1329 list_bytes
[b
->cache_private
] -= b
->length
;
1331 switch (b
->cache_private
) {
1332 case BUFFER_WARM_IN
:
1333 warm_in
.erase(warm_in
.iterator_to(*b
));
1335 case BUFFER_WARM_OUT
:
1336 warm_out
.erase(warm_out
.iterator_to(*b
));
1339 hot
.erase(hot
.iterator_to(*b
));
1342 ceph_abort_msg("bad cache_private");
1344 num
= hot
.size() + warm_in
.size();
1347 void _move(BlueStore::BufferCacheShard
*srcc
, BlueStore::Buffer
*b
) override
1349 TwoQBufferCacheShard
*src
= static_cast<TwoQBufferCacheShard
*>(srcc
);
1352 // preserve which list we're on (even if we can't preserve the order!)
1353 switch (b
->cache_private
) {
1354 case BUFFER_WARM_IN
:
1355 ceph_assert(!b
->is_empty());
1356 warm_in
.push_back(*b
);
1358 case BUFFER_WARM_OUT
:
1359 ceph_assert(b
->is_empty());
1360 warm_out
.push_back(*b
);
1363 ceph_assert(!b
->is_empty());
1367 ceph_abort_msg("bad cache_private");
1369 if (!b
->is_empty()) {
1370 buffer_bytes
+= b
->length
;
1371 list_bytes
[b
->cache_private
] += b
->length
;
1373 num
= hot
.size() + warm_in
.size();
1376 void _adjust_size(BlueStore::Buffer
*b
, int64_t delta
) override
1378 dout(20) << __func__
<< " delta " << delta
<< " on " << *b
<< dendl
;
1379 if (!b
->is_empty()) {
1380 ceph_assert((int64_t)buffer_bytes
+ delta
>= 0);
1381 buffer_bytes
+= delta
;
1382 ceph_assert((int64_t)list_bytes
[b
->cache_private
] + delta
>= 0);
1383 list_bytes
[b
->cache_private
] += delta
;
1387 void _touch(BlueStore::Buffer
*b
) override
{
1388 switch (b
->cache_private
) {
1389 case BUFFER_WARM_IN
:
1390 // do nothing (somewhat counter-intuitively!)
1392 case BUFFER_WARM_OUT
:
1393 // move from warm_out to hot LRU
1394 ceph_abort_msg("this happens via discard hint");
1397 // move to front of hot LRU
1398 hot
.erase(hot
.iterator_to(*b
));
1402 num
= hot
.size() + warm_in
.size();
1403 _audit("_touch_buffer end");
1406 void _trim_to(uint64_t max
) override
1408 if (buffer_bytes
> max
) {
1409 uint64_t kin
= max
* cct
->_conf
->bluestore_2q_cache_kin_ratio
;
1410 uint64_t khot
= max
- kin
;
1412 // pre-calculate kout based on average buffer size too,
1413 // which is typical(the warm_in and hot lists may change later)
1415 uint64_t buffer_num
= hot
.size() + warm_in
.size();
1417 uint64_t avg_size
= buffer_bytes
/ buffer_num
;
1418 ceph_assert(avg_size
);
1419 uint64_t calculated_num
= max
/ avg_size
;
1420 kout
= calculated_num
* cct
->_conf
->bluestore_2q_cache_kout_ratio
;
1423 if (list_bytes
[BUFFER_HOT
] < khot
) {
1424 // hot is small, give slack to warm_in
1425 kin
+= khot
- list_bytes
[BUFFER_HOT
];
1426 } else if (list_bytes
[BUFFER_WARM_IN
] < kin
) {
1427 // warm_in is small, give slack to hot
1428 khot
+= kin
- list_bytes
[BUFFER_WARM_IN
];
1431 // adjust warm_in list
1432 int64_t to_evict_bytes
= list_bytes
[BUFFER_WARM_IN
] - kin
;
1433 uint64_t evicted
= 0;
1435 while (to_evict_bytes
> 0) {
1436 auto p
= warm_in
.rbegin();
1437 if (p
== warm_in
.rend()) {
1438 // stop if warm_in list is now empty
1442 BlueStore::Buffer
*b
= &*p
;
1443 ceph_assert(b
->is_clean());
1444 dout(20) << __func__
<< " buffer_warm_in -> out " << *b
<< dendl
;
1445 ceph_assert(buffer_bytes
>= b
->length
);
1446 buffer_bytes
-= b
->length
;
1447 ceph_assert(list_bytes
[BUFFER_WARM_IN
] >= b
->length
);
1448 list_bytes
[BUFFER_WARM_IN
] -= b
->length
;
1449 to_evict_bytes
-= b
->length
;
1450 evicted
+= b
->length
;
1451 b
->state
= BlueStore::Buffer::STATE_EMPTY
;
1453 warm_in
.erase(warm_in
.iterator_to(*b
));
1454 warm_out
.push_front(*b
);
1455 b
->cache_private
= BUFFER_WARM_OUT
;
1459 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1460 << " from warm_in list, done evicting warm_in buffers"
1465 to_evict_bytes
= list_bytes
[BUFFER_HOT
] - khot
;
1468 while (to_evict_bytes
> 0) {
1469 auto p
= hot
.rbegin();
1470 if (p
== hot
.rend()) {
1471 // stop if hot list is now empty
1475 BlueStore::Buffer
*b
= &*p
;
1476 dout(20) << __func__
<< " buffer_hot rm " << *b
<< dendl
;
1477 ceph_assert(b
->is_clean());
1478 // adjust evict size before buffer goes invalid
1479 to_evict_bytes
-= b
->length
;
1480 evicted
+= b
->length
;
1481 b
->space
->_rm_buffer(this, b
);
1485 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1486 << " from hot list, done evicting hot buffers"
1490 // adjust warm out list too, if necessary
1491 int64_t n
= warm_out
.size() - kout
;
1493 BlueStore::Buffer
*b
= &*warm_out
.rbegin();
1494 ceph_assert(b
->is_empty());
1495 dout(20) << __func__
<< " buffer_warm_out rm " << *b
<< dendl
;
1496 b
->space
->_rm_buffer(this, b
);
1499 num
= hot
.size() + warm_in
.size();
1502 void add_stats(uint64_t *extents
,
1505 uint64_t *bytes
) override
{
1506 *extents
+= num_extents
;
1507 *blobs
+= num_blobs
;
1509 *bytes
+= buffer_bytes
;
1513 void _audit(const char *s
) override
1515 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1517 for (auto i
= hot
.begin(); i
!= hot
.end(); ++i
) {
1521 uint64_t hot_bytes
= s
;
1522 if (hot_bytes
!= list_bytes
[BUFFER_HOT
]) {
1523 derr
<< __func__
<< " hot_list_bytes "
1524 << list_bytes
[BUFFER_HOT
]
1525 << " != actual " << hot_bytes
1527 ceph_assert(hot_bytes
== list_bytes
[BUFFER_HOT
]);
1530 for (auto i
= warm_in
.begin(); i
!= warm_in
.end(); ++i
) {
1534 uint64_t warm_in_bytes
= s
- hot_bytes
;
1535 if (warm_in_bytes
!= list_bytes
[BUFFER_WARM_IN
]) {
1536 derr
<< __func__
<< " warm_in_list_bytes "
1537 << list_bytes
[BUFFER_WARM_IN
]
1538 << " != actual " << warm_in_bytes
1540 ceph_assert(warm_in_bytes
== list_bytes
[BUFFER_WARM_IN
]);
1543 if (s
!= buffer_bytes
) {
1544 derr
<< __func__
<< " buffer_bytes " << buffer_bytes
<< " actual " << s
1546 ceph_assert(s
== buffer_bytes
);
1549 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1557 BlueStore::BufferCacheShard
*BlueStore::BufferCacheShard::create(
1560 PerfCounters
*logger
)
1562 BufferCacheShard
*c
= nullptr;
1564 c
= new LruBufferCacheShard(cct
);
1565 else if (type
== "2q")
1566 c
= new TwoQBufferCacheShard(cct
);
1568 ceph_abort_msg("unrecognized cache type");
1576 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1578 void BlueStore::BufferSpace::_clear(BufferCacheShard
* cache
)
1580 // note: we already hold cache->lock
1581 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1582 while (!buffer_map
.empty()) {
1583 _rm_buffer(cache
, buffer_map
.begin());
1587 int BlueStore::BufferSpace::_discard(BufferCacheShard
* cache
, uint32_t offset
, uint32_t length
)
1589 // note: we already hold cache->lock
1590 ldout(cache
->cct
, 20) << __func__
<< std::hex
<< " 0x" << offset
<< "~" << length
1591 << std::dec
<< dendl
;
1592 int cache_private
= 0;
1593 cache
->_audit("discard start");
1594 auto i
= _data_lower_bound(offset
);
1595 uint32_t end
= offset
+ length
;
1596 while (i
!= buffer_map
.end()) {
1597 Buffer
*b
= i
->second
.get();
1598 if (b
->offset
>= end
) {
1601 if (b
->cache_private
> cache_private
) {
1602 cache_private
= b
->cache_private
;
1604 if (b
->offset
< offset
) {
1605 int64_t front
= offset
- b
->offset
;
1606 if (b
->end() > end
) {
1607 // drop middle (split)
1608 uint32_t tail
= b
->end() - end
;
1609 if (b
->data
.length()) {
1611 bl
.substr_of(b
->data
, b
->length
- tail
, tail
);
1612 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
, b
->flags
);
1613 nb
->maybe_rebuild();
1614 _add_buffer(cache
, nb
, 0, b
);
1616 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, tail
,
1620 if (!b
->is_writing()) {
1621 cache
->_adjust_size(b
, front
- (int64_t)b
->length
);
1625 cache
->_audit("discard end 1");
1629 if (!b
->is_writing()) {
1630 cache
->_adjust_size(b
, front
- (int64_t)b
->length
);
1638 if (b
->end() <= end
) {
1639 // drop entire buffer
1640 _rm_buffer(cache
, i
++);
1644 uint32_t keep
= b
->end() - end
;
1645 if (b
->data
.length()) {
1647 bl
.substr_of(b
->data
, b
->length
- keep
, keep
);
1648 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
, b
->flags
);
1649 nb
->maybe_rebuild();
1650 _add_buffer(cache
, nb
, 0, b
);
1652 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, keep
,
1656 _rm_buffer(cache
, i
);
1657 cache
->_audit("discard end 2");
1660 return cache_private
;
1663 void BlueStore::BufferSpace::read(
1664 BufferCacheShard
* cache
,
1667 BlueStore::ready_regions_t
& res
,
1668 interval_set
<uint32_t>& res_intervals
,
1672 res_intervals
.clear();
1673 uint32_t want_bytes
= length
;
1674 uint32_t end
= offset
+ length
;
1677 std::lock_guard
l(cache
->lock
);
1678 for (auto i
= _data_lower_bound(offset
);
1679 i
!= buffer_map
.end() && offset
< end
&& i
->first
< end
;
1681 Buffer
*b
= i
->second
.get();
1682 ceph_assert(b
->end() > offset
);
1685 if (flags
& BYPASS_CLEAN_CACHE
)
1686 val
= b
->is_writing();
1688 val
= b
->is_writing() || b
->is_clean();
1690 if (b
->offset
< offset
) {
1691 uint32_t skip
= offset
- b
->offset
;
1692 uint32_t l
= min(length
, b
->length
- skip
);
1693 res
[offset
].substr_of(b
->data
, skip
, l
);
1694 res_intervals
.insert(offset
, l
);
1697 if (!b
->is_writing()) {
1702 if (b
->offset
> offset
) {
1703 uint32_t gap
= b
->offset
- offset
;
1704 if (length
<= gap
) {
1710 if (!b
->is_writing()) {
1713 if (b
->length
> length
) {
1714 res
[offset
].substr_of(b
->data
, 0, length
);
1715 res_intervals
.insert(offset
, length
);
1718 res
[offset
].append(b
->data
);
1719 res_intervals
.insert(offset
, b
->length
);
1720 if (b
->length
== length
)
1722 offset
+= b
->length
;
1723 length
-= b
->length
;
1729 uint64_t hit_bytes
= res_intervals
.size();
1730 ceph_assert(hit_bytes
<= want_bytes
);
1731 uint64_t miss_bytes
= want_bytes
- hit_bytes
;
1732 cache
->logger
->inc(l_bluestore_buffer_hit_bytes
, hit_bytes
);
1733 cache
->logger
->inc(l_bluestore_buffer_miss_bytes
, miss_bytes
);
1736 void BlueStore::BufferSpace::_finish_write(BufferCacheShard
* cache
, uint64_t seq
)
1738 auto i
= writing
.begin();
1739 while (i
!= writing
.end()) {
1749 ceph_assert(b
->is_writing());
1751 if (b
->flags
& Buffer::FLAG_NOCACHE
) {
1753 ldout(cache
->cct
, 20) << __func__
<< " discard " << *b
<< dendl
;
1754 buffer_map
.erase(b
->offset
);
1756 b
->state
= Buffer::STATE_CLEAN
;
1759 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
1760 cache
->_add(b
, 1, nullptr);
1761 ldout(cache
->cct
, 20) << __func__
<< " added " << *b
<< dendl
;
1765 cache
->_audit("finish_write end");
1768 void BlueStore::BufferSpace::split(BufferCacheShard
* cache
, size_t pos
, BlueStore::BufferSpace
&r
)
1770 std::lock_guard
lk(cache
->lock
);
1771 if (buffer_map
.empty())
1774 auto p
= --buffer_map
.end();
1776 if (p
->second
->end() <= pos
)
1779 if (p
->second
->offset
< pos
) {
1780 ldout(cache
->cct
, 30) << __func__
<< " cut " << *p
->second
<< dendl
;
1781 size_t left
= pos
- p
->second
->offset
;
1782 size_t right
= p
->second
->length
- left
;
1783 if (p
->second
->data
.length()) {
1785 bl
.substr_of(p
->second
->data
, left
, right
);
1786 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1787 0, bl
, p
->second
->flags
),
1788 0, p
->second
.get());
1790 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1791 0, right
, p
->second
->flags
),
1792 0, p
->second
.get());
1794 cache
->_adjust_size(p
->second
.get(), -right
);
1795 p
->second
->truncate(left
);
1799 ceph_assert(p
->second
->end() > pos
);
1800 ldout(cache
->cct
, 30) << __func__
<< " move " << *p
->second
<< dendl
;
1801 if (p
->second
->data
.length()) {
1802 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1803 p
->second
->offset
- pos
, p
->second
->data
, p
->second
->flags
),
1804 0, p
->second
.get());
1806 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1807 p
->second
->offset
- pos
, p
->second
->length
, p
->second
->flags
),
1808 0, p
->second
.get());
1810 if (p
== buffer_map
.begin()) {
1811 _rm_buffer(cache
, p
);
1814 _rm_buffer(cache
, p
--);
1817 ceph_assert(writing
.empty());
1824 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1826 BlueStore::OnodeRef
BlueStore::OnodeSpace::add(const ghobject_t
& oid
,
1829 std::lock_guard
l(cache
->lock
);
1830 auto p
= onode_map
.find(oid
);
1831 if (p
!= onode_map
.end()) {
1832 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
1833 << " raced, returning existing " << p
->second
1837 ldout(cache
->cct
, 20) << __func__
<< " " << oid
<< " " << o
<< dendl
;
1839 cache
->_add(o
.get(), 1);
1844 void BlueStore::OnodeSpace::_remove(const ghobject_t
& oid
)
1846 ldout(cache
->cct
, 20) << __func__
<< " " << oid
<< " " << dendl
;
1847 onode_map
.erase(oid
);
1850 BlueStore::OnodeRef
BlueStore::OnodeSpace::lookup(const ghobject_t
& oid
)
1852 ldout(cache
->cct
, 30) << __func__
<< dendl
;
1857 std::lock_guard
l(cache
->lock
);
1858 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator p
= onode_map
.find(oid
);
1859 if (p
== onode_map
.end()) {
1860 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " miss" << dendl
;
1862 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " hit " << p
->second
1863 << " " << p
->second
->nref
1864 << " " << p
->second
->cached
1865 << " " << p
->second
->pinned
1867 // This will pin onode and implicitly touch the cache when Onode
1868 // eventually will become unpinned
1870 ceph_assert(!o
->cached
|| o
->pinned
);
1877 cache
->logger
->inc(l_bluestore_onode_hits
);
1879 cache
->logger
->inc(l_bluestore_onode_misses
);
1884 void BlueStore::OnodeSpace::clear()
1886 std::lock_guard
l(cache
->lock
);
1887 ldout(cache
->cct
, 10) << __func__
<< " " << onode_map
.size()<< dendl
;
1888 for (auto &p
: onode_map
) {
1889 cache
->_rm(p
.second
.get());
1894 bool BlueStore::OnodeSpace::empty()
1896 std::lock_guard
l(cache
->lock
);
1897 return onode_map
.empty();
1900 void BlueStore::OnodeSpace::rename(
1902 const ghobject_t
& old_oid
,
1903 const ghobject_t
& new_oid
,
1904 const mempool::bluestore_cache_meta::string
& new_okey
)
1906 std::lock_guard
l(cache
->lock
);
1907 ldout(cache
->cct
, 30) << __func__
<< " " << old_oid
<< " -> " << new_oid
1909 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator po
, pn
;
1910 po
= onode_map
.find(old_oid
);
1911 pn
= onode_map
.find(new_oid
);
1912 ceph_assert(po
!= pn
);
1914 ceph_assert(po
!= onode_map
.end());
1915 if (pn
!= onode_map
.end()) {
1916 ldout(cache
->cct
, 30) << __func__
<< " removing target " << pn
->second
1918 cache
->_rm(pn
->second
.get());
1919 onode_map
.erase(pn
);
1921 OnodeRef o
= po
->second
;
1923 // install a non-existent onode at old location
1924 oldo
.reset(new Onode(o
->c
, old_oid
, o
->key
));
1926 cache
->_add(oldo
.get(), 1);
1927 // add at new position and fix oid, key.
1928 // This will pin 'o' and implicitly touch cache
1929 // when it will eventually become unpinned
1930 onode_map
.insert(make_pair(new_oid
, o
));
1931 ceph_assert(o
->pinned
);
1938 bool BlueStore::OnodeSpace::map_any(std::function
<bool(Onode
*)> f
)
1940 std::lock_guard
l(cache
->lock
);
1941 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1942 for (auto& i
: onode_map
) {
1943 if (f(i
.second
.get())) {
1950 template <int LogLevelV
= 30>
1951 void BlueStore::OnodeSpace::dump(CephContext
*cct
)
1953 for (auto& i
: onode_map
) {
1954 ldout(cct
, LogLevelV
) << i
.first
<< " : " << i
.second
1955 << " " << i
.second
->nref
1956 << " " << i
.second
->cached
1957 << " " << i
.second
->pinned
1965 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1967 #define dout_context coll->store->cct
1969 void BlueStore::SharedBlob::dump(Formatter
* f
) const
1971 f
->dump_bool("loaded", loaded
);
1973 persistent
->dump(f
);
1975 f
->dump_unsigned("sbid_unloaded", sbid_unloaded
);
1979 ostream
& operator<<(ostream
& out
, const BlueStore::SharedBlob
& sb
)
1981 out
<< "SharedBlob(" << &sb
;
1984 out
<< " loaded " << *sb
.persistent
;
1986 out
<< " sbid 0x" << std::hex
<< sb
.sbid_unloaded
<< std::dec
;
1991 BlueStore::SharedBlob::SharedBlob(uint64_t i
, Collection
*_coll
)
1992 : coll(_coll
), sbid_unloaded(i
)
1994 ceph_assert(sbid_unloaded
> 0);
1996 get_cache()->add_blob();
2000 BlueStore::SharedBlob::~SharedBlob()
2002 if (loaded
&& persistent
) {
2007 void BlueStore::SharedBlob::put()
2010 dout(20) << __func__
<< " " << this
2011 << " removing self from set " << get_parent()
2014 auto coll_snap
= coll
;
2016 std::lock_guard
l(coll_snap
->cache
->lock
);
2017 if (coll_snap
!= coll
) {
2020 if (!coll_snap
->shared_blob_set
.remove(this, true)) {
2024 bc
._clear(coll_snap
->cache
);
2025 coll_snap
->cache
->rm_blob();
2031 void BlueStore::SharedBlob::get_ref(uint64_t offset
, uint32_t length
)
2033 ceph_assert(persistent
);
2034 persistent
->ref_map
.get(offset
, length
);
2037 void BlueStore::SharedBlob::put_ref(uint64_t offset
, uint32_t length
,
2041 ceph_assert(persistent
);
2042 persistent
->ref_map
.put(offset
, length
, r
,
2043 unshare
&& !*unshare
? unshare
: nullptr);
2046 void BlueStore::SharedBlob::finish_write(uint64_t seq
)
2049 BufferCacheShard
*cache
= coll
->cache
;
2050 std::lock_guard
l(cache
->lock
);
2051 if (coll
->cache
!= cache
) {
2052 dout(20) << __func__
2053 << " raced with sb cache update, was " << cache
2054 << ", now " << coll
->cache
<< ", retrying"
2058 bc
._finish_write(cache
, seq
);
2066 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2068 template <int LogLevelV
= 30>
2069 void BlueStore::SharedBlobSet::dump(CephContext
*cct
)
2071 std::lock_guard
l(lock
);
2072 for (auto& i
: sb_map
) {
2073 ldout(cct
, LogLevelV
) << i
.first
<< " : " << *i
.second
<< dendl
;
2080 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2082 void BlueStore::Blob::dump(Formatter
* f
) const
2084 if (is_spanning()) {
2085 f
->dump_unsigned("spanning_id ", id
);
2089 f
->dump_object("shared", *shared_blob
);
2093 ostream
& operator<<(ostream
& out
, const BlueStore::Blob
& b
)
2095 out
<< "Blob(" << &b
;
2096 if (b
.is_spanning()) {
2097 out
<< " spanning " << b
.id
;
2099 out
<< " " << b
.get_blob() << " " << b
.get_blob_use_tracker();
2100 if (b
.shared_blob
) {
2101 out
<< " " << *b
.shared_blob
;
2103 out
<< " (shared_blob=NULL)";
2109 void BlueStore::Blob::discard_unallocated(Collection
*coll
)
2111 if (get_blob().is_shared()) {
2114 if (get_blob().is_compressed()) {
2115 bool discard
= false;
2116 bool all_invalid
= true;
2117 for (auto e
: get_blob().get_extents()) {
2118 if (!e
.is_valid()) {
2121 all_invalid
= false;
2124 ceph_assert(discard
== all_invalid
); // in case of compressed blob all
2125 // or none pextents are invalid.
2127 shared_blob
->bc
.discard(shared_blob
->get_cache(), 0,
2128 get_blob().get_logical_length());
2132 for (auto e
: get_blob().get_extents()) {
2133 if (!e
.is_valid()) {
2134 dout(20) << __func__
<< " 0x" << std::hex
<< pos
2136 << std::dec
<< dendl
;
2137 shared_blob
->bc
.discard(shared_blob
->get_cache(), pos
, e
.length
);
2141 if (get_blob().can_prune_tail()) {
2142 dirty_blob().prune_tail();
2143 used_in_blob
.prune_tail(get_blob().get_ondisk_length());
2144 dout(20) << __func__
<< " pruned tail, now " << get_blob() << dendl
;
2149 void BlueStore::Blob::get_ref(
2154 // Caller has to initialize Blob's logical length prior to increment
2155 // references. Otherwise one is neither unable to determine required
2156 // amount of counters in case of per-au tracking nor obtain min_release_size
2157 // for single counter mode.
2158 ceph_assert(get_blob().get_logical_length() != 0);
2159 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2160 << std::dec
<< " " << *this << dendl
;
2162 if (used_in_blob
.is_empty()) {
2163 uint32_t min_release_size
=
2164 get_blob().get_release_size(coll
->store
->min_alloc_size
);
2165 uint64_t l
= get_blob().get_logical_length();
2166 dout(20) << __func__
<< " init 0x" << std::hex
<< l
<< ", "
2167 << min_release_size
<< std::dec
<< dendl
;
2168 used_in_blob
.init(l
, min_release_size
);
2175 bool BlueStore::Blob::put_ref(
2181 PExtentVector logical
;
2183 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2184 << std::dec
<< " " << *this << dendl
;
2186 bool empty
= used_in_blob
.put(
2191 // nothing to release
2192 if (!empty
&& logical
.empty()) {
2196 bluestore_blob_t
& b
= dirty_blob();
2197 return b
.release_extents(empty
, logical
, r
);
2200 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size
,
2201 uint32_t target_blob_size
,
2203 uint32_t *length0
) {
2204 ceph_assert(min_alloc_size
);
2205 ceph_assert(target_blob_size
);
2206 if (!get_blob().is_mutable()) {
2210 uint32_t length
= *length0
;
2211 uint32_t end
= b_offset
+ length
;
2213 // Currently for the sake of simplicity we omit blob reuse if data is
2214 // unaligned with csum chunk. Later we can perform padding if needed.
2215 if (get_blob().has_csum() &&
2216 ((b_offset
% get_blob().get_csum_chunk_size()) != 0 ||
2217 (end
% get_blob().get_csum_chunk_size()) != 0)) {
2221 auto blen
= get_blob().get_logical_length();
2222 uint32_t new_blen
= blen
;
2224 // make sure target_blob_size isn't less than current blob len
2225 target_blob_size
= std::max(blen
, target_blob_size
);
2227 if (b_offset
>= blen
) {
2228 // new data totally stands out of the existing blob
2231 // new data overlaps with the existing blob
2232 new_blen
= std::max(blen
, end
);
2234 uint32_t overlap
= 0;
2235 if (new_blen
> blen
) {
2236 overlap
= blen
- b_offset
;
2241 if (!get_blob().is_unallocated(b_offset
, overlap
)) {
2242 // abort if any piece of the overlap has already been allocated
2247 if (new_blen
> blen
) {
2248 int64_t overflow
= int64_t(new_blen
) - target_blob_size
;
2249 // Unable to decrease the provided length to fit into max_blob_size
2250 if (overflow
>= length
) {
2254 // FIXME: in some cases we could reduce unused resolution
2255 if (get_blob().has_unused()) {
2260 new_blen
-= overflow
;
2265 if (new_blen
> blen
) {
2266 dirty_blob().add_tail(new_blen
);
2267 used_in_blob
.add_tail(new_blen
,
2268 get_blob().get_release_size(min_alloc_size
));
2274 void BlueStore::Blob::split(Collection
*coll
, uint32_t blob_offset
, Blob
*r
)
2276 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2277 << " start " << *this << dendl
;
2278 ceph_assert(blob
.can_split());
2279 ceph_assert(used_in_blob
.can_split());
2280 bluestore_blob_t
&lb
= dirty_blob();
2281 bluestore_blob_t
&rb
= r
->dirty_blob();
2285 &(r
->used_in_blob
));
2287 lb
.split(blob_offset
, rb
);
2288 shared_blob
->bc
.split(shared_blob
->get_cache(), blob_offset
, r
->shared_blob
->bc
);
2290 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2291 << " finish " << *this << dendl
;
2292 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2293 << " and " << *r
<< dendl
;
2296 #ifndef CACHE_BLOB_BL
2297 void BlueStore::Blob::decode(
2299 bufferptr::const_iterator
& p
,
2302 bool include_ref_map
)
2304 denc(blob
, p
, struct_v
);
2305 if (blob
.is_shared()) {
2308 if (include_ref_map
) {
2310 used_in_blob
.decode(p
);
2312 used_in_blob
.clear();
2313 bluestore_extent_ref_map_t legacy_ref_map
;
2314 legacy_ref_map
.decode(p
);
2315 for (auto r
: legacy_ref_map
.ref_map
) {
2319 r
.second
.refs
* r
.second
.length
);
2328 void BlueStore::Extent::dump(Formatter
* f
) const
2330 f
->dump_unsigned("logical_offset", logical_offset
);
2331 f
->dump_unsigned("length", length
);
2332 f
->dump_unsigned("blob_offset", blob_offset
);
2333 f
->dump_object("blob", *blob
);
2336 ostream
& operator<<(ostream
& out
, const BlueStore::Extent
& e
)
2338 return out
<< std::hex
<< "0x" << e
.logical_offset
<< "~" << e
.length
2339 << ": 0x" << e
.blob_offset
<< "~" << e
.length
<< std::dec
2344 BlueStore::OldExtent
* BlueStore::OldExtent::create(CollectionRef c
,
2349 OldExtent
* oe
= new OldExtent(lo
, o
, l
, b
);
2350 b
->put_ref(c
.get(), o
, l
, &(oe
->r
));
2351 oe
->blob_empty
= !b
->is_referenced();
2358 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2360 #define dout_context onode->c->store->cct
2362 BlueStore::ExtentMap::ExtentMap(Onode
*o
)
2365 o
->c
->store
->cct
->_conf
->bluestore_extent_map_inline_shard_prealloc_size
) {
2368 void BlueStore::ExtentMap::dump(Formatter
* f
) const
2370 f
->open_array_section("extents");
2372 for (auto& e
: extent_map
) {
2373 f
->dump_object("extent", e
);
2378 void BlueStore::ExtentMap::dup(BlueStore
* b
, TransContext
* txc
,
2379 CollectionRef
& c
, OnodeRef
& oldo
, OnodeRef
& newo
, uint64_t& srcoff
,
2380 uint64_t& length
, uint64_t& dstoff
) {
2382 auto cct
= onode
->c
->store
->cct
;
2384 cct
->_conf
->bluestore_debug_inject_bug21040
;
2385 vector
<BlobRef
> id_to_blob(oldo
->extent_map
.extent_map
.size());
2386 for (auto& e
: oldo
->extent_map
.extent_map
) {
2387 e
.blob
->last_encoded_id
= -1;
2391 uint64_t end
= srcoff
+ length
;
2392 uint32_t dirty_range_begin
= 0;
2393 uint32_t dirty_range_end
= 0;
2394 bool src_dirty
= false;
2395 for (auto ep
= oldo
->extent_map
.seek_lextent(srcoff
);
2396 ep
!= oldo
->extent_map
.extent_map
.end();
2399 if (e
.logical_offset
>= end
) {
2402 dout(20) << __func__
<< " src " << e
<< dendl
;
2404 bool blob_duped
= true;
2405 if (e
.blob
->last_encoded_id
>= 0) {
2406 cb
= id_to_blob
[e
.blob
->last_encoded_id
];
2410 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
2411 // make sure it is shared
2412 if (!blob
.is_shared()) {
2413 c
->make_blob_shared(b
->_assign_blobid(txc
), e
.blob
);
2414 if (!inject_21040
&& !src_dirty
) {
2416 dirty_range_begin
= e
.logical_offset
;
2417 } else if (inject_21040
&&
2418 dirty_range_begin
== 0 && dirty_range_end
== 0) {
2419 dirty_range_begin
= e
.logical_offset
;
2421 ceph_assert(e
.logical_end() > 0);
2422 // -1 to exclude next potential shard
2423 dirty_range_end
= e
.logical_end() - 1;
2425 c
->load_shared_blob(e
.blob
->shared_blob
);
2428 e
.blob
->last_encoded_id
= n
;
2431 // bump the extent refs on the copied blob's extents
2432 for (auto p
: blob
.get_extents()) {
2434 e
.blob
->shared_blob
->get_ref(p
.offset
, p
.length
);
2437 txc
->write_shared_blob(e
.blob
->shared_blob
);
2438 dout(20) << __func__
<< " new " << *cb
<< dendl
;
2441 int skip_front
, skip_back
;
2442 if (e
.logical_offset
< srcoff
) {
2443 skip_front
= srcoff
- e
.logical_offset
;
2447 if (e
.logical_end() > end
) {
2448 skip_back
= e
.logical_end() - end
;
2453 Extent
* ne
= new Extent(e
.logical_offset
+ skip_front
+ dstoff
- srcoff
,
2454 e
.blob_offset
+ skip_front
, e
.length
- skip_front
- skip_back
, cb
);
2455 newo
->extent_map
.extent_map
.insert(*ne
);
2456 ne
->blob
->get_ref(c
.get(), ne
->blob_offset
, ne
->length
);
2457 // fixme: we may leave parts of new blob unreferenced that could
2458 // be freed (relative to the shared_blob).
2459 txc
->statfs_delta
.stored() += ne
->length
;
2460 if (e
.blob
->get_blob().is_compressed()) {
2461 txc
->statfs_delta
.compressed_original() += ne
->length
;
2463 txc
->statfs_delta
.compressed() +=
2464 cb
->get_blob().get_compressed_payload_length();
2467 dout(20) << __func__
<< " dst " << *ne
<< dendl
;
2470 if ((!inject_21040
&& src_dirty
) ||
2471 (inject_21040
&& dirty_range_end
> dirty_range_begin
)) {
2472 oldo
->extent_map
.dirty_range(dirty_range_begin
,
2473 dirty_range_end
- dirty_range_begin
);
2474 txc
->write_onode(oldo
);
2476 txc
->write_onode(newo
);
2478 if (dstoff
+ length
> newo
->onode
.size
) {
2479 newo
->onode
.size
= dstoff
+ length
;
2481 newo
->extent_map
.dirty_range(dstoff
, length
);
2483 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t
,
2486 auto cct
= onode
->c
->store
->cct
; //used by dout
2487 dout(20) << __func__
<< " " << onode
->oid
<< (force
? " force" : "") << dendl
;
2488 if (onode
->onode
.extent_map_shards
.empty()) {
2489 if (inline_bl
.length() == 0) {
2491 // we need to encode inline_bl to measure encoded length
2492 bool never_happen
= encode_some(0, OBJECT_MAX_SIZE
, inline_bl
, &n
);
2493 inline_bl
.reassign_to_mempool(mempool::mempool_bluestore_inline_bl
);
2494 ceph_assert(!never_happen
);
2495 size_t len
= inline_bl
.length();
2496 dout(20) << __func__
<< " inline shard " << len
<< " bytes from " << n
2497 << " extents" << dendl
;
2498 if (!force
&& len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2499 request_reshard(0, OBJECT_MAX_SIZE
);
2503 // will persist in the onode key.
2505 // pending shard update
2506 struct dirty_shard_t
{
2509 dirty_shard_t(Shard
*s
) : shard(s
) {}
2511 vector
<dirty_shard_t
> encoded_shards
;
2512 // allocate slots for all shards in a single call instead of
2513 // doing multiple allocations - one per each dirty shard
2514 encoded_shards
.reserve(shards
.size());
2516 auto p
= shards
.begin();
2518 while (p
!= shards
.end()) {
2519 ceph_assert(p
->shard_info
->offset
>= prev_p
->shard_info
->offset
);
2524 if (n
== shards
.end()) {
2525 endoff
= OBJECT_MAX_SIZE
;
2527 endoff
= n
->shard_info
->offset
;
2529 encoded_shards
.emplace_back(dirty_shard_t(&(*p
)));
2530 bufferlist
& bl
= encoded_shards
.back().bl
;
2531 if (encode_some(p
->shard_info
->offset
, endoff
- p
->shard_info
->offset
,
2534 derr
<< __func__
<< " encode_some needs reshard" << dendl
;
2535 ceph_assert(!force
);
2538 size_t len
= bl
.length();
2540 dout(20) << __func__
<< " shard 0x" << std::hex
2541 << p
->shard_info
->offset
<< std::dec
<< " is " << len
2542 << " bytes (was " << p
->shard_info
->bytes
<< ") from "
2543 << p
->extents
<< " extents" << dendl
;
2546 if (len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2547 // we are big; reshard ourselves
2548 request_reshard(p
->shard_info
->offset
, endoff
);
2550 // avoid resharding the trailing shard, even if it is small
2551 else if (n
!= shards
.end() &&
2552 len
< g_conf()->bluestore_extent_map_shard_min_size
) {
2553 ceph_assert(endoff
!= OBJECT_MAX_SIZE
);
2554 if (p
== shards
.begin()) {
2555 // we are the first shard, combine with next shard
2556 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2558 // combine either with the previous shard or the next,
2559 // whichever is smaller
2560 if (prev_p
->shard_info
->bytes
> n
->shard_info
->bytes
) {
2561 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2563 request_reshard(prev_p
->shard_info
->offset
, endoff
);
2572 if (needs_reshard()) {
2576 // schedule DB update for dirty shards
2578 for (auto& it
: encoded_shards
) {
2579 it
.shard
->dirty
= false;
2580 it
.shard
->shard_info
->bytes
= it
.bl
.length();
2581 generate_extent_shard_key_and_apply(
2583 it
.shard
->shard_info
->offset
,
2585 [&](const string
& final_key
) {
2586 t
->set(PREFIX_OBJ
, final_key
, it
.bl
);
2593 bid_t
BlueStore::ExtentMap::allocate_spanning_blob_id()
2595 if (spanning_blob_map
.empty())
2597 bid_t bid
= spanning_blob_map
.rbegin()->first
+ 1;
2598 // bid is valid and available.
2601 // Find next unused bid;
2602 bid
= rand() % (numeric_limits
<bid_t
>::max() + 1);
2603 const auto begin_bid
= bid
;
2605 if (!spanning_blob_map
.count(bid
))
2609 if (bid
< 0) bid
= 0;
2611 } while (bid
!= begin_bid
);
2612 auto cct
= onode
->c
->store
->cct
; // used by dout
2613 _dump_onode
<0>(cct
, *onode
);
2614 ceph_abort_msg("no available blob id");
2617 void BlueStore::ExtentMap::reshard(
2619 KeyValueDB::Transaction t
)
2621 auto cct
= onode
->c
->store
->cct
; // used by dout
2623 dout(10) << __func__
<< " 0x[" << std::hex
<< needs_reshard_begin
<< ","
2624 << needs_reshard_end
<< ")" << std::dec
2625 << " of " << onode
->onode
.extent_map_shards
.size()
2626 << " shards on " << onode
->oid
<< dendl
;
2627 for (auto& p
: spanning_blob_map
) {
2628 dout(20) << __func__
<< " spanning blob " << p
.first
<< " " << *p
.second
2631 // determine shard index range
2632 unsigned si_begin
= 0, si_end
= 0;
2633 if (!shards
.empty()) {
2634 while (si_begin
+ 1 < shards
.size() &&
2635 shards
[si_begin
+ 1].shard_info
->offset
<= needs_reshard_begin
) {
2638 needs_reshard_begin
= shards
[si_begin
].shard_info
->offset
;
2639 for (si_end
= si_begin
; si_end
< shards
.size(); ++si_end
) {
2640 if (shards
[si_end
].shard_info
->offset
>= needs_reshard_end
) {
2641 needs_reshard_end
= shards
[si_end
].shard_info
->offset
;
2645 if (si_end
== shards
.size()) {
2646 needs_reshard_end
= OBJECT_MAX_SIZE
;
2648 dout(20) << __func__
<< " shards [" << si_begin
<< "," << si_end
<< ")"
2649 << " over 0x[" << std::hex
<< needs_reshard_begin
<< ","
2650 << needs_reshard_end
<< ")" << std::dec
<< dendl
;
2653 fault_range(db
, needs_reshard_begin
, (needs_reshard_end
- needs_reshard_begin
));
2655 // we may need to fault in a larger interval later must have all
2656 // referring extents for spanning blobs loaded in order to have
2657 // accurate use_tracker values.
2658 uint32_t spanning_scan_begin
= needs_reshard_begin
;
2659 uint32_t spanning_scan_end
= needs_reshard_end
;
2663 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2664 generate_extent_shard_key_and_apply(
2665 onode
->key
, shards
[i
].shard_info
->offset
, &key
,
2666 [&](const string
& final_key
) {
2667 t
->rmkey(PREFIX_OBJ
, final_key
);
2672 // calculate average extent size
2674 unsigned extents
= 0;
2675 if (onode
->onode
.extent_map_shards
.empty()) {
2676 bytes
= inline_bl
.length();
2677 extents
= extent_map
.size();
2679 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2680 bytes
+= shards
[i
].shard_info
->bytes
;
2681 extents
+= shards
[i
].extents
;
2684 unsigned target
= cct
->_conf
->bluestore_extent_map_shard_target_size
;
2685 unsigned slop
= target
*
2686 cct
->_conf
->bluestore_extent_map_shard_target_size_slop
;
2687 unsigned extent_avg
= bytes
/ std::max(1u, extents
);
2688 dout(20) << __func__
<< " extent_avg " << extent_avg
<< ", target " << target
2689 << ", slop " << slop
<< dendl
;
2692 unsigned estimate
= 0;
2693 unsigned offset
= needs_reshard_begin
;
2694 vector
<bluestore_onode_t::shard_info
> new_shard_info
;
2695 unsigned max_blob_end
= 0;
2696 Extent
dummy(needs_reshard_begin
);
2697 for (auto e
= extent_map
.lower_bound(dummy
);
2698 e
!= extent_map
.end();
2700 if (e
->logical_offset
>= needs_reshard_end
) {
2703 dout(30) << " extent " << *e
<< dendl
;
2705 // disfavor shard boundaries that span a blob
2706 bool would_span
= (e
->logical_offset
< max_blob_end
) || e
->blob_offset
;
2708 estimate
+ extent_avg
> target
+ (would_span
? slop
: 0)) {
2710 if (offset
== needs_reshard_begin
) {
2711 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2712 new_shard_info
.back().offset
= offset
;
2713 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2714 << std::dec
<< dendl
;
2716 offset
= e
->logical_offset
;
2717 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2718 new_shard_info
.back().offset
= offset
;
2719 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2720 << std::dec
<< dendl
;
2723 estimate
+= extent_avg
;
2724 unsigned bs
= e
->blob_start();
2725 if (bs
< spanning_scan_begin
) {
2726 spanning_scan_begin
= bs
;
2728 uint32_t be
= e
->blob_end();
2729 if (be
> max_blob_end
) {
2732 if (be
> spanning_scan_end
) {
2733 spanning_scan_end
= be
;
2736 if (new_shard_info
.empty() && (si_begin
> 0 ||
2737 si_end
< shards
.size())) {
2738 // we resharded a partial range; we must produce at least one output
2740 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2741 new_shard_info
.back().offset
= needs_reshard_begin
;
2742 dout(20) << __func__
<< " new shard 0x" << std::hex
<< needs_reshard_begin
2743 << std::dec
<< " (singleton degenerate case)" << dendl
;
2746 auto& sv
= onode
->onode
.extent_map_shards
;
2747 dout(20) << __func__
<< " new " << new_shard_info
<< dendl
;
2748 dout(20) << __func__
<< " old " << sv
<< dendl
;
2750 // no old shards to keep
2751 sv
.swap(new_shard_info
);
2752 init_shards(true, true);
2754 // splice in new shards
2755 sv
.erase(sv
.begin() + si_begin
, sv
.begin() + si_end
);
2756 shards
.erase(shards
.begin() + si_begin
, shards
.begin() + si_end
);
2758 sv
.begin() + si_begin
,
2759 new_shard_info
.begin(),
2760 new_shard_info
.end());
2761 shards
.insert(shards
.begin() + si_begin
, new_shard_info
.size(), Shard());
2762 si_end
= si_begin
+ new_shard_info
.size();
2764 ceph_assert(sv
.size() == shards
.size());
2766 // note that we need to update every shard_info of shards here,
2767 // as sv might have been totally re-allocated above
2768 for (unsigned i
= 0; i
< shards
.size(); i
++) {
2769 shards
[i
].shard_info
= &sv
[i
];
2772 // mark newly added shards as dirty
2773 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2774 shards
[i
].loaded
= true;
2775 shards
[i
].dirty
= true;
2778 dout(20) << __func__
<< " fin " << sv
<< dendl
;
2782 // no more shards; unspan all previously spanning blobs
2783 auto p
= spanning_blob_map
.begin();
2784 while (p
!= spanning_blob_map
.end()) {
2786 dout(30) << __func__
<< " un-spanning " << *p
->second
<< dendl
;
2787 p
= spanning_blob_map
.erase(p
);
2790 // identify new spanning blobs
2791 dout(20) << __func__
<< " checking spanning blobs 0x[" << std::hex
2792 << spanning_scan_begin
<< "," << spanning_scan_end
<< ")" << dendl
;
2793 if (spanning_scan_begin
< needs_reshard_begin
) {
2794 fault_range(db
, spanning_scan_begin
,
2795 needs_reshard_begin
- spanning_scan_begin
);
2797 if (spanning_scan_end
> needs_reshard_end
) {
2798 fault_range(db
, needs_reshard_end
,
2799 spanning_scan_end
- needs_reshard_end
);
2801 auto sp
= sv
.begin() + si_begin
;
2802 auto esp
= sv
.end();
2803 unsigned shard_start
= sp
->offset
;
2807 shard_end
= OBJECT_MAX_SIZE
;
2809 shard_end
= sp
->offset
;
2811 Extent
dummy(needs_reshard_begin
);
2813 bool was_too_many_blobs_check
= false;
2814 auto too_many_blobs_threshold
=
2815 g_conf()->bluestore_debug_too_many_blobs_threshold
;
2816 auto& dumped_onodes
= onode
->c
->onode_map
.cache
->dumped_onodes
;
2817 decltype(onode
->c
->onode_map
.cache
->dumped_onodes
)::value_type
* oid_slot
= nullptr;
2818 decltype(onode
->c
->onode_map
.cache
->dumped_onodes
)::value_type
* oldest_slot
= nullptr;
2820 for (auto e
= extent_map
.lower_bound(dummy
); e
!= extent_map
.end(); ++e
) {
2821 if (e
->logical_offset
>= needs_reshard_end
) {
2824 dout(30) << " extent " << *e
<< dendl
;
2825 while (e
->logical_offset
>= shard_end
) {
2826 shard_start
= shard_end
;
2827 ceph_assert(sp
!= esp
);
2830 shard_end
= OBJECT_MAX_SIZE
;
2832 shard_end
= sp
->offset
;
2834 dout(30) << __func__
<< " shard 0x" << std::hex
<< shard_start
2835 << " to 0x" << shard_end
<< std::dec
<< dendl
;
2838 if (e
->blob_escapes_range(shard_start
, shard_end
- shard_start
)) {
2839 if (!e
->blob
->is_spanning()) {
2840 // We have two options: (1) split the blob into pieces at the
2841 // shard boundaries (and adjust extents accordingly), or (2)
2842 // mark it spanning. We prefer to cut the blob if we can. Note that
2843 // we may have to split it multiple times--potentially at every
2845 bool must_span
= false;
2846 BlobRef b
= e
->blob
;
2847 if (b
->can_split()) {
2848 uint32_t bstart
= e
->blob_start();
2849 uint32_t bend
= e
->blob_end();
2850 for (const auto& sh
: shards
) {
2851 if (bstart
< sh
.shard_info
->offset
&&
2852 bend
> sh
.shard_info
->offset
) {
2853 uint32_t blob_offset
= sh
.shard_info
->offset
- bstart
;
2854 if (b
->can_split_at(blob_offset
)) {
2855 dout(20) << __func__
<< " splitting blob, bstart 0x"
2856 << std::hex
<< bstart
<< " blob_offset 0x"
2857 << blob_offset
<< std::dec
<< " " << *b
<< dendl
;
2858 b
= split_blob(b
, blob_offset
, sh
.shard_info
->offset
);
2859 // switch b to the new right-hand side, in case it
2860 // *also* has to get split.
2861 bstart
+= blob_offset
;
2862 onode
->c
->store
->logger
->inc(l_bluestore_blob_split
);
2873 auto bid
= allocate_spanning_blob_id();
2875 spanning_blob_map
[b
->id
] = b
;
2876 dout(20) << __func__
<< " adding spanning " << *b
<< dendl
;
2877 if (!was_too_many_blobs_check
&&
2878 too_many_blobs_threshold
&&
2879 spanning_blob_map
.size() >= size_t(too_many_blobs_threshold
)) {
2881 was_too_many_blobs_check
= true;
2882 for (size_t i
= 0; i
< dumped_onodes
.size(); ++i
) {
2883 if (dumped_onodes
[i
].first
== onode
->oid
) {
2884 oid_slot
= &dumped_onodes
[i
];
2887 if (!oldest_slot
|| (oldest_slot
&&
2888 dumped_onodes
[i
].second
< oldest_slot
->second
)) {
2889 oldest_slot
= &dumped_onodes
[i
];
2896 if (e
->blob
->is_spanning()) {
2897 spanning_blob_map
.erase(e
->blob
->id
);
2899 dout(30) << __func__
<< " un-spanning " << *e
->blob
<< dendl
;
2903 bool do_dump
= (!oid_slot
&& was_too_many_blobs_check
) ||
2905 (mono_clock::now() - oid_slot
->second
>= make_timespan(5 * 60)));
2908 << " spanning blob count exceeds threshold, "
2909 << spanning_blob_map
.size() << " spanning blobs"
2911 _dump_onode
<0>(cct
, *onode
);
2913 oid_slot
->second
= mono_clock::now();
2915 ceph_assert(oldest_slot
);
2916 oldest_slot
->first
= onode
->oid
;
2917 oldest_slot
->second
= mono_clock::now();
2922 clear_needs_reshard();
2925 bool BlueStore::ExtentMap::encode_some(
2931 Extent
dummy(offset
);
2932 auto start
= extent_map
.lower_bound(dummy
);
2933 uint32_t end
= offset
+ length
;
2935 __u8 struct_v
= 2; // Version 2 differs from v1 in blob's ref_map
2936 // serialization only. Hence there is no specific
2937 // handling at ExtentMap level.
2941 bool must_reshard
= false;
2942 for (auto p
= start
;
2943 p
!= extent_map
.end() && p
->logical_offset
< end
;
2945 ceph_assert(p
->logical_offset
>= offset
);
2946 p
->blob
->last_encoded_id
= -1;
2947 if (!p
->blob
->is_spanning() && p
->blob_escapes_range(offset
, length
)) {
2948 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2949 << std::dec
<< " hit new spanning blob " << *p
<< dendl
;
2950 request_reshard(p
->blob_start(), p
->blob_end());
2951 must_reshard
= true;
2953 if (!must_reshard
) {
2954 denc_varint(0, bound
); // blobid
2955 denc_varint(0, bound
); // logical_offset
2956 denc_varint(0, bound
); // len
2957 denc_varint(0, bound
); // blob_offset
2959 p
->blob
->bound_encode(
2962 p
->blob
->shared_blob
->get_sbid(),
2970 denc(struct_v
, bound
);
2971 denc_varint(0, bound
); // number of extents
2974 auto app
= bl
.get_contiguous_appender(bound
);
2975 denc(struct_v
, app
);
2976 denc_varint(n
, app
);
2983 uint64_t prev_len
= 0;
2984 for (auto p
= start
;
2985 p
!= extent_map
.end() && p
->logical_offset
< end
;
2988 bool include_blob
= false;
2989 if (p
->blob
->is_spanning()) {
2990 blobid
= p
->blob
->id
<< BLOBID_SHIFT_BITS
;
2991 blobid
|= BLOBID_FLAG_SPANNING
;
2992 } else if (p
->blob
->last_encoded_id
< 0) {
2993 p
->blob
->last_encoded_id
= n
+ 1; // so it is always non-zero
2994 include_blob
= true;
2995 blobid
= 0; // the decoder will infer the id from n
2997 blobid
= p
->blob
->last_encoded_id
<< BLOBID_SHIFT_BITS
;
2999 if (p
->logical_offset
== pos
) {
3000 blobid
|= BLOBID_FLAG_CONTIGUOUS
;
3002 if (p
->blob_offset
== 0) {
3003 blobid
|= BLOBID_FLAG_ZEROOFFSET
;
3005 if (p
->length
== prev_len
) {
3006 blobid
|= BLOBID_FLAG_SAMELENGTH
;
3008 prev_len
= p
->length
;
3010 denc_varint(blobid
, app
);
3011 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
3012 denc_varint_lowz(p
->logical_offset
- pos
, app
);
3014 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
3015 denc_varint_lowz(p
->blob_offset
, app
);
3017 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
3018 denc_varint_lowz(p
->length
, app
);
3020 pos
= p
->logical_end();
3022 p
->blob
->encode(app
, struct_v
, p
->blob
->shared_blob
->get_sbid(), false);
3026 /*derr << __func__ << bl << dendl;
3027 derr << __func__ << ":";
3034 unsigned BlueStore::ExtentMap::decode_some(bufferlist
& bl
)
3037 derr << __func__ << ":";
3042 ceph_assert(bl
.get_num_buffers() <= 1);
3043 auto p
= bl
.front().begin_deep();
3046 // Version 2 differs from v1 in blob's ref_map
3047 // serialization only. Hence there is no specific
3048 // handling at ExtentMap level below.
3049 ceph_assert(struct_v
== 1 || struct_v
== 2);
3052 denc_varint(num
, p
);
3053 vector
<BlobRef
> blobs(num
);
3055 uint64_t prev_len
= 0;
3059 Extent
*le
= new Extent();
3061 denc_varint(blobid
, p
);
3062 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
3064 denc_varint_lowz(gap
, p
);
3067 le
->logical_offset
= pos
;
3068 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
3069 denc_varint_lowz(le
->blob_offset
, p
);
3071 le
->blob_offset
= 0;
3073 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
3074 denc_varint_lowz(prev_len
, p
);
3076 le
->length
= prev_len
;
3078 if (blobid
& BLOBID_FLAG_SPANNING
) {
3079 dout(30) << __func__
<< " getting spanning blob "
3080 << (blobid
>> BLOBID_SHIFT_BITS
) << dendl
;
3081 le
->assign_blob(get_spanning_blob(blobid
>> BLOBID_SHIFT_BITS
));
3083 blobid
>>= BLOBID_SHIFT_BITS
;
3085 le
->assign_blob(blobs
[blobid
- 1]);
3086 ceph_assert(le
->blob
);
3088 Blob
*b
= new Blob();
3090 b
->decode(onode
->c
, p
, struct_v
, &sbid
, false);
3092 onode
->c
->open_shared_blob(sbid
, b
);
3095 // we build ref_map dynamically for non-spanning blobs
3103 extent_map
.insert(*le
);
3106 ceph_assert(n
== num
);
3110 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p
)
3112 // Version 2 differs from v1 in blob's ref_map
3113 // serialization only. Hence there is no specific
3114 // handling at ExtentMap level.
3118 denc_varint((uint32_t)0, p
);
3119 size_t key_size
= 0;
3120 denc_varint((uint32_t)0, key_size
);
3121 p
+= spanning_blob_map
.size() * key_size
;
3122 for (const auto& i
: spanning_blob_map
) {
3123 i
.second
->bound_encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
3127 void BlueStore::ExtentMap::encode_spanning_blobs(
3128 bufferlist::contiguous_appender
& p
)
3130 // Version 2 differs from v1 in blob's ref_map
3131 // serialization only. Hence there is no specific
3132 // handling at ExtentMap level.
3136 denc_varint(spanning_blob_map
.size(), p
);
3137 for (auto& i
: spanning_blob_map
) {
3138 denc_varint(i
.second
->id
, p
);
3139 i
.second
->encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
3143 void BlueStore::ExtentMap::decode_spanning_blobs(
3144 bufferptr::const_iterator
& p
)
3148 // Version 2 differs from v1 in blob's ref_map
3149 // serialization only. Hence there is no specific
3150 // handling at ExtentMap level.
3151 ceph_assert(struct_v
== 1 || struct_v
== 2);
3156 BlobRef
b(new Blob());
3157 denc_varint(b
->id
, p
);
3158 spanning_blob_map
[b
->id
] = b
;
3160 b
->decode(onode
->c
, p
, struct_v
, &sbid
, true);
3161 onode
->c
->open_shared_blob(sbid
, b
);
3165 void BlueStore::ExtentMap::init_shards(bool loaded
, bool dirty
)
3167 shards
.resize(onode
->onode
.extent_map_shards
.size());
3169 for (auto &s
: onode
->onode
.extent_map_shards
) {
3170 shards
[i
].shard_info
= &s
;
3171 shards
[i
].loaded
= loaded
;
3172 shards
[i
].dirty
= dirty
;
3177 void BlueStore::ExtentMap::fault_range(
3182 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3183 << std::dec
<< dendl
;
3184 auto start
= seek_shard(offset
);
3185 auto last
= seek_shard(offset
+ length
);
3190 ceph_assert(last
>= start
);
3192 while (start
<= last
) {
3193 ceph_assert((size_t)start
< shards
.size());
3194 auto p
= &shards
[start
];
3196 dout(30) << __func__
<< " opening shard 0x" << std::hex
3197 << p
->shard_info
->offset
<< std::dec
<< dendl
;
3199 generate_extent_shard_key_and_apply(
3200 onode
->key
, p
->shard_info
->offset
, &key
,
3201 [&](const string
& final_key
) {
3202 int r
= db
->get(PREFIX_OBJ
, final_key
, &v
);
3204 derr
<< __func__
<< " missing shard 0x" << std::hex
3205 << p
->shard_info
->offset
<< std::dec
<< " for " << onode
->oid
3207 ceph_assert(r
>= 0);
3211 p
->extents
= decode_some(v
);
3213 dout(20) << __func__
<< " open shard 0x" << std::hex
3214 << p
->shard_info
->offset
3215 << " for range 0x" << offset
<< "~" << length
<< std::dec
3216 << " (" << v
.length() << " bytes)" << dendl
;
3217 ceph_assert(p
->dirty
== false);
3218 ceph_assert(v
.length() == p
->shard_info
->bytes
);
3219 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_misses
);
3221 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_hits
);
3227 void BlueStore::ExtentMap::dirty_range(
3231 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3232 << std::dec
<< dendl
;
3233 if (shards
.empty()) {
3234 dout(20) << __func__
<< " mark inline shard dirty" << dendl
;
3238 auto start
= seek_shard(offset
);
3242 auto last
= seek_shard(offset
+ length
- 1);
3246 ceph_assert(last
>= start
);
3247 while (start
<= last
) {
3248 ceph_assert((size_t)start
< shards
.size());
3249 auto p
= &shards
[start
];
3251 derr
<< __func__
<< "on write 0x" << std::hex
<< offset
3252 << "~" << length
<< " shard 0x" << p
->shard_info
->offset
3253 << std::dec
<< " is not loaded, can't mark dirty" << dendl
;
3254 ceph_abort_msg("can't mark unloaded shard dirty");
3257 dout(20) << __func__
<< " mark shard 0x" << std::hex
3258 << p
->shard_info
->offset
<< std::dec
<< " dirty" << dendl
;
3265 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::find(
3268 Extent
dummy(offset
);
3269 return extent_map
.find(dummy
);
3272 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::seek_lextent(
3275 Extent
dummy(offset
);
3276 auto fp
= extent_map
.lower_bound(dummy
);
3277 if (fp
!= extent_map
.begin()) {
3279 if (fp
->logical_end() <= offset
) {
3286 BlueStore::extent_map_t::const_iterator
BlueStore::ExtentMap::seek_lextent(
3287 uint64_t offset
) const
3289 Extent
dummy(offset
);
3290 auto fp
= extent_map
.lower_bound(dummy
);
3291 if (fp
!= extent_map
.begin()) {
3293 if (fp
->logical_end() <= offset
) {
3300 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset
, uint64_t length
)
3302 auto fp
= seek_lextent(offset
);
3303 if (fp
== extent_map
.end() || fp
->logical_offset
>= offset
+ length
) {
3309 int BlueStore::ExtentMap::compress_extent_map(
3313 if (extent_map
.empty())
3316 auto p
= seek_lextent(offset
);
3317 if (p
!= extent_map
.begin()) {
3318 --p
; // start to the left of offset
3320 // the caller should have just written to this region
3321 ceph_assert(p
!= extent_map
.end());
3323 // identify the *next* shard
3324 auto pshard
= shards
.begin();
3325 while (pshard
!= shards
.end() &&
3326 p
->logical_offset
>= pshard
->shard_info
->offset
) {
3330 if (pshard
!= shards
.end()) {
3331 shard_end
= pshard
->shard_info
->offset
;
3333 shard_end
= OBJECT_MAX_SIZE
;
3337 for (++n
; n
!= extent_map
.end(); p
= n
++) {
3338 if (n
->logical_offset
> offset
+ length
) {
3339 break; // stop after end
3341 while (n
!= extent_map
.end() &&
3342 p
->logical_end() == n
->logical_offset
&&
3343 p
->blob
== n
->blob
&&
3344 p
->blob_offset
+ p
->length
== n
->blob_offset
&&
3345 n
->logical_offset
< shard_end
) {
3346 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3347 << " next shard 0x" << shard_end
<< std::dec
3348 << " merging " << *p
<< " and " << *n
<< dendl
;
3349 p
->length
+= n
->length
;
3353 if (n
== extent_map
.end()) {
3356 if (n
->logical_offset
>= shard_end
) {
3357 ceph_assert(pshard
!= shards
.end());
3359 if (pshard
!= shards
.end()) {
3360 shard_end
= pshard
->shard_info
->offset
;
3362 shard_end
= OBJECT_MAX_SIZE
;
3367 onode
->c
->store
->logger
->inc(l_bluestore_extent_compress
, removed
);
3372 void BlueStore::ExtentMap::punch_hole(
3376 old_extent_map_t
*old_extents
)
3378 auto p
= seek_lextent(offset
);
3379 uint64_t end
= offset
+ length
;
3380 while (p
!= extent_map
.end()) {
3381 if (p
->logical_offset
>= end
) {
3384 if (p
->logical_offset
< offset
) {
3385 if (p
->logical_end() > end
) {
3386 // split and deref middle
3387 uint64_t front
= offset
- p
->logical_offset
;
3388 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ front
,
3390 old_extents
->push_back(*oe
);
3392 p
->blob_offset
+ front
+ length
,
3393 p
->length
- front
- length
,
3399 ceph_assert(p
->logical_end() > offset
); // else seek_lextent bug
3400 uint64_t keep
= offset
- p
->logical_offset
;
3401 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ keep
,
3402 p
->length
- keep
, p
->blob
);
3403 old_extents
->push_back(*oe
);
3409 if (p
->logical_offset
+ p
->length
<= end
) {
3410 // deref whole lextent
3411 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
3412 p
->length
, p
->blob
);
3413 old_extents
->push_back(*oe
);
3418 uint64_t keep
= p
->logical_end() - end
;
3419 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
3420 p
->length
- keep
, p
->blob
);
3421 old_extents
->push_back(*oe
);
3423 add(end
, p
->blob_offset
+ p
->length
- keep
, keep
, p
->blob
);
3429 BlueStore::Extent
*BlueStore::ExtentMap::set_lextent(
3431 uint64_t logical_offset
,
3432 uint64_t blob_offset
, uint64_t length
, BlobRef b
,
3433 old_extent_map_t
*old_extents
)
3435 // We need to have completely initialized Blob to increment its ref counters.
3436 ceph_assert(b
->get_blob().get_logical_length() != 0);
3438 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3439 // old_extents list if we overwre the blob totally
3440 // This might happen during WAL overwrite.
3441 b
->get_ref(onode
->c
, blob_offset
, length
);
3444 punch_hole(c
, logical_offset
, length
, old_extents
);
3447 Extent
*le
= new Extent(logical_offset
, blob_offset
, length
, b
);
3448 extent_map
.insert(*le
);
3449 if (spans_shard(logical_offset
, length
)) {
3450 request_reshard(logical_offset
, logical_offset
+ length
);
3455 BlueStore::BlobRef
BlueStore::ExtentMap::split_blob(
3457 uint32_t blob_offset
,
3460 uint32_t end_pos
= pos
+ lb
->get_blob().get_logical_length() - blob_offset
;
3461 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< " end 0x" << end_pos
3462 << " blob_offset 0x" << blob_offset
<< std::dec
<< " " << *lb
3464 BlobRef rb
= onode
->c
->new_blob();
3465 lb
->split(onode
->c
, blob_offset
, rb
.get());
3467 for (auto ep
= seek_lextent(pos
);
3468 ep
!= extent_map
.end() && ep
->logical_offset
< end_pos
;
3470 if (ep
->blob
!= lb
) {
3473 if (ep
->logical_offset
< pos
) {
3475 size_t left
= pos
- ep
->logical_offset
;
3476 Extent
*ne
= new Extent(pos
, 0, ep
->length
- left
, rb
);
3477 extent_map
.insert(*ne
);
3479 dout(30) << __func__
<< " split " << *ep
<< dendl
;
3480 dout(30) << __func__
<< " to " << *ne
<< dendl
;
3483 ceph_assert(ep
->blob_offset
>= blob_offset
);
3486 ep
->blob_offset
-= blob_offset
;
3487 dout(30) << __func__
<< " adjusted " << *ep
<< dendl
;
3496 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3499 // A tricky thing about Onode's ref counter is that we do an additional
3500 // increment when newly pinned instance is detected. And -1 on unpin.
3501 // This prevents from a conflict with a delete call (when nref == 0).
3502 // The latter might happen while the thread is in unpin() function
3503 // (and e.g. waiting for lock acquisition) since nref is already
3504 // decremented. And another 'putting' thread on the instance will release it.
3506 void BlueStore::Onode::get() {
3507 if (++nref
>= 2 && !pinned
) {
3508 OnodeCacheShard
* ocs
= c
->get_onode_cache();
3510 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3511 while (ocs
!= c
->get_onode_cache()) {
3513 ocs
= c
->get_onode_cache();
3516 bool was_pinned
= pinned
;
3518 // additional increment for newly pinned instance
3519 bool r
= !was_pinned
&& pinned
;
3529 void BlueStore::Onode::put() {
3532 OnodeCacheShard
* ocs
= c
->get_onode_cache();
3534 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3535 while (ocs
!= c
->get_onode_cache()) {
3537 ocs
= c
->get_onode_cache();
3540 bool need_unpin
= pinned
;
3541 pinned
= pinned
&& nref
> 2; // intentionally use > not >= as we have
3542 // +1 due to pinned state
3543 need_unpin
= need_unpin
&& !pinned
;
3544 if (cached
&& need_unpin
) {
3548 ocs
->_unpin_and_rm(this);
3549 // remove will also decrement nref and delete Onode
3550 c
->onode_map
._remove(oid
);
3553 // additional decrement for newly unpinned instance
3554 // should be the last action since Onode can be released
3555 // at any point after this decrement
3566 BlueStore::Onode
* BlueStore::Onode::decode(
3568 const ghobject_t
& oid
,
3570 const bufferlist
& v
)
3572 Onode
* on
= new Onode(c
.get(), oid
, key
);
3574 auto p
= v
.front().begin_deep();
3575 on
->onode
.decode(p
);
3576 for (auto& i
: on
->onode
.attrs
) {
3577 i
.second
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
3580 // initialize extent_map
3581 on
->extent_map
.decode_spanning_blobs(p
);
3582 if (on
->onode
.extent_map_shards
.empty()) {
3583 denc(on
->extent_map
.inline_bl
, p
);
3584 on
->extent_map
.decode_some(on
->extent_map
.inline_bl
);
3585 on
->extent_map
.inline_bl
.reassign_to_mempool(
3586 mempool::mempool_bluestore_cache_data
);
3589 on
->extent_map
.init_shards(false, false);
3594 void BlueStore::Onode::flush()
3596 if (flushing_count
.load()) {
3597 ldout(c
->store
->cct
, 20) << __func__
<< " cnt:" << flushing_count
<< dendl
;
3599 std::unique_lock
l(flush_lock
);
3600 while (flushing_count
.load()) {
3605 ldout(c
->store
->cct
, 20) << __func__
<< " done" << dendl
;
3608 void BlueStore::Onode::dump(Formatter
* f
) const
3615 const string
& BlueStore::Onode::get_omap_prefix()
3617 if (onode
.is_pgmeta_omap()) {
3618 return PREFIX_PGMETA_OMAP
;
3620 if (onode
.is_perpg_omap()) {
3621 return PREFIX_PERPG_OMAP
;
3623 if (onode
.is_perpool_omap()) {
3624 return PREFIX_PERPOOL_OMAP
;
3631 void BlueStore::Onode::get_omap_header(string
*out
)
3633 if (!onode
.is_pgmeta_omap()) {
3634 if (onode
.is_perpg_omap()) {
3635 _key_encode_u64(c
->pool(), out
);
3636 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), out
);
3637 } else if (onode
.is_perpool_omap()) {
3638 _key_encode_u64(c
->pool(), out
);
3641 _key_encode_u64(onode
.nid
, out
);
3642 out
->push_back('-');
3645 void BlueStore::Onode::get_omap_key(const string
& key
, string
*out
)
3647 if (!onode
.is_pgmeta_omap()) {
3648 if (onode
.is_perpg_omap()) {
3649 _key_encode_u64(c
->pool(), out
);
3650 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), out
);
3651 } else if (onode
.is_perpool_omap()) {
3652 _key_encode_u64(c
->pool(), out
);
3655 _key_encode_u64(onode
.nid
, out
);
3656 out
->push_back('.');
3660 void BlueStore::Onode::rewrite_omap_key(const string
& old
, string
*out
)
3662 if (!onode
.is_pgmeta_omap()) {
3663 if (onode
.is_perpg_omap()) {
3664 _key_encode_u64(c
->pool(), out
);
3665 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), out
);
3666 } else if (onode
.is_perpool_omap()) {
3667 _key_encode_u64(c
->pool(), out
);
3670 _key_encode_u64(onode
.nid
, out
);
3671 out
->append(old
.c_str() + out
->length(), old
.size() - out
->length());
3674 void BlueStore::Onode::get_omap_tail(string
*out
)
3676 if (!onode
.is_pgmeta_omap()) {
3677 if (onode
.is_perpg_omap()) {
3678 _key_encode_u64(c
->pool(), out
);
3679 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), out
);
3680 } else if (onode
.is_perpool_omap()) {
3681 _key_encode_u64(c
->pool(), out
);
3684 _key_encode_u64(onode
.nid
, out
);
3685 out
->push_back('~');
3688 void BlueStore::Onode::decode_omap_key(const string
& key
, string
*user_key
)
3690 size_t pos
= sizeof(uint64_t) + 1;
3691 if (!onode
.is_pgmeta_omap()) {
3692 if (onode
.is_perpg_omap()) {
3693 pos
+= sizeof(uint64_t) + sizeof(uint32_t);
3694 } else if (onode
.is_perpool_omap()) {
3695 pos
+= sizeof(uint64_t);
3698 *user_key
= key
.substr(pos
);
3702 // =======================================================
3705 /// Checks for writes to the same pextent within a blob
3706 bool BlueStore::WriteContext::has_conflict(
3710 uint64_t min_alloc_size
)
3712 ceph_assert((loffs
% min_alloc_size
) == 0);
3713 ceph_assert((loffs_end
% min_alloc_size
) == 0);
3714 for (auto w
: writes
) {
3716 auto loffs2
= p2align(w
.logical_offset
, min_alloc_size
);
3717 auto loffs2_end
= p2roundup(w
.logical_offset
+ w
.length0
, min_alloc_size
);
3718 if ((loffs
<= loffs2
&& loffs_end
> loffs2
) ||
3719 (loffs
>= loffs2
&& loffs
< loffs2_end
)) {
3727 // =======================================================
3731 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3733 #define dout_context cct
3735 void BlueStore::DeferredBatch::prepare_write(
3737 uint64_t seq
, uint64_t offset
, uint64_t length
,
3738 bufferlist::const_iterator
& blp
)
3740 _discard(cct
, offset
, length
);
3741 auto i
= iomap
.insert(make_pair(offset
, deferred_io()));
3742 ceph_assert(i
.second
); // this should be a new insertion
3743 i
.first
->second
.seq
= seq
;
3744 blp
.copy(length
, i
.first
->second
.bl
);
3745 i
.first
->second
.bl
.reassign_to_mempool(
3746 mempool::mempool_bluestore_writing_deferred
);
3747 dout(20) << __func__
<< " seq " << seq
3748 << " 0x" << std::hex
<< offset
<< "~" << length
3749 << " crc " << i
.first
->second
.bl
.crc32c(-1)
3750 << std::dec
<< dendl
;
3751 seq_bytes
[seq
] += length
;
3752 #ifdef DEBUG_DEFERRED
3757 void BlueStore::DeferredBatch::_discard(
3758 CephContext
*cct
, uint64_t offset
, uint64_t length
)
3760 generic_dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3761 << std::dec
<< dendl
;
3762 auto p
= iomap
.lower_bound(offset
);
3763 if (p
!= iomap
.begin()) {
3765 auto end
= p
->first
+ p
->second
.bl
.length();
3768 head
.substr_of(p
->second
.bl
, 0, offset
- p
->first
);
3769 dout(20) << __func__
<< " keep head " << p
->second
.seq
3770 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3771 << " -> 0x" << head
.length() << std::dec
<< dendl
;
3772 auto i
= seq_bytes
.find(p
->second
.seq
);
3773 ceph_assert(i
!= seq_bytes
.end());
3774 if (end
> offset
+ length
) {
3776 tail
.substr_of(p
->second
.bl
, offset
+ length
- p
->first
,
3777 end
- (offset
+ length
));
3778 dout(20) << __func__
<< " keep tail " << p
->second
.seq
3779 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3780 << " -> 0x" << tail
.length() << std::dec
<< dendl
;
3781 auto &n
= iomap
[offset
+ length
];
3783 n
.seq
= p
->second
.seq
;
3784 i
->second
-= length
;
3786 i
->second
-= end
- offset
;
3788 ceph_assert(i
->second
>= 0);
3789 p
->second
.bl
.swap(head
);
3793 while (p
!= iomap
.end()) {
3794 if (p
->first
>= offset
+ length
) {
3797 auto i
= seq_bytes
.find(p
->second
.seq
);
3798 ceph_assert(i
!= seq_bytes
.end());
3799 auto end
= p
->first
+ p
->second
.bl
.length();
3800 if (end
> offset
+ length
) {
3801 unsigned drop_front
= offset
+ length
- p
->first
;
3802 unsigned keep_tail
= end
- (offset
+ length
);
3803 dout(20) << __func__
<< " truncate front " << p
->second
.seq
3804 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3805 << " drop_front 0x" << drop_front
<< " keep_tail 0x" << keep_tail
3806 << " to 0x" << (offset
+ length
) << "~" << keep_tail
3807 << std::dec
<< dendl
;
3808 auto &s
= iomap
[offset
+ length
];
3809 s
.seq
= p
->second
.seq
;
3810 s
.bl
.substr_of(p
->second
.bl
, drop_front
, keep_tail
);
3811 i
->second
-= drop_front
;
3813 dout(20) << __func__
<< " drop " << p
->second
.seq
3814 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3815 << std::dec
<< dendl
;
3816 i
->second
-= p
->second
.bl
.length();
3818 ceph_assert(i
->second
>= 0);
3823 void BlueStore::DeferredBatch::_audit(CephContext
*cct
)
3825 map
<uint64_t,int> sb
;
3826 for (auto p
: seq_bytes
) {
3827 sb
[p
.first
] = 0; // make sure we have the same set of keys
3830 for (auto& p
: iomap
) {
3831 ceph_assert(p
.first
>= pos
);
3832 sb
[p
.second
.seq
] += p
.second
.bl
.length();
3833 pos
= p
.first
+ p
.second
.bl
.length();
3835 ceph_assert(sb
== seq_bytes
);
3842 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3844 BlueStore::Collection::Collection(BlueStore
*store_
, OnodeCacheShard
*oc
, BufferCacheShard
*bc
, coll_t cid
)
3845 : CollectionImpl(store_
->cct
, cid
),
3850 commit_queue(nullptr)
3854 bool BlueStore::Collection::flush_commit(Context
*c
)
3856 return osr
->flush_commit(c
);
3859 void BlueStore::Collection::flush()
3864 void BlueStore::Collection::flush_all_but_last()
3866 osr
->flush_all_but_last();
3869 void BlueStore::Collection::open_shared_blob(uint64_t sbid
, BlobRef b
)
3871 ceph_assert(!b
->shared_blob
);
3872 const bluestore_blob_t
& blob
= b
->get_blob();
3873 if (!blob
.is_shared()) {
3874 b
->shared_blob
= new SharedBlob(this);
3878 b
->shared_blob
= shared_blob_set
.lookup(sbid
);
3879 if (b
->shared_blob
) {
3880 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3881 << std::dec
<< " had " << *b
->shared_blob
<< dendl
;
3883 b
->shared_blob
= new SharedBlob(sbid
, this);
3884 shared_blob_set
.add(this, b
->shared_blob
.get());
3885 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3886 << std::dec
<< " opened " << *b
->shared_blob
3891 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb
)
3893 if (!sb
->is_loaded()) {
3897 auto sbid
= sb
->get_sbid();
3898 get_shared_blob_key(sbid
, &key
);
3899 int r
= store
->db
->get(PREFIX_SHARED_BLOB
, key
, &v
);
3901 lderr(store
->cct
) << __func__
<< " sbid 0x" << std::hex
<< sbid
3902 << std::dec
<< " not found at key "
3903 << pretty_binary_string(key
) << dendl
;
3904 ceph_abort_msg("uh oh, missing shared_blob");
3908 sb
->persistent
= new bluestore_shared_blob_t(sbid
);
3909 auto p
= v
.cbegin();
3910 decode(*(sb
->persistent
), p
);
3911 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3912 << std::dec
<< " loaded shared_blob " << *sb
<< dendl
;
3916 void BlueStore::Collection::make_blob_shared(uint64_t sbid
, BlobRef b
)
3918 ldout(store
->cct
, 10) << __func__
<< " " << *b
<< dendl
;
3919 ceph_assert(!b
->shared_blob
->is_loaded());
3922 bluestore_blob_t
& blob
= b
->dirty_blob();
3923 blob
.set_flag(bluestore_blob_t::FLAG_SHARED
);
3925 // update shared blob
3926 b
->shared_blob
->loaded
= true;
3927 b
->shared_blob
->persistent
= new bluestore_shared_blob_t(sbid
);
3928 shared_blob_set
.add(this, b
->shared_blob
.get());
3929 for (auto p
: blob
.get_extents()) {
3931 b
->shared_blob
->get_ref(
3936 ldout(store
->cct
, 20) << __func__
<< " now " << *b
<< dendl
;
3939 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob
*sb
)
3941 ldout(store
->cct
, 10) << __func__
<< " " << *sb
<< dendl
;
3942 ceph_assert(sb
->is_loaded());
3944 uint64_t sbid
= sb
->get_sbid();
3945 shared_blob_set
.remove(sb
);
3947 delete sb
->persistent
;
3948 sb
->sbid_unloaded
= 0;
3949 ldout(store
->cct
, 20) << __func__
<< " now " << *sb
<< dendl
;
3953 BlueStore::OnodeRef
BlueStore::Collection::get_onode(
3954 const ghobject_t
& oid
,
3958 ceph_assert(create
? ceph_mutex_is_wlocked(lock
) : ceph_mutex_is_locked(lock
));
3961 if (cid
.is_pg(&pgid
)) {
3962 if (!oid
.match(cnode
.bits
, pgid
.ps())) {
3963 lderr(store
->cct
) << __func__
<< " oid " << oid
<< " not part of "
3964 << pgid
<< " bits " << cnode
.bits
<< dendl
;
3969 OnodeRef o
= onode_map
.lookup(oid
);
3974 get_object_key(store
->cct
, oid
, &key
);
3976 ldout(store
->cct
, 20) << __func__
<< " oid " << oid
<< " key "
3977 << pretty_binary_string(key
) << dendl
;
3983 r
= store
->db
->get(PREFIX_OBJ
, key
.c_str(), key
.size(), &v
);
3984 ldout(store
->cct
, 20) << " r " << r
<< " v.len " << v
.length() << dendl
;
3986 if (v
.length() == 0) {
3987 ceph_assert(r
== -ENOENT
);
3991 // new object, new onode
3992 on
= new Onode(this, oid
, key
);
3995 ceph_assert(r
>= 0);
3996 on
= Onode::decode(this, oid
, key
, v
);
3999 return onode_map
.add(oid
, o
);
4002 void BlueStore::Collection::split_cache(
4005 ldout(store
->cct
, 10) << __func__
<< " to " << dest
<< dendl
;
4007 auto *ocache
= get_onode_cache();
4008 auto *ocache_dest
= dest
->get_onode_cache();
4010 // lock cache shards
4011 std::lock(ocache
->lock
, ocache_dest
->lock
, cache
->lock
, dest
->cache
->lock
);
4012 std::lock_guard
l(ocache
->lock
, std::adopt_lock
);
4013 std::lock_guard
l2(ocache_dest
->lock
, std::adopt_lock
);
4014 std::lock_guard
l3(cache
->lock
, std::adopt_lock
);
4015 std::lock_guard
l4(dest
->cache
->lock
, std::adopt_lock
);
4017 int destbits
= dest
->cnode
.bits
;
4019 bool is_pg
= dest
->cid
.is_pg(&destpg
);
4022 auto p
= onode_map
.onode_map
.begin();
4023 while (p
!= onode_map
.onode_map
.end()) {
4024 OnodeRef o
= p
->second
;
4025 if (!p
->second
->oid
.match(destbits
, destpg
.pgid
.ps())) {
4026 // onode does not belong to this child
4027 ldout(store
->cct
, 20) << __func__
<< " not moving " << o
<< " " << o
->oid
4031 ldout(store
->cct
, 20) << __func__
<< " moving " << o
<< " " << o
->oid
4034 // ensuring that nref is always >= 2 and hence onode is pinned and
4035 // physically out of cache during the transition
4037 ceph_assert(o
->pinned
);
4039 p
= onode_map
.onode_map
.erase(p
);
4040 dest
->onode_map
.onode_map
[o
->oid
] = o
;
4042 get_onode_cache()->move_pinned(dest
->get_onode_cache(), o
.get());
4046 // move over shared blobs and buffers. cover shared blobs from
4047 // both extent map and spanning blob map (the full extent map
4048 // may not be faulted in)
4049 vector
<SharedBlob
*> sbvec
;
4050 for (auto& e
: o
->extent_map
.extent_map
) {
4051 sbvec
.push_back(e
.blob
->shared_blob
.get());
4053 for (auto& b
: o
->extent_map
.spanning_blob_map
) {
4054 sbvec
.push_back(b
.second
->shared_blob
.get());
4056 for (auto sb
: sbvec
) {
4057 if (sb
->coll
== dest
) {
4058 ldout(store
->cct
, 20) << __func__
<< " already moved " << *sb
4062 ldout(store
->cct
, 20) << __func__
<< " moving " << *sb
<< dendl
;
4063 if (sb
->get_sbid()) {
4064 ldout(store
->cct
, 20) << __func__
4065 << " moving registration " << *sb
<< dendl
;
4066 shared_blob_set
.remove(sb
);
4067 dest
->shared_blob_set
.add(dest
, sb
);
4070 if (dest
->cache
!= cache
) {
4071 for (auto& i
: sb
->bc
.buffer_map
) {
4072 if (!i
.second
->is_writing()) {
4073 ldout(store
->cct
, 20) << __func__
<< " moving " << *i
.second
4075 dest
->cache
->_move(cache
, i
.second
.get());
4082 dest
->cache
->_trim();
4085 // =======================================================
4090 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
4092 #define dout_context store->cct
4094 void *BlueStore::MempoolThread::entry()
4096 std::unique_lock l
{lock
};
4098 uint32_t prev_config_change
= store
->config_changed
.load();
4099 uint64_t base
= store
->osd_memory_base
;
4100 double fragmentation
= store
->osd_memory_expected_fragmentation
;
4101 uint64_t target
= store
->osd_memory_target
;
4102 uint64_t min
= store
->osd_memory_cache_min
;
4105 // When setting the maximum amount of memory to use for cache, first
4106 // assume some base amount of memory for the OSD and then fudge in
4107 // some overhead for fragmentation that scales with cache usage.
4108 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
4109 if (ltarget
> base
+ min
) {
4110 max
= ltarget
- base
;
4113 binned_kv_cache
= store
->db
->get_priority_cache();
4114 binned_kv_onode_cache
= store
->db
->get_priority_cache(PREFIX_OBJ
);
4115 if (store
->cache_autotune
&& binned_kv_cache
!= nullptr) {
4116 pcm
= std::make_shared
<PriorityCache::Manager
>(
4117 store
->cct
, min
, max
, target
, true, "bluestore-pricache");
4118 pcm
->insert("kv", binned_kv_cache
, true);
4119 pcm
->insert("meta", meta_cache
, true);
4120 pcm
->insert("data", data_cache
, true);
4121 if (binned_kv_onode_cache
!= nullptr) {
4122 pcm
->insert("kv_onode", binned_kv_onode_cache
, true);
4126 utime_t next_balance
= ceph_clock_now();
4127 utime_t next_resize
= ceph_clock_now();
4128 utime_t next_deferred_force_submit
= ceph_clock_now();
4129 utime_t alloc_stats_dump_clock
= ceph_clock_now();
4131 bool interval_stats_trim
= false;
4133 // Update pcm cache settings if related configuration was changed
4134 uint32_t cur_config_change
= store
->config_changed
.load();
4135 if (cur_config_change
!= prev_config_change
) {
4136 _update_cache_settings();
4137 prev_config_change
= cur_config_change
;
4140 // Before we trim, check and see if it's time to rebalance/resize.
4141 double autotune_interval
= store
->cache_autotune_interval
;
4142 double resize_interval
= store
->osd_memory_cache_resize_interval
;
4143 double max_defer_interval
= store
->max_defer_interval
;
4145 double alloc_stats_dump_interval
=
4146 store
->cct
->_conf
->bluestore_alloc_stats_dump_interval
;
4148 if (alloc_stats_dump_interval
> 0 &&
4149 alloc_stats_dump_clock
+ alloc_stats_dump_interval
< ceph_clock_now()) {
4150 store
->_record_allocation_stats();
4151 alloc_stats_dump_clock
= ceph_clock_now();
4153 if (autotune_interval
> 0 && next_balance
< ceph_clock_now()) {
4154 _adjust_cache_settings();
4156 // Log events at 5 instead of 20 when balance happens.
4157 interval_stats_trim
= true;
4159 if (pcm
!= nullptr) {
4163 next_balance
= ceph_clock_now();
4164 next_balance
+= autotune_interval
;
4166 if (resize_interval
> 0 && next_resize
< ceph_clock_now()) {
4167 if (ceph_using_tcmalloc() && pcm
!= nullptr) {
4170 next_resize
= ceph_clock_now();
4171 next_resize
+= resize_interval
;
4174 if (max_defer_interval
> 0 &&
4175 next_deferred_force_submit
< ceph_clock_now()) {
4176 if (store
->get_deferred_last_submitted() + max_defer_interval
<
4178 store
->deferred_try_submit();
4180 next_deferred_force_submit
= ceph_clock_now();
4181 next_deferred_force_submit
+= max_defer_interval
/3;
4184 // Now Resize the shards
4185 _resize_shards(interval_stats_trim
);
4186 interval_stats_trim
= false;
4188 store
->_update_cache_logger();
4189 auto wait
= ceph::make_timespan(
4190 store
->cct
->_conf
->bluestore_cache_trim_interval
);
4191 cond
.wait_for(l
, wait
);
4194 store
->_record_allocation_stats();
4200 void BlueStore::MempoolThread::_adjust_cache_settings()
4202 if (binned_kv_cache
!= nullptr) {
4203 binned_kv_cache
->set_cache_ratio(store
->cache_kv_ratio
);
4205 if (binned_kv_onode_cache
!= nullptr) {
4206 binned_kv_onode_cache
->set_cache_ratio(store
->cache_kv_onode_ratio
);
4208 meta_cache
->set_cache_ratio(store
->cache_meta_ratio
);
4209 data_cache
->set_cache_ratio(store
->cache_data_ratio
);
4212 void BlueStore::MempoolThread::_resize_shards(bool interval_stats
)
4214 size_t onode_shards
= store
->onode_cache_shards
.size();
4215 size_t buffer_shards
= store
->buffer_cache_shards
.size();
4216 int64_t kv_used
= store
->db
->get_cache_usage();
4217 int64_t kv_onode_used
= store
->db
->get_cache_usage(PREFIX_OBJ
);
4218 int64_t meta_used
= meta_cache
->_get_used_bytes();
4219 int64_t data_used
= data_cache
->_get_used_bytes();
4221 uint64_t cache_size
= store
->cache_size
;
4223 static_cast<int64_t>(store
->cache_kv_ratio
* cache_size
);
4224 int64_t kv_onode_alloc
=
4225 static_cast<int64_t>(store
->cache_kv_onode_ratio
* cache_size
);
4226 int64_t meta_alloc
=
4227 static_cast<int64_t>(store
->cache_meta_ratio
* cache_size
);
4228 int64_t data_alloc
=
4229 static_cast<int64_t>(store
->cache_data_ratio
* cache_size
);
4231 if (pcm
!= nullptr && binned_kv_cache
!= nullptr) {
4232 cache_size
= pcm
->get_tuned_mem();
4233 kv_alloc
= binned_kv_cache
->get_committed_size();
4234 meta_alloc
= meta_cache
->get_committed_size();
4235 data_alloc
= data_cache
->get_committed_size();
4236 if (binned_kv_onode_cache
!= nullptr) {
4237 kv_onode_alloc
= binned_kv_onode_cache
->get_committed_size();
4241 if (interval_stats
) {
4242 dout(5) << __func__
<< " cache_size: " << cache_size
4243 << " kv_alloc: " << kv_alloc
4244 << " kv_used: " << kv_used
4245 << " kv_onode_alloc: " << kv_onode_alloc
4246 << " kv_onode_used: " << kv_onode_used
4247 << " meta_alloc: " << meta_alloc
4248 << " meta_used: " << meta_used
4249 << " data_alloc: " << data_alloc
4250 << " data_used: " << data_used
<< dendl
;
4252 dout(20) << __func__
<< " cache_size: " << cache_size
4253 << " kv_alloc: " << kv_alloc
4254 << " kv_used: " << kv_used
4255 << " kv_onode_alloc: " << kv_onode_alloc
4256 << " kv_onode_used: " << kv_onode_used
4257 << " meta_alloc: " << meta_alloc
4258 << " meta_used: " << meta_used
4259 << " data_alloc: " << data_alloc
4260 << " data_used: " << data_used
<< dendl
;
4263 uint64_t max_shard_onodes
= static_cast<uint64_t>(
4264 (meta_alloc
/ (double) onode_shards
) / meta_cache
->get_bytes_per_onode());
4265 uint64_t max_shard_buffer
= static_cast<uint64_t>(data_alloc
/ buffer_shards
);
4267 dout(30) << __func__
<< " max_shard_onodes: " << max_shard_onodes
4268 << " max_shard_buffer: " << max_shard_buffer
<< dendl
;
4270 for (auto i
: store
->onode_cache_shards
) {
4271 i
->set_max(max_shard_onodes
);
4273 for (auto i
: store
->buffer_cache_shards
) {
4274 i
->set_max(max_shard_buffer
);
4278 void BlueStore::MempoolThread::_update_cache_settings()
4280 // Nothing to do if pcm is not used.
4281 if (pcm
== nullptr) {
4285 uint64_t target
= store
->osd_memory_target
;
4286 uint64_t base
= store
->osd_memory_base
;
4287 uint64_t min
= store
->osd_memory_cache_min
;
4289 double fragmentation
= store
->osd_memory_expected_fragmentation
;
4291 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
4292 if (ltarget
> base
+ min
) {
4293 max
= ltarget
- base
;
4296 // set pcm cache levels
4297 pcm
->set_target_memory(target
);
4298 pcm
->set_min_memory(min
);
4299 pcm
->set_max_memory(max
);
4301 dout(5) << __func__
<< " updated pcm target: " << target
4302 << " pcm min: " << min
4303 << " pcm max: " << max
4307 // =======================================================
4312 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4314 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4315 CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
)
4316 : c(c
), o(o
), it(it
)
4318 std::shared_lock
l(c
->lock
);
4319 if (o
->onode
.has_omap()) {
4320 o
->get_omap_key(string(), &head
);
4321 o
->get_omap_tail(&tail
);
4322 it
->lower_bound(head
);
4326 string
BlueStore::OmapIteratorImpl::_stringify() const
4329 s
<< " omap_iterator(cid = " << c
->cid
4330 <<", oid = " << o
->oid
<< ")";
4334 int BlueStore::OmapIteratorImpl::seek_to_first()
4336 std::shared_lock
l(c
->lock
);
4337 auto start1
= mono_clock::now();
4338 if (o
->onode
.has_omap()) {
4339 it
->lower_bound(head
);
4341 it
= KeyValueDB::Iterator();
4343 c
->store
->log_latency(
4345 l_bluestore_omap_seek_to_first_lat
,
4346 mono_clock::now() - start1
,
4347 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
4352 int BlueStore::OmapIteratorImpl::upper_bound(const string
& after
)
4354 std::shared_lock
l(c
->lock
);
4355 auto start1
= mono_clock::now();
4356 if (o
->onode
.has_omap()) {
4358 o
->get_omap_key(after
, &key
);
4359 ldout(c
->store
->cct
,20) << __func__
<< " after " << after
<< " key "
4360 << pretty_binary_string(key
) << dendl
;
4361 it
->upper_bound(key
);
4363 it
= KeyValueDB::Iterator();
4365 c
->store
->log_latency_fn(
4367 l_bluestore_omap_upper_bound_lat
,
4368 mono_clock::now() - start1
,
4369 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
,
4370 [&] (const ceph::timespan
& lat
) {
4371 return ", after = " + after
+
4378 int BlueStore::OmapIteratorImpl::lower_bound(const string
& to
)
4380 std::shared_lock
l(c
->lock
);
4381 auto start1
= mono_clock::now();
4382 if (o
->onode
.has_omap()) {
4384 o
->get_omap_key(to
, &key
);
4385 ldout(c
->store
->cct
,20) << __func__
<< " to " << to
<< " key "
4386 << pretty_binary_string(key
) << dendl
;
4387 it
->lower_bound(key
);
4389 it
= KeyValueDB::Iterator();
4391 c
->store
->log_latency_fn(
4393 l_bluestore_omap_lower_bound_lat
,
4394 mono_clock::now() - start1
,
4395 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
,
4396 [&] (const ceph::timespan
& lat
) {
4397 return ", to = " + to
+
4404 bool BlueStore::OmapIteratorImpl::valid()
4406 std::shared_lock
l(c
->lock
);
4407 bool r
= o
->onode
.has_omap() && it
&& it
->valid() &&
4408 it
->raw_key().second
< tail
;
4409 if (it
&& it
->valid()) {
4410 ldout(c
->store
->cct
,20) << __func__
<< " is at "
4411 << pretty_binary_string(it
->raw_key().second
)
4417 int BlueStore::OmapIteratorImpl::next()
4420 std::shared_lock
l(c
->lock
);
4421 auto start1
= mono_clock::now();
4422 if (o
->onode
.has_omap()) {
4426 c
->store
->log_latency(
4428 l_bluestore_omap_next_lat
,
4429 mono_clock::now() - start1
,
4430 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
4435 string
BlueStore::OmapIteratorImpl::key()
4437 std::shared_lock
l(c
->lock
);
4438 ceph_assert(it
->valid());
4439 string db_key
= it
->raw_key().second
;
4441 o
->decode_omap_key(db_key
, &user_key
);
4446 bufferlist
BlueStore::OmapIteratorImpl::value()
4448 std::shared_lock
l(c
->lock
);
4449 ceph_assert(it
->valid());
4454 // =====================================
4457 #define dout_prefix *_dout << "bluestore(" << path << ") "
4459 #define dout_context cct
4462 static void aio_cb(void *priv
, void *priv2
)
4464 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
4465 BlueStore::AioContext
*c
= static_cast<BlueStore::AioContext
*>(priv2
);
4466 c
->aio_finish(store
);
4469 static void discard_cb(void *priv
, void *priv2
)
4471 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
4472 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
4473 store
->handle_discard(*tmp
);
4476 void BlueStore::handle_discard(interval_set
<uint64_t>& to_release
)
4478 dout(10) << __func__
<< dendl
;
4479 ceph_assert(shared_alloc
.a
);
4480 shared_alloc
.a
->release(to_release
);
4483 BlueStore::BlueStore(CephContext
*cct
, const string
& path
)
4484 : BlueStore(cct
, path
, 0) {}
4486 BlueStore::BlueStore(CephContext
*cct
,
4488 uint64_t _min_alloc_size
)
4489 : ObjectStore(cct
, path
),
4491 finisher(cct
, "commit_finisher", "cfin"),
4492 kv_sync_thread(this),
4493 kv_finalize_thread(this),
4494 zoned_cleaner_thread(this),
4495 min_alloc_size(_min_alloc_size
),
4496 min_alloc_size_order(ctz(_min_alloc_size
)),
4497 mempool_thread(this)
4500 cct
->_conf
.add_observer(this);
4501 set_cache_shards(1);
4504 BlueStore::~BlueStore()
4506 cct
->_conf
.remove_observer(this);
4508 ceph_assert(!mounted
);
4509 ceph_assert(db
== NULL
);
4510 ceph_assert(bluefs
== NULL
);
4511 ceph_assert(fsid_fd
< 0);
4512 ceph_assert(path_fd
< 0);
4513 for (auto i
: onode_cache_shards
) {
4516 for (auto i
: buffer_cache_shards
) {
4519 onode_cache_shards
.clear();
4520 buffer_cache_shards
.clear();
4523 const char **BlueStore::get_tracked_conf_keys() const
4525 static const char* KEYS
[] = {
4526 "bluestore_csum_type",
4527 "bluestore_compression_mode",
4528 "bluestore_compression_algorithm",
4529 "bluestore_compression_min_blob_size",
4530 "bluestore_compression_min_blob_size_ssd",
4531 "bluestore_compression_min_blob_size_hdd",
4532 "bluestore_compression_max_blob_size",
4533 "bluestore_compression_max_blob_size_ssd",
4534 "bluestore_compression_max_blob_size_hdd",
4535 "bluestore_compression_required_ratio",
4536 "bluestore_max_alloc_size",
4537 "bluestore_prefer_deferred_size",
4538 "bluestore_prefer_deferred_size_hdd",
4539 "bluestore_prefer_deferred_size_ssd",
4540 "bluestore_deferred_batch_ops",
4541 "bluestore_deferred_batch_ops_hdd",
4542 "bluestore_deferred_batch_ops_ssd",
4543 "bluestore_throttle_bytes",
4544 "bluestore_throttle_deferred_bytes",
4545 "bluestore_throttle_cost_per_io_hdd",
4546 "bluestore_throttle_cost_per_io_ssd",
4547 "bluestore_throttle_cost_per_io",
4548 "bluestore_max_blob_size",
4549 "bluestore_max_blob_size_ssd",
4550 "bluestore_max_blob_size_hdd",
4551 "osd_memory_target",
4552 "osd_memory_target_cgroup_limit_ratio",
4554 "osd_memory_cache_min",
4555 "osd_memory_expected_fragmentation",
4556 "bluestore_cache_autotune",
4557 "bluestore_cache_autotune_interval",
4558 "bluestore_warn_on_legacy_statfs",
4559 "bluestore_warn_on_no_per_pool_omap",
4560 "bluestore_max_defer_interval",
4566 void BlueStore::handle_conf_change(const ConfigProxy
& conf
,
4567 const std::set
<std::string
> &changed
)
4569 if (changed
.count("bluestore_warn_on_legacy_statfs")) {
4570 _check_legacy_statfs_alert();
4572 if (changed
.count("bluestore_warn_on_no_per_pool_omap") ||
4573 changed
.count("bluestore_warn_on_no_per_pg_omap")) {
4574 _check_no_per_pg_or_pool_omap_alert();
4577 if (changed
.count("bluestore_csum_type")) {
4580 if (changed
.count("bluestore_compression_mode") ||
4581 changed
.count("bluestore_compression_algorithm") ||
4582 changed
.count("bluestore_compression_min_blob_size") ||
4583 changed
.count("bluestore_compression_max_blob_size")) {
4588 if (changed
.count("bluestore_max_blob_size") ||
4589 changed
.count("bluestore_max_blob_size_ssd") ||
4590 changed
.count("bluestore_max_blob_size_hdd")) {
4592 // only after startup
4596 if (changed
.count("bluestore_prefer_deferred_size") ||
4597 changed
.count("bluestore_prefer_deferred_size_hdd") ||
4598 changed
.count("bluestore_prefer_deferred_size_ssd") ||
4599 changed
.count("bluestore_max_alloc_size") ||
4600 changed
.count("bluestore_deferred_batch_ops") ||
4601 changed
.count("bluestore_deferred_batch_ops_hdd") ||
4602 changed
.count("bluestore_deferred_batch_ops_ssd")) {
4604 // only after startup
4608 if (changed
.count("bluestore_throttle_cost_per_io") ||
4609 changed
.count("bluestore_throttle_cost_per_io_hdd") ||
4610 changed
.count("bluestore_throttle_cost_per_io_ssd")) {
4612 _set_throttle_params();
4615 if (changed
.count("bluestore_throttle_bytes") ||
4616 changed
.count("bluestore_throttle_deferred_bytes") ||
4617 changed
.count("bluestore_throttle_trace_rate")) {
4618 throttle
.reset_throttle(conf
);
4620 if (changed
.count("bluestore_max_defer_interval")) {
4622 _set_max_defer_interval();
4625 if (changed
.count("osd_memory_target") ||
4626 changed
.count("osd_memory_base") ||
4627 changed
.count("osd_memory_cache_min") ||
4628 changed
.count("osd_memory_expected_fragmentation")) {
4629 _update_osd_memory_options();
4633 void BlueStore::_set_compression()
4635 auto m
= Compressor::get_comp_mode_type(cct
->_conf
->bluestore_compression_mode
);
4637 _clear_compression_alert();
4640 derr
<< __func__
<< " unrecognized value '"
4641 << cct
->_conf
->bluestore_compression_mode
4642 << "' for bluestore_compression_mode, reverting to 'none'"
4644 comp_mode
= Compressor::COMP_NONE
;
4645 string
s("unknown mode: ");
4646 s
+= cct
->_conf
->bluestore_compression_mode
;
4647 _set_compression_alert(true, s
.c_str());
4650 compressor
= nullptr;
4652 if (cct
->_conf
->bluestore_compression_min_blob_size
) {
4653 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size
;
4656 if (_use_rotational_settings()) {
4657 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_hdd
;
4659 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_ssd
;
4663 if (cct
->_conf
->bluestore_compression_max_blob_size
) {
4664 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size
;
4667 if (_use_rotational_settings()) {
4668 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_hdd
;
4670 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_ssd
;
4674 auto& alg_name
= cct
->_conf
->bluestore_compression_algorithm
;
4675 if (!alg_name
.empty()) {
4676 compressor
= Compressor::create(cct
, alg_name
);
4678 derr
<< __func__
<< " unable to initialize " << alg_name
.c_str() << " compressor"
4680 _set_compression_alert(false, alg_name
.c_str());
4684 dout(10) << __func__
<< " mode " << Compressor::get_comp_mode_name(comp_mode
)
4685 << " alg " << (compressor
? compressor
->get_type_name() : "(none)")
4686 << " min_blob " << comp_min_blob_size
4687 << " max_blob " << comp_max_blob_size
4691 void BlueStore::_set_csum()
4693 csum_type
= Checksummer::CSUM_NONE
;
4694 int t
= Checksummer::get_csum_string_type(cct
->_conf
->bluestore_csum_type
);
4695 if (t
> Checksummer::CSUM_NONE
)
4698 dout(10) << __func__
<< " csum_type "
4699 << Checksummer::get_csum_type_string(csum_type
)
4703 void BlueStore::_set_throttle_params()
4705 if (cct
->_conf
->bluestore_throttle_cost_per_io
) {
4706 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io
;
4709 if (_use_rotational_settings()) {
4710 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_hdd
;
4712 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_ssd
;
4716 dout(10) << __func__
<< " throttle_cost_per_io " << throttle_cost_per_io
4719 void BlueStore::_set_blob_size()
4721 if (cct
->_conf
->bluestore_max_blob_size
) {
4722 max_blob_size
= cct
->_conf
->bluestore_max_blob_size
;
4725 if (_use_rotational_settings()) {
4726 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_hdd
;
4728 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_ssd
;
4731 dout(10) << __func__
<< " max_blob_size 0x" << std::hex
<< max_blob_size
4732 << std::dec
<< dendl
;
4735 void BlueStore::_update_osd_memory_options()
4737 osd_memory_target
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_target");
4738 osd_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
4739 osd_memory_expected_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
4740 osd_memory_cache_min
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_cache_min");
4742 dout(10) << __func__
4743 << " osd_memory_target " << osd_memory_target
4744 << " osd_memory_base " << osd_memory_base
4745 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4746 << " osd_memory_cache_min " << osd_memory_cache_min
4750 int BlueStore::_set_cache_sizes()
4753 cache_autotune
= cct
->_conf
.get_val
<bool>("bluestore_cache_autotune");
4754 cache_autotune_interval
=
4755 cct
->_conf
.get_val
<double>("bluestore_cache_autotune_interval");
4756 osd_memory_target
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_target");
4757 osd_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
4758 osd_memory_expected_fragmentation
=
4759 cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
4760 osd_memory_cache_min
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_cache_min");
4761 osd_memory_cache_resize_interval
=
4762 cct
->_conf
.get_val
<double>("osd_memory_cache_resize_interval");
4764 if (cct
->_conf
->bluestore_cache_size
) {
4765 cache_size
= cct
->_conf
->bluestore_cache_size
;
4767 // choose global cache size based on backend type
4768 if (_use_rotational_settings()) {
4769 cache_size
= cct
->_conf
->bluestore_cache_size_hdd
;
4771 cache_size
= cct
->_conf
->bluestore_cache_size_ssd
;
4775 cache_meta_ratio
= cct
->_conf
.get_val
<double>("bluestore_cache_meta_ratio");
4776 if (cache_meta_ratio
< 0 || cache_meta_ratio
> 1.0) {
4777 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
4778 << ") must be in range [0,1.0]" << dendl
;
4782 cache_kv_ratio
= cct
->_conf
.get_val
<double>("bluestore_cache_kv_ratio");
4783 if (cache_kv_ratio
< 0 || cache_kv_ratio
> 1.0) {
4784 derr
<< __func__
<< " bluestore_cache_kv_ratio (" << cache_kv_ratio
4785 << ") must be in range [0,1.0]" << dendl
;
4789 cache_kv_onode_ratio
= cct
->_conf
.get_val
<double>("bluestore_cache_kv_onode_ratio");
4790 if (cache_kv_onode_ratio
< 0 || cache_kv_onode_ratio
> 1.0) {
4791 derr
<< __func__
<< " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
4792 << ") must be in range [0,1.0]" << dendl
;
4796 if (cache_meta_ratio
+ cache_kv_ratio
> 1.0) {
4797 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
4798 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4799 << ") = " << cache_meta_ratio
+ cache_kv_ratio
<< "; must be <= 1.0"
4804 cache_data_ratio
= (double)1.0 -
4805 (double)cache_meta_ratio
-
4806 (double)cache_kv_ratio
-
4807 (double)cache_kv_onode_ratio
;
4808 if (cache_data_ratio
< 0) {
4809 // deal with floating point imprecision
4810 cache_data_ratio
= 0;
4813 dout(1) << __func__
<< " cache_size " << cache_size
4814 << " meta " << cache_meta_ratio
4815 << " kv " << cache_kv_ratio
4816 << " data " << cache_data_ratio
4821 int BlueStore::write_meta(const std::string
& key
, const std::string
& value
)
4823 bluestore_bdev_label_t label
;
4824 string p
= path
+ "/block";
4825 int r
= _read_bdev_label(cct
, p
, &label
);
4827 return ObjectStore::write_meta(key
, value
);
4829 label
.meta
[key
] = value
;
4830 r
= _write_bdev_label(cct
, p
, label
);
4831 ceph_assert(r
== 0);
4832 return ObjectStore::write_meta(key
, value
);
4835 int BlueStore::read_meta(const std::string
& key
, std::string
*value
)
4837 bluestore_bdev_label_t label
;
4838 string p
= path
+ "/block";
4839 int r
= _read_bdev_label(cct
, p
, &label
);
4841 return ObjectStore::read_meta(key
, value
);
4843 auto i
= label
.meta
.find(key
);
4844 if (i
== label
.meta
.end()) {
4845 return ObjectStore::read_meta(key
, value
);
4851 void BlueStore::_init_logger()
4853 PerfCountersBuilder
b(cct
, "bluestore",
4854 l_bluestore_first
, l_bluestore_last
);
4855 b
.add_time_avg(l_bluestore_kv_flush_lat
, "kv_flush_lat",
4856 "Average kv_thread flush latency",
4857 "fl_l", PerfCountersBuilder::PRIO_INTERESTING
);
4858 b
.add_time_avg(l_bluestore_kv_commit_lat
, "kv_commit_lat",
4859 "Average kv_thread commit latency");
4860 b
.add_time_avg(l_bluestore_kv_sync_lat
, "kv_sync_lat",
4861 "Average kv_sync thread latency",
4862 "ks_l", PerfCountersBuilder::PRIO_INTERESTING
);
4863 b
.add_time_avg(l_bluestore_kv_final_lat
, "kv_final_lat",
4864 "Average kv_finalize thread latency",
4865 "kf_l", PerfCountersBuilder::PRIO_INTERESTING
);
4866 b
.add_time_avg(l_bluestore_state_prepare_lat
, "state_prepare_lat",
4867 "Average prepare state latency");
4868 b
.add_time_avg(l_bluestore_state_aio_wait_lat
, "state_aio_wait_lat",
4869 "Average aio_wait state latency",
4870 "io_l", PerfCountersBuilder::PRIO_INTERESTING
);
4871 b
.add_time_avg(l_bluestore_state_io_done_lat
, "state_io_done_lat",
4872 "Average io_done state latency");
4873 b
.add_time_avg(l_bluestore_state_kv_queued_lat
, "state_kv_queued_lat",
4874 "Average kv_queued state latency");
4875 b
.add_time_avg(l_bluestore_state_kv_committing_lat
, "state_kv_commiting_lat",
4876 "Average kv_commiting state latency");
4877 b
.add_time_avg(l_bluestore_state_kv_done_lat
, "state_kv_done_lat",
4878 "Average kv_done state latency");
4879 b
.add_time_avg(l_bluestore_state_deferred_queued_lat
, "state_deferred_queued_lat",
4880 "Average deferred_queued state latency");
4881 b
.add_time_avg(l_bluestore_state_deferred_aio_wait_lat
, "state_deferred_aio_wait_lat",
4882 "Average aio_wait state latency");
4883 b
.add_time_avg(l_bluestore_state_deferred_cleanup_lat
, "state_deferred_cleanup_lat",
4884 "Average cleanup state latency");
4885 b
.add_time_avg(l_bluestore_state_finishing_lat
, "state_finishing_lat",
4886 "Average finishing state latency");
4887 b
.add_time_avg(l_bluestore_state_done_lat
, "state_done_lat",
4888 "Average done state latency");
4889 b
.add_time_avg(l_bluestore_throttle_lat
, "throttle_lat",
4890 "Average submit throttle latency",
4891 "th_l", PerfCountersBuilder::PRIO_CRITICAL
);
4892 b
.add_time_avg(l_bluestore_submit_lat
, "submit_lat",
4893 "Average submit latency",
4894 "s_l", PerfCountersBuilder::PRIO_CRITICAL
);
4895 b
.add_time_avg(l_bluestore_commit_lat
, "commit_lat",
4896 "Average commit latency",
4897 "c_l", PerfCountersBuilder::PRIO_CRITICAL
);
4898 b
.add_time_avg(l_bluestore_read_lat
, "read_lat",
4899 "Average read latency",
4900 "r_l", PerfCountersBuilder::PRIO_CRITICAL
);
4901 b
.add_time_avg(l_bluestore_read_onode_meta_lat
, "read_onode_meta_lat",
4902 "Average read onode metadata latency");
4903 b
.add_time_avg(l_bluestore_read_wait_aio_lat
, "read_wait_aio_lat",
4904 "Average read latency");
4905 b
.add_time_avg(l_bluestore_compress_lat
, "compress_lat",
4906 "Average compress latency");
4907 b
.add_time_avg(l_bluestore_decompress_lat
, "decompress_lat",
4908 "Average decompress latency");
4909 b
.add_time_avg(l_bluestore_csum_lat
, "csum_lat",
4910 "Average checksum latency");
4911 b
.add_u64_counter(l_bluestore_compress_success_count
, "compress_success_count",
4912 "Sum for beneficial compress ops");
4913 b
.add_u64_counter(l_bluestore_compress_rejected_count
, "compress_rejected_count",
4914 "Sum for compress ops rejected due to low net gain of space");
4915 b
.add_u64_counter(l_bluestore_write_pad_bytes
, "write_pad_bytes",
4916 "Sum for write-op padded bytes", NULL
, 0, unit_t(UNIT_BYTES
));
4917 b
.add_u64_counter(l_bluestore_deferred_write_ops
, "deferred_write_ops",
4918 "Sum for deferred write op");
4919 b
.add_u64_counter(l_bluestore_deferred_write_bytes
, "deferred_write_bytes",
4920 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES
));
4921 b
.add_u64_counter(l_bluestore_write_penalty_read_ops
, "write_penalty_read_ops",
4922 "Sum for write penalty read ops");
4923 b
.add_u64(l_bluestore_allocated
, "bluestore_allocated",
4924 "Sum for allocated bytes");
4925 b
.add_u64(l_bluestore_stored
, "bluestore_stored",
4926 "Sum for stored bytes");
4927 b
.add_u64(l_bluestore_compressed
, "bluestore_compressed",
4928 "Sum for stored compressed bytes",
4929 "c", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4930 b
.add_u64(l_bluestore_compressed_allocated
, "bluestore_compressed_allocated",
4931 "Sum for bytes allocated for compressed data",
4932 "c_a", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4933 b
.add_u64(l_bluestore_compressed_original
, "bluestore_compressed_original",
4934 "Sum for original bytes that were compressed",
4935 "c_o", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4936 b
.add_u64(l_bluestore_onodes
, "bluestore_onodes",
4937 "Number of onodes in cache");
4938 b
.add_u64(l_bluestore_pinned_onodes
, "bluestore_pinned_onodes",
4939 "Number of pinned onodes in cache");
4940 b
.add_u64_counter(l_bluestore_onode_hits
, "bluestore_onode_hits",
4941 "Sum for onode-lookups hit in the cache");
4942 b
.add_u64_counter(l_bluestore_onode_misses
, "bluestore_onode_misses",
4943 "Sum for onode-lookups missed in the cache");
4944 b
.add_u64_counter(l_bluestore_onode_shard_hits
, "bluestore_onode_shard_hits",
4945 "Sum for onode-shard lookups hit in the cache");
4946 b
.add_u64_counter(l_bluestore_onode_shard_misses
,
4947 "bluestore_onode_shard_misses",
4948 "Sum for onode-shard lookups missed in the cache");
4949 b
.add_u64(l_bluestore_extents
, "bluestore_extents",
4950 "Number of extents in cache");
4951 b
.add_u64(l_bluestore_blobs
, "bluestore_blobs",
4952 "Number of blobs in cache");
4953 b
.add_u64(l_bluestore_buffers
, "bluestore_buffers",
4954 "Number of buffers in cache");
4955 b
.add_u64(l_bluestore_buffer_bytes
, "bluestore_buffer_bytes",
4956 "Number of buffer bytes in cache", NULL
, 0, unit_t(UNIT_BYTES
));
4957 b
.add_u64_counter(l_bluestore_buffer_hit_bytes
, "bluestore_buffer_hit_bytes",
4958 "Sum for bytes of read hit in the cache", NULL
, 0, unit_t(UNIT_BYTES
));
4959 b
.add_u64_counter(l_bluestore_buffer_miss_bytes
, "bluestore_buffer_miss_bytes",
4960 "Sum for bytes of read missed in the cache", NULL
, 0, unit_t(UNIT_BYTES
));
4962 b
.add_u64_counter(l_bluestore_write_big
, "bluestore_write_big",
4963 "Large aligned writes into fresh blobs");
4964 b
.add_u64_counter(l_bluestore_write_big_bytes
, "bluestore_write_big_bytes",
4965 "Large aligned writes into fresh blobs (bytes)", NULL
, 0, unit_t(UNIT_BYTES
));
4966 b
.add_u64_counter(l_bluestore_write_big_blobs
, "bluestore_write_big_blobs",
4967 "Large aligned writes into fresh blobs (blobs)");
4968 b
.add_u64_counter(l_bluestore_write_big_deferred
,
4969 "bluestore_write_big_deferred",
4970 "Big overwrites using deferred");
4971 b
.add_u64_counter(l_bluestore_write_small
, "bluestore_write_small",
4972 "Small writes into existing or sparse small blobs");
4973 b
.add_u64_counter(l_bluestore_write_small_bytes
, "bluestore_write_small_bytes",
4974 "Small writes into existing or sparse small blobs (bytes)", NULL
, 0, unit_t(UNIT_BYTES
));
4975 b
.add_u64_counter(l_bluestore_write_small_unused
,
4976 "bluestore_write_small_unused",
4977 "Small writes into unused portion of existing blob");
4978 b
.add_u64_counter(l_bluestore_write_deferred
,
4979 "bluestore_write_deferred",
4980 "Overwrites using deferred");
4981 b
.add_u64_counter(l_bluestore_write_small_pre_read
,
4982 "bluestore_write_small_pre_read",
4983 "Small writes that required we read some data (possibly "
4984 "cached) to fill out the block");
4985 b
.add_u64_counter(l_bluestore_write_new
, "bluestore_write_new",
4986 "Write into new blob");
4988 b
.add_u64_counter(l_bluestore_txc
, "bluestore_txc", "Transactions committed");
4989 b
.add_u64_counter(l_bluestore_onode_reshard
, "bluestore_onode_reshard",
4990 "Onode extent map reshard events");
4991 b
.add_u64_counter(l_bluestore_blob_split
, "bluestore_blob_split",
4992 "Sum for blob splitting due to resharding");
4993 b
.add_u64_counter(l_bluestore_extent_compress
, "bluestore_extent_compress",
4994 "Sum for extents that have been removed due to compression");
4995 b
.add_u64_counter(l_bluestore_gc_merged
, "bluestore_gc_merged",
4996 "Sum for extents that have been merged due to garbage "
4998 b
.add_u64_counter(l_bluestore_read_eio
, "bluestore_read_eio",
4999 "Read EIO errors propagated to high level callers");
5000 b
.add_u64_counter(l_bluestore_reads_with_retries
, "bluestore_reads_with_retries",
5001 "Read operations that required at least one retry due to failed checksum validation");
5002 b
.add_u64(l_bluestore_fragmentation
, "bluestore_fragmentation_micros",
5003 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
5004 b
.add_time_avg(l_bluestore_omap_seek_to_first_lat
, "omap_seek_to_first_lat",
5005 "Average omap iterator seek_to_first call latency");
5006 b
.add_time_avg(l_bluestore_omap_upper_bound_lat
, "omap_upper_bound_lat",
5007 "Average omap iterator upper_bound call latency");
5008 b
.add_time_avg(l_bluestore_omap_lower_bound_lat
, "omap_lower_bound_lat",
5009 "Average omap iterator lower_bound call latency");
5010 b
.add_time_avg(l_bluestore_omap_next_lat
, "omap_next_lat",
5011 "Average omap iterator next call latency");
5012 b
.add_time_avg(l_bluestore_omap_get_keys_lat
, "omap_get_keys_lat",
5013 "Average omap get_keys call latency");
5014 b
.add_time_avg(l_bluestore_omap_get_values_lat
, "omap_get_values_lat",
5015 "Average omap get_values call latency");
5016 b
.add_time_avg(l_bluestore_clist_lat
, "clist_lat",
5017 "Average collection listing latency");
5018 b
.add_time_avg(l_bluestore_remove_lat
, "remove_lat",
5019 "Average removal latency");
5021 logger
= b
.create_perf_counters();
5022 cct
->get_perfcounters_collection()->add(logger
);
5025 int BlueStore::_reload_logger()
5027 struct store_statfs_t store_statfs
;
5028 int r
= statfs(&store_statfs
);
5030 logger
->set(l_bluestore_allocated
, store_statfs
.allocated
);
5031 logger
->set(l_bluestore_stored
, store_statfs
.data_stored
);
5032 logger
->set(l_bluestore_compressed
, store_statfs
.data_compressed
);
5033 logger
->set(l_bluestore_compressed_allocated
, store_statfs
.data_compressed_allocated
);
5034 logger
->set(l_bluestore_compressed_original
, store_statfs
.data_compressed_original
);
5039 void BlueStore::_shutdown_logger()
5041 cct
->get_perfcounters_collection()->remove(logger
);
5045 int BlueStore::get_block_device_fsid(CephContext
* cct
, const string
& path
,
5048 bluestore_bdev_label_t label
;
5049 int r
= _read_bdev_label(cct
, path
, &label
);
5052 *fsid
= label
.osd_uuid
;
5056 int BlueStore::_open_path()
5059 ceph_assert(path_fd
< 0);
5060 path_fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_DIRECTORY
|O_CLOEXEC
));
5063 derr
<< __func__
<< " unable to open " << path
<< ": " << cpp_strerror(r
)
5070 void BlueStore::_close_path()
5072 VOID_TEMP_FAILURE_RETRY(::close(path_fd
));
5076 int BlueStore::_write_bdev_label(CephContext
*cct
,
5077 string path
, bluestore_bdev_label_t label
)
5079 dout(10) << __func__
<< " path " << path
<< " label " << label
<< dendl
;
5082 uint32_t crc
= bl
.crc32c(-1);
5084 ceph_assert(bl
.length() <= BDEV_LABEL_BLOCK_SIZE
);
5085 bufferptr
z(BDEV_LABEL_BLOCK_SIZE
- bl
.length());
5087 bl
.append(std::move(z
));
5089 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_WRONLY
|O_CLOEXEC
));
5092 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
5096 int r
= bl
.write_fd(fd
);
5098 derr
<< __func__
<< " failed to write to " << path
5099 << ": " << cpp_strerror(r
) << dendl
;
5104 derr
<< __func__
<< " failed to fsync " << path
5105 << ": " << cpp_strerror(r
) << dendl
;
5108 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5112 int BlueStore::_read_bdev_label(CephContext
* cct
, string path
,
5113 bluestore_bdev_label_t
*label
)
5115 dout(10) << __func__
<< dendl
;
5116 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_RDONLY
|O_CLOEXEC
));
5119 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
5124 int r
= bl
.read_fd(fd
, BDEV_LABEL_BLOCK_SIZE
);
5125 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5127 derr
<< __func__
<< " failed to read from " << path
5128 << ": " << cpp_strerror(r
) << dendl
;
5132 uint32_t crc
, expected_crc
;
5133 auto p
= bl
.cbegin();
5137 t
.substr_of(bl
, 0, p
.get_off());
5139 decode(expected_crc
, p
);
5141 catch (ceph::buffer::error
& e
) {
5142 dout(2) << __func__
<< " unable to decode label at offset " << p
.get_off()
5147 if (crc
!= expected_crc
) {
5148 derr
<< __func__
<< " bad crc on label, expected " << expected_crc
5149 << " != actual " << crc
<< dendl
;
5152 dout(10) << __func__
<< " got " << *label
<< dendl
;
5156 int BlueStore::_check_or_set_bdev_label(
5157 string path
, uint64_t size
, string desc
, bool create
)
5159 bluestore_bdev_label_t label
;
5161 label
.osd_uuid
= fsid
;
5163 label
.btime
= ceph_clock_now();
5164 label
.description
= desc
;
5165 int r
= _write_bdev_label(cct
, path
, label
);
5169 int r
= _read_bdev_label(cct
, path
, &label
);
5172 if (cct
->_conf
->bluestore_debug_permit_any_bdev_label
) {
5173 dout(20) << __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
5174 << " and fsid " << fsid
<< " check bypassed" << dendl
;
5175 } else if (label
.osd_uuid
!= fsid
) {
5176 derr
<< __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
5177 << " does not match our fsid " << fsid
<< dendl
;
5184 void BlueStore::_set_alloc_sizes(void)
5186 max_alloc_size
= cct
->_conf
->bluestore_max_alloc_size
;
5188 if (cct
->_conf
->bluestore_prefer_deferred_size
) {
5189 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size
;
5192 if (_use_rotational_settings()) {
5193 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_hdd
;
5195 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_ssd
;
5199 if (cct
->_conf
->bluestore_deferred_batch_ops
) {
5200 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops
;
5203 if (_use_rotational_settings()) {
5204 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_hdd
;
5206 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_ssd
;
5210 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
5211 << std::dec
<< " order " << (int)min_alloc_size_order
5212 << " max_alloc_size 0x" << std::hex
<< max_alloc_size
5213 << " prefer_deferred_size 0x" << prefer_deferred_size
5215 << " deferred_batch_ops " << deferred_batch_ops
5219 int BlueStore::_open_bdev(bool create
)
5221 ceph_assert(bdev
== NULL
);
5222 string p
= path
+ "/block";
5223 bdev
= BlockDevice::create(cct
, p
, aio_cb
, static_cast<void*>(this), discard_cb
, static_cast<void*>(this));
5224 int r
= bdev
->open(p
);
5228 if (create
&& cct
->_conf
->bdev_enable_discard
) {
5229 bdev
->discard(0, bdev
->get_size());
5232 if (bdev
->supported_bdev_label()) {
5233 r
= _check_or_set_bdev_label(p
, bdev
->get_size(), "main", create
);
5238 // initialize global block parameters
5239 block_size
= bdev
->get_block_size();
5240 block_mask
= ~(block_size
- 1);
5241 block_size_order
= ctz(block_size
);
5242 ceph_assert(block_size
== 1u << block_size_order
);
5243 _set_max_defer_interval();
5244 // and set cache_size based on device type
5245 r
= _set_cache_sizes();
5250 if (bdev
->is_smr()) {
5251 freelist_type
= "zoned";
5263 void BlueStore::_validate_bdev()
5266 uint64_t dev_size
= bdev
->get_size();
5267 ceph_assert(dev_size
> _get_ondisk_reserved());
5270 void BlueStore::_close_bdev()
5278 int BlueStore::_open_fm(KeyValueDB::Transaction t
, bool read_only
)
5282 ceph_assert(fm
== NULL
);
5283 fm
= FreelistManager::create(cct
, freelist_type
, PREFIX_ALLOC
);
5286 // create mode. initialize freespace
5287 dout(20) << __func__
<< " initializing freespace" << dendl
;
5290 bl
.append(freelist_type
);
5291 t
->set(PREFIX_SUPER
, "freelist_type", bl
);
5293 // being able to allocate in units less than bdev block size
5294 // seems to be a bad idea.
5295 ceph_assert( cct
->_conf
->bdev_block_size
<= (int64_t)min_alloc_size
);
5297 uint64_t alloc_size
= min_alloc_size
;
5298 if (bdev
->is_smr()) {
5299 alloc_size
= _zoned_piggyback_device_parameters_onto(alloc_size
);
5302 fm
->create(bdev
->get_size(), alloc_size
, t
);
5304 // allocate superblock reserved space. note that we do not mark
5305 // bluefs space as allocated in the freelist; we instead rely on
5306 // bluefs doing that itself.
5307 auto reserved
= _get_ondisk_reserved();
5308 fm
->allocate(0, reserved
, t
);
5310 if (cct
->_conf
->bluestore_debug_prefill
> 0) {
5311 uint64_t end
= bdev
->get_size() - reserved
;
5312 dout(1) << __func__
<< " pre-fragmenting freespace, using "
5313 << cct
->_conf
->bluestore_debug_prefill
<< " with max free extent "
5314 << cct
->_conf
->bluestore_debug_prefragment_max
<< dendl
;
5315 uint64_t start
= p2roundup(reserved
, min_alloc_size
);
5316 uint64_t max_b
= cct
->_conf
->bluestore_debug_prefragment_max
/ min_alloc_size
;
5317 float r
= cct
->_conf
->bluestore_debug_prefill
;
5321 while (!stop
&& start
< end
) {
5322 uint64_t l
= (rand() % max_b
+ 1) * min_alloc_size
;
5323 if (start
+ l
> end
) {
5325 l
= p2align(l
, min_alloc_size
);
5327 ceph_assert(start
+ l
<= end
);
5329 uint64_t u
= 1 + (uint64_t)(r
* (double)l
);
5330 u
= p2roundup(u
, min_alloc_size
);
5331 if (start
+ l
+ u
> end
) {
5332 u
= end
- (start
+ l
);
5333 // trim to align so we don't overflow again
5334 u
= p2align(u
, min_alloc_size
);
5337 ceph_assert(start
+ l
+ u
<= end
);
5339 dout(20) << __func__
<< " free 0x" << std::hex
<< start
<< "~" << l
5340 << " use 0x" << u
<< std::dec
<< dendl
;
5343 // break if u has been trimmed to nothing
5347 fm
->allocate(start
+ l
, u
, t
);
5351 r
= _write_out_fm_meta(0);
5352 ceph_assert(r
== 0);
5354 r
= fm
->init(db
, read_only
,
5355 [&](const std::string
& key
, std::string
* result
) {
5356 return read_meta(key
, result
);
5359 derr
<< __func__
<< " freelist init failed: " << cpp_strerror(r
) << dendl
;
5365 // if space size tracked by free list manager is that higher than actual
5366 // dev size one can hit out-of-space allocation which will result
5367 // in data loss and/or assertions
5368 // Probably user altered the device size somehow.
5369 // The only fix for now is to redeploy OSD.
5370 if (fm
->get_size() >= bdev
->get_size() + min_alloc_size
) {
5372 ss
<< "slow device size mismatch detected, "
5373 << " fm size(" << fm
->get_size()
5374 << ") > slow device size(" << bdev
->get_size()
5375 << "), Please stop using this OSD as it might cause data loss.";
5376 _set_disk_size_mismatch_alert(ss
.str());
5381 void BlueStore::_close_fm()
5383 dout(10) << __func__
<< dendl
;
5390 int BlueStore::_write_out_fm_meta(uint64_t target_size
)
5393 string p
= path
+ "/block";
5395 std::vector
<std::pair
<string
, string
>> fm_meta
;
5396 fm
->get_meta(target_size
, &fm_meta
);
5398 for (auto& m
: fm_meta
) {
5399 r
= write_meta(m
.first
, m
.second
);
5400 ceph_assert(r
== 0);
5405 int BlueStore::_create_alloc()
5407 ceph_assert(shared_alloc
.a
== NULL
);
5408 ceph_assert(bdev
->get_size());
5410 uint64_t alloc_size
= min_alloc_size
;
5411 if (bdev
->is_smr()) {
5412 int r
= _zoned_check_config_settings();
5415 alloc_size
= _zoned_piggyback_device_parameters_onto(alloc_size
);
5418 shared_alloc
.set(Allocator::create(cct
, cct
->_conf
->bluestore_allocator
,
5420 alloc_size
, "block"));
5422 if (!shared_alloc
.a
) {
5423 lderr(cct
) << __func__
<< "Failed to create allocator:: "
5424 << cct
->_conf
->bluestore_allocator
5431 int BlueStore::_init_alloc()
5433 int r
= _create_alloc();
5437 ceph_assert(shared_alloc
.a
!= NULL
);
5439 if (bdev
->is_smr()) {
5440 shared_alloc
.a
->zoned_set_zone_states(fm
->get_zone_states(db
));
5443 uint64_t num
= 0, bytes
= 0;
5445 dout(1) << __func__
<< " opening allocation metadata" << dendl
;
5446 // initialize from freelist
5447 fm
->enumerate_reset();
5448 uint64_t offset
, length
;
5449 while (fm
->enumerate_next(db
, &offset
, &length
)) {
5450 shared_alloc
.a
->init_add_free(offset
, length
);
5454 fm
->enumerate_reset();
5457 << " loaded " << byte_u_t(bytes
) << " in " << num
<< " extents"
5459 << ", allocator type " << shared_alloc
.a
->get_type()
5460 << ", capacity 0x" << shared_alloc
.a
->get_capacity()
5461 << ", block size 0x" << shared_alloc
.a
->get_block_size()
5462 << ", free 0x" << shared_alloc
.a
->get_free()
5463 << ", fragmentation " << shared_alloc
.a
->get_fragmentation()
5464 << std::dec
<< dendl
;
5469 void BlueStore::_close_alloc()
5472 bdev
->discard_drain();
5474 ceph_assert(shared_alloc
.a
);
5475 shared_alloc
.a
->shutdown();
5476 delete shared_alloc
.a
;
5477 shared_alloc
.reset();
5480 int BlueStore::_open_fsid(bool create
)
5482 ceph_assert(fsid_fd
< 0);
5483 int flags
= O_RDWR
|O_CLOEXEC
;
5486 fsid_fd
= ::openat(path_fd
, "fsid", flags
, 0644);
5489 derr
<< __func__
<< " " << cpp_strerror(err
) << dendl
;
5495 int BlueStore::_read_fsid(uuid_d
*uuid
)
5498 memset(fsid_str
, 0, sizeof(fsid_str
));
5499 int ret
= safe_read(fsid_fd
, fsid_str
, sizeof(fsid_str
));
5501 derr
<< __func__
<< " failed: " << cpp_strerror(ret
) << dendl
;
5508 if (!uuid
->parse(fsid_str
)) {
5509 derr
<< __func__
<< " unparsable uuid " << fsid_str
<< dendl
;
5515 int BlueStore::_write_fsid()
5517 int r
= ::ftruncate(fsid_fd
, 0);
5520 derr
<< __func__
<< " fsid truncate failed: " << cpp_strerror(r
) << dendl
;
5523 string str
= stringify(fsid
) + "\n";
5524 r
= safe_write(fsid_fd
, str
.c_str(), str
.length());
5526 derr
<< __func__
<< " fsid write failed: " << cpp_strerror(r
) << dendl
;
5529 r
= ::fsync(fsid_fd
);
5532 derr
<< __func__
<< " fsid fsync failed: " << cpp_strerror(r
) << dendl
;
5538 void BlueStore::_close_fsid()
5540 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd
));
5544 int BlueStore::_lock_fsid()
5547 memset(&l
, 0, sizeof(l
));
5549 l
.l_whence
= SEEK_SET
;
5550 int r
= ::fcntl(fsid_fd
, F_SETLK
, &l
);
5553 derr
<< __func__
<< " failed to lock " << path
<< "/fsid"
5554 << " (is another ceph-osd still running?)"
5555 << cpp_strerror(err
) << dendl
;
5561 bool BlueStore::is_rotational()
5564 return bdev
->is_rotational();
5567 bool rotational
= true;
5568 int r
= _open_path();
5571 r
= _open_fsid(false);
5574 r
= _read_fsid(&fsid
);
5580 r
= _open_bdev(false);
5583 rotational
= bdev
->is_rotational();
5593 bool BlueStore::is_journal_rotational()
5596 dout(5) << __func__
<< " bluefs disabled, default to store media type"
5598 return is_rotational();
5600 dout(10) << __func__
<< " " << (int)bluefs
->wal_is_rotational() << dendl
;
5601 return bluefs
->wal_is_rotational();
5604 bool BlueStore::_use_rotational_settings()
5606 if (cct
->_conf
->bluestore_debug_enforce_settings
== "hdd") {
5609 if (cct
->_conf
->bluestore_debug_enforce_settings
== "ssd") {
5612 return bdev
->is_rotational();
5615 bool BlueStore::test_mount_in_use()
5617 // most error conditions mean the mount is not in use (e.g., because
5618 // it doesn't exist). only if we fail to lock do we conclude it is
5621 int r
= _open_path();
5624 r
= _open_fsid(false);
5629 ret
= true; // if we can't lock, it is in use
5636 int BlueStore::_minimal_open_bluefs(bool create
)
5639 bluefs
= new BlueFS(cct
);
5644 bfn
= path
+ "/block.db";
5645 if (::stat(bfn
.c_str(), &st
) == 0) {
5646 r
= bluefs
->add_block_device(
5647 BlueFS::BDEV_DB
, bfn
,
5648 create
&& cct
->_conf
->bdev_enable_discard
,
5651 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5652 << cpp_strerror(r
) << dendl
;
5656 if (bluefs
->bdev_support_label(BlueFS::BDEV_DB
)) {
5657 r
= _check_or_set_bdev_label(
5659 bluefs
->get_block_device_size(BlueFS::BDEV_DB
),
5660 "bluefs db", create
);
5663 << " check block device(" << bfn
<< ") label returned: "
5664 << cpp_strerror(r
) << dendl
;
5668 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
5669 bluefs_layout
.dedicated_db
= true;
5672 if (::lstat(bfn
.c_str(), &st
) == -1) {
5674 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
5676 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
5677 << cpp_strerror(r
) << dendl
;
5683 bfn
= path
+ "/block";
5685 r
= bluefs
->add_block_device(bluefs_layout
.shared_bdev
, bfn
, false,
5686 0, // no need to provide valid 'reserved' for shared dev
5689 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5690 << cpp_strerror(r
) << dendl
;
5694 bfn
= path
+ "/block.wal";
5695 if (::stat(bfn
.c_str(), &st
) == 0) {
5696 r
= bluefs
->add_block_device(BlueFS::BDEV_WAL
, bfn
,
5697 create
&& cct
->_conf
->bdev_enable_discard
,
5698 BDEV_LABEL_BLOCK_SIZE
);
5700 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5701 << cpp_strerror(r
) << dendl
;
5705 if (bluefs
->bdev_support_label(BlueFS::BDEV_WAL
)) {
5706 r
= _check_or_set_bdev_label(
5708 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
),
5709 "bluefs wal", create
);
5711 derr
<< __func__
<< " check block device(" << bfn
5712 << ") label returned: " << cpp_strerror(r
) << dendl
;
5717 bluefs_layout
.dedicated_wal
= true;
5720 if (::lstat(bfn
.c_str(), &st
) != -1) {
5722 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
5723 << cpp_strerror(r
) << dendl
;
5730 ceph_assert(bluefs
);
5736 int BlueStore::_open_bluefs(bool create
, bool read_only
)
5738 int r
= _minimal_open_bluefs(create
);
5742 BlueFSVolumeSelector
* vselector
= nullptr;
5743 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
5745 string options
= cct
->_conf
->bluestore_rocksdb_options
;
5746 string options_annex
= cct
->_conf
->bluestore_rocksdb_options_annex
;
5747 if (!options_annex
.empty()) {
5748 if (!options
.empty() &&
5749 *options
.rbegin() != ',') {
5752 options
+= options_annex
;
5755 rocksdb::Options rocks_opts
;
5756 r
= RocksDBStore::ParseOptionsFromStringStatic(
5764 if (cct
->_conf
->bluestore_volume_selection_policy
== "fit_to_fast") {
5765 vselector
= new FitToFastVolumeSelector(
5766 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
5767 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
5768 bluefs
->get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100);
5770 double reserved_factor
= cct
->_conf
->bluestore_volume_selection_reserved_factor
;
5772 new RocksDBBlueFSVolumeSelector(
5773 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
5774 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
5775 bluefs
->get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100,
5776 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5777 rocks_opts
.max_bytes_for_level_base
,
5778 rocks_opts
.max_bytes_for_level_multiplier
,
5780 cct
->_conf
->bluestore_volume_selection_reserved
,
5781 cct
->_conf
->bluestore_volume_selection_policy
== "use_some_extra");
5785 bluefs
->mkfs(fsid
, bluefs_layout
);
5787 bluefs
->set_volume_selector(vselector
);
5788 r
= bluefs
->mount();
5790 derr
<< __func__
<< " failed bluefs mount: " << cpp_strerror(r
) << dendl
;
5792 ceph_assert_always(bluefs
->maybe_verify_layout(bluefs_layout
) == 0);
5796 void BlueStore::_close_bluefs(bool cold_close
)
5798 bluefs
->umount(cold_close
);
5799 _minimal_close_bluefs();
5802 void BlueStore::_minimal_close_bluefs()
5808 int BlueStore::_is_bluefs(bool create
, bool* ret
)
5811 *ret
= cct
->_conf
->bluestore_bluefs
;
5814 int r
= read_meta("bluefs", &s
);
5816 derr
<< __func__
<< " unable to read 'bluefs' meta" << dendl
;
5821 } else if (s
== "0") {
5824 derr
<< __func__
<< " bluefs = " << s
<< " : not 0 or 1, aborting"
5833 * opens both DB and dependant super_meta, FreelistManager and allocator
5834 * in the proper order
5836 int BlueStore::_open_db_and_around(bool read_only
, bool to_repair
)
5838 dout(0) << __func__
<< " read-only:" << read_only
5839 << " repair:" << to_repair
<< dendl
;
5842 int r
= read_meta("type", &type
);
5844 derr
<< __func__
<< " failed to load os-type: " << cpp_strerror(r
)
5849 if (type
!= "bluestore") {
5850 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
5855 int r
= _open_path();
5858 r
= _open_fsid(false);
5862 r
= _read_fsid(&fsid
);
5870 r
= _open_bdev(false);
5874 // open in read-only first to read FM list and init allocator
5875 // as they might be needed for some BlueFS procedures
5876 r
= _open_db(false, false, true);
5880 r
= _open_super_meta();
5885 r
= _open_fm(nullptr, true);
5893 // Re-open in the proper mode(s).
5895 // Can't simply bypass second open for read-only mode as we need to
5896 // load allocated extents from bluefs into allocator.
5897 // And now it's time to do that
5901 r
= _open_db(false, to_repair
, read_only
);
5912 _close_db(read_only
);
5922 void BlueStore::_close_db_and_around(bool read_only
)
5924 _close_db(read_only
);
5932 int BlueStore::open_db_environment(KeyValueDB
**pdb
, bool to_repair
)
5935 int r
= _open_db_and_around(false, to_repair
);
5944 int BlueStore::close_db_environment()
5946 _close_db_and_around(false);
5950 int BlueStore::_prepare_db_environment(bool create
, bool read_only
,
5951 std::string
* _fn
, std::string
* _kv_backend
)
5955 std::string
& fn
=*_fn
;
5956 std::string
& kv_backend
=*_kv_backend
;
5958 std::shared_ptr
<Int64ArrayMergeOperator
> merge_op(new Int64ArrayMergeOperator
);
5961 kv_backend
= cct
->_conf
->bluestore_kvbackend
;
5963 r
= read_meta("kv_backend", &kv_backend
);
5965 derr
<< __func__
<< " unable to read 'kv_backend' meta" << dendl
;
5969 dout(10) << __func__
<< " kv_backend = " << kv_backend
<< dendl
;
5972 r
= _is_bluefs(create
, &do_bluefs
);
5976 dout(10) << __func__
<< " do_bluefs = " << do_bluefs
<< dendl
;
5978 map
<string
,string
> kv_options
;
5979 // force separate wal dir for all new deployments.
5980 kv_options
["separate_wal_dir"] = 1;
5981 rocksdb::Env
*env
= NULL
;
5983 dout(10) << __func__
<< " initializing bluefs" << dendl
;
5984 if (kv_backend
!= "rocksdb") {
5985 derr
<< " backend must be rocksdb to use bluefs" << dendl
;
5989 r
= _open_bluefs(create
, read_only
);
5994 if (cct
->_conf
->bluestore_bluefs_env_mirror
) {
5995 rocksdb::Env
* a
= new BlueRocksEnv(bluefs
);
5996 rocksdb::Env
* b
= rocksdb::Env::Default();
5998 string cmd
= "rm -rf " + path
+ "/db " +
5999 path
+ "/db.slow " +
6001 int r
= system(cmd
.c_str());
6004 env
= new rocksdb::EnvMirror(b
, a
, false, true);
6006 env
= new BlueRocksEnv(bluefs
);
6008 // simplify the dir names, too, as "seen" by rocksdb
6011 BlueFSVolumeSelector::paths paths
;
6012 bluefs
->get_vselector_paths(fn
, paths
);
6014 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
6015 // we have both block.db and block; tell rocksdb!
6016 // note: the second (last) size value doesn't really matter
6017 ostringstream db_paths
;
6019 for (auto& p
: paths
) {
6024 db_paths
<< p
.first
<< "," << p
.second
;
6027 kv_options
["db_paths"] = db_paths
.str();
6028 dout(1) << __func__
<< " set db_paths to " << db_paths
.str() << dendl
;
6032 for (auto& p
: paths
) {
6033 env
->CreateDir(p
.first
);
6035 // Selectors don't provide wal path so far hence create explicitly
6036 env
->CreateDir(fn
+ ".wal");
6038 std::vector
<std::string
> res
;
6039 // check for dir presence
6040 auto r
= env
->GetChildren(fn
+".wal", &res
);
6041 if (r
.IsNotFound()) {
6042 kv_options
.erase("separate_wal_dir");
6046 string walfn
= path
+ "/db.wal";
6049 int r
= ::mkdir(fn
.c_str(), 0755);
6052 if (r
< 0 && r
!= -EEXIST
) {
6053 derr
<< __func__
<< " failed to create " << fn
<< ": " << cpp_strerror(r
)
6059 r
= ::mkdir(walfn
.c_str(), 0755);
6062 if (r
< 0 && r
!= -EEXIST
) {
6063 derr
<< __func__
<< " failed to create " << walfn
6064 << ": " << cpp_strerror(r
)
6070 r
= ::stat(walfn
.c_str(), &st
);
6071 if (r
< 0 && errno
== ENOENT
) {
6072 kv_options
.erase("separate_wal_dir");
6078 db
= KeyValueDB::create(cct
,
6082 static_cast<void*>(env
));
6084 derr
<< __func__
<< " error creating db" << dendl
;
6086 _close_bluefs(read_only
);
6088 // delete env manually here since we can't depend on db to do this
6095 FreelistManager::setup_merge_operators(db
, freelist_type
);
6096 db
->set_merge_operator(PREFIX_STAT
, merge_op
);
6097 db
->set_cache_size(cache_kv_ratio
* cache_size
);
6101 int BlueStore::_open_db(bool create
, bool to_repair_db
, bool read_only
)
6104 ceph_assert(!(create
&& read_only
));
6106 string options_annex
;
6110 std::string sharding_def
;
6111 r
= _prepare_db_environment(create
, read_only
, &kv_dir_fn
, &kv_backend
);
6113 derr
<< __func__
<< " failed to prepare db environment: " << err
.str() << dendl
;
6116 if (kv_backend
== "rocksdb") {
6117 options
= cct
->_conf
->bluestore_rocksdb_options
;
6118 options_annex
= cct
->_conf
->bluestore_rocksdb_options_annex
;
6119 if (!options_annex
.empty()) {
6120 if (!options
.empty() &&
6121 *options
.rbegin() != ',') {
6124 options
+= options_annex
;
6127 if (cct
->_conf
.get_val
<bool>("bluestore_rocksdb_cf")) {
6128 sharding_def
= cct
->_conf
.get_val
<std::string
>("bluestore_rocksdb_cfs");
6136 r
= db
->create_and_open(err
, sharding_def
);
6138 // we pass in cf list here, but it is only used if the db already has
6139 // column families created.
6141 db
->open_read_only(err
, sharding_def
) :
6142 db
->open(err
, sharding_def
);
6145 derr
<< __func__
<< " erroring opening db: " << err
.str() << dendl
;
6146 _close_db(read_only
);
6149 dout(1) << __func__
<< " opened " << kv_backend
6150 << " path " << kv_dir_fn
<< " options " << options
<< dendl
;
6154 void BlueStore::_close_db(bool cold_close
)
6160 _close_bluefs(cold_close
);
6164 void BlueStore::_dump_alloc_on_failure()
6166 auto dump_interval
=
6167 cct
->_conf
->bluestore_bluefs_alloc_failure_dump_interval
;
6168 if (dump_interval
> 0 &&
6169 next_dump_on_bluefs_alloc_failure
<= ceph_clock_now()) {
6170 shared_alloc
.a
->dump();
6171 next_dump_on_bluefs_alloc_failure
= ceph_clock_now();
6172 next_dump_on_bluefs_alloc_failure
+= dump_interval
;
6176 int BlueStore::_open_collections()
6178 dout(10) << __func__
<< dendl
;
6179 collections_had_errors
= false;
6180 ceph_assert(coll_map
.empty());
6181 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
6182 for (it
->upper_bound(string());
6186 if (cid
.parse(it
->key())) {
6187 auto c
= ceph::make_ref
<Collection
>(
6189 onode_cache_shards
[cid
.hash_to_shard(onode_cache_shards
.size())],
6190 buffer_cache_shards
[cid
.hash_to_shard(buffer_cache_shards
.size())],
6192 bufferlist bl
= it
->value();
6193 auto p
= bl
.cbegin();
6195 decode(c
->cnode
, p
);
6196 } catch (ceph::buffer::error
& e
) {
6197 derr
<< __func__
<< " failed to decode cnode, key:"
6198 << pretty_binary_string(it
->key()) << dendl
;
6201 dout(20) << __func__
<< " opened " << cid
<< " " << c
6202 << " " << c
->cnode
<< dendl
;
6203 _osr_attach(c
.get());
6207 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
6208 collections_had_errors
= true;
6214 void BlueStore::_fsck_collections(int64_t* errors
)
6216 if (collections_had_errors
) {
6217 dout(10) << __func__
<< dendl
;
6218 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
, KeyValueDB::ITERATOR_NOCACHE
);
6219 for (it
->upper_bound(string());
6223 if (!cid
.parse(it
->key())) {
6224 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
6233 void BlueStore::_set_per_pool_omap()
6235 per_pool_omap
= OMAP_BULK
;
6237 db
->get(PREFIX_SUPER
, "per_pool_omap", &bl
);
6239 auto s
= bl
.to_str();
6240 if (s
== stringify(OMAP_PER_POOL
)) {
6241 per_pool_omap
= OMAP_PER_POOL
;
6243 ceph_assert(s
== stringify(OMAP_PER_PG
));
6244 per_pool_omap
= OMAP_PER_PG
;
6246 dout(10) << __func__
<< " per_pool_omap = " << per_pool_omap
<< dendl
;
6248 dout(10) << __func__
<< " per_pool_omap not present" << dendl
;
6250 _check_no_per_pg_or_pool_omap_alert();
6253 void BlueStore::_open_statfs()
6259 int r
= db
->get(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, &bl
);
6261 per_pool_stat_collection
= false;
6262 if (size_t(bl
.length()) >= sizeof(vstatfs
.values
)) {
6263 auto it
= bl
.cbegin();
6265 dout(10) << __func__
<< " store_statfs is found" << dendl
;
6267 dout(10) << __func__
<< " store_statfs is corrupt, using empty" << dendl
;
6269 _check_legacy_statfs_alert();
6271 per_pool_stat_collection
= true;
6272 dout(10) << __func__
<< " per-pool statfs is enabled" << dendl
;
6273 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_STAT
, KeyValueDB::ITERATOR_NOCACHE
);
6274 for (it
->upper_bound(string());
6279 int r
= get_key_pool_stat(it
->key(), &pool_id
);
6280 ceph_assert(r
== 0);
6284 auto p
= bl
.cbegin();
6285 auto& st
= osd_pools
[pool_id
];
6290 dout(30) << __func__
<< " pool " << pool_id
6291 << " statfs " << st
<< dendl
;
6292 } catch (ceph::buffer::error
& e
) {
6293 derr
<< __func__
<< " failed to decode pool stats, key:"
6294 << pretty_binary_string(it
->key()) << dendl
;
6298 dout(30) << __func__
<< " statfs " << vstatfs
<< dendl
;
6302 int BlueStore::_setup_block_symlink_or_file(
6308 dout(20) << __func__
<< " name " << name
<< " path " << epath
6309 << " size " << size
<< " create=" << (int)create
<< dendl
;
6311 int flags
= O_RDWR
|O_CLOEXEC
;
6314 if (epath
.length()) {
6315 r
= ::symlinkat(epath
.c_str(), path_fd
, name
.c_str());
6318 derr
<< __func__
<< " failed to create " << name
<< " symlink to "
6319 << epath
<< ": " << cpp_strerror(r
) << dendl
;
6323 if (!epath
.compare(0, strlen(SPDK_PREFIX
), SPDK_PREFIX
)) {
6324 int fd
= ::openat(path_fd
, epath
.c_str(), flags
, 0644);
6327 derr
<< __func__
<< " failed to open " << epath
<< " file: "
6328 << cpp_strerror(r
) << dendl
;
6331 // write the Transport ID of the NVMe device
6332 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6333 // where "0000:02:00.0" is the selector of a PCI device, see
6334 // the first column of "lspci -mm -n -D"
6335 string trid
{"trtype:PCIe "};
6337 trid
+= epath
.substr(strlen(SPDK_PREFIX
));
6338 r
= ::write(fd
, trid
.c_str(), trid
.size());
6339 ceph_assert(r
== static_cast<int>(trid
.size()));
6340 dout(1) << __func__
<< " created " << name
<< " symlink to "
6342 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6346 int fd
= ::openat(path_fd
, name
.c_str(), flags
, 0644);
6348 // block file is present
6350 int r
= ::fstat(fd
, &st
);
6352 S_ISREG(st
.st_mode
) && // if it is a regular file
6353 st
.st_size
== 0) { // and is 0 bytes
6354 r
= ::ftruncate(fd
, size
);
6357 derr
<< __func__
<< " failed to resize " << name
<< " file to "
6358 << size
<< ": " << cpp_strerror(r
) << dendl
;
6359 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6363 if (cct
->_conf
->bluestore_block_preallocate_file
) {
6364 r
= ::ceph_posix_fallocate(fd
, 0, size
);
6366 derr
<< __func__
<< " failed to prefallocate " << name
<< " file to "
6367 << size
<< ": " << cpp_strerror(r
) << dendl
;
6368 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6372 dout(1) << __func__
<< " resized " << name
<< " file to "
6373 << byte_u_t(size
) << dendl
;
6375 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6379 derr
<< __func__
<< " failed to open " << name
<< " file: "
6380 << cpp_strerror(r
) << dendl
;
6388 int BlueStore::mkfs()
6390 dout(1) << __func__
<< " path " << path
<< dendl
;
6394 if (cct
->_conf
->osd_max_object_size
> OBJECT_MAX_SIZE
) {
6395 derr
<< __func__
<< " osd_max_object_size "
6396 << cct
->_conf
->osd_max_object_size
<< " > bluestore max "
6397 << OBJECT_MAX_SIZE
<< dendl
;
6403 r
= read_meta("mkfs_done", &done
);
6405 dout(1) << __func__
<< " already created" << dendl
;
6406 if (cct
->_conf
->bluestore_fsck_on_mkfs
) {
6407 r
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
6409 derr
<< __func__
<< " fsck found fatal error: " << cpp_strerror(r
)
6414 derr
<< __func__
<< " fsck found " << r
<< " errors" << dendl
;
6418 return r
; // idempotent
6424 r
= read_meta("type", &type
);
6426 if (type
!= "bluestore") {
6427 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
6431 r
= write_meta("type", "bluestore");
6437 freelist_type
= "bitmap";
6443 r
= _open_fsid(true);
6449 goto out_close_fsid
;
6451 r
= _read_fsid(&old_fsid
);
6452 if (r
< 0 || old_fsid
.is_zero()) {
6453 if (fsid
.is_zero()) {
6454 fsid
.generate_random();
6455 dout(1) << __func__
<< " generated fsid " << fsid
<< dendl
;
6457 dout(1) << __func__
<< " using provided fsid " << fsid
<< dendl
;
6459 // we'll write it later.
6461 if (!fsid
.is_zero() && fsid
!= old_fsid
) {
6462 derr
<< __func__
<< " on-disk fsid " << old_fsid
6463 << " != provided " << fsid
<< dendl
;
6465 goto out_close_fsid
;
6470 r
= _setup_block_symlink_or_file("block", cct
->_conf
->bluestore_block_path
,
6471 cct
->_conf
->bluestore_block_size
,
6472 cct
->_conf
->bluestore_block_create
);
6474 goto out_close_fsid
;
6475 if (cct
->_conf
->bluestore_bluefs
) {
6476 r
= _setup_block_symlink_or_file("block.wal", cct
->_conf
->bluestore_block_wal_path
,
6477 cct
->_conf
->bluestore_block_wal_size
,
6478 cct
->_conf
->bluestore_block_wal_create
);
6480 goto out_close_fsid
;
6481 r
= _setup_block_symlink_or_file("block.db", cct
->_conf
->bluestore_block_db_path
,
6482 cct
->_conf
->bluestore_block_db_size
,
6483 cct
->_conf
->bluestore_block_db_create
);
6485 goto out_close_fsid
;
6488 r
= _open_bdev(true);
6490 goto out_close_fsid
;
6492 // choose min_alloc_size
6493 if (cct
->_conf
->bluestore_min_alloc_size
) {
6494 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size
;
6497 if (_use_rotational_settings()) {
6498 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_hdd
;
6500 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_ssd
;
6505 // make sure min_alloc_size is power of 2 aligned.
6506 if (!isp2(min_alloc_size
)) {
6507 derr
<< __func__
<< " min_alloc_size 0x"
6508 << std::hex
<< min_alloc_size
<< std::dec
6509 << " is not power of 2 aligned!"
6512 goto out_close_bdev
;
6515 r
= _create_alloc();
6517 goto out_close_bdev
;
6520 reserved
= _get_ondisk_reserved();
6521 shared_alloc
.a
->init_add_free(reserved
,
6522 p2align(bdev
->get_size(), min_alloc_size
) - reserved
);
6526 goto out_close_alloc
;
6529 KeyValueDB::Transaction t
= db
->get_transaction();
6530 r
= _open_fm(t
, true);
6535 encode((uint64_t)0, bl
);
6536 t
->set(PREFIX_SUPER
, "nid_max", bl
);
6537 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
6542 encode((uint64_t)min_alloc_size
, bl
);
6543 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
6547 bl
.append(stringify(OMAP_PER_PG
));
6548 t
->set(PREFIX_SUPER
, "per_pool_omap", bl
);
6550 ondisk_format
= latest_ondisk_format
;
6551 _prepare_ondisk_format_super(t
);
6552 db
->submit_transaction_sync(t
);
6555 r
= write_meta("kv_backend", cct
->_conf
->bluestore_kvbackend
);
6559 r
= write_meta("bluefs", stringify(bluefs
? 1 : 0));
6563 if (fsid
!= old_fsid
) {
6566 derr
<< __func__
<< " error writing fsid: " << cpp_strerror(r
) << dendl
;
6585 cct
->_conf
->bluestore_fsck_on_mkfs
) {
6586 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
6590 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
6596 // indicate success by writing the 'mkfs_done' file
6597 r
= write_meta("mkfs_done", "yes");
6601 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
6603 dout(0) << __func__
<< " success" << dendl
;
6608 int BlueStore::add_new_bluefs_device(int id
, const string
& dev_path
)
6610 dout(10) << __func__
<< " path " << dev_path
<< " id:" << id
<< dendl
;
6612 ceph_assert(path_fd
< 0);
6614 ceph_assert(id
== BlueFS::BDEV_NEWWAL
|| id
== BlueFS::BDEV_NEWDB
);
6616 if (!cct
->_conf
->bluestore_bluefs
) {
6617 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6621 r
= _open_db_and_around(true);
6623 if (id
== BlueFS::BDEV_NEWWAL
) {
6624 string p
= path
+ "/block.wal";
6625 r
= _setup_block_symlink_or_file("block.wal", dev_path
,
6626 cct
->_conf
->bluestore_block_wal_size
,
6628 ceph_assert(r
== 0);
6630 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWWAL
, p
,
6631 cct
->_conf
->bdev_enable_discard
,
6632 BDEV_LABEL_BLOCK_SIZE
);
6633 ceph_assert(r
== 0);
6635 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWWAL
)) {
6636 r
= _check_or_set_bdev_label(
6638 bluefs
->get_block_device_size(BlueFS::BDEV_NEWWAL
),
6641 ceph_assert(r
== 0);
6644 bluefs_layout
.dedicated_wal
= true;
6645 } else if (id
== BlueFS::BDEV_NEWDB
) {
6646 string p
= path
+ "/block.db";
6647 r
= _setup_block_symlink_or_file("block.db", dev_path
,
6648 cct
->_conf
->bluestore_block_db_size
,
6650 ceph_assert(r
== 0);
6652 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWDB
, p
,
6653 cct
->_conf
->bdev_enable_discard
,
6655 ceph_assert(r
== 0);
6657 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWDB
)) {
6658 r
= _check_or_set_bdev_label(
6660 bluefs
->get_block_device_size(BlueFS::BDEV_NEWDB
),
6663 ceph_assert(r
== 0);
6665 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
6666 bluefs_layout
.dedicated_db
= true;
6672 r
= bluefs
->prepare_new_device(id
, bluefs_layout
);
6673 ceph_assert(r
== 0);
6676 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
6678 dout(0) << __func__
<< " success" << dendl
;
6681 _close_db_and_around(true);
6685 int BlueStore::migrate_to_existing_bluefs_device(const set
<int>& devs_source
,
6688 dout(10) << __func__
<< " id:" << id
<< dendl
;
6689 ceph_assert(path_fd
< 0);
6691 ceph_assert(id
== BlueFS::BDEV_SLOW
|| id
== BlueFS::BDEV_DB
);
6693 if (!cct
->_conf
->bluestore_bluefs
) {
6694 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6698 int r
= _open_db_and_around(true);
6700 uint64_t used_space
= 0;
6701 for(auto src_id
: devs_source
) {
6702 used_space
+= bluefs
->get_used(src_id
);
6704 uint64_t target_free
= bluefs
->get_free(id
);
6705 if (target_free
< used_space
) {
6707 << " can't migrate, free space at target: " << target_free
6708 << " is less than required space: " << used_space
6713 if (devs_source
.count(BlueFS::BDEV_DB
)) {
6714 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
6715 bluefs_layout
.dedicated_db
= false;
6717 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
6718 bluefs_layout
.dedicated_wal
= false;
6720 r
= bluefs
->device_migrate_to_existing(cct
, devs_source
, id
, bluefs_layout
);
6722 derr
<< __func__
<< " failed during BlueFS migration, " << cpp_strerror(r
) << dendl
;
6726 if (devs_source
.count(BlueFS::BDEV_DB
)) {
6727 r
= unlink(string(path
+ "/block.db").c_str());
6728 ceph_assert(r
== 0);
6730 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
6731 r
= unlink(string(path
+ "/block.wal").c_str());
6732 ceph_assert(r
== 0);
6736 _close_db_and_around(true);
6740 int BlueStore::migrate_to_new_bluefs_device(const set
<int>& devs_source
,
6742 const string
& dev_path
)
6744 dout(10) << __func__
<< " path " << dev_path
<< " id:" << id
<< dendl
;
6746 ceph_assert(path_fd
< 0);
6748 ceph_assert(id
== BlueFS::BDEV_NEWWAL
|| id
== BlueFS::BDEV_NEWDB
);
6750 if (!cct
->_conf
->bluestore_bluefs
) {
6751 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6755 r
= _open_db_and_around(true);
6759 if (devs_source
.count(BlueFS::BDEV_DB
) &&
6760 bluefs_layout
.shared_bdev
!= BlueFS::BDEV_DB
) {
6761 link_db
= path
+ "/block.db";
6762 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
6763 bluefs_layout
.dedicated_db
= false;
6765 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
6766 link_wal
= path
+ "/block.wal";
6767 bluefs_layout
.dedicated_wal
= false;
6772 if (id
== BlueFS::BDEV_NEWWAL
) {
6773 target_name
= "block.wal";
6774 target_size
= cct
->_conf
->bluestore_block_wal_size
;
6775 bluefs_layout
.dedicated_wal
= true;
6777 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWWAL
, dev_path
,
6778 cct
->_conf
->bdev_enable_discard
,
6779 BDEV_LABEL_BLOCK_SIZE
);
6780 ceph_assert(r
== 0);
6782 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWWAL
)) {
6783 r
= _check_or_set_bdev_label(
6785 bluefs
->get_block_device_size(BlueFS::BDEV_NEWWAL
),
6788 ceph_assert(r
== 0);
6790 } else if (id
== BlueFS::BDEV_NEWDB
) {
6791 target_name
= "block.db";
6792 target_size
= cct
->_conf
->bluestore_block_db_size
;
6793 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
6794 bluefs_layout
.dedicated_db
= true;
6796 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWDB
, dev_path
,
6797 cct
->_conf
->bdev_enable_discard
,
6799 ceph_assert(r
== 0);
6801 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWDB
)) {
6802 r
= _check_or_set_bdev_label(
6804 bluefs
->get_block_device_size(BlueFS::BDEV_NEWDB
),
6807 ceph_assert(r
== 0);
6814 r
= bluefs
->device_migrate_to_new(cct
, devs_source
, id
, bluefs_layout
);
6817 derr
<< __func__
<< " failed during BlueFS migration, " << cpp_strerror(r
) << dendl
;
6821 if (!link_db
.empty()) {
6822 r
= unlink(link_db
.c_str());
6823 ceph_assert(r
== 0);
6825 if (!link_wal
.empty()) {
6826 r
= unlink(link_wal
.c_str());
6827 ceph_assert(r
== 0);
6829 r
= _setup_block_symlink_or_file(
6834 ceph_assert(r
== 0);
6835 dout(0) << __func__
<< " success" << dendl
;
6838 _close_db_and_around(true);
6843 string
BlueStore::get_device_path(unsigned id
)
6846 if (id
< BlueFS::MAX_BDEV
) {
6848 case BlueFS::BDEV_WAL
:
6849 res
= path
+ "/block.wal";
6851 case BlueFS::BDEV_DB
:
6852 if (id
== bluefs_layout
.shared_bdev
) {
6853 res
= path
+ "/block";
6855 res
= path
+ "/block.db";
6858 case BlueFS::BDEV_SLOW
:
6859 res
= path
+ "/block";
6866 int BlueStore::_set_bdev_label_size(const string
& path
, uint64_t size
)
6868 bluestore_bdev_label_t label
;
6869 int r
= _read_bdev_label(cct
, path
, &label
);
6871 derr
<< "unable to read label for " << path
<< ": "
6872 << cpp_strerror(r
) << dendl
;
6875 r
= _write_bdev_label(cct
, path
, label
);
6877 derr
<< "unable to write label for " << path
<< ": "
6878 << cpp_strerror(r
) << dendl
;
6884 int BlueStore::expand_devices(ostream
& out
)
6886 int r
= _open_db_and_around(true);
6887 ceph_assert(r
== 0);
6888 bluefs
->dump_block_extents(out
);
6889 out
<< "Expanding DB/WAL..." << std::endl
;
6890 for (auto devid
: { BlueFS::BDEV_WAL
, BlueFS::BDEV_DB
}) {
6891 if (devid
== bluefs_layout
.shared_bdev
) {
6894 uint64_t size
= bluefs
->get_block_device_size(devid
);
6901 <<" : expanding " << " to 0x" << size
<< std::dec
<< std::endl
;
6902 string p
= get_device_path(devid
);
6903 const char* path
= p
.c_str();
6904 if (path
== nullptr) {
6906 <<": can't find device path " << dendl
;
6909 if (bluefs
->bdev_support_label(devid
)) {
6910 if (_set_bdev_label_size(p
, size
) >= 0) {
6912 << " : size label updated to " << size
6917 uint64_t size0
= fm
->get_size();
6918 uint64_t size
= bdev
->get_size();
6920 out
<< bluefs_layout
.shared_bdev
6921 << " : expanding " << " from 0x" << std::hex
6922 << size0
<< " to 0x" << size
<< std::dec
<< std::endl
;
6923 _write_out_fm_meta(size
);
6924 if (bdev
->supported_bdev_label()) {
6925 if (_set_bdev_label_size(path
, size
) >= 0) {
6926 out
<< bluefs_layout
.shared_bdev
6927 << " : size label updated to " << size
6931 _close_db_and_around(true);
6933 // mount in read/write to sync expansion changes
6935 ceph_assert(r
== 0);
6938 _close_db_and_around(true);
6943 int BlueStore::dump_bluefs_sizes(ostream
& out
)
6945 int r
= _open_db_and_around(true);
6946 ceph_assert(r
== 0);
6947 bluefs
->dump_block_extents(out
);
6948 _close_db_and_around(true);
6952 void BlueStore::set_cache_shards(unsigned num
)
6954 dout(10) << __func__
<< " " << num
<< dendl
;
6955 size_t oold
= onode_cache_shards
.size();
6956 size_t bold
= buffer_cache_shards
.size();
6957 ceph_assert(num
>= oold
&& num
>= bold
);
6958 onode_cache_shards
.resize(num
);
6959 buffer_cache_shards
.resize(num
);
6960 for (unsigned i
= oold
; i
< num
; ++i
) {
6961 onode_cache_shards
[i
] =
6962 OnodeCacheShard::create(cct
, cct
->_conf
->bluestore_cache_type
,
6965 for (unsigned i
= bold
; i
< num
; ++i
) {
6966 buffer_cache_shards
[i
] =
6967 BufferCacheShard::create(cct
, cct
->_conf
->bluestore_cache_type
,
6972 int BlueStore::_mount()
6974 dout(1) << __func__
<< " path " << path
<< dendl
;
6977 if (cct
->_conf
->bluestore_fsck_on_mount
) {
6978 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mount_deep
);
6982 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
6987 if (cct
->_conf
->osd_max_object_size
> OBJECT_MAX_SIZE
) {
6988 derr
<< __func__
<< " osd_max_object_size "
6989 << cct
->_conf
->osd_max_object_size
<< " > bluestore max "
6990 << OBJECT_MAX_SIZE
<< dendl
;
6994 int r
= _open_db_and_around(false);
6999 r
= _upgrade_super();
7004 r
= _open_collections();
7008 r
= _reload_logger();
7014 if (bdev
->is_smr()) {
7015 _zoned_cleaner_start();
7018 r
= _deferred_replay();
7022 mempool_thread
.init();
7024 if ((!per_pool_stat_collection
|| per_pool_omap
!= OMAP_PER_PG
) &&
7025 cct
->_conf
->bluestore_fsck_quick_fix_on_mount
== true) {
7027 auto was_per_pool_omap
= per_pool_omap
;
7029 dout(1) << __func__
<< " quick-fix on mount" << dendl
;
7030 _fsck_on_open(FSCK_SHALLOW
, true);
7033 //FIXME minor: replace with actual open/close?
7035 _check_legacy_statfs_alert();
7037 //set again as hopefully it has been fixed
7038 if (was_per_pool_omap
!= OMAP_PER_PG
) {
7039 _set_per_pool_omap();
7047 if (bdev
->is_smr()) {
7048 _zoned_cleaner_stop();
7054 _close_db_and_around(false);
7058 int BlueStore::umount()
7060 ceph_assert(_kv_only
|| mounted
);
7061 dout(1) << __func__
<< dendl
;
7067 mempool_thread
.shutdown();
7068 if (bdev
->is_smr()) {
7069 dout(20) << __func__
<< " stopping zone cleaner thread" << dendl
;
7070 _zoned_cleaner_stop();
7072 dout(20) << __func__
<< " stopping kv thread" << dendl
;
7075 dout(20) << __func__
<< " closing" << dendl
;
7078 _close_db_and_around(false);
7080 if (cct
->_conf
->bluestore_fsck_on_umount
) {
7081 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_umount_deep
);
7085 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
7092 int BlueStore::cold_open()
7094 return _open_db_and_around(true);
7097 int BlueStore::cold_close()
7099 _close_db_and_around(true);
7103 // derr wrapper to limit enormous output and avoid log flooding.
7104 // Of limited use where such output is expected for now
7105 #define fsck_derr(err_cnt, threshold) \
7106 if (err_cnt <= threshold) { \
7107 bool need_skip_print = err_cnt == threshold; \
7110 #define fsck_dendl \
7112 if (need_skip_print) \
7113 derr << "more error lines skipped..." << dendl; \
7116 int _fsck_sum_extents(
7117 const PExtentVector
& extents
,
7119 store_statfs_t
& expected_statfs
)
7121 for (auto e
: extents
) {
7124 expected_statfs
.allocated
+= e
.length
;
7126 expected_statfs
.data_compressed_allocated
+= e
.length
;
7132 int BlueStore::_fsck_check_extents(
7134 const ghobject_t
& oid
,
7135 const PExtentVector
& extents
,
7137 mempool_dynamic_bitset
&used_blocks
,
7138 uint64_t granularity
,
7139 BlueStoreRepairer
* repairer
,
7140 store_statfs_t
& expected_statfs
,
7143 dout(30) << __func__
<< " oid " << oid
<< " extents " << extents
<< dendl
;
7145 for (auto e
: extents
) {
7148 expected_statfs
.allocated
+= e
.length
;
7150 expected_statfs
.data_compressed_allocated
+= e
.length
;
7152 if (depth
!= FSCK_SHALLOW
) {
7153 bool already
= false;
7154 apply_for_bitset_range(
7155 e
.offset
, e
.length
, granularity
, used_blocks
,
7156 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
7159 repairer
->note_misreference(
7160 pos
* min_alloc_size
, min_alloc_size
, !already
);
7163 derr
<< "fsck error: " << oid
<< " extent " << e
7164 << " or a subset is already allocated (misreferenced)" << dendl
;
7173 repairer
->set_space_used(e
.offset
, e
.length
, cid
, oid
);
7176 if (e
.end() > bdev
->get_size()) {
7177 derr
<< "fsck error: " << oid
<< " extent " << e
7178 << " past end of block device" << dendl
;
7186 void BlueStore::_fsck_check_pool_statfs(
7187 BlueStore::per_pool_statfs
& expected_pool_statfs
,
7190 BlueStoreRepairer
* repairer
)
7192 auto it
= db
->get_iterator(PREFIX_STAT
, KeyValueDB::ITERATOR_NOCACHE
);
7194 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
7195 string key
= it
->key();
7196 if (key
== BLUESTORE_GLOBAL_STATFS_KEY
) {
7199 repairer
->remove_key(db
, PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
);
7200 derr
<< "fsck error: " << "legacy statfs record found, removing"
7206 if (get_key_pool_stat(key
, &pool_id
) < 0) {
7207 derr
<< "fsck error: bad key " << key
7208 << "in statfs namespece" << dendl
;
7210 repairer
->remove_key(db
, PREFIX_STAT
, key
);
7216 volatile_statfs vstatfs
;
7217 bufferlist bl
= it
->value();
7218 auto blp
= bl
.cbegin();
7220 vstatfs
.decode(blp
);
7221 } catch (ceph::buffer::error
& e
) {
7222 derr
<< "fsck error: failed to decode Pool StatFS record"
7223 << pretty_binary_string(key
) << dendl
;
7225 dout(20) << __func__
<< " undecodable Pool StatFS record, key:'"
7226 << pretty_binary_string(key
)
7227 << "', removing" << dendl
;
7228 repairer
->remove_key(db
, PREFIX_STAT
, key
);
7233 auto stat_it
= expected_pool_statfs
.find(pool_id
);
7234 if (stat_it
== expected_pool_statfs
.end()) {
7235 if (vstatfs
.is_empty()) {
7236 // we don't consider that as an error since empty pool statfs
7237 // are left in DB for now
7238 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7239 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7241 // but we need to increment error count in case of repair
7242 // to have proper counters at the end
7243 // (as repairer increments recovery counter anyway).
7247 derr
<< "fsck error: found stray Pool StatFS record for pool id 0x"
7248 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7252 repairer
->remove_key(db
, PREFIX_SHARED_BLOB
, key
);
7256 store_statfs_t statfs
;
7257 vstatfs
.publish(&statfs
);
7258 if (!(stat_it
->second
== statfs
)) {
7259 derr
<< "fsck error: actual " << statfs
7260 << " != expected " << stat_it
->second
7262 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7264 repairer
->fix_statfs(db
, key
, stat_it
->second
);
7268 expected_pool_statfs
.erase(stat_it
);
7271 for (auto& s
: expected_pool_statfs
) {
7272 if (s
.second
.is_zero()) {
7273 // we might lack empty statfs recs in DB
7276 derr
<< "fsck error: missing Pool StatFS record for pool "
7277 << std::hex
<< s
.first
<< std::dec
<< dendl
;
7280 get_pool_stat_key(s
.first
, &key
);
7281 repairer
->fix_statfs(db
, key
, s
.second
);
7285 if (!per_pool_stat_collection
&&
7287 // by virtue of running this method, we correct the top-level
7288 // error of having global stats
7289 repairer
->inc_repaired();
7293 BlueStore::OnodeRef
BlueStore::fsck_check_objects_shallow(
7294 BlueStore::FSCKDepth depth
,
7296 BlueStore::CollectionRef c
,
7297 const ghobject_t
& oid
,
7299 const bufferlist
& value
,
7300 mempool::bluestore_fsck::list
<string
>* expecting_shards
,
7301 map
<BlobRef
, bluestore_blob_t::unused_t
>* referenced
,
7302 const BlueStore::FSCK_ObjectCtx
& ctx
)
7304 auto& errors
= ctx
.errors
;
7305 auto& num_objects
= ctx
.num_objects
;
7306 auto& num_extents
= ctx
.num_extents
;
7307 auto& num_blobs
= ctx
.num_blobs
;
7308 auto& num_sharded_objects
= ctx
.num_sharded_objects
;
7309 auto& num_spanning_blobs
= ctx
.num_spanning_blobs
;
7310 auto used_blocks
= ctx
.used_blocks
;
7311 auto sb_info_lock
= ctx
.sb_info_lock
;
7312 auto& sb_info
= ctx
.sb_info
;
7313 auto repairer
= ctx
.repairer
;
7315 store_statfs_t
* res_statfs
= (per_pool_stat_collection
|| repairer
) ?
7316 &ctx
.expected_pool_statfs
[pool_id
] :
7317 &ctx
.expected_store_statfs
;
7319 dout(10) << __func__
<< " " << oid
<< dendl
;
7321 o
.reset(Onode::decode(c
, oid
, key
, value
));
7324 num_spanning_blobs
+= o
->extent_map
.spanning_blob_map
.size();
7326 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
7327 _dump_onode
<30>(cct
, *o
);
7329 if (!o
->extent_map
.shards
.empty()) {
7330 ++num_sharded_objects
;
7331 if (depth
!= FSCK_SHALLOW
) {
7332 ceph_assert(expecting_shards
);
7333 for (auto& s
: o
->extent_map
.shards
) {
7334 dout(20) << __func__
<< " shard " << *s
.shard_info
<< dendl
;
7335 expecting_shards
->push_back(string());
7336 get_extent_shard_key(o
->key
, s
.shard_info
->offset
,
7337 &expecting_shards
->back());
7338 if (s
.shard_info
->offset
>= o
->onode
.size
) {
7339 derr
<< "fsck error: " << oid
<< " shard 0x" << std::hex
7340 << s
.shard_info
->offset
<< " past EOF at 0x" << o
->onode
.size
7341 << std::dec
<< dendl
;
7350 mempool::bluestore_fsck::map
<BlobRef
,
7351 bluestore_blob_use_tracker_t
> ref_map
;
7352 for (auto& l
: o
->extent_map
.extent_map
) {
7353 dout(20) << __func__
<< " " << l
<< dendl
;
7354 if (l
.logical_offset
< pos
) {
7355 derr
<< "fsck error: " << oid
<< " lextent at 0x"
7356 << std::hex
<< l
.logical_offset
7357 << " overlaps with the previous, which ends at 0x" << pos
7358 << std::dec
<< dendl
;
7361 if (depth
!= FSCK_SHALLOW
&&
7362 o
->extent_map
.spans_shard(l
.logical_offset
, l
.length
)) {
7363 derr
<< "fsck error: " << oid
<< " lextent at 0x"
7364 << std::hex
<< l
.logical_offset
<< "~" << l
.length
7365 << " spans a shard boundary"
7366 << std::dec
<< dendl
;
7369 pos
= l
.logical_offset
+ l
.length
;
7370 res_statfs
->data_stored
+= l
.length
;
7371 ceph_assert(l
.blob
);
7372 const bluestore_blob_t
& blob
= l
.blob
->get_blob();
7374 auto& ref
= ref_map
[l
.blob
];
7375 if (ref
.is_empty()) {
7376 uint32_t min_release_size
= blob
.get_release_size(min_alloc_size
);
7377 uint32_t l
= blob
.get_logical_length();
7378 ref
.init(l
, min_release_size
);
7384 if (depth
!= FSCK_SHALLOW
&&
7385 blob
.has_unused()) {
7386 ceph_assert(referenced
);
7387 auto p
= referenced
->find(l
.blob
);
7388 bluestore_blob_t::unused_t
* pu
;
7389 if (p
== referenced
->end()) {
7390 pu
= &(*referenced
)[l
.blob
];
7395 uint64_t blob_len
= blob
.get_logical_length();
7396 ceph_assert((blob_len
% (sizeof(*pu
) * 8)) == 0);
7397 ceph_assert(l
.blob_offset
+ l
.length
<= blob_len
);
7398 uint64_t chunk_size
= blob_len
/ (sizeof(*pu
) * 8);
7399 uint64_t start
= l
.blob_offset
/ chunk_size
;
7401 round_up_to(l
.blob_offset
+ l
.length
, chunk_size
) / chunk_size
;
7402 for (auto i
= start
; i
< end
; ++i
) {
7406 } //for (auto& l : o->extent_map.extent_map)
7408 for (auto& i
: ref_map
) {
7410 const bluestore_blob_t
& blob
= i
.first
->get_blob();
7412 depth
== FSCK_SHALLOW
? true :
7413 i
.first
->get_blob_use_tracker().equal(i
.second
);
7415 derr
<< "fsck error: " << oid
<< " blob " << *i
.first
7416 << " doesn't match expected ref_map " << i
.second
<< dendl
;
7419 if (blob
.is_compressed()) {
7420 res_statfs
->data_compressed
+= blob
.get_compressed_payload_length();
7421 res_statfs
->data_compressed_original
+=
7422 i
.first
->get_referenced_bytes();
7424 if (blob
.is_shared()) {
7425 if (i
.first
->shared_blob
->get_sbid() > blobid_max
) {
7426 derr
<< "fsck error: " << oid
<< " blob " << blob
7427 << " sbid " << i
.first
->shared_blob
->get_sbid() << " > blobid_max "
7428 << blobid_max
<< dendl
;
7431 else if (i
.first
->shared_blob
->get_sbid() == 0) {
7432 derr
<< "fsck error: " << oid
<< " blob " << blob
7433 << " marked as shared but has uninitialized sbid"
7437 // the below lock is optional and provided in multithreading mode only
7439 sb_info_lock
->lock();
7441 sb_info_t
& sbi
= sb_info
[i
.first
->shared_blob
->get_sbid()];
7442 ceph_assert(sbi
.cid
== coll_t() || sbi
.cid
== c
->cid
);
7443 ceph_assert(sbi
.pool_id
== INT64_MIN
||
7444 sbi
.pool_id
== oid
.hobj
.get_logical_pool());
7446 sbi
.pool_id
= oid
.hobj
.get_logical_pool();
7447 sbi
.sb
= i
.first
->shared_blob
;
7448 sbi
.oids
.push_back(oid
);
7449 sbi
.compressed
= blob
.is_compressed();
7450 for (auto e
: blob
.get_extents()) {
7452 sbi
.ref_map
.get(e
.offset
, e
.length
);
7456 sb_info_lock
->unlock();
7458 } else if (depth
!= FSCK_SHALLOW
) {
7459 ceph_assert(used_blocks
);
7460 errors
+= _fsck_check_extents(c
->cid
, oid
, blob
.get_extents(),
7461 blob
.is_compressed(),
7463 fm
->get_alloc_size(),
7468 errors
+= _fsck_sum_extents(
7470 blob
.is_compressed(),
7473 } // for (auto& i : ref_map)
7476 auto &sbm
= o
->extent_map
.spanning_blob_map
;
7478 BlobRef first_broken
;
7479 for (auto it
= sbm
.begin(); it
!= sbm
.end();) {
7481 if (ref_map
.count(it1
->second
) == 0) {
7483 first_broken
= it1
->second
;
7493 derr
<< "fsck error: " << oid
<< " - " << broken
7494 << " zombie spanning blob(s) found, the first one: "
7495 << *first_broken
<< dendl
;
7497 repairer
->fix_spanning_blobs(
7499 [&](KeyValueDB::Transaction txn
) {
7500 _record_onode(o
, txn
);
7506 if (o
->onode
.has_omap()) {
7507 _fsck_check_object_omap(depth
, o
, ctx
);
7513 #include "common/WorkQueue.h"
7515 class ShallowFSCKThreadPool
: public ThreadPool
7518 ShallowFSCKThreadPool(CephContext
* cct_
, std::string nm
, std::string tn
, int n
) :
7519 ThreadPool(cct_
, nm
, tn
, n
) {
7521 void worker(ThreadPool::WorkThread
* wt
) override
{
7524 next_wq
%= work_queues
.size();
7525 WorkQueue_
*wq
= work_queues
[next_wq
++];
7527 void* item
= wq
->_void_dequeue();
7530 TPHandle
tp_handle(cct
, nullptr, wq
->timeout_interval
, wq
->suicide_interval
);
7531 wq
->_void_process(item
, tp_handle
);
7536 template <size_t BatchLen
>
7537 struct FSCKWorkQueue
: public ThreadPool::WorkQueue_
7541 BlueStore::CollectionRef c
;
7547 std::atomic
<size_t> running
= { 0 };
7548 size_t entry_count
= 0;
7549 std::array
<Entry
, BatchLen
> entries
;
7552 int64_t warnings
= 0;
7553 uint64_t num_objects
= 0;
7554 uint64_t num_extents
= 0;
7555 uint64_t num_blobs
= 0;
7556 uint64_t num_sharded_objects
= 0;
7557 uint64_t num_spanning_blobs
= 0;
7558 store_statfs_t expected_store_statfs
;
7559 BlueStore::per_pool_statfs expected_pool_statfs
;
7563 BlueStore
* store
= nullptr;
7565 ceph::mutex
* sb_info_lock
= nullptr;
7566 BlueStore::sb_info_map_t
* sb_info
= nullptr;
7567 BlueStoreRepairer
* repairer
= nullptr;
7569 Batch
* batches
= nullptr;
7570 size_t last_batch_pos
= 0;
7571 bool batch_acquired
= false;
7573 FSCKWorkQueue(std::string n
,
7576 ceph::mutex
* _sb_info_lock
,
7577 BlueStore::sb_info_map_t
& _sb_info
,
7578 BlueStoreRepairer
* _repairer
) :
7579 WorkQueue_(n
, ceph::timespan::zero(), ceph::timespan::zero()),
7580 batchCount(_batchCount
),
7582 sb_info_lock(_sb_info_lock
),
7586 batches
= new Batch
[batchCount
];
7592 /// Remove all work items from the queue.
7593 void _clear() override
{
7596 /// Check whether there is anything to do.
7597 bool _empty() override
{
7601 /// Get the next work item to process.
7602 void* _void_dequeue() override
{
7603 size_t pos
= rand() % batchCount
;
7606 auto& batch
= batches
[pos
];
7607 if (batch
.running
.fetch_add(1) == 0) {
7608 if (batch
.entry_count
) {
7615 } while (pos
!= pos0
);
7618 /** @brief Process the work item.
7619 * This function will be called several times in parallel
7620 * and must therefore be thread-safe. */
7621 void _void_process(void* item
, TPHandle
& handle
) override
{
7622 Batch
* batch
= (Batch
*)item
;
7624 BlueStore::FSCK_ObjectCtx
ctx(
7630 batch
->num_sharded_objects
,
7631 batch
->num_spanning_blobs
,
7632 nullptr, // used_blocks
7633 nullptr, //used_omap_head
7636 batch
->expected_store_statfs
,
7637 batch
->expected_pool_statfs
,
7640 for (size_t i
= 0; i
< batch
->entry_count
; i
++) {
7641 auto& entry
= batch
->entries
[i
];
7643 store
->fsck_check_objects_shallow(
7644 BlueStore::FSCK_SHALLOW
,
7650 nullptr, // expecting_shards - this will need a protection if passed
7651 nullptr, // referenced
7654 //std::cout << "processed " << batch << std::endl;
7655 batch
->entry_count
= 0;
7658 /** @brief Synchronously finish processing a work item.
7659 * This function is called after _void_process with the global thread pool lock held,
7660 * so at most one copy will execute simultaneously for a given thread pool.
7661 * It can be used for non-thread-safe finalization. */
7662 void _void_process_finish(void*) override
{
7668 BlueStore::CollectionRef c
,
7669 const ghobject_t
& oid
,
7671 const bufferlist
& value
) {
7673 size_t pos0
= last_batch_pos
;
7674 if (!batch_acquired
) {
7676 auto& batch
= batches
[last_batch_pos
];
7677 if (batch
.running
.fetch_add(1) == 0) {
7678 if (batch
.entry_count
< BatchLen
) {
7679 batch_acquired
= true;
7683 batch
.running
.fetch_sub(1);
7685 last_batch_pos
%= batchCount
;
7686 } while (last_batch_pos
!= pos0
);
7688 if (batch_acquired
) {
7689 auto& batch
= batches
[last_batch_pos
];
7690 ceph_assert(batch
.running
);
7691 ceph_assert(batch
.entry_count
< BatchLen
);
7693 auto& entry
= batch
.entries
[batch
.entry_count
];
7694 entry
.pool_id
= pool_id
;
7698 entry
.value
= value
;
7700 ++batch
.entry_count
;
7701 if (batch
.entry_count
== BatchLen
) {
7702 batch_acquired
= false;
7703 batch
.running
.fetch_sub(1);
7705 last_batch_pos
%= batchCount
;
7712 void finalize(ThreadPool
& tp
,
7713 BlueStore::FSCK_ObjectCtx
& ctx
) {
7714 if (batch_acquired
) {
7715 auto& batch
= batches
[last_batch_pos
];
7716 ceph_assert(batch
.running
);
7717 batch
.running
.fetch_sub(1);
7721 for (size_t i
= 0; i
< batchCount
; i
++) {
7722 auto& batch
= batches
[i
];
7724 //process leftovers if any
7725 if (batch
.entry_count
) {
7726 TPHandle
tp_handle(store
->cct
,
7730 ceph_assert(batch
.running
== 0);
7732 batch
.running
++; // just to be on-par with the regular call
7733 _void_process(&batch
, tp_handle
);
7735 ceph_assert(batch
.entry_count
== 0);
7737 ctx
.errors
+= batch
.errors
;
7738 ctx
.warnings
+= batch
.warnings
;
7739 ctx
.num_objects
+= batch
.num_objects
;
7740 ctx
.num_extents
+= batch
.num_extents
;
7741 ctx
.num_blobs
+= batch
.num_blobs
;
7742 ctx
.num_sharded_objects
+= batch
.num_sharded_objects
;
7743 ctx
.num_spanning_blobs
+= batch
.num_spanning_blobs
;
7745 ctx
.expected_store_statfs
.add(batch
.expected_store_statfs
);
7747 for (auto it
= batch
.expected_pool_statfs
.begin();
7748 it
!= batch
.expected_pool_statfs
.end();
7750 ctx
.expected_pool_statfs
[it
->first
].add(it
->second
);
7757 void BlueStore::_fsck_check_object_omap(FSCKDepth depth
,
7759 const BlueStore::FSCK_ObjectCtx
& ctx
)
7761 auto& errors
= ctx
.errors
;
7762 auto& warnings
= ctx
.warnings
;
7763 auto repairer
= ctx
.repairer
;
7765 ceph_assert(o
->onode
.has_omap());
7766 if (!o
->onode
.is_perpool_omap() && !o
->onode
.is_pgmeta_omap()) {
7767 if (per_pool_omap
== OMAP_PER_POOL
) {
7768 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
7769 << "fsck error: " << o
->oid
7770 << " has omap that is not per-pool or pgmeta"
7776 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_omap
) {
7785 fsck_derr(num
, MAX_FSCK_ERROR_LINES
)
7786 << "fsck " << w
<< ": " << o
->oid
7787 << " has omap that is not per-pool or pgmeta"
7790 } else if (!o
->onode
.is_perpg_omap() && !o
->onode
.is_pgmeta_omap()) {
7791 if (per_pool_omap
== OMAP_PER_PG
) {
7792 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
7793 << "fsck error: " << o
->oid
7794 << " has omap that is not per-pg or pgmeta"
7800 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pg_omap
) {
7809 fsck_derr(num
, MAX_FSCK_ERROR_LINES
)
7810 << "fsck " << w
<< ": " << o
->oid
7811 << " has omap that is not per-pg or pgmeta"
7816 !o
->onode
.is_perpg_omap() &&
7817 !o
->onode
.is_pgmeta_omap()) {
7818 dout(10) << "fsck converting " << o
->oid
<< " omap to per-pg" << dendl
;
7820 map
<string
, bufferlist
> kv
;
7821 int r
= _onode_omap_get(o
, &h
, &kv
);
7823 derr
<< " got " << r
<< " " << cpp_strerror(r
) << dendl
;
7825 KeyValueDB::Transaction txn
= db
->get_transaction();
7827 const string
& old_omap_prefix
= o
->get_omap_prefix();
7828 string old_head
, old_tail
;
7829 o
->get_omap_header(&old_head
);
7830 o
->get_omap_tail(&old_tail
);
7831 txn
->rm_range_keys(old_omap_prefix
, old_head
, old_tail
);
7832 txn
->rmkey(old_omap_prefix
, old_tail
);
7834 o
->onode
.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP
| bluestore_onode_t::FLAG_PERPG_OMAP
);
7835 _record_onode(o
, txn
);
7836 const string
& new_omap_prefix
= o
->get_omap_prefix();
7840 o
->get_omap_header(&new_head
);
7841 txn
->set(new_omap_prefix
, new_head
, h
);
7845 o
->get_omap_tail(&new_tail
);
7847 txn
->set(new_omap_prefix
, new_tail
, empty
);
7850 o
->get_omap_key(string(), &final_key
);
7851 size_t base_key_len
= final_key
.size();
7852 for (auto& i
: kv
) {
7853 final_key
.resize(base_key_len
);
7854 final_key
+= i
.first
;
7855 txn
->set(new_omap_prefix
, final_key
, i
.second
);
7857 db
->submit_transaction_sync(txn
);
7858 repairer
->inc_repaired();
7863 void BlueStore::_fsck_check_objects(FSCKDepth depth
,
7864 BlueStore::FSCK_ObjectCtx
& ctx
)
7866 auto& errors
= ctx
.errors
;
7867 auto sb_info_lock
= ctx
.sb_info_lock
;
7868 auto& sb_info
= ctx
.sb_info
;
7869 auto repairer
= ctx
.repairer
;
7871 uint64_t_btree_t used_nids
;
7873 size_t processed_myself
= 0;
7875 auto it
= db
->get_iterator(PREFIX_OBJ
, KeyValueDB::ITERATOR_NOCACHE
);
7876 mempool::bluestore_fsck::list
<string
> expecting_shards
;
7878 const size_t thread_count
= cct
->_conf
->bluestore_fsck_quick_fix_threads
;
7879 typedef ShallowFSCKThreadPool::FSCKWorkQueue
<256> WQ
;
7880 std::unique_ptr
<WQ
> wq(
7883 (thread_count
? : 1) * 32,
7889 ShallowFSCKThreadPool
thread_pool(cct
, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count
);
7891 thread_pool
.add_work_queue(wq
.get());
7892 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
7893 //not the best place but let's check anyway
7894 ceph_assert(sb_info_lock
);
7895 thread_pool
.start();
7898 //fill global if not overriden below
7900 int64_t pool_id
= -1;
7902 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
7903 dout(30) << __func__
<< " key "
7904 << pretty_binary_string(it
->key()) << dendl
;
7905 if (is_extent_shard_key(it
->key())) {
7906 if (depth
== FSCK_SHALLOW
) {
7909 while (!expecting_shards
.empty() &&
7910 expecting_shards
.front() < it
->key()) {
7911 derr
<< "fsck error: missing shard key "
7912 << pretty_binary_string(expecting_shards
.front())
7915 expecting_shards
.pop_front();
7917 if (!expecting_shards
.empty() &&
7918 expecting_shards
.front() == it
->key()) {
7920 expecting_shards
.pop_front();
7926 get_key_extent_shard(it
->key(), &okey
, &offset
);
7927 derr
<< "fsck error: stray shard 0x" << std::hex
<< offset
7928 << std::dec
<< dendl
;
7929 if (expecting_shards
.empty()) {
7930 derr
<< "fsck error: " << pretty_binary_string(it
->key())
7931 << " is unexpected" << dendl
;
7935 while (expecting_shards
.front() > it
->key()) {
7936 derr
<< "fsck error: saw " << pretty_binary_string(it
->key())
7938 derr
<< "fsck error: exp "
7939 << pretty_binary_string(expecting_shards
.front()) << dendl
;
7941 expecting_shards
.pop_front();
7942 if (expecting_shards
.empty()) {
7950 int r
= get_key_object(it
->key(), &oid
);
7952 derr
<< "fsck error: bad object key "
7953 << pretty_binary_string(it
->key()) << dendl
;
7958 oid
.shard_id
!= pgid
.shard
||
7959 oid
.hobj
.get_logical_pool() != (int64_t)pgid
.pool() ||
7960 !c
->contains(oid
)) {
7962 for (auto& p
: coll_map
) {
7963 if (p
.second
->contains(oid
)) {
7969 derr
<< "fsck error: stray object " << oid
7970 << " not owned by any collection" << dendl
;
7974 pool_id
= c
->cid
.is_pg(&pgid
) ? pgid
.pool() : META_POOL_ID
;
7975 dout(20) << __func__
<< " collection " << c
->cid
<< " " << c
->cnode
7979 if (depth
!= FSCK_SHALLOW
&&
7980 !expecting_shards
.empty()) {
7981 for (auto& k
: expecting_shards
) {
7982 derr
<< "fsck error: missing shard key "
7983 << pretty_binary_string(k
) << dendl
;
7986 expecting_shards
.clear();
7989 bool queued
= false;
7990 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
7999 map
<BlobRef
, bluestore_blob_t::unused_t
> referenced
;
8004 o
= fsck_check_objects_shallow(
8016 if (depth
!= FSCK_SHALLOW
) {
8017 ceph_assert(o
!= nullptr);
8019 if (o
->onode
.nid
> nid_max
) {
8020 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
8021 << " > nid_max " << nid_max
<< dendl
;
8024 if (used_nids
.count(o
->onode
.nid
)) {
8025 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
8026 << " already in use" << dendl
;
8028 continue; // go for next object
8030 used_nids
.insert(o
->onode
.nid
);
8032 for (auto& i
: referenced
) {
8033 dout(20) << __func__
<< " referenced 0x" << std::hex
<< i
.second
8034 << std::dec
<< " for " << *i
.first
<< dendl
;
8035 const bluestore_blob_t
& blob
= i
.first
->get_blob();
8036 if (i
.second
& blob
.unused
) {
8037 derr
<< "fsck error: " << oid
<< " blob claims unused 0x"
8038 << std::hex
<< blob
.unused
8039 << " but extents reference 0x" << i
.second
<< std::dec
8040 << " on blob " << *i
.first
<< dendl
;
8043 if (blob
.has_csum()) {
8044 uint64_t blob_len
= blob
.get_logical_length();
8045 uint64_t unused_chunk_size
= blob_len
/ (sizeof(blob
.unused
) * 8);
8046 unsigned csum_count
= blob
.get_csum_count();
8047 unsigned csum_chunk_size
= blob
.get_csum_chunk_size();
8048 for (unsigned p
= 0; p
< csum_count
; ++p
) {
8049 unsigned pos
= p
* csum_chunk_size
;
8050 unsigned firstbit
= pos
/ unused_chunk_size
; // [firstbit,lastbit]
8051 unsigned lastbit
= (pos
+ csum_chunk_size
- 1) / unused_chunk_size
;
8052 unsigned mask
= 1u << firstbit
;
8053 for (unsigned b
= firstbit
+ 1; b
<= lastbit
; ++b
) {
8056 if ((blob
.unused
& mask
) == mask
) {
8057 // this csum chunk region is marked unused
8058 if (blob
.get_csum_item(p
) != 0) {
8059 derr
<< "fsck error: " << oid
8060 << " blob claims csum chunk 0x" << std::hex
<< pos
8061 << "~" << csum_chunk_size
8062 << " is unused (mask 0x" << mask
<< " of unused 0x"
8063 << blob
.unused
<< ") but csum is non-zero 0x"
8064 << blob
.get_csum_item(p
) << std::dec
<< " on blob "
8065 << *i
.first
<< dendl
;
8073 if (o
->onode
.has_omap()) {
8074 ceph_assert(ctx
.used_omap_head
);
8075 if (ctx
.used_omap_head
->count(o
->onode
.nid
)) {
8076 derr
<< "fsck error: " << o
->oid
<< " omap_head " << o
->onode
.nid
8077 << " already in use" << dendl
;
8080 ctx
.used_omap_head
->insert(o
->onode
.nid
);
8082 } // if (o->onode.has_omap())
8083 if (depth
== FSCK_DEEP
) {
8085 uint64_t max_read_block
= cct
->_conf
->bluestore_fsck_read_bytes_cap
;
8086 uint64_t offset
= 0;
8088 uint64_t l
= std::min(uint64_t(o
->onode
.size
- offset
), max_read_block
);
8089 int r
= _do_read(c
.get(), o
, offset
, l
, bl
,
8090 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
8093 derr
<< "fsck error: " << oid
<< std::hex
8094 << " error during read: "
8095 << " " << offset
<< "~" << l
8096 << " " << cpp_strerror(r
) << std::dec
8101 } while (offset
< o
->onode
.size
);
8103 } //if (depth != FSCK_SHALLOW)
8104 } // for (it->lower_bound(string()); it->valid(); it->next())
8105 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
8106 wq
->finalize(thread_pool
, ctx
);
8107 if (processed_myself
) {
8108 // may be needs more threads?
8109 dout(0) << __func__
<< " partial offload"
8110 << ", done myself " << processed_myself
8111 << " of " << ctx
.num_objects
8112 << "objects, threads " << thread_count
8119 An overview for currently implemented repair logics
8120 performed in fsck in two stages: detection(+preparation) and commit.
8121 Detection stage (in processing order):
8122 (Issue -> Repair action to schedule)
8123 - Detect undecodable keys for Shared Blobs -> Remove
8124 - Detect undecodable records for Shared Blobs -> Remove
8125 (might trigger missed Shared Blob detection below)
8126 - Detect stray records for Shared Blobs -> Remove
8127 - Detect misreferenced pextents -> Fix
8128 Prepare Bloom-like filter to track cid/oid -> pextent
8129 Prepare list of extents that are improperly referenced
8130 Enumerate Onode records that might use 'misreferenced' pextents
8131 (Bloom-like filter applied to reduce computation)
8132 Per each questinable Onode enumerate all blobs and identify broken ones
8133 (i.e. blobs having 'misreferences')
8134 Rewrite each broken blob data by allocating another extents and
8136 If blob is shared - unshare it and mark corresponding Shared Blob
8138 Release previously allocated space
8140 - Detect missed Shared Blobs -> Recreate
8141 - Detect undecodable deferred transaction -> Remove
8142 - Detect Freelist Manager's 'false free' entries -> Mark as used
8143 - Detect Freelist Manager's leaked entries -> Mark as free
8144 - Detect statfs inconsistency - Update
8145 Commit stage (separate DB commit per each step):
8146 - Apply leaked FM entries fix
8147 - Apply 'false free' FM entries fix
8148 - Apply 'Remove' actions
8149 - Apply fix for misreference pextents
8150 - Apply Shared Blob recreate
8151 (can be merged with the step above if misreferences were dectected)
8152 - Apply StatFS update
8154 int BlueStore::_fsck(BlueStore::FSCKDepth depth
, bool repair
)
8157 << (repair
? " repair" : " check")
8158 << (depth
== FSCK_DEEP
? " (deep)" :
8159 depth
== FSCK_SHALLOW
? " (shallow)" : " (regular)")
8162 // in deep mode we need R/W write access to be able to replay deferred ops
8163 bool read_only
= !(repair
|| depth
== FSCK_DEEP
);
8165 int r
= _open_db_and_around(read_only
);
8170 r
= _upgrade_super();
8176 r
= _open_collections();
8180 mempool_thread
.init();
8182 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8183 // enable in repair or deep mode modes only
8186 r
= _deferred_replay();
8192 r
= _fsck_on_open(depth
, repair
);
8195 mempool_thread
.shutdown();
8198 _close_db_and_around(false);
8203 int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth
, bool repair
)
8207 << (repair
? " repair" : " check")
8208 << (depth
== FSCK_DEEP
? " (deep)" :
8209 depth
== FSCK_SHALLOW
? " (shallow)" : " (regular)")
8210 << " start" << dendl
;
8212 int64_t warnings
= 0;
8213 unsigned repaired
= 0;
8215 uint64_t_btree_t used_omap_head
;
8216 uint64_t_btree_t used_sbids
;
8218 mempool_dynamic_bitset used_blocks
, bluefs_used_blocks
;
8219 KeyValueDB::Iterator it
;
8220 store_statfs_t expected_store_statfs
, actual_statfs
;
8221 per_pool_statfs expected_pool_statfs
;
8223 sb_info_map_t sb_info
;
8225 uint64_t num_objects
= 0;
8226 uint64_t num_extents
= 0;
8227 uint64_t num_blobs
= 0;
8228 uint64_t num_spanning_blobs
= 0;
8229 uint64_t num_shared_blobs
= 0;
8230 uint64_t num_sharded_objects
= 0;
8231 BlueStoreRepairer repairer
;
8233 auto alloc_size
= fm
->get_alloc_size();
8235 utime_t start
= ceph_clock_now();
8237 _fsck_collections(&errors
);
8238 used_blocks
.resize(fm
->get_alloc_units());
8241 interval_set
<uint64_t> bluefs_extents
;
8243 int r
= bluefs
->get_block_extents(bluefs_layout
.shared_bdev
, &bluefs_extents
);
8244 ceph_assert(r
== 0);
8245 for (auto [start
, len
] : bluefs_extents
) {
8246 apply_for_bitset_range(start
, len
, alloc_size
, used_blocks
,
8247 [&](uint64_t pos
, mempool_dynamic_bitset
& bs
) {
8248 ceph_assert(pos
< bs
.size());
8255 bluefs_used_blocks
= used_blocks
;
8257 apply_for_bitset_range(
8258 0, std::max
<uint64_t>(min_alloc_size
, SUPER_RESERVED
), alloc_size
, used_blocks
,
8259 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8266 repairer
.init_space_usage_tracker(
8272 int r
= bluefs
->fsck();
8280 if (!per_pool_stat_collection
) {
8282 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_stats
) {
8289 derr
<< "fsck " << w
<< ": store not yet converted to per-pool stats"
8292 if (per_pool_omap
!= OMAP_PER_PG
) {
8294 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_omap
) {
8301 derr
<< "fsck " << w
<< ": store not yet converted to per-pg omap"
8305 // get expected statfs; reset unaffected fields to be able to compare
8307 statfs(&actual_statfs
);
8308 actual_statfs
.total
= 0;
8309 actual_statfs
.internally_reserved
= 0;
8310 actual_statfs
.available
= 0;
8311 actual_statfs
.internal_metadata
= 0;
8312 actual_statfs
.omap_allocated
= 0;
8314 if (g_conf()->bluestore_debug_fsck_abort
) {
8315 dout(1) << __func__
<< " debug abort" << dendl
;
8320 dout(1) << __func__
<< " walking object keyspace" << dendl
;
8321 ceph::mutex sb_info_lock
= ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8322 BlueStore::FSCK_ObjectCtx
ctx(
8328 num_sharded_objects
,
8332 //no need for the below lock when in non-shallow mode as
8333 // there is no multithreading in this case
8334 depth
== FSCK_SHALLOW
? &sb_info_lock
: nullptr,
8336 expected_store_statfs
,
8337 expected_pool_statfs
,
8338 repair
? &repairer
: nullptr);
8340 _fsck_check_objects(depth
, ctx
);
8343 dout(1) << __func__
<< " checking shared_blobs" << dendl
;
8344 it
= db
->get_iterator(PREFIX_SHARED_BLOB
, KeyValueDB::ITERATOR_NOCACHE
);
8346 // FIXME minor: perhaps simplify for shallow mode?
8347 // fill global if not overriden below
8348 auto expected_statfs
= &expected_store_statfs
;
8350 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8351 string key
= it
->key();
8353 if (get_key_shared_blob(key
, &sbid
)) {
8354 derr
<< "fsck error: bad key '" << key
8355 << "' in shared blob namespace" << dendl
;
8357 repairer
.remove_key(db
, PREFIX_SHARED_BLOB
, key
);
8362 auto p
= sb_info
.find(sbid
);
8363 if (p
== sb_info
.end()) {
8364 derr
<< "fsck error: found stray shared blob data for sbid 0x"
8365 << std::hex
<< sbid
<< std::dec
<< dendl
;
8367 repairer
.remove_key(db
, PREFIX_SHARED_BLOB
, key
);
8372 sb_info_t
& sbi
= p
->second
;
8373 bluestore_shared_blob_t
shared_blob(sbid
);
8374 bufferlist bl
= it
->value();
8375 auto blp
= bl
.cbegin();
8377 decode(shared_blob
, blp
);
8378 } catch (ceph::buffer::error
& e
) {
8380 // Force update and don't report as missing
8381 sbi
.updated
= sbi
.passed
= true;
8383 derr
<< "fsck error: failed to decode Shared Blob"
8384 << pretty_binary_string(it
->key()) << dendl
;
8386 dout(20) << __func__
<< " undecodable Shared Blob, key:'"
8387 << pretty_binary_string(it
->key())
8388 << "', removing" << dendl
;
8389 repairer
.remove_key(db
, PREFIX_DEFERRED
, it
->key());
8393 dout(20) << __func__
<< " " << *sbi
.sb
<< " " << shared_blob
<< dendl
;
8394 if (shared_blob
.ref_map
!= sbi
.ref_map
) {
8395 derr
<< "fsck error: shared blob 0x" << std::hex
<< sbid
8396 << std::dec
<< " ref_map " << shared_blob
.ref_map
8397 << " != expected " << sbi
.ref_map
<< dendl
;
8398 sbi
.updated
= true; // will update later in repair mode only!
8401 PExtentVector extents
;
8402 for (auto &r
: shared_blob
.ref_map
.ref_map
) {
8403 extents
.emplace_back(bluestore_pextent_t(r
.first
, r
.second
.length
));
8405 if (per_pool_stat_collection
|| repair
) {
8406 expected_statfs
= &expected_pool_statfs
[sbi
.pool_id
];
8408 errors
+= _fsck_check_extents(sbi
.cid
,
8409 p
->second
.oids
.front(),
8411 p
->second
.compressed
,
8413 fm
->get_alloc_size(),
8414 repair
? &repairer
: nullptr,
8422 if (repair
&& repairer
.preprocess_misreference(db
)) {
8424 dout(1) << __func__
<< " sorting out misreferenced extents" << dendl
;
8425 auto& misref_extents
= repairer
.get_misreferences();
8426 interval_set
<uint64_t> to_release
;
8427 it
= db
->get_iterator(PREFIX_OBJ
, KeyValueDB::ITERATOR_NOCACHE
);
8429 // fill global if not overriden below
8430 auto expected_statfs
= &expected_store_statfs
;
8434 KeyValueDB::Transaction txn
= repairer
.get_fix_misreferences_txn();
8435 bool bypass_rest
= false;
8436 for (it
->lower_bound(string()); it
->valid() && !bypass_rest
;
8438 dout(30) << __func__
<< " key "
8439 << pretty_binary_string(it
->key()) << dendl
;
8440 if (is_extent_shard_key(it
->key())) {
8445 int r
= get_key_object(it
->key(), &oid
);
8446 if (r
< 0 || !repairer
.is_used(oid
)) {
8451 oid
.shard_id
!= pgid
.shard
||
8452 oid
.hobj
.get_logical_pool() != (int64_t)pgid
.pool() ||
8453 !c
->contains(oid
)) {
8455 for (auto& p
: coll_map
) {
8456 if (p
.second
->contains(oid
)) {
8464 if (per_pool_stat_collection
|| repair
) {
8465 auto pool_id
= c
->cid
.is_pg(&pgid
) ? pgid
.pool() : META_POOL_ID
;
8466 expected_statfs
= &expected_pool_statfs
[pool_id
];
8469 if (!repairer
.is_used(c
->cid
)) {
8473 dout(20) << __func__
<< " check misreference for col:" << c
->cid
8474 << " obj:" << oid
<< dendl
;
8477 o
.reset(Onode::decode(c
, oid
, it
->key(), it
->value()));
8478 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
8479 mempool::bluestore_fsck::set
<BlobRef
> blobs
;
8481 for (auto& e
: o
->extent_map
.extent_map
) {
8482 blobs
.insert(e
.blob
);
8484 bool need_onode_update
= false;
8485 bool first_dump
= true;
8486 for(auto b
: blobs
) {
8487 bool broken_blob
= false;
8488 auto& pextents
= b
->dirty_blob().dirty_extents();
8489 for (auto& e
: pextents
) {
8490 if (!e
.is_valid()) {
8493 // for the sake of simplicity and proper shared blob handling
8494 // always rewrite the whole blob even when it's partially
8496 if (misref_extents
.intersects(e
.offset
, e
.length
)) {
8499 _dump_onode
<10>(cct
, *o
);
8507 bool compressed
= b
->get_blob().is_compressed();
8508 need_onode_update
= true;
8509 dout(10) << __func__
8510 << " fix misreferences in oid:" << oid
8511 << " " << *b
<< dendl
;
8513 PExtentVector pext_to_release
;
8514 pext_to_release
.reserve(pextents
.size());
8515 // rewriting all valid pextents
8516 for (auto e
= pextents
.begin(); e
!= pextents
.end();
8517 b_off
+= e
->length
, e
++) {
8518 if (!e
->is_valid()) {
8523 shared_alloc
.a
->allocate(e
->length
, min_alloc_size
,
8525 if (alloc_len
< 0 || alloc_len
< (int64_t)e
->length
) {
8527 << " failed to allocate 0x" << std::hex
<< e
->length
8528 << " allocated 0x " << (alloc_len
< 0 ? 0 : alloc_len
)
8529 << " min_alloc_size 0x" << min_alloc_size
8530 << " available 0x " << shared_alloc
.a
->get_free()
8531 << std::dec
<< dendl
;
8532 if (alloc_len
> 0) {
8533 shared_alloc
.a
->release(exts
);
8538 expected_statfs
->allocated
+= e
->length
;
8540 expected_statfs
->data_compressed_allocated
+= e
->length
;
8544 IOContext
ioc(cct
, NULL
, true); // allow EIO
8545 r
= bdev
->read(e
->offset
, e
->length
, &bl
, &ioc
, false);
8547 derr
<< __func__
<< " failed to read from 0x" << std::hex
<< e
->offset
8548 <<"~" << e
->length
<< std::dec
<< dendl
;
8549 ceph_abort_msg("read failed, wtf");
8551 pext_to_release
.push_back(*e
);
8552 e
= pextents
.erase(e
);
8553 e
= pextents
.insert(e
, exts
.begin(), exts
.end());
8554 b
->get_blob().map_bl(
8556 [&](uint64_t offset
, bufferlist
& t
) {
8557 int r
= bdev
->write(offset
, t
, false);
8558 ceph_assert(r
== 0);
8560 e
+= exts
.size() - 1;
8561 for (auto& p
: exts
) {
8562 fm
->allocate(p
.offset
, p
.length
, txn
);
8564 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8566 if (b
->get_blob().is_shared()) {
8567 b
->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED
);
8569 auto sb_it
= sb_info
.find(b
->shared_blob
->get_sbid());
8570 ceph_assert(sb_it
!= sb_info
.end());
8571 sb_info_t
& sbi
= sb_it
->second
;
8573 for (auto& r
: sbi
.ref_map
.ref_map
) {
8574 expected_statfs
->allocated
-= r
.second
.length
;
8575 if (sbi
.compressed
) {
8576 // NB: it's crucial to use compressed flag from sb_info_t
8577 // as we originally used that value while accumulating
8579 expected_statfs
->data_compressed_allocated
-= r
.second
.length
;
8582 sbi
.updated
= sbi
.passed
= true;
8583 sbi
.ref_map
.clear();
8585 // relying on blob's pextents to decide what to release.
8586 for (auto& p
: pext_to_release
) {
8587 to_release
.union_insert(p
.offset
, p
.length
);
8590 for (auto& p
: pext_to_release
) {
8591 expected_statfs
->allocated
-= p
.length
;
8593 expected_statfs
->data_compressed_allocated
-= p
.length
;
8595 to_release
.union_insert(p
.offset
, p
.length
);
8601 } // for(auto b : blobs)
8602 if (need_onode_update
) {
8603 o
->extent_map
.dirty_range(0, OBJECT_MAX_SIZE
);
8604 _record_onode(o
, txn
);
8606 } // for (it->lower_bound(string()); it->valid(); it->next())
8608 for (auto it
= to_release
.begin(); it
!= to_release
.end(); ++it
) {
8609 dout(10) << __func__
<< " release 0x" << std::hex
<< it
.get_start()
8610 << "~" << it
.get_len() << std::dec
<< dendl
;
8611 fm
->release(it
.get_start(), it
.get_len(), txn
);
8613 shared_alloc
.a
->release(to_release
);
8616 } //if (repair && repairer.preprocess_misreference()) {
8618 if (depth
!= FSCK_SHALLOW
) {
8619 for (auto &p
: sb_info
) {
8620 sb_info_t
& sbi
= p
.second
;
8622 derr
<< "fsck error: missing " << *sbi
.sb
<< dendl
;
8625 if (repair
&& (!sbi
.passed
|| sbi
.updated
)) {
8626 auto sbid
= p
.first
;
8627 if (sbi
.ref_map
.empty()) {
8628 ceph_assert(sbi
.passed
);
8629 dout(20) << __func__
<< " " << *sbi
.sb
8630 << " is empty, removing" << dendl
;
8631 repairer
.fix_shared_blob(db
, sbid
, nullptr);
8634 bluestore_shared_blob_t
persistent(sbid
, std::move(sbi
.ref_map
));
8635 encode(persistent
, bl
);
8636 dout(20) << __func__
<< " " << *sbi
.sb
8637 << " is " << bl
.length() << " bytes, updating" << dendl
;
8639 repairer
.fix_shared_blob(db
, sbid
, &bl
);
8646 // check global stats only if fscking (not repairing) w/o per-pool stats
8647 if (!per_pool_stat_collection
&&
8649 !(actual_statfs
== expected_store_statfs
)) {
8650 derr
<< "fsck error: actual " << actual_statfs
8651 << " != expected " << expected_store_statfs
<< dendl
;
8653 repairer
.fix_statfs(db
, BLUESTORE_GLOBAL_STATFS_KEY
,
8654 expected_store_statfs
);
8659 dout(1) << __func__
<< " checking pool_statfs" << dendl
;
8660 _fsck_check_pool_statfs(expected_pool_statfs
,
8661 errors
, warnings
, repair
? &repairer
: nullptr);
8663 if (depth
!= FSCK_SHALLOW
) {
8664 dout(1) << __func__
<< " checking for stray omap data " << dendl
;
8665 it
= db
->get_iterator(PREFIX_OMAP
, KeyValueDB::ITERATOR_NOCACHE
);
8667 uint64_t last_omap_head
= 0;
8668 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8671 _key_decode_u64(it
->key().c_str(), &omap_head
);
8673 if (used_omap_head
.count(omap_head
) == 0 &&
8674 omap_head
!= last_omap_head
) {
8675 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8676 << "fsck error: found stray omap data on omap_head "
8677 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
8679 last_omap_head
= omap_head
;
8683 it
= db
->get_iterator(PREFIX_PGMETA_OMAP
, KeyValueDB::ITERATOR_NOCACHE
);
8685 uint64_t last_omap_head
= 0;
8686 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8688 _key_decode_u64(it
->key().c_str(), &omap_head
);
8689 if (used_omap_head
.count(omap_head
) == 0 &&
8690 omap_head
!= last_omap_head
) {
8691 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8692 << "fsck error: found stray (pgmeta) omap data on omap_head "
8693 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
8694 last_omap_head
= omap_head
;
8699 it
= db
->get_iterator(PREFIX_PERPOOL_OMAP
, KeyValueDB::ITERATOR_NOCACHE
);
8701 uint64_t last_omap_head
= 0;
8702 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8705 string k
= it
->key();
8706 const char *c
= k
.c_str();
8707 c
= _key_decode_u64(c
, &pool
);
8708 c
= _key_decode_u64(c
, &omap_head
);
8709 if (used_omap_head
.count(omap_head
) == 0 &&
8710 omap_head
!= last_omap_head
) {
8711 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8712 << "fsck error: found stray (per-pool) omap data on omap_head "
8713 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
8715 last_omap_head
= omap_head
;
8719 it
= db
->get_iterator(PREFIX_PERPG_OMAP
, KeyValueDB::ITERATOR_NOCACHE
);
8721 uint64_t last_omap_head
= 0;
8722 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8726 string k
= it
->key();
8727 const char* c
= k
.c_str();
8728 c
= _key_decode_u64(c
, &pool
);
8729 c
= _key_decode_u32(c
, &hash
);
8730 c
= _key_decode_u64(c
, &omap_head
);
8731 if (used_omap_head
.count(omap_head
) == 0 &&
8732 omap_head
!= last_omap_head
) {
8733 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8734 << "fsck error: found stray (per-pg) omap data on omap_head "
8735 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
8737 last_omap_head
= omap_head
;
8741 dout(1) << __func__
<< " checking deferred events" << dendl
;
8742 it
= db
->get_iterator(PREFIX_DEFERRED
, KeyValueDB::ITERATOR_NOCACHE
);
8744 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8745 bufferlist bl
= it
->value();
8746 auto p
= bl
.cbegin();
8747 bluestore_deferred_transaction_t wt
;
8750 } catch (ceph::buffer::error
& e
) {
8751 derr
<< "fsck error: failed to decode deferred txn "
8752 << pretty_binary_string(it
->key()) << dendl
;
8754 dout(20) << __func__
<< " undecodable deferred TXN record, key: '"
8755 << pretty_binary_string(it
->key())
8756 << "', removing" << dendl
;
8757 repairer
.remove_key(db
, PREFIX_DEFERRED
, it
->key());
8761 dout(20) << __func__
<< " deferred " << wt
.seq
8762 << " ops " << wt
.ops
.size()
8763 << " released 0x" << std::hex
<< wt
.released
<< std::dec
<< dendl
;
8764 for (auto e
= wt
.released
.begin(); e
!= wt
.released
.end(); ++e
) {
8765 apply_for_bitset_range(
8766 e
.get_start(), e
.get_len(), alloc_size
, used_blocks
,
8767 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8775 dout(1) << __func__
<< " checking freelist vs allocated" << dendl
;
8777 fm
->enumerate_reset();
8778 uint64_t offset
, length
;
8779 while (fm
->enumerate_next(db
, &offset
, &length
)) {
8780 bool intersects
= false;
8781 apply_for_bitset_range(
8782 offset
, length
, alloc_size
, used_blocks
,
8783 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8784 ceph_assert(pos
< bs
.size());
8785 if (bs
.test(pos
) && !bluefs_used_blocks
.test(pos
)) {
8786 if (offset
== SUPER_RESERVED
&&
8787 length
== min_alloc_size
- SUPER_RESERVED
) {
8788 // this is due to the change just after luminous to min_alloc_size
8789 // granularity allocations, and our baked in assumption at the top
8790 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
8791 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
8792 // since we will never allocate this region below min_alloc_size.
8793 dout(10) << __func__
<< " ignoring free extent between SUPER_RESERVED"
8794 << " and min_alloc_size, 0x" << std::hex
<< offset
<< "~"
8795 << length
<< std::dec
<< dendl
;
8799 repairer
.fix_false_free(db
, fm
,
8800 pos
* min_alloc_size
,
8810 derr
<< "fsck error: free extent 0x" << std::hex
<< offset
8811 << "~" << length
<< std::dec
8812 << " intersects allocated blocks" << dendl
;
8816 fm
->enumerate_reset();
8817 size_t count
= used_blocks
.count();
8818 if (used_blocks
.size() != count
) {
8819 ceph_assert(used_blocks
.size() > count
);
8821 size_t start
= used_blocks
.find_first();
8822 while (start
!= decltype(used_blocks
)::npos
) {
8825 size_t next
= used_blocks
.find_next(cur
);
8826 if (next
!= cur
+ 1) {
8828 derr
<< "fsck error: leaked extent 0x" << std::hex
8829 << ((uint64_t)start
* fm
->get_alloc_size()) << "~"
8830 << ((cur
+ 1 - start
) * fm
->get_alloc_size()) << std::dec
8833 repairer
.fix_leaked(db
,
8835 start
* min_alloc_size
,
8836 (cur
+ 1 - start
) * min_alloc_size
);
8849 if (per_pool_omap
!= OMAP_PER_PG
) {
8850 dout(5) << __func__
<< " fixing per_pg_omap" << dendl
;
8851 repairer
.fix_per_pool_omap(db
, OMAP_PER_PG
);
8854 dout(5) << __func__
<< " applying repair results" << dendl
;
8855 repaired
= repairer
.apply(db
);
8856 dout(5) << __func__
<< " repair applied" << dendl
;
8860 dout(2) << __func__
<< " " << num_objects
<< " objects, "
8861 << num_sharded_objects
<< " of them sharded. "
8863 dout(2) << __func__
<< " " << num_extents
<< " extents to "
8864 << num_blobs
<< " blobs, "
8865 << num_spanning_blobs
<< " spanning, "
8866 << num_shared_blobs
<< " shared."
8869 utime_t duration
= ceph_clock_now() - start
;
8870 dout(1) << __func__
<< " <<<FINISH>>> with " << errors
<< " errors, "
8871 << warnings
<< " warnings, "
8872 << repaired
<< " repaired, "
8873 << (errors
+ warnings
- (int)repaired
) << " remaining in "
8874 << duration
<< " seconds" << dendl
;
8876 // In non-repair mode we should return error count only as
8877 // it indicates if store status is OK.
8878 // In repair mode both errors and warnings are taken into account
8879 // since repaired counter relates to them both.
8880 return repair
? errors
+ warnings
- (int)repaired
: errors
;
8883 /// methods to inject various errors fsck can repair
8884 void BlueStore::inject_broken_shared_blob_key(const string
& key
,
8885 const bufferlist
& bl
)
8887 KeyValueDB::Transaction txn
;
8888 txn
= db
->get_transaction();
8889 txn
->set(PREFIX_SHARED_BLOB
, key
, bl
);
8890 db
->submit_transaction_sync(txn
);
8893 void BlueStore::inject_leaked(uint64_t len
)
8895 KeyValueDB::Transaction txn
;
8896 txn
= db
->get_transaction();
8899 int64_t alloc_len
= shared_alloc
.a
->allocate(len
, min_alloc_size
,
8900 min_alloc_size
* 256, 0, &exts
);
8901 ceph_assert(alloc_len
>= (int64_t)len
);
8902 for (auto& p
: exts
) {
8903 fm
->allocate(p
.offset
, p
.length
, txn
);
8905 db
->submit_transaction_sync(txn
);
8908 void BlueStore::inject_false_free(coll_t cid
, ghobject_t oid
)
8910 KeyValueDB::Transaction txn
;
8912 CollectionRef c
= _get_collection(cid
);
8915 std::unique_lock l
{c
->lock
}; // just to avoid internal asserts
8916 o
= c
->get_onode(oid
, false);
8918 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
8921 bool injected
= false;
8922 txn
= db
->get_transaction();
8923 auto& em
= o
->extent_map
.extent_map
;
8924 std::vector
<const PExtentVector
*> v
;
8926 v
.push_back(&em
.begin()->blob
->get_blob().get_extents());
8928 if (em
.size() > 1) {
8931 v
.push_back(&(it
->blob
->get_blob().get_extents()));
8933 for (auto pext
: v
) {
8935 auto p
= pext
->begin();
8936 while (p
!= pext
->end()) {
8937 if (p
->is_valid()) {
8938 dout(20) << __func__
<< " release 0x" << std::hex
<< p
->offset
8939 << "~" << p
->length
<< std::dec
<< dendl
;
8940 fm
->release(p
->offset
, p
->length
, txn
);
8948 ceph_assert(injected
);
8949 db
->submit_transaction_sync(txn
);
8952 void BlueStore::inject_legacy_omap()
8954 dout(1) << __func__
<< dendl
;
8955 per_pool_omap
= OMAP_BULK
;
8956 KeyValueDB::Transaction txn
;
8957 txn
= db
->get_transaction();
8958 txn
->rmkey(PREFIX_SUPER
, "per_pool_omap");
8959 db
->submit_transaction_sync(txn
);
8962 void BlueStore::inject_legacy_omap(coll_t cid
, ghobject_t oid
)
8964 dout(1) << __func__
<< " "
8965 << cid
<< " " << oid
8967 KeyValueDB::Transaction txn
;
8969 CollectionRef c
= _get_collection(cid
);
8972 std::unique_lock l
{ c
->lock
}; // just to avoid internal asserts
8973 o
= c
->get_onode(oid
, false);
8976 o
->onode
.clear_flag(
8977 bluestore_onode_t::FLAG_PERPG_OMAP
|
8978 bluestore_onode_t::FLAG_PERPOOL_OMAP
|
8979 bluestore_onode_t::FLAG_PGMETA_OMAP
);
8980 txn
= db
->get_transaction();
8981 _record_onode(o
, txn
);
8982 db
->submit_transaction_sync(txn
);
8986 void BlueStore::inject_statfs(const string
& key
, const store_statfs_t
& new_statfs
)
8988 BlueStoreRepairer repairer
;
8989 repairer
.fix_statfs(db
, key
, new_statfs
);
8993 void BlueStore::inject_global_statfs(const store_statfs_t
& new_statfs
)
8995 KeyValueDB::Transaction t
= db
->get_transaction();
9000 t
->set(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, bl
);
9001 db
->submit_transaction_sync(t
);
9004 void BlueStore::inject_misreference(coll_t cid1
, ghobject_t oid1
,
9005 coll_t cid2
, ghobject_t oid2
,
9009 CollectionRef c1
= _get_collection(cid1
);
9012 std::unique_lock l
{c1
->lock
}; // just to avoid internal asserts
9013 o1
= c1
->get_onode(oid1
, false);
9015 o1
->extent_map
.fault_range(db
, offset
, OBJECT_MAX_SIZE
);
9018 CollectionRef c2
= _get_collection(cid2
);
9021 std::unique_lock l
{c2
->lock
}; // just to avoid internal asserts
9022 o2
= c2
->get_onode(oid2
, false);
9024 o2
->extent_map
.fault_range(db
, offset
, OBJECT_MAX_SIZE
);
9026 Extent
& e1
= *(o1
->extent_map
.seek_lextent(offset
));
9027 Extent
& e2
= *(o2
->extent_map
.seek_lextent(offset
));
9029 // require onode/extent layout to be the same (and simple)
9030 // to make things easier
9031 ceph_assert(o1
->onode
.extent_map_shards
.empty());
9032 ceph_assert(o2
->onode
.extent_map_shards
.empty());
9033 ceph_assert(o1
->extent_map
.spanning_blob_map
.size() == 0);
9034 ceph_assert(o2
->extent_map
.spanning_blob_map
.size() == 0);
9035 ceph_assert(e1
.logical_offset
== e2
.logical_offset
);
9036 ceph_assert(e1
.length
== e2
.length
);
9037 ceph_assert(e1
.blob_offset
== e2
.blob_offset
);
9039 KeyValueDB::Transaction txn
;
9040 txn
= db
->get_transaction();
9042 // along with misreference error this will create space leaks errors
9043 e2
.blob
->dirty_blob() = e1
.blob
->get_blob();
9044 o2
->extent_map
.dirty_range(offset
, e2
.length
);
9045 o2
->extent_map
.update(txn
, false);
9047 _record_onode(o2
, txn
);
9048 db
->submit_transaction_sync(txn
);
9051 void BlueStore::inject_zombie_spanning_blob(coll_t cid
, ghobject_t oid
,
9055 CollectionRef c
= _get_collection(cid
);
9058 std::unique_lock l
{ c
->lock
}; // just to avoid internal asserts
9059 o
= c
->get_onode(oid
, false);
9061 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
9064 BlobRef b
= c
->new_blob();
9066 o
->extent_map
.spanning_blob_map
[blob_id
] = b
;
9068 KeyValueDB::Transaction txn
;
9069 txn
= db
->get_transaction();
9071 _record_onode(o
, txn
);
9072 db
->submit_transaction_sync(txn
);
9075 void BlueStore::collect_metadata(map
<string
,string
> *pm
)
9077 dout(10) << __func__
<< dendl
;
9078 bdev
->collect_metadata("bluestore_bdev_", pm
);
9080 (*pm
)["bluefs"] = "1";
9081 // this value is for backward compatibility only
9082 (*pm
)["bluefs_single_shared_device"] = \
9083 stringify((int)bluefs_layout
.single_shared_device());
9084 (*pm
)["bluefs_dedicated_db"] = \
9085 stringify((int)bluefs_layout
.dedicated_db
);
9086 (*pm
)["bluefs_dedicated_wal"] = \
9087 stringify((int)bluefs_layout
.dedicated_wal
);
9088 bluefs
->collect_metadata(pm
, bluefs_layout
.shared_bdev
);
9090 (*pm
)["bluefs"] = "0";
9093 // report numa mapping for underlying devices
9097 int r
= get_numa_node(&node
, &nodes
, &failed
);
9099 if (!failed
.empty()) {
9100 (*pm
)["objectstore_numa_unknown_devices"] = stringify(failed
);
9102 if (!nodes
.empty()) {
9103 dout(1) << __func__
<< " devices span numa nodes " << nodes
<< dendl
;
9104 (*pm
)["objectstore_numa_nodes"] = stringify(nodes
);
9107 (*pm
)["objectstore_numa_node"] = stringify(node
);
9112 int BlueStore::get_numa_node(
9114 set
<int> *out_nodes
,
9115 set
<string
> *out_failed
)
9118 set
<string
> devices
;
9119 get_devices(&devices
);
9122 for (auto& devname
: devices
) {
9124 BlkDev
bdev(devname
);
9125 int r
= bdev
.get_numa_node(&n
);
9127 dout(10) << __func__
<< " bdev " << devname
<< " can't detect numa_node"
9129 failed
.insert(devname
);
9132 dout(10) << __func__
<< " bdev " << devname
<< " on numa_node " << n
9139 if (node
>= 0 && nodes
.size() == 1 && failed
.empty()) {
9146 *out_failed
= failed
;
9151 int BlueStore::get_devices(set
<string
> *ls
)
9154 bdev
->get_devices(ls
);
9156 bluefs
->get_devices(ls
);
9161 // grumble, we haven't started up yet.
9162 int r
= _open_path();
9165 r
= _open_fsid(false);
9168 r
= _read_fsid(&fsid
);
9174 r
= _open_bdev(false);
9177 r
= _minimal_open_bluefs(false);
9180 bdev
->get_devices(ls
);
9182 bluefs
->get_devices(ls
);
9185 _minimal_close_bluefs();
9196 void BlueStore::_get_statfs_overall(struct store_statfs_t
*buf
)
9200 auto prefix
= per_pool_omap
== OMAP_BULK
?
9202 per_pool_omap
== OMAP_PER_POOL
?
9203 PREFIX_PERPOOL_OMAP
:
9205 buf
->omap_allocated
=
9206 db
->estimate_prefix_size(prefix
, string());
9208 uint64_t bfree
= shared_alloc
.a
->get_free();
9211 buf
->internally_reserved
= 0;
9212 // include dedicated db, too, if that isn't the shared device.
9213 if (bluefs_layout
.shared_bdev
!= BlueFS::BDEV_DB
) {
9214 buf
->total
+= bluefs
->get_total(BlueFS::BDEV_DB
);
9216 // call any non-omap bluefs space "internal metadata"
9217 buf
->internal_metadata
=
9219 - buf
->omap_allocated
;
9222 uint64_t thin_total
, thin_avail
;
9223 if (bdev
->get_thin_utilization(&thin_total
, &thin_avail
)) {
9224 buf
->total
+= thin_total
;
9226 // we are limited by both the size of the virtual device and the
9227 // underlying physical device.
9228 bfree
= std::min(bfree
, thin_avail
);
9230 buf
->allocated
= thin_total
- thin_avail
;
9232 buf
->total
+= bdev
->get_size();
9234 buf
->available
= bfree
;
9237 int BlueStore::statfs(struct store_statfs_t
*buf
,
9238 osd_alert_list_t
* alerts
)
9242 _log_alerts(*alerts
);
9244 _get_statfs_overall(buf
);
9246 std::lock_guard
l(vstatfs_lock
);
9247 buf
->allocated
= vstatfs
.allocated();
9248 buf
->data_stored
= vstatfs
.stored();
9249 buf
->data_compressed
= vstatfs
.compressed();
9250 buf
->data_compressed_original
= vstatfs
.compressed_original();
9251 buf
->data_compressed_allocated
= vstatfs
.compressed_allocated();
9254 dout(20) << __func__
<< " " << *buf
<< dendl
;
9258 int BlueStore::pool_statfs(uint64_t pool_id
, struct store_statfs_t
*buf
,
9259 bool *out_per_pool_omap
)
9261 dout(20) << __func__
<< " pool " << pool_id
<< dendl
;
9263 if (!per_pool_stat_collection
) {
9264 dout(20) << __func__
<< " not supported in legacy mode " << dendl
;
9270 std::lock_guard
l(vstatfs_lock
);
9271 osd_pools
[pool_id
].publish(buf
);
9275 _key_encode_u64(pool_id
, &key_prefix
);
9276 *out_per_pool_omap
= per_pool_omap
!= OMAP_BULK
;
9277 if (*out_per_pool_omap
) {
9278 auto prefix
= per_pool_omap
== OMAP_PER_POOL
?
9279 PREFIX_PERPOOL_OMAP
:
9281 buf
->omap_allocated
= db
->estimate_prefix_size(prefix
, key_prefix
);
9284 dout(10) << __func__
<< *buf
<< dendl
;
9288 void BlueStore::_check_legacy_statfs_alert()
9291 if (!per_pool_stat_collection
&&
9292 cct
->_conf
->bluestore_warn_on_legacy_statfs
) {
9293 s
= "legacy statfs reporting detected, "
9294 "suggest to run store repair to get consistent statistic reports";
9296 std::lock_guard
l(qlock
);
9297 legacy_statfs_alert
= s
;
9300 void BlueStore::_check_no_per_pg_or_pool_omap_alert()
9302 string per_pg
, per_pool
;
9303 if (per_pool_omap
!= OMAP_PER_PG
) {
9304 if (cct
->_conf
->bluestore_warn_on_no_per_pg_omap
) {
9305 per_pg
= "legacy (not per-pg) omap detected, "
9306 "suggest to run store repair to benefit from faster PG removal";
9308 if (per_pool_omap
!= OMAP_PER_POOL
) {
9309 if (cct
->_conf
->bluestore_warn_on_no_per_pool_omap
) {
9310 per_pool
= "legacy (not per-pool) omap detected, "
9311 "suggest to run store repair to benefit from per-pool omap usage statistics";
9315 std::lock_guard
l(qlock
);
9316 no_per_pg_omap_alert
= per_pg
;
9317 no_per_pool_omap_alert
= per_pool
;
9323 BlueStore::CollectionRef
BlueStore::_get_collection(const coll_t
& cid
)
9325 std::shared_lock
l(coll_lock
);
9326 ceph::unordered_map
<coll_t
,CollectionRef
>::iterator cp
= coll_map
.find(cid
);
9327 if (cp
== coll_map
.end())
9328 return CollectionRef();
9332 void BlueStore::_queue_reap_collection(CollectionRef
& c
)
9334 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
9335 // _reap_collections and this in the same thread,
9336 // so no need a lock.
9337 removed_collections
.push_back(c
);
9340 void BlueStore::_reap_collections()
9343 list
<CollectionRef
> removed_colls
;
9345 // _queue_reap_collection and this in the same thread.
9346 // So no need a lock.
9347 if (!removed_collections
.empty())
9348 removed_colls
.swap(removed_collections
);
9353 list
<CollectionRef
>::iterator p
= removed_colls
.begin();
9354 while (p
!= removed_colls
.end()) {
9355 CollectionRef c
= *p
;
9356 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
9357 if (c
->onode_map
.map_any([&](Onode
* o
) {
9358 ceph_assert(!o
->exists
);
9359 if (o
->flushing_count
.load()) {
9360 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " " << o
->oid
9361 << " flush_txns " << o
->flushing_count
<< dendl
;
9369 c
->onode_map
.clear();
9370 p
= removed_colls
.erase(p
);
9371 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " done" << dendl
;
9373 if (removed_colls
.empty()) {
9374 dout(10) << __func__
<< " all reaped" << dendl
;
9376 removed_collections
.splice(removed_collections
.begin(), removed_colls
);
9380 void BlueStore::_update_cache_logger()
9382 uint64_t num_onodes
= 0;
9383 uint64_t num_pinned_onodes
= 0;
9384 uint64_t num_extents
= 0;
9385 uint64_t num_blobs
= 0;
9386 uint64_t num_buffers
= 0;
9387 uint64_t num_buffer_bytes
= 0;
9388 for (auto c
: onode_cache_shards
) {
9389 c
->add_stats(&num_onodes
, &num_pinned_onodes
);
9391 for (auto c
: buffer_cache_shards
) {
9392 c
->add_stats(&num_extents
, &num_blobs
,
9393 &num_buffers
, &num_buffer_bytes
);
9395 logger
->set(l_bluestore_onodes
, num_onodes
);
9396 logger
->set(l_bluestore_pinned_onodes
, num_pinned_onodes
);
9397 logger
->set(l_bluestore_extents
, num_extents
);
9398 logger
->set(l_bluestore_blobs
, num_blobs
);
9399 logger
->set(l_bluestore_buffers
, num_buffers
);
9400 logger
->set(l_bluestore_buffer_bytes
, num_buffer_bytes
);
9406 ObjectStore::CollectionHandle
BlueStore::open_collection(const coll_t
& cid
)
9408 return _get_collection(cid
);
9411 ObjectStore::CollectionHandle
BlueStore::create_new_collection(
9414 std::unique_lock l
{coll_lock
};
9415 auto c
= ceph::make_ref
<Collection
>(
9417 onode_cache_shards
[cid
.hash_to_shard(onode_cache_shards
.size())],
9418 buffer_cache_shards
[cid
.hash_to_shard(buffer_cache_shards
.size())],
9420 new_coll_map
[cid
] = c
;
9421 _osr_attach(c
.get());
9425 void BlueStore::set_collection_commit_queue(
9427 ContextQueue
*commit_queue
)
9430 std::shared_lock
l(coll_lock
);
9431 if (coll_map
.count(cid
)) {
9432 coll_map
[cid
]->commit_queue
= commit_queue
;
9433 } else if (new_coll_map
.count(cid
)) {
9434 new_coll_map
[cid
]->commit_queue
= commit_queue
;
9440 bool BlueStore::exists(CollectionHandle
&c_
, const ghobject_t
& oid
)
9442 Collection
*c
= static_cast<Collection
*>(c_
.get());
9443 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
9450 std::shared_lock
l(c
->lock
);
9451 OnodeRef o
= c
->get_onode(oid
, false);
9452 if (!o
|| !o
->exists
)
9459 int BlueStore::stat(
9460 CollectionHandle
&c_
,
9461 const ghobject_t
& oid
,
9465 Collection
*c
= static_cast<Collection
*>(c_
.get());
9468 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
9471 std::shared_lock
l(c
->lock
);
9472 OnodeRef o
= c
->get_onode(oid
, false);
9473 if (!o
|| !o
->exists
)
9475 st
->st_size
= o
->onode
.size
;
9476 st
->st_blksize
= 4096;
9477 st
->st_blocks
= (st
->st_size
+ st
->st_blksize
- 1) / st
->st_blksize
;
9482 if (_debug_mdata_eio(oid
)) {
9484 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
9488 int BlueStore::set_collection_opts(
9489 CollectionHandle
& ch
,
9490 const pool_opts_t
& opts
)
9492 Collection
*c
= static_cast<Collection
*>(ch
.get());
9493 dout(15) << __func__
<< " " << ch
->cid
<< " options " << opts
<< dendl
;
9496 std::unique_lock l
{c
->lock
};
9497 c
->pool_opts
= opts
;
9501 int BlueStore::read(
9502 CollectionHandle
&c_
,
9503 const ghobject_t
& oid
,
9509 auto start
= mono_clock::now();
9510 Collection
*c
= static_cast<Collection
*>(c_
.get());
9511 const coll_t
&cid
= c
->get_cid();
9512 dout(15) << __func__
<< " " << cid
<< " " << oid
9513 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9521 std::shared_lock
l(c
->lock
);
9522 auto start1
= mono_clock::now();
9523 OnodeRef o
= c
->get_onode(oid
, false);
9524 log_latency("get_onode@read",
9525 l_bluestore_read_onode_meta_lat
,
9526 mono_clock::now() - start1
,
9527 cct
->_conf
->bluestore_log_op_age
);
9528 if (!o
|| !o
->exists
) {
9533 if (offset
== length
&& offset
== 0)
9534 length
= o
->onode
.size
;
9536 r
= _do_read(c
, o
, offset
, length
, bl
, op_flags
);
9538 logger
->inc(l_bluestore_read_eio
);
9543 if (r
>= 0 && _debug_data_eio(oid
)) {
9545 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
9546 } else if (oid
.hobj
.pool
> 0 && /* FIXME, see #23029 */
9547 cct
->_conf
->bluestore_debug_random_read_err
&&
9548 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
*
9550 dout(0) << __func__
<< ": inject random EIO" << dendl
;
9553 dout(10) << __func__
<< " " << cid
<< " " << oid
9554 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9555 << " = " << r
<< dendl
;
9556 log_latency(__func__
,
9557 l_bluestore_read_lat
,
9558 mono_clock::now() - start
,
9559 cct
->_conf
->bluestore_log_op_age
);
9563 void BlueStore::_read_cache(
9567 int read_cache_policy
,
9568 ready_regions_t
& ready_regions
,
9569 blobs2read_t
& blobs2read
)
9571 // build blob-wise list to of stuff read (that isn't cached)
9572 unsigned left
= length
;
9573 uint64_t pos
= offset
;
9574 auto lp
= o
->extent_map
.seek_lextent(offset
);
9575 while (left
> 0 && lp
!= o
->extent_map
.extent_map
.end()) {
9576 if (pos
< lp
->logical_offset
) {
9577 unsigned hole
= lp
->logical_offset
- pos
;
9581 dout(30) << __func__
<< " hole 0x" << std::hex
<< pos
<< "~" << hole
9582 << std::dec
<< dendl
;
9586 BlobRef
& bptr
= lp
->blob
;
9587 unsigned l_off
= pos
- lp
->logical_offset
;
9588 unsigned b_off
= l_off
+ lp
->blob_offset
;
9589 unsigned b_len
= std::min(left
, lp
->length
- l_off
);
9591 ready_regions_t cache_res
;
9592 interval_set
<uint32_t> cache_interval
;
9593 bptr
->shared_blob
->bc
.read(
9594 bptr
->shared_blob
->get_cache(), b_off
, b_len
, cache_res
, cache_interval
,
9596 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9597 << " need 0x" << b_off
<< "~" << b_len
9598 << " cache has 0x" << cache_interval
9599 << std::dec
<< dendl
;
9601 auto pc
= cache_res
.begin();
9602 uint64_t chunk_size
= bptr
->get_blob().get_chunk_size(block_size
);
9605 if (pc
!= cache_res
.end() &&
9606 pc
->first
== b_off
) {
9607 l
= pc
->second
.length();
9608 ready_regions
[pos
] = std::move(pc
->second
);
9609 dout(30) << __func__
<< " use cache 0x" << std::hex
<< pos
<< ": 0x"
9610 << b_off
<< "~" << l
<< std::dec
<< dendl
;
9614 if (pc
!= cache_res
.end()) {
9615 ceph_assert(pc
->first
> b_off
);
9616 l
= pc
->first
- b_off
;
9618 dout(30) << __func__
<< " will read 0x" << std::hex
<< pos
<< ": 0x"
9619 << b_off
<< "~" << l
<< std::dec
<< dendl
;
9622 uint64_t r_off
= b_off
;
9624 uint64_t front
= r_off
% chunk_size
;
9629 unsigned tail
= r_len
% chunk_size
;
9631 r_len
+= chunk_size
- tail
;
9633 bool merged
= false;
9634 regions2read_t
& r2r
= blobs2read
[bptr
];
9636 read_req_t
& pre
= r2r
.back();
9637 if (r_off
<= (pre
.r_off
+ pre
.r_len
)) {
9638 front
+= (r_off
- pre
.r_off
);
9639 pre
.r_len
+= (r_off
+ r_len
- pre
.r_off
- pre
.r_len
);
9640 pre
.regs
.emplace_back(region_t(pos
, b_off
, l
, front
));
9645 read_req_t
req(r_off
, r_len
);
9646 req
.regs
.emplace_back(region_t(pos
, b_off
, l
, front
));
9647 r2r
.emplace_back(std::move(req
));
9660 int BlueStore::_prepare_read_ioc(
9661 blobs2read_t
& blobs2read
,
9662 vector
<bufferlist
>* compressed_blob_bls
,
9665 for (auto& p
: blobs2read
) {
9666 const BlobRef
& bptr
= p
.first
;
9667 regions2read_t
& r2r
= p
.second
;
9668 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9669 << " need " << r2r
<< std::dec
<< dendl
;
9670 if (bptr
->get_blob().is_compressed()) {
9671 // read the whole thing
9672 if (compressed_blob_bls
->empty()) {
9673 // ensure we avoid any reallocation on subsequent blobs
9674 compressed_blob_bls
->reserve(blobs2read
.size());
9676 compressed_blob_bls
->push_back(bufferlist());
9677 bufferlist
& bl
= compressed_blob_bls
->back();
9678 auto r
= bptr
->get_blob().map(
9679 0, bptr
->get_blob().get_ondisk_length(),
9680 [&](uint64_t offset
, uint64_t length
) {
9681 int r
= bdev
->aio_read(offset
, length
, &bl
, ioc
);
9687 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
) << dendl
;
9689 // propagate EIO to caller
9692 ceph_assert(r
== 0);
9696 for (auto& req
: r2r
) {
9697 dout(20) << __func__
<< " region 0x" << std::hex
9698 << req
.regs
.front().logical_offset
9699 << ": 0x" << req
.regs
.front().blob_xoffset
9700 << " reading 0x" << req
.r_off
9701 << "~" << req
.r_len
<< std::dec
9705 auto r
= bptr
->get_blob().map(
9706 req
.r_off
, req
.r_len
,
9707 [&](uint64_t offset
, uint64_t length
) {
9708 int r
= bdev
->aio_read(offset
, length
, &req
.bl
, ioc
);
9714 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
)
9717 // propagate EIO to caller
9720 ceph_assert(r
== 0);
9722 ceph_assert(req
.bl
.length() == req
.r_len
);
9729 int BlueStore::_generate_read_result_bl(
9733 ready_regions_t
& ready_regions
,
9734 vector
<bufferlist
>& compressed_blob_bls
,
9735 blobs2read_t
& blobs2read
,
9740 // enumerate and decompress desired blobs
9741 auto p
= compressed_blob_bls
.begin();
9742 blobs2read_t::iterator b2r_it
= blobs2read
.begin();
9743 while (b2r_it
!= blobs2read
.end()) {
9744 const BlobRef
& bptr
= b2r_it
->first
;
9745 regions2read_t
& r2r
= b2r_it
->second
;
9746 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9747 << " need 0x" << r2r
<< std::dec
<< dendl
;
9748 if (bptr
->get_blob().is_compressed()) {
9749 ceph_assert(p
!= compressed_blob_bls
.end());
9750 bufferlist
& compressed_bl
= *p
++;
9751 if (_verify_csum(o
, &bptr
->get_blob(), 0, compressed_bl
,
9752 r2r
.front().regs
.front().logical_offset
) < 0) {
9757 auto r
= _decompress(compressed_bl
, &raw_bl
);
9761 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(), 0,
9764 for (auto& req
: r2r
) {
9765 for (auto& r
: req
.regs
) {
9766 ready_regions
[r
.logical_offset
].substr_of(
9767 raw_bl
, r
.blob_xoffset
, r
.length
);
9771 for (auto& req
: r2r
) {
9772 if (_verify_csum(o
, &bptr
->get_blob(), req
.r_off
, req
.bl
,
9773 req
.regs
.front().logical_offset
) < 0) {
9778 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(),
9782 // prune and keep result
9783 for (const auto& r
: req
.regs
) {
9784 ready_regions
[r
.logical_offset
].substr_of(req
.bl
, r
.front
, r
.length
);
9791 // generate a resulting buffer
9792 auto pr
= ready_regions
.begin();
9793 auto pr_end
= ready_regions
.end();
9795 while (pos
< length
) {
9796 if (pr
!= pr_end
&& pr
->first
== pos
+ offset
) {
9797 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
9798 << ": data from 0x" << pr
->first
<< "~" << pr
->second
.length()
9799 << std::dec
<< dendl
;
9800 pos
+= pr
->second
.length();
9801 bl
.claim_append(pr
->second
);
9804 uint64_t l
= length
- pos
;
9806 ceph_assert(pr
->first
> pos
+ offset
);
9807 l
= pr
->first
- (pos
+ offset
);
9809 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
9810 << ": zeros for 0x" << (pos
+ offset
) << "~" << l
9811 << std::dec
<< dendl
;
9816 ceph_assert(bl
.length() == length
);
9817 ceph_assert(pos
== length
);
9818 ceph_assert(pr
== pr_end
);
9822 int BlueStore::_do_read(
9829 uint64_t retry_count
)
9833 int read_cache_policy
= 0; // do not bypass clean or dirty cache
9835 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
9836 << " size 0x" << o
->onode
.size
<< " (" << std::dec
9837 << o
->onode
.size
<< ")" << dendl
;
9840 if (offset
>= o
->onode
.size
) {
9844 // generally, don't buffer anything, unless the client explicitly requests
9846 bool buffered
= false;
9847 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
9848 dout(20) << __func__
<< " will do buffered read" << dendl
;
9850 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
9851 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
9852 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
9853 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
9857 if (offset
+ length
> o
->onode
.size
) {
9858 length
= o
->onode
.size
- offset
;
9861 auto start
= mono_clock::now();
9862 o
->extent_map
.fault_range(db
, offset
, length
);
9863 log_latency(__func__
,
9864 l_bluestore_read_onode_meta_lat
,
9865 mono_clock::now() - start
,
9866 cct
->_conf
->bluestore_log_op_age
);
9867 _dump_onode
<30>(cct
, *o
);
9869 // for deep-scrub, we only read dirty cache and bypass clean cache in
9870 // order to read underlying block device in case there are silent disk errors.
9871 if (op_flags
& CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE
) {
9872 dout(20) << __func__
<< " will bypass cache and do direct read" << dendl
;
9873 read_cache_policy
= BufferSpace::BYPASS_CLEAN_CACHE
;
9876 // build blob-wise list to of stuff read (that isn't cached)
9877 ready_regions_t ready_regions
;
9878 blobs2read_t blobs2read
;
9879 _read_cache(o
, offset
, length
, read_cache_policy
, ready_regions
, blobs2read
);
9882 // read raw blob data.
9883 start
= mono_clock::now(); // for the sake of simplicity
9884 // measure the whole block below.
9885 // The error isn't that much...
9886 vector
<bufferlist
> compressed_blob_bls
;
9887 IOContext
ioc(cct
, NULL
, true); // allow EIO
9888 r
= _prepare_read_ioc(blobs2read
, &compressed_blob_bls
, &ioc
);
9889 // we always issue aio for reading, so errors other than EIO are not allowed
9893 int64_t num_ios
= blobs2read
.size();
9894 if (ioc
.has_pending_aios()) {
9895 num_ios
= ioc
.get_num_ios();
9896 bdev
->aio_submit(&ioc
);
9897 dout(20) << __func__
<< " waiting for aio" << dendl
;
9899 r
= ioc
.get_return_value();
9901 ceph_assert(r
== -EIO
); // no other errors allowed
9905 log_latency_fn(__func__
,
9906 l_bluestore_read_wait_aio_lat
,
9907 mono_clock::now() - start
,
9908 cct
->_conf
->bluestore_log_op_age
,
9909 [&](auto lat
) { return ", num_ios = " + stringify(num_ios
); }
9912 bool csum_error
= false;
9913 r
= _generate_read_result_bl(o
, offset
, length
, ready_regions
,
9914 compressed_blob_bls
, blobs2read
,
9915 buffered
, &csum_error
, bl
);
9917 // Handles spurious read errors caused by a kernel bug.
9918 // We sometimes get all-zero pages as a result of the read under
9919 // high memory pressure. Retrying the failing read succeeds in most
9921 // See also: http://tracker.ceph.com/issues/22464
9922 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
9925 return _do_read(c
, o
, offset
, length
, bl
, op_flags
, retry_count
+ 1);
9929 logger
->inc(l_bluestore_reads_with_retries
);
9930 dout(5) << __func__
<< " read at 0x" << std::hex
<< offset
<< "~" << length
9931 << " failed " << std::dec
<< retry_count
<< " times before succeeding" << dendl
;
9933 s
<< " reads with retries: " << logger
->get(l_bluestore_reads_with_retries
);
9934 _set_spurious_read_errors_alert(s
.str());
9939 int BlueStore::_verify_csum(OnodeRef
& o
,
9940 const bluestore_blob_t
* blob
, uint64_t blob_xoffset
,
9941 const bufferlist
& bl
,
9942 uint64_t logical_offset
) const
9946 auto start
= mono_clock::now();
9947 int r
= blob
->verify_csum(blob_xoffset
, bl
, &bad
, &bad_csum
);
9948 if (cct
->_conf
->bluestore_debug_inject_csum_err_probability
> 0 &&
9949 (rand() % 10000) < cct
->_conf
->bluestore_debug_inject_csum_err_probability
* 10000.0) {
9950 derr
<< __func__
<< " injecting bluestore checksum verifcation error" << dendl
;
9953 bad_csum
= 0xDEADBEEF;
9960 blob
->get_csum_chunk_size(),
9961 [&](uint64_t offset
, uint64_t length
) {
9962 pex
.emplace_back(bluestore_pextent_t(offset
, length
));
9965 derr
<< __func__
<< " bad "
9966 << Checksummer::get_csum_type_string(blob
->csum_type
)
9967 << "/0x" << std::hex
<< blob
->get_csum_chunk_size()
9968 << " checksum at blob offset 0x" << bad
9969 << ", got 0x" << bad_csum
<< ", expected 0x"
9970 << blob
->get_csum_item(bad
/ blob
->get_csum_chunk_size()) << std::dec
9971 << ", device location " << pex
9972 << ", logical extent 0x" << std::hex
9973 << (logical_offset
+ bad
- blob_xoffset
) << "~"
9974 << blob
->get_csum_chunk_size() << std::dec
9975 << ", object " << o
->oid
9978 derr
<< __func__
<< " failed with exit code: " << cpp_strerror(r
) << dendl
;
9981 log_latency(__func__
,
9982 l_bluestore_csum_lat
,
9983 mono_clock::now() - start
,
9984 cct
->_conf
->bluestore_log_op_age
);
9985 if (cct
->_conf
->bluestore_ignore_data_csum
) {
9991 int BlueStore::_decompress(bufferlist
& source
, bufferlist
* result
)
9994 auto start
= mono_clock::now();
9995 auto i
= source
.cbegin();
9996 bluestore_compression_header_t chdr
;
9998 int alg
= int(chdr
.type
);
9999 CompressorRef cp
= compressor
;
10000 if (!cp
|| (int)cp
->get_type() != alg
) {
10001 cp
= Compressor::create(cct
, alg
);
10005 // if compressor isn't available - error, because cannot return
10006 // decompressed data?
10008 const char* alg_name
= Compressor::get_comp_alg_name(alg
);
10009 derr
<< __func__
<< " can't load decompressor " << alg_name
<< dendl
;
10010 _set_compression_alert(false, alg_name
);
10013 r
= cp
->decompress(i
, chdr
.length
, *result
, chdr
.compressor_message
);
10015 derr
<< __func__
<< " decompression failed with exit code " << r
<< dendl
;
10019 log_latency(__func__
,
10020 l_bluestore_decompress_lat
,
10021 mono_clock::now() - start
,
10022 cct
->_conf
->bluestore_log_op_age
);
10026 // this stores fiemap into interval_set, other variations
10027 // use it internally
10028 int BlueStore::_fiemap(
10029 CollectionHandle
&c_
,
10030 const ghobject_t
& oid
,
10033 interval_set
<uint64_t>& destset
)
10035 Collection
*c
= static_cast<Collection
*>(c_
.get());
10039 std::shared_lock
l(c
->lock
);
10041 OnodeRef o
= c
->get_onode(oid
, false);
10042 if (!o
|| !o
->exists
) {
10045 _dump_onode
<30>(cct
, *o
);
10047 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10048 << " size 0x" << o
->onode
.size
<< std::dec
<< dendl
;
10050 boost::intrusive::set
<Extent
>::iterator ep
, eend
;
10051 if (offset
>= o
->onode
.size
)
10054 if (offset
+ length
> o
->onode
.size
) {
10055 length
= o
->onode
.size
- offset
;
10058 o
->extent_map
.fault_range(db
, offset
, length
);
10059 eend
= o
->extent_map
.extent_map
.end();
10060 ep
= o
->extent_map
.seek_lextent(offset
);
10061 while (length
> 0) {
10062 dout(20) << __func__
<< " offset " << offset
<< dendl
;
10063 if (ep
!= eend
&& ep
->logical_offset
+ ep
->length
<= offset
) {
10068 uint64_t x_len
= length
;
10069 if (ep
!= eend
&& ep
->logical_offset
<= offset
) {
10070 uint64_t x_off
= offset
- ep
->logical_offset
;
10071 x_len
= std::min(x_len
, ep
->length
- x_off
);
10072 dout(30) << __func__
<< " lextent 0x" << std::hex
<< offset
<< "~"
10073 << x_len
<< std::dec
<< " blob " << ep
->blob
<< dendl
;
10074 destset
.insert(offset
, x_len
);
10077 if (x_off
+ x_len
== ep
->length
)
10082 ep
->logical_offset
> offset
&&
10083 ep
->logical_offset
- offset
< x_len
) {
10084 x_len
= ep
->logical_offset
- offset
;
10092 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10093 << " size = 0x(" << destset
<< ")" << std::dec
<< dendl
;
10097 int BlueStore::fiemap(
10098 CollectionHandle
&c_
,
10099 const ghobject_t
& oid
,
10104 interval_set
<uint64_t> m
;
10105 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
10112 int BlueStore::fiemap(
10113 CollectionHandle
&c_
,
10114 const ghobject_t
& oid
,
10117 map
<uint64_t, uint64_t>& destmap
)
10119 interval_set
<uint64_t> m
;
10120 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
10122 destmap
= std::move(m
).detach();
10127 int BlueStore::readv(
10128 CollectionHandle
&c_
,
10129 const ghobject_t
& oid
,
10130 interval_set
<uint64_t>& m
,
10134 auto start
= mono_clock::now();
10135 Collection
*c
= static_cast<Collection
*>(c_
.get());
10136 const coll_t
&cid
= c
->get_cid();
10137 dout(15) << __func__
<< " " << cid
<< " " << oid
10146 std::shared_lock
l(c
->lock
);
10147 auto start1
= mono_clock::now();
10148 OnodeRef o
= c
->get_onode(oid
, false);
10149 log_latency("get_onode@read",
10150 l_bluestore_read_onode_meta_lat
,
10151 mono_clock::now() - start1
,
10152 cct
->_conf
->bluestore_log_op_age
);
10153 if (!o
|| !o
->exists
) {
10163 r
= _do_readv(c
, o
, m
, bl
, op_flags
);
10165 logger
->inc(l_bluestore_read_eio
);
10170 if (r
>= 0 && _debug_data_eio(oid
)) {
10172 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10173 } else if (oid
.hobj
.pool
> 0 && /* FIXME, see #23029 */
10174 cct
->_conf
->bluestore_debug_random_read_err
&&
10175 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
*
10177 dout(0) << __func__
<< ": inject random EIO" << dendl
;
10180 dout(10) << __func__
<< " " << cid
<< " " << oid
10181 << " fiemap " << m
<< std::dec
10182 << " = " << r
<< dendl
;
10183 log_latency(__func__
,
10184 l_bluestore_read_lat
,
10185 mono_clock::now() - start
,
10186 cct
->_conf
->bluestore_log_op_age
);
10190 int BlueStore::_do_readv(
10193 const interval_set
<uint64_t>& m
,
10196 uint64_t retry_count
)
10200 int read_cache_policy
= 0; // do not bypass clean or dirty cache
10202 dout(20) << __func__
<< " fiemap " << m
<< std::hex
10203 << " size 0x" << o
->onode
.size
<< " (" << std::dec
10204 << o
->onode
.size
<< ")" << dendl
;
10206 // generally, don't buffer anything, unless the client explicitly requests
10208 bool buffered
= false;
10209 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
10210 dout(20) << __func__
<< " will do buffered read" << dendl
;
10212 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
10213 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
10214 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
10215 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
10218 // this method must be idempotent since we may call it several times
10219 // before we finally read the expected result.
10222 // call fiemap first!
10223 ceph_assert(m
.range_start() <= o
->onode
.size
);
10224 ceph_assert(m
.range_end() <= o
->onode
.size
);
10225 auto start
= mono_clock::now();
10226 o
->extent_map
.fault_range(db
, m
.range_start(), m
.range_end() - m
.range_start());
10227 log_latency(__func__
,
10228 l_bluestore_read_onode_meta_lat
,
10229 mono_clock::now() - start
,
10230 cct
->_conf
->bluestore_log_op_age
);
10231 _dump_onode
<30>(cct
, *o
);
10233 IOContext
ioc(cct
, NULL
, true); // allow EIO
10234 vector
<std::tuple
<ready_regions_t
, vector
<bufferlist
>, blobs2read_t
>> raw_results
;
10235 raw_results
.reserve(m
.num_intervals());
10237 for (auto p
= m
.begin(); p
!= m
.end(); p
++, i
++) {
10238 raw_results
.push_back({});
10239 _read_cache(o
, p
.get_start(), p
.get_len(), read_cache_policy
,
10240 std::get
<0>(raw_results
[i
]), std::get
<2>(raw_results
[i
]));
10241 r
= _prepare_read_ioc(std::get
<2>(raw_results
[i
]), &std::get
<1>(raw_results
[i
]), &ioc
);
10242 // we always issue aio for reading, so errors other than EIO are not allowed
10247 auto num_ios
= m
.size();
10248 if (ioc
.has_pending_aios()) {
10249 num_ios
= ioc
.get_num_ios();
10250 bdev
->aio_submit(&ioc
);
10251 dout(20) << __func__
<< " waiting for aio" << dendl
;
10253 r
= ioc
.get_return_value();
10255 ceph_assert(r
== -EIO
); // no other errors allowed
10259 log_latency_fn(__func__
,
10260 l_bluestore_read_wait_aio_lat
,
10261 mono_clock::now() - start
,
10262 cct
->_conf
->bluestore_log_op_age
,
10263 [&](auto lat
) { return ", num_ios = " + stringify(num_ios
); }
10266 ceph_assert(raw_results
.size() == (size_t)m
.num_intervals());
10268 for (auto p
= m
.begin(); p
!= m
.end(); p
++, i
++) {
10269 bool csum_error
= false;
10271 r
= _generate_read_result_bl(o
, p
.get_start(), p
.get_len(),
10272 std::get
<0>(raw_results
[i
]),
10273 std::get
<1>(raw_results
[i
]),
10274 std::get
<2>(raw_results
[i
]),
10275 buffered
, &csum_error
, t
);
10277 // Handles spurious read errors caused by a kernel bug.
10278 // We sometimes get all-zero pages as a result of the read under
10279 // high memory pressure. Retrying the failing read succeeds in most
10281 // See also: http://tracker.ceph.com/issues/22464
10282 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
10285 return _do_readv(c
, o
, m
, bl
, op_flags
, retry_count
+ 1);
10287 bl
.claim_append(t
);
10290 logger
->inc(l_bluestore_reads_with_retries
);
10291 dout(5) << __func__
<< " read fiemap " << m
10292 << " failed " << retry_count
<< " times before succeeding"
10295 return bl
.length();
10298 int BlueStore::dump_onode(CollectionHandle
&c_
,
10299 const ghobject_t
& oid
,
10300 const string
& section_name
,
10303 Collection
*c
= static_cast<Collection
*>(c_
.get());
10304 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
10310 std::shared_lock
l(c
->lock
);
10312 OnodeRef o
= c
->get_onode(oid
, false);
10313 if (!o
|| !o
->exists
) {
10317 // FIXME minor: actually the next line isn't enough to
10318 // load shared blobs. Leaving as is for now..
10320 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
10322 _dump_onode
<0>(cct
, *o
);
10323 f
->open_object_section(section_name
.c_str());
10325 f
->close_section();
10329 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
10330 << " = " << r
<< dendl
;
10334 int BlueStore::getattr(
10335 CollectionHandle
&c_
,
10336 const ghobject_t
& oid
,
10340 Collection
*c
= static_cast<Collection
*>(c_
.get());
10341 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
<< dendl
;
10347 std::shared_lock
l(c
->lock
);
10348 mempool::bluestore_cache_meta::string
k(name
);
10350 OnodeRef o
= c
->get_onode(oid
, false);
10351 if (!o
|| !o
->exists
) {
10356 if (!o
->onode
.attrs
.count(k
)) {
10360 value
= o
->onode
.attrs
[k
];
10364 if (r
== 0 && _debug_mdata_eio(oid
)) {
10366 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10368 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
10369 << " = " << r
<< dendl
;
10373 int BlueStore::getattrs(
10374 CollectionHandle
&c_
,
10375 const ghobject_t
& oid
,
10376 map
<string
,bufferptr
>& aset
)
10378 Collection
*c
= static_cast<Collection
*>(c_
.get());
10379 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
10385 std::shared_lock
l(c
->lock
);
10387 OnodeRef o
= c
->get_onode(oid
, false);
10388 if (!o
|| !o
->exists
) {
10392 for (auto& i
: o
->onode
.attrs
) {
10393 aset
.emplace(i
.first
.c_str(), i
.second
);
10399 if (r
== 0 && _debug_mdata_eio(oid
)) {
10401 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10403 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
10404 << " = " << r
<< dendl
;
10408 int BlueStore::list_collections(vector
<coll_t
>& ls
)
10410 std::shared_lock
l(coll_lock
);
10411 ls
.reserve(coll_map
.size());
10412 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
= coll_map
.begin();
10413 p
!= coll_map
.end();
10415 ls
.push_back(p
->first
);
10419 bool BlueStore::collection_exists(const coll_t
& c
)
10421 std::shared_lock
l(coll_lock
);
10422 return coll_map
.count(c
);
10425 int BlueStore::collection_empty(CollectionHandle
& ch
, bool *empty
)
10427 dout(15) << __func__
<< " " << ch
->cid
<< dendl
;
10428 vector
<ghobject_t
> ls
;
10430 int r
= collection_list(ch
, ghobject_t(), ghobject_t::get_max(), 1,
10433 derr
<< __func__
<< " collection_list returned: " << cpp_strerror(r
)
10437 *empty
= ls
.empty();
10438 dout(10) << __func__
<< " " << ch
->cid
<< " = " << (int)(*empty
) << dendl
;
10442 int BlueStore::collection_bits(CollectionHandle
& ch
)
10444 dout(15) << __func__
<< " " << ch
->cid
<< dendl
;
10445 Collection
*c
= static_cast<Collection
*>(ch
.get());
10446 std::shared_lock
l(c
->lock
);
10447 dout(10) << __func__
<< " " << ch
->cid
<< " = " << c
->cnode
.bits
<< dendl
;
10448 return c
->cnode
.bits
;
10451 int BlueStore::collection_list(
10452 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10453 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10455 Collection
*c
= static_cast<Collection
*>(c_
.get());
10457 dout(15) << __func__
<< " " << c
->cid
10458 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
10461 std::shared_lock
l(c
->lock
);
10462 r
= _collection_list(c
, start
, end
, max
, false, ls
, pnext
);
10465 dout(10) << __func__
<< " " << c
->cid
10466 << " start " << start
<< " end " << end
<< " max " << max
10467 << " = " << r
<< ", ls.size() = " << ls
->size()
10468 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
10472 int BlueStore::collection_list_legacy(
10473 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10474 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10476 Collection
*c
= static_cast<Collection
*>(c_
.get());
10478 dout(15) << __func__
<< " " << c
->cid
10479 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
10482 std::shared_lock
l(c
->lock
);
10483 r
= _collection_list(c
, start
, end
, max
, true, ls
, pnext
);
10486 dout(10) << __func__
<< " " << c
->cid
10487 << " start " << start
<< " end " << end
<< " max " << max
10488 << " = " << r
<< ", ls.size() = " << ls
->size()
10489 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
10493 int BlueStore::_collection_list(
10494 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10495 bool legacy
, vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10501 auto start_time
= mono_clock::now();
10503 ghobject_t static_next
;
10504 std::unique_ptr
<CollectionListIterator
> it
;
10505 ghobject_t coll_range_temp_start
, coll_range_temp_end
;
10506 ghobject_t coll_range_start
, coll_range_end
;
10507 bool set_next
= false;
10512 pnext
= &static_next
;
10514 if (start
.is_max() || start
.hobj
.is_max()) {
10517 get_coll_range(c
->cid
, c
->cnode
.bits
, &coll_range_temp_start
,
10518 &coll_range_temp_end
, &coll_range_start
, &coll_range_end
);
10519 dout(20) << __func__
10520 << " range " << coll_range_temp_start
10521 << " to " << coll_range_temp_end
10522 << " and " << coll_range_start
10523 << " to " << coll_range_end
10524 << " start " << start
<< dendl
;
10526 it
= std::make_unique
<SimpleCollectionListIterator
>(
10527 cct
, db
->get_iterator(PREFIX_OBJ
));
10529 it
= std::make_unique
<SortedCollectionListIterator
>(
10530 db
->get_iterator(PREFIX_OBJ
));
10532 if (start
== ghobject_t() ||
10533 start
.hobj
== hobject_t() ||
10534 start
== c
->cid
.get_min_hobj()) {
10535 it
->upper_bound(coll_range_temp_start
);
10538 if (start
.hobj
.is_temp()) {
10540 ceph_assert(start
>= coll_range_temp_start
&& start
< coll_range_temp_end
);
10543 ceph_assert(start
>= coll_range_start
&& start
< coll_range_end
);
10545 dout(20) << __func__
<< " temp=" << (int)temp
<< dendl
;
10546 it
->lower_bound(start
);
10548 if (end
.hobj
.is_max()) {
10549 pend
= temp
? coll_range_temp_end
: coll_range_end
;
10551 if (end
.hobj
.is_temp()) {
10557 pend
= temp
? coll_range_temp_end
: end
;
10560 dout(20) << __func__
<< " pend " << pend
<< dendl
;
10562 if (!it
->valid() || it
->is_ge(pend
)) {
10564 dout(20) << __func__
<< " iterator not valid (end of db?)" << dendl
;
10566 dout(20) << __func__
<< " oid " << it
->oid() << " >= " << pend
<< dendl
;
10568 if (end
.hobj
.is_temp()) {
10569 if (it
->valid() && it
->is_lt(coll_range_temp_end
)) {
10570 *pnext
= it
->oid();
10575 dout(30) << __func__
<< " switch to non-temp namespace" << dendl
;
10577 it
->upper_bound(coll_range_start
);
10578 if (end
.hobj
.is_max())
10579 pend
= coll_range_end
;
10582 dout(30) << __func__
<< " pend " << pend
<< dendl
;
10585 if (it
->valid() && it
->is_lt(coll_range_end
)) {
10586 *pnext
= it
->oid();
10591 dout(20) << __func__
<< " oid " << it
->oid() << " end " << end
<< dendl
;
10592 if (ls
->size() >= (unsigned)max
) {
10593 dout(20) << __func__
<< " reached max " << max
<< dendl
;
10594 *pnext
= it
->oid();
10598 ls
->push_back(it
->oid());
10603 *pnext
= ghobject_t::get_max();
10607 l_bluestore_clist_lat
,
10608 mono_clock::now() - start_time
,
10609 cct
->_conf
->bluestore_log_collection_list_age
,
10610 [&] (const ceph::timespan
& lat
) {
10611 ostringstream ostr
;
10612 ostr
<< ", lat = " << timespan_str(lat
)
10613 << " cid =" << c
->cid
10614 << " start " << start
<< " end " << end
10622 int BlueStore::omap_get(
10623 CollectionHandle
&c_
, ///< [in] Collection containing oid
10624 const ghobject_t
&oid
, ///< [in] Object containing omap
10625 bufferlist
*header
, ///< [out] omap header
10626 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10629 Collection
*c
= static_cast<Collection
*>(c_
.get());
10630 return _omap_get(c
, oid
, header
, out
);
10633 int BlueStore::_omap_get(
10634 Collection
*c
, ///< [in] Collection containing oid
10635 const ghobject_t
&oid
, ///< [in] Object containing omap
10636 bufferlist
*header
, ///< [out] omap header
10637 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10640 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10643 std::shared_lock
l(c
->lock
);
10645 OnodeRef o
= c
->get_onode(oid
, false);
10646 if (!o
|| !o
->exists
) {
10650 r
= _onode_omap_get(o
, header
, out
);
10652 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10657 int BlueStore::_onode_omap_get(
10658 const OnodeRef
&o
, ///< [in] Object containing omap
10659 bufferlist
*header
, ///< [out] omap header
10660 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10664 if (!o
|| !o
->exists
) {
10668 if (!o
->onode
.has_omap())
10672 const string
& prefix
= o
->get_omap_prefix();
10673 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
10675 o
->get_omap_header(&head
);
10676 o
->get_omap_tail(&tail
);
10677 it
->lower_bound(head
);
10678 while (it
->valid()) {
10679 if (it
->key() == head
) {
10680 dout(30) << __func__
<< " got header" << dendl
;
10681 *header
= it
->value();
10682 } else if (it
->key() >= tail
) {
10683 dout(30) << __func__
<< " reached tail" << dendl
;
10687 o
->decode_omap_key(it
->key(), &user_key
);
10688 dout(20) << __func__
<< " got " << pretty_binary_string(it
->key())
10689 << " -> " << user_key
<< dendl
;
10690 (*out
)[user_key
] = it
->value();
10699 int BlueStore::omap_get_header(
10700 CollectionHandle
&c_
, ///< [in] Collection containing oid
10701 const ghobject_t
&oid
, ///< [in] Object containing omap
10702 bufferlist
*header
, ///< [out] omap header
10703 bool allow_eio
///< [in] don't assert on eio
10706 Collection
*c
= static_cast<Collection
*>(c_
.get());
10707 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10710 std::shared_lock
l(c
->lock
);
10712 OnodeRef o
= c
->get_onode(oid
, false);
10713 if (!o
|| !o
->exists
) {
10717 if (!o
->onode
.has_omap())
10722 o
->get_omap_header(&head
);
10723 if (db
->get(o
->get_omap_prefix(), head
, header
) >= 0) {
10724 dout(30) << __func__
<< " got header" << dendl
;
10726 dout(30) << __func__
<< " no header" << dendl
;
10730 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10735 int BlueStore::omap_get_keys(
10736 CollectionHandle
&c_
, ///< [in] Collection containing oid
10737 const ghobject_t
&oid
, ///< [in] Object containing omap
10738 set
<string
> *keys
///< [out] Keys defined on oid
10741 Collection
*c
= static_cast<Collection
*>(c_
.get());
10742 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10745 auto start1
= mono_clock::now();
10746 std::shared_lock
l(c
->lock
);
10748 OnodeRef o
= c
->get_onode(oid
, false);
10749 if (!o
|| !o
->exists
) {
10753 if (!o
->onode
.has_omap())
10757 const string
& prefix
= o
->get_omap_prefix();
10758 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
10760 o
->get_omap_key(string(), &head
);
10761 o
->get_omap_tail(&tail
);
10762 it
->lower_bound(head
);
10763 while (it
->valid()) {
10764 if (it
->key() >= tail
) {
10765 dout(30) << __func__
<< " reached tail" << dendl
;
10769 o
->decode_omap_key(it
->key(), &user_key
);
10770 dout(20) << __func__
<< " got " << pretty_binary_string(it
->key())
10771 << " -> " << user_key
<< dendl
;
10772 keys
->insert(user_key
);
10777 c
->store
->log_latency(
10779 l_bluestore_omap_get_keys_lat
,
10780 mono_clock::now() - start1
,
10781 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
10783 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10788 int BlueStore::omap_get_values(
10789 CollectionHandle
&c_
, ///< [in] Collection containing oid
10790 const ghobject_t
&oid
, ///< [in] Object containing omap
10791 const set
<string
> &keys
, ///< [in] Keys to get
10792 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
10795 Collection
*c
= static_cast<Collection
*>(c_
.get());
10796 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10799 std::shared_lock
l(c
->lock
);
10800 auto start1
= mono_clock::now();
10803 OnodeRef o
= c
->get_onode(oid
, false);
10804 if (!o
|| !o
->exists
) {
10808 if (!o
->onode
.has_omap()) {
10813 const string
& prefix
= o
->get_omap_prefix();
10814 o
->get_omap_key(string(), &final_key
);
10815 size_t base_key_len
= final_key
.size();
10816 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
10817 final_key
.resize(base_key_len
); // keep prefix
10820 if (db
->get(prefix
, final_key
, &val
) >= 0) {
10821 dout(30) << __func__
<< " got " << pretty_binary_string(final_key
)
10822 << " -> " << *p
<< dendl
;
10823 out
->insert(make_pair(*p
, val
));
10828 c
->store
->log_latency(
10830 l_bluestore_omap_get_values_lat
,
10831 mono_clock::now() - start1
,
10832 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
10834 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10839 #ifdef WITH_SEASTAR
10840 int BlueStore::omap_get_values(
10841 CollectionHandle
&c_
, ///< [in] Collection containing oid
10842 const ghobject_t
&oid
, ///< [in] Object containing omap
10843 const std::optional
<string
> &start_after
, ///< [in] Keys to get
10844 map
<string
, bufferlist
> *output
///< [out] Returned keys and values
10847 Collection
*c
= static_cast<Collection
*>(c_
.get());
10848 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10851 std::shared_lock
l(c
->lock
);
10853 OnodeRef o
= c
->get_onode(oid
, false);
10854 if (!o
|| !o
->exists
) {
10858 if (!o
->onode
.has_omap()) {
10863 ObjectMap::ObjectMapIterator iter
= get_omap_iterator(c_
, oid
);
10868 iter
->upper_bound(*start_after
);
10869 for (; iter
->valid(); iter
->next()) {
10870 output
->insert(make_pair(iter
->key(), iter
->value()));
10875 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10881 int BlueStore::omap_check_keys(
10882 CollectionHandle
&c_
, ///< [in] Collection containing oid
10883 const ghobject_t
&oid
, ///< [in] Object containing omap
10884 const set
<string
> &keys
, ///< [in] Keys to check
10885 set
<string
> *out
///< [out] Subset of keys defined on oid
10888 Collection
*c
= static_cast<Collection
*>(c_
.get());
10889 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10892 std::shared_lock
l(c
->lock
);
10895 OnodeRef o
= c
->get_onode(oid
, false);
10896 if (!o
|| !o
->exists
) {
10900 if (!o
->onode
.has_omap()) {
10905 const string
& prefix
= o
->get_omap_prefix();
10906 o
->get_omap_key(string(), &final_key
);
10907 size_t base_key_len
= final_key
.size();
10908 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
10909 final_key
.resize(base_key_len
); // keep prefix
10912 if (db
->get(prefix
, final_key
, &val
) >= 0) {
10913 dout(30) << __func__
<< " have " << pretty_binary_string(final_key
)
10914 << " -> " << *p
<< dendl
;
10917 dout(30) << __func__
<< " miss " << pretty_binary_string(final_key
)
10918 << " -> " << *p
<< dendl
;
10923 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10928 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
10929 CollectionHandle
&c_
, ///< [in] collection
10930 const ghobject_t
&oid
///< [in] object
10933 Collection
*c
= static_cast<Collection
*>(c_
.get());
10934 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
10936 return ObjectMap::ObjectMapIterator();
10938 std::shared_lock
l(c
->lock
);
10939 OnodeRef o
= c
->get_onode(oid
, false);
10940 if (!o
|| !o
->exists
) {
10941 dout(10) << __func__
<< " " << oid
<< "doesn't exist" <<dendl
;
10942 return ObjectMap::ObjectMapIterator();
10945 dout(10) << __func__
<< " has_omap = " << (int)o
->onode
.has_omap() <<dendl
;
10946 KeyValueDB::Iterator it
= db
->get_iterator(o
->get_omap_prefix());
10947 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c
, o
, it
));
10950 // -----------------
10953 uint64_t BlueStore::_get_ondisk_reserved() const {
10954 ceph_assert(min_alloc_size
);
10955 return round_up_to(
10956 std::max
<uint64_t>(SUPER_RESERVED
, min_alloc_size
), min_alloc_size
);
10959 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction
& t
)
10961 dout(10) << __func__
<< " ondisk_format " << ondisk_format
10962 << " min_compat_ondisk_format " << min_compat_ondisk_format
10964 ceph_assert(ondisk_format
== latest_ondisk_format
);
10967 encode(ondisk_format
, bl
);
10968 t
->set(PREFIX_SUPER
, "ondisk_format", bl
);
10972 encode(min_compat_ondisk_format
, bl
);
10973 t
->set(PREFIX_SUPER
, "min_compat_ondisk_format", bl
);
10977 int BlueStore::_open_super_meta()
10983 db
->get(PREFIX_SUPER
, "nid_max", &bl
);
10984 auto p
= bl
.cbegin();
10989 } catch (ceph::buffer::error
& e
) {
10990 derr
<< __func__
<< " unable to read nid_max" << dendl
;
10993 dout(1) << __func__
<< " old nid_max " << nid_max
<< dendl
;
10994 nid_last
= nid_max
.load();
11001 db
->get(PREFIX_SUPER
, "blobid_max", &bl
);
11002 auto p
= bl
.cbegin();
11007 } catch (ceph::buffer::error
& e
) {
11008 derr
<< __func__
<< " unable to read blobid_max" << dendl
;
11011 dout(1) << __func__
<< " old blobid_max " << blobid_max
<< dendl
;
11012 blobid_last
= blobid_max
.load();
11018 db
->get(PREFIX_SUPER
, "freelist_type", &bl
);
11020 freelist_type
= std::string(bl
.c_str(), bl
.length());
11021 dout(1) << __func__
<< " freelist_type " << freelist_type
<< dendl
;
11023 ceph_abort_msg("Not Support extent freelist manager");
11028 int32_t compat_ondisk_format
= 0;
11031 int r
= db
->get(PREFIX_SUPER
, "ondisk_format", &bl
);
11033 // base case: kraken bluestore is v1 and readable by v1
11034 dout(20) << __func__
<< " missing ondisk_format; assuming kraken"
11037 compat_ondisk_format
= 1;
11039 auto p
= bl
.cbegin();
11041 decode(ondisk_format
, p
);
11042 } catch (ceph::buffer::error
& e
) {
11043 derr
<< __func__
<< " unable to read ondisk_format" << dendl
;
11048 r
= db
->get(PREFIX_SUPER
, "min_compat_ondisk_format", &bl
);
11050 auto p
= bl
.cbegin();
11052 decode(compat_ondisk_format
, p
);
11053 } catch (ceph::buffer::error
& e
) {
11054 derr
<< __func__
<< " unable to read compat_ondisk_format" << dendl
;
11059 dout(1) << __func__
<< " ondisk_format " << ondisk_format
11060 << " compat_ondisk_format " << compat_ondisk_format
11064 if (latest_ondisk_format
< compat_ondisk_format
) {
11065 derr
<< __func__
<< " compat_ondisk_format is "
11066 << compat_ondisk_format
<< " but we only understand version "
11067 << latest_ondisk_format
<< dendl
;
11073 db
->get(PREFIX_SUPER
, "min_alloc_size", &bl
);
11074 auto p
= bl
.cbegin();
11078 min_alloc_size
= val
;
11079 min_alloc_size_order
= ctz(val
);
11080 ceph_assert(min_alloc_size
== 1u << min_alloc_size_order
);
11081 } catch (ceph::buffer::error
& e
) {
11082 derr
<< __func__
<< " unable to read min_alloc_size" << dendl
;
11085 dout(1) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
11086 << std::dec
<< dendl
;
11089 _set_per_pool_omap();
11092 _set_alloc_sizes();
11093 _set_throttle_params();
11096 _set_compression();
11103 int BlueStore::_upgrade_super()
11105 dout(1) << __func__
<< " from " << ondisk_format
<< ", latest "
11106 << latest_ondisk_format
<< dendl
;
11107 if (ondisk_format
< latest_ondisk_format
) {
11108 ceph_assert(ondisk_format
> 0);
11109 ceph_assert(ondisk_format
< latest_ondisk_format
);
11111 KeyValueDB::Transaction t
= db
->get_transaction();
11112 if (ondisk_format
== 1) {
11114 // - super: added ondisk_format
11115 // - super: added min_readable_ondisk_format
11116 // - super: added min_compat_ondisk_format
11117 // - super: added min_alloc_size
11118 // - super: removed min_min_alloc_size
11121 db
->get(PREFIX_SUPER
, "min_min_alloc_size", &bl
);
11122 auto p
= bl
.cbegin();
11126 min_alloc_size
= val
;
11127 } catch (ceph::buffer::error
& e
) {
11128 derr
<< __func__
<< " failed to read min_min_alloc_size" << dendl
;
11131 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
11132 t
->rmkey(PREFIX_SUPER
, "min_min_alloc_size");
11136 if (ondisk_format
== 2) {
11138 // - onode has FLAG_PERPOOL_OMAP. Note that we do not know that *all*
11139 // oondes are using the per-pool prefix until a repair is run; at that
11140 // point the per_pool_omap=1 key will be set.
11141 // - super: added per_pool_omap key, which indicates that *all* objects
11142 // are using the new prefix and key format
11145 if (ondisk_format
== 3) {
11147 // - FreelistManager keeps meta within bdev label
11148 int r
= _write_out_fm_meta(0);
11149 ceph_assert(r
== 0);
11152 // This to be the last operation
11153 _prepare_ondisk_format_super(t
);
11154 int r
= db
->submit_transaction_sync(t
);
11155 ceph_assert(r
== 0);
11158 dout(1) << __func__
<< " done" << dendl
;
11162 void BlueStore::_assign_nid(TransContext
*txc
, OnodeRef o
)
11164 if (o
->onode
.nid
) {
11165 ceph_assert(o
->exists
);
11168 uint64_t nid
= ++nid_last
;
11169 dout(20) << __func__
<< " " << nid
<< dendl
;
11170 o
->onode
.nid
= nid
;
11171 txc
->last_nid
= nid
;
11175 uint64_t BlueStore::_assign_blobid(TransContext
*txc
)
11177 uint64_t bid
= ++blobid_last
;
11178 dout(20) << __func__
<< " " << bid
<< dendl
;
11179 txc
->last_blobid
= bid
;
11183 void BlueStore::get_db_statistics(Formatter
*f
)
11185 db
->get_statistics(f
);
11188 BlueStore::TransContext
*BlueStore::_txc_create(
11189 Collection
*c
, OpSequencer
*osr
,
11190 list
<Context
*> *on_commits
,
11191 TrackedOpRef osd_op
)
11193 TransContext
*txc
= new TransContext(cct
, c
, osr
, on_commits
);
11194 txc
->t
= db
->get_transaction();
11197 if (osd_op
&& osd_op
->pg_trace
) {
11198 txc
->trace
.init("TransContext", &trace_endpoint
,
11199 &osd_op
->pg_trace
);
11200 txc
->trace
.event("txc create");
11201 txc
->trace
.keyval("txc seq", txc
->seq
);
11205 osr
->queue_new(txc
);
11206 dout(20) << __func__
<< " osr " << osr
<< " = " << txc
11207 << " seq " << txc
->seq
<< dendl
;
11211 void BlueStore::_txc_calc_cost(TransContext
*txc
)
11213 // one "io" for the kv commit
11214 auto ios
= 1 + txc
->ioc
.get_num_ios();
11215 auto cost
= throttle_cost_per_io
.load();
11216 txc
->cost
= ios
* cost
+ txc
->bytes
;
11218 dout(10) << __func__
<< " " << txc
<< " cost " << txc
->cost
<< " ("
11219 << ios
<< " ios * " << cost
<< " + " << txc
->bytes
11220 << " bytes)" << dendl
;
11223 void BlueStore::_txc_update_store_statfs(TransContext
*txc
)
11225 if (txc
->statfs_delta
.is_empty())
11228 logger
->inc(l_bluestore_allocated
, txc
->statfs_delta
.allocated());
11229 logger
->inc(l_bluestore_stored
, txc
->statfs_delta
.stored());
11230 logger
->inc(l_bluestore_compressed
, txc
->statfs_delta
.compressed());
11231 logger
->inc(l_bluestore_compressed_allocated
, txc
->statfs_delta
.compressed_allocated());
11232 logger
->inc(l_bluestore_compressed_original
, txc
->statfs_delta
.compressed_original());
11235 txc
->statfs_delta
.encode(bl
);
11236 if (per_pool_stat_collection
) {
11238 get_pool_stat_key(txc
->osd_pool_id
, &key
);
11239 txc
->t
->merge(PREFIX_STAT
, key
, bl
);
11241 std::lock_guard
l(vstatfs_lock
);
11242 auto& stats
= osd_pools
[txc
->osd_pool_id
];
11243 stats
+= txc
->statfs_delta
;
11245 vstatfs
+= txc
->statfs_delta
; //non-persistent in this mode
11248 txc
->t
->merge(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, bl
);
11250 std::lock_guard
l(vstatfs_lock
);
11251 vstatfs
+= txc
->statfs_delta
;
11253 txc
->statfs_delta
.reset();
11256 void BlueStore::_txc_state_proc(TransContext
*txc
)
11259 dout(10) << __func__
<< " txc " << txc
11260 << " " << txc
->get_state_name() << dendl
;
11261 switch (txc
->get_state()) {
11262 case TransContext::STATE_PREPARE
:
11263 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_prepare_lat
);
11264 if (txc
->ioc
.has_pending_aios()) {
11265 txc
->set_state(TransContext::STATE_AIO_WAIT
);
11268 txc
->trace
.keyval("pending aios", txc
->ioc
.num_pending
.load());
11271 txc
->had_ios
= true;
11272 _txc_aio_submit(txc
);
11277 case TransContext::STATE_AIO_WAIT
:
11279 mono_clock::duration lat
= throttle
.log_state_latency(
11280 *txc
, logger
, l_bluestore_state_aio_wait_lat
);
11281 if (ceph::to_seconds
<double>(lat
) >= cct
->_conf
->bluestore_log_op_age
) {
11282 dout(0) << __func__
<< " slow aio_wait, txc = " << txc
11283 << ", latency = " << lat
11288 _txc_finish_io(txc
); // may trigger blocked txc's too
11291 case TransContext::STATE_IO_DONE
:
11292 ceph_assert(ceph_mutex_is_locked(txc
->osr
->qlock
)); // see _txc_finish_io
11293 if (txc
->had_ios
) {
11294 ++txc
->osr
->txc_with_unstable_io
;
11296 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_io_done_lat
);
11297 txc
->set_state(TransContext::STATE_KV_QUEUED
);
11298 if (cct
->_conf
->bluestore_sync_submit_transaction
) {
11299 if (txc
->last_nid
>= nid_max
||
11300 txc
->last_blobid
>= blobid_max
) {
11301 dout(20) << __func__
11302 << " last_{nid,blobid} exceeds max, submit via kv thread"
11304 } else if (txc
->osr
->kv_committing_serially
) {
11305 dout(20) << __func__
<< " prior txc submitted via kv thread, us too"
11307 // note: this is starvation-prone. once we have a txc in a busy
11308 // sequencer that is committing serially it is possible to keep
11309 // submitting new transactions fast enough that we get stuck doing
11310 // so. the alternative is to block here... fixme?
11311 } else if (txc
->osr
->txc_with_unstable_io
) {
11312 dout(20) << __func__
<< " prior txc(s) with unstable ios "
11313 << txc
->osr
->txc_with_unstable_io
.load() << dendl
;
11314 } else if (cct
->_conf
->bluestore_debug_randomize_serial_transaction
&&
11315 rand() % cct
->_conf
->bluestore_debug_randomize_serial_transaction
11317 dout(20) << __func__
<< " DEBUG randomly forcing submit via kv thread"
11320 _txc_apply_kv(txc
, true);
11324 std::lock_guard
l(kv_lock
);
11325 kv_queue
.push_back(txc
);
11326 if (!kv_sync_in_progress
) {
11327 kv_sync_in_progress
= true;
11328 kv_cond
.notify_one();
11330 if (txc
->get_state() != TransContext::STATE_KV_SUBMITTED
) {
11331 kv_queue_unsubmitted
.push_back(txc
);
11332 ++txc
->osr
->kv_committing_serially
;
11336 kv_throttle_costs
+= txc
->cost
;
11339 case TransContext::STATE_KV_SUBMITTED
:
11340 _txc_committed_kv(txc
);
11343 case TransContext::STATE_KV_DONE
:
11344 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_done_lat
);
11345 if (txc
->deferred_txn
) {
11346 txc
->set_state(TransContext::STATE_DEFERRED_QUEUED
);
11347 _deferred_queue(txc
);
11350 txc
->set_state(TransContext::STATE_FINISHING
);
11353 case TransContext::STATE_DEFERRED_CLEANUP
:
11354 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_deferred_cleanup_lat
);
11355 txc
->set_state(TransContext::STATE_FINISHING
);
11358 case TransContext::STATE_FINISHING
:
11359 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_finishing_lat
);
11364 derr
<< __func__
<< " unexpected txc " << txc
11365 << " state " << txc
->get_state_name() << dendl
;
11366 ceph_abort_msg("unexpected txc state");
11372 void BlueStore::_txc_finish_io(TransContext
*txc
)
11374 dout(20) << __func__
<< " " << txc
<< dendl
;
11377 * we need to preserve the order of kv transactions,
11378 * even though aio will complete in any order.
11381 OpSequencer
*osr
= txc
->osr
.get();
11382 std::lock_guard
l(osr
->qlock
);
11383 txc
->set_state(TransContext::STATE_IO_DONE
);
11384 txc
->ioc
.release_running_aios();
11385 OpSequencer::q_list_t::iterator p
= osr
->q
.iterator_to(*txc
);
11386 while (p
!= osr
->q
.begin()) {
11388 if (p
->get_state() < TransContext::STATE_IO_DONE
) {
11389 dout(20) << __func__
<< " " << txc
<< " blocked by " << &*p
<< " "
11390 << p
->get_state_name() << dendl
;
11393 if (p
->get_state() > TransContext::STATE_IO_DONE
) {
11399 _txc_state_proc(&*p
++);
11400 } while (p
!= osr
->q
.end() &&
11401 p
->get_state() == TransContext::STATE_IO_DONE
);
11403 if (osr
->kv_submitted_waiters
) {
11404 osr
->qcond
.notify_all();
11408 void BlueStore::_txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
)
11410 dout(20) << __func__
<< " txc " << txc
11411 << " onodes " << txc
->onodes
11412 << " shared_blobs " << txc
->shared_blobs
11416 for (auto o
: txc
->onodes
) {
11417 _record_onode(o
, t
);
11418 o
->flushing_count
++;
11421 // objects we modified but didn't affect the onode
11422 auto p
= txc
->modified_objects
.begin();
11423 while (p
!= txc
->modified_objects
.end()) {
11424 if (txc
->onodes
.count(*p
) == 0) {
11425 (*p
)->flushing_count
++;
11428 // remove dups with onodes list to avoid problems in _txc_finish
11429 p
= txc
->modified_objects
.erase(p
);
11433 // finalize shared_blobs
11434 for (auto sb
: txc
->shared_blobs
) {
11436 auto sbid
= sb
->get_sbid();
11437 get_shared_blob_key(sbid
, &key
);
11438 if (sb
->persistent
->empty()) {
11439 dout(20) << __func__
<< " shared_blob 0x"
11440 << std::hex
<< sbid
<< std::dec
11441 << " is empty" << dendl
;
11442 t
->rmkey(PREFIX_SHARED_BLOB
, key
);
11445 encode(*(sb
->persistent
), bl
);
11446 dout(20) << __func__
<< " shared_blob 0x"
11447 << std::hex
<< sbid
<< std::dec
11448 << " is " << bl
.length() << " " << *sb
<< dendl
;
11449 t
->set(PREFIX_SHARED_BLOB
, key
, bl
);
11454 void BlueStore::BSPerfTracker::update_from_perfcounters(
11455 PerfCounters
&logger
)
11457 os_commit_latency_ns
.consume_next(
11458 logger
.get_tavg_ns(
11459 l_bluestore_commit_lat
));
11460 os_apply_latency_ns
.consume_next(
11461 logger
.get_tavg_ns(
11462 l_bluestore_commit_lat
));
11465 // For every object we maintain <zone_num+oid, offset> tuple in the key-value
11466 // store. When a new object written to a zone, we insert the corresponding
11467 // tuple to the database. When an object is truncated, we remove the
11468 // corresponding tuple. When an object is overwritten, we remove the old tuple
11469 // and insert a new tuple corresponding to the new location of the object. The
11470 // cleaner can now identify live objects within the zone <zone_num> by
11471 // enumerating all the keys starting with <zone_num> prefix.
11472 void BlueStore::_zoned_update_cleaning_metadata(TransContext
*txc
) {
11473 for (const auto &[o
, offsets
] : txc
->zoned_onode_to_offset_map
) {
11475 get_object_key(cct
, o
->oid
, &key
);
11476 for (auto offset
: offsets
) {
11478 bufferlist offset_bl
;
11479 encode(offset
, offset_bl
);
11480 txc
->t
->set(_zoned_get_prefix(offset
), key
, offset_bl
);
11482 txc
->t
->rmkey(_zoned_get_prefix(-offset
), key
);
11488 std::string
BlueStore::_zoned_get_prefix(uint64_t offset
) {
11489 uint64_t zone_num
= offset
/ bdev
->get_zone_size();
11490 std::string zone_key
;
11491 _key_encode_u64(zone_num
, &zone_key
);
11492 return PREFIX_ZONED_CL_INFO
+ zone_key
;
11495 // For now, to avoid interface changes we piggyback zone_size (in MiB) and the
11496 // first sequential zone number onto min_alloc_size and pass it to functions
11497 // Allocator::create and FreelistManager::create.
11498 uint64_t BlueStore::_zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size
) {
11499 uint64_t zone_size
= bdev
->get_zone_size();
11500 uint64_t zone_size_mb
= zone_size
/ (1024 * 1024);
11501 uint64_t first_seq_zone
= bdev
->get_conventional_region_size() / zone_size
;
11502 min_alloc_size
|= (zone_size_mb
<< 32);
11503 min_alloc_size
|= (first_seq_zone
<< 48);
11504 return min_alloc_size
;
11507 int BlueStore::_zoned_check_config_settings() {
11508 if (cct
->_conf
->bluestore_allocator
!= "zoned") {
11509 dout(1) << __func__
<< " The drive is HM-SMR but "
11510 << cct
->_conf
->bluestore_allocator
<< " allocator is specified. "
11511 << "Only zoned allocator can be used with HM-SMR drive." << dendl
;
11515 // At least for now we want to use large min_alloc_size with HM-SMR drives.
11516 // Populating used_blocks bitset on a debug build of ceph-osd takes about 5
11517 // minutes with a 14 TB HM-SMR drive and 4 KiB min_alloc_size.
11518 if (min_alloc_size
< 64 * 1024) {
11519 dout(1) << __func__
<< " The drive is HM-SMR but min_alloc_size is "
11520 << min_alloc_size
<< ". "
11521 << "Please set to at least 64 KiB." << dendl
;
11525 // We don't want to defer writes with HM-SMR because it violates sequential
11526 // write requirement.
11527 if (prefer_deferred_size
) {
11528 dout(1) << __func__
<< " The drive is HM-SMR but prefer_deferred_size is "
11529 << prefer_deferred_size
<< ". "
11530 << "Please set to 0." << dendl
;
11536 void BlueStore::_txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
)
11538 dout(20) << __func__
<< " txc " << txc
<< std::hex
11539 << " allocated 0x" << txc
->allocated
11540 << " released 0x" << txc
->released
11541 << std::dec
<< dendl
;
11543 // We have to handle the case where we allocate *and* deallocate the
11544 // same region in this transaction. The freelist doesn't like that.
11545 // (Actually, the only thing that cares is the BitmapFreelistManager
11546 // debug check. But that's important.)
11547 interval_set
<uint64_t> tmp_allocated
, tmp_released
;
11548 interval_set
<uint64_t> *pallocated
= &txc
->allocated
;
11549 interval_set
<uint64_t> *preleased
= &txc
->released
;
11550 if (!txc
->allocated
.empty() && !txc
->released
.empty()) {
11551 interval_set
<uint64_t> overlap
;
11552 overlap
.intersection_of(txc
->allocated
, txc
->released
);
11553 if (!overlap
.empty()) {
11554 tmp_allocated
= txc
->allocated
;
11555 tmp_allocated
.subtract(overlap
);
11556 tmp_released
= txc
->released
;
11557 tmp_released
.subtract(overlap
);
11558 dout(20) << __func__
<< " overlap 0x" << std::hex
<< overlap
11559 << ", new allocated 0x" << tmp_allocated
11560 << " released 0x" << tmp_released
<< std::dec
11562 pallocated
= &tmp_allocated
;
11563 preleased
= &tmp_released
;
11567 // update freelist with non-overlap sets
11568 for (interval_set
<uint64_t>::iterator p
= pallocated
->begin();
11569 p
!= pallocated
->end();
11571 fm
->allocate(p
.get_start(), p
.get_len(), t
);
11573 for (interval_set
<uint64_t>::iterator p
= preleased
->begin();
11574 p
!= preleased
->end();
11576 dout(20) << __func__
<< " release 0x" << std::hex
<< p
.get_start()
11577 << "~" << p
.get_len() << std::dec
<< dendl
;
11578 fm
->release(p
.get_start(), p
.get_len(), t
);
11581 if (bdev
->is_smr()) {
11582 _zoned_update_cleaning_metadata(txc
);
11585 _txc_update_store_statfs(txc
);
11588 void BlueStore::_txc_apply_kv(TransContext
*txc
, bool sync_submit_transaction
)
11590 ceph_assert(txc
->get_state() == TransContext::STATE_KV_QUEUED
);
11592 #if defined(WITH_LTTNG)
11593 auto start
= mono_clock::now();
11598 txc
->trace
.event("db async submit");
11602 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction(txc
->t
);
11603 ceph_assert(r
== 0);
11604 txc
->set_state(TransContext::STATE_KV_SUBMITTED
);
11605 if (txc
->osr
->kv_submitted_waiters
) {
11606 std::lock_guard
l(txc
->osr
->qlock
);
11607 txc
->osr
->qcond
.notify_all();
11610 #if defined(WITH_LTTNG)
11611 if (txc
->tracing
) {
11614 transaction_kv_submit_latency
,
11615 txc
->osr
->get_sequencer_id(),
11617 sync_submit_transaction
,
11618 ceph::to_seconds
<double>(mono_clock::now() - start
));
11623 for (auto ls
: { &txc
->onodes
, &txc
->modified_objects
}) {
11624 for (auto& o
: *ls
) {
11625 dout(20) << __func__
<< " onode " << o
<< " had " << o
->flushing_count
11627 if (--o
->flushing_count
== 0 && o
->waiting_count
.load()) {
11628 std::lock_guard
l(o
->flush_lock
);
11629 o
->flush_cond
.notify_all();
11635 void BlueStore::_txc_committed_kv(TransContext
*txc
)
11637 dout(20) << __func__
<< " txc " << txc
<< dendl
;
11638 throttle
.complete_kv(*txc
);
11640 std::lock_guard
l(txc
->osr
->qlock
);
11641 txc
->set_state(TransContext::STATE_KV_DONE
);
11642 if (txc
->ch
->commit_queue
) {
11643 txc
->ch
->commit_queue
->queue(txc
->oncommits
);
11645 finisher
.queue(txc
->oncommits
);
11648 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_committing_lat
);
11651 l_bluestore_commit_lat
,
11652 mono_clock::now() - txc
->start
,
11653 cct
->_conf
->bluestore_log_op_age
,
11655 return ", txc = " + stringify(txc
);
11660 void BlueStore::_txc_finish(TransContext
*txc
)
11662 dout(20) << __func__
<< " " << txc
<< " onodes " << txc
->onodes
<< dendl
;
11663 ceph_assert(txc
->get_state() == TransContext::STATE_FINISHING
);
11665 for (auto& sb
: txc
->shared_blobs_written
) {
11666 sb
->finish_write(txc
->seq
);
11668 txc
->shared_blobs_written
.clear();
11670 while (!txc
->removed_collections
.empty()) {
11671 _queue_reap_collection(txc
->removed_collections
.front());
11672 txc
->removed_collections
.pop_front();
11675 OpSequencerRef osr
= txc
->osr
;
11676 bool empty
= false;
11677 bool submit_deferred
= false;
11678 OpSequencer::q_list_t releasing_txc
;
11680 std::lock_guard
l(osr
->qlock
);
11681 txc
->set_state(TransContext::STATE_DONE
);
11682 bool notify
= false;
11683 while (!osr
->q
.empty()) {
11684 TransContext
*txc
= &osr
->q
.front();
11685 dout(20) << __func__
<< " txc " << txc
<< " " << txc
->get_state_name()
11687 if (txc
->get_state() != TransContext::STATE_DONE
) {
11688 if (txc
->get_state() == TransContext::STATE_PREPARE
&&
11689 deferred_aggressive
) {
11690 // for _osr_drain_preceding()
11693 if (txc
->get_state() == TransContext::STATE_DEFERRED_QUEUED
&&
11694 osr
->q
.size() > g_conf()->bluestore_max_deferred_txc
) {
11695 submit_deferred
= true;
11700 osr
->q
.pop_front();
11701 releasing_txc
.push_back(*txc
);
11704 if (osr
->q
.empty()) {
11705 dout(20) << __func__
<< " osr " << osr
<< " q now empty" << dendl
;
11709 // only drain()/drain_preceding() need wakeup,
11710 // other cases use kv_submitted_waiters
11711 if (notify
|| empty
) {
11712 osr
->qcond
.notify_all();
11716 while (!releasing_txc
.empty()) {
11717 // release to allocator only after all preceding txc's have also
11718 // finished any deferred writes that potentially land in these
11720 auto txc
= &releasing_txc
.front();
11721 _txc_release_alloc(txc
);
11722 releasing_txc
.pop_front();
11723 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_done_lat
);
11724 throttle
.complete(*txc
);
11728 if (submit_deferred
) {
11729 // we're pinning memory; flush! we could be more fine-grained here but
11730 // i'm not sure it's worth the bother.
11731 deferred_try_submit();
11734 if (empty
&& osr
->zombie
) {
11735 std::lock_guard
l(zombie_osr_lock
);
11736 if (zombie_osr_set
.erase(osr
->cid
)) {
11737 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
11739 dout(10) << __func__
<< " empty zombie osr " << osr
<< " already reaped"
11745 void BlueStore::_txc_release_alloc(TransContext
*txc
)
11747 // it's expected we're called with lazy_release_lock already taken!
11748 if (likely(!cct
->_conf
->bluestore_debug_no_reuse_blocks
)) {
11750 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
11751 r
= bdev
->queue_discard(txc
->released
);
11753 dout(10) << __func__
<< "(queued) " << txc
<< " " << std::hex
11754 << txc
->released
<< std::dec
<< dendl
;
11757 } else if (cct
->_conf
->bdev_enable_discard
) {
11758 for (auto p
= txc
->released
.begin(); p
!= txc
->released
.end(); ++p
) {
11759 bdev
->discard(p
.get_start(), p
.get_len());
11762 dout(10) << __func__
<< "(sync) " << txc
<< " " << std::hex
11763 << txc
->released
<< std::dec
<< dendl
;
11764 shared_alloc
.a
->release(txc
->released
);
11768 txc
->allocated
.clear();
11769 txc
->released
.clear();
11772 void BlueStore::_osr_attach(Collection
*c
)
11774 // note: caller has RWLock on coll_map
11775 auto q
= coll_map
.find(c
->cid
);
11776 if (q
!= coll_map
.end()) {
11777 c
->osr
= q
->second
->osr
;
11778 ldout(cct
, 10) << __func__
<< " " << c
->cid
11779 << " reusing osr " << c
->osr
<< " from existing coll "
11780 << q
->second
<< dendl
;
11782 std::lock_guard
l(zombie_osr_lock
);
11783 auto p
= zombie_osr_set
.find(c
->cid
);
11784 if (p
== zombie_osr_set
.end()) {
11785 c
->osr
= ceph::make_ref
<OpSequencer
>(this, next_sequencer_id
++, c
->cid
);
11786 ldout(cct
, 10) << __func__
<< " " << c
->cid
11787 << " fresh osr " << c
->osr
<< dendl
;
11789 c
->osr
= p
->second
;
11790 zombie_osr_set
.erase(p
);
11791 ldout(cct
, 10) << __func__
<< " " << c
->cid
11792 << " resurrecting zombie osr " << c
->osr
<< dendl
;
11793 c
->osr
->zombie
= false;
11798 void BlueStore::_osr_register_zombie(OpSequencer
*osr
)
11800 std::lock_guard
l(zombie_osr_lock
);
11801 dout(10) << __func__
<< " " << osr
<< " " << osr
->cid
<< dendl
;
11802 osr
->zombie
= true;
11803 auto i
= zombie_osr_set
.emplace(osr
->cid
, osr
);
11804 // this is either a new insertion or the same osr is already there
11805 ceph_assert(i
.second
|| i
.first
->second
== osr
);
11808 void BlueStore::_osr_drain_preceding(TransContext
*txc
)
11810 OpSequencer
*osr
= txc
->osr
.get();
11811 dout(10) << __func__
<< " " << txc
<< " osr " << osr
<< dendl
;
11812 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
11814 // submit anything pending
11815 osr
->deferred_lock
.lock();
11816 if (osr
->deferred_pending
&& !osr
->deferred_running
) {
11817 _deferred_submit_unlock(osr
);
11819 osr
->deferred_lock
.unlock();
11823 // wake up any previously finished deferred events
11824 std::lock_guard
l(kv_lock
);
11825 if (!kv_sync_in_progress
) {
11826 kv_sync_in_progress
= true;
11827 kv_cond
.notify_one();
11830 osr
->drain_preceding(txc
);
11831 --deferred_aggressive
;
11832 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
11835 void BlueStore::_osr_drain(OpSequencer
*osr
)
11837 dout(10) << __func__
<< " " << osr
<< dendl
;
11838 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
11840 // submit anything pending
11841 osr
->deferred_lock
.lock();
11842 if (osr
->deferred_pending
&& !osr
->deferred_running
) {
11843 _deferred_submit_unlock(osr
);
11845 osr
->deferred_lock
.unlock();
11849 // wake up any previously finished deferred events
11850 std::lock_guard
l(kv_lock
);
11851 if (!kv_sync_in_progress
) {
11852 kv_sync_in_progress
= true;
11853 kv_cond
.notify_one();
11857 --deferred_aggressive
;
11858 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
11861 void BlueStore::_osr_drain_all()
11863 dout(10) << __func__
<< dendl
;
11865 set
<OpSequencerRef
> s
;
11866 vector
<OpSequencerRef
> zombies
;
11868 std::shared_lock
l(coll_lock
);
11869 for (auto& i
: coll_map
) {
11870 s
.insert(i
.second
->osr
);
11874 std::lock_guard
l(zombie_osr_lock
);
11875 for (auto& i
: zombie_osr_set
) {
11876 s
.insert(i
.second
);
11877 zombies
.push_back(i
.second
);
11880 dout(20) << __func__
<< " osr_set " << s
<< dendl
;
11882 ++deferred_aggressive
;
11884 // submit anything pending
11885 deferred_try_submit();
11888 // wake up any previously finished deferred events
11889 std::lock_guard
l(kv_lock
);
11890 kv_cond
.notify_one();
11893 std::lock_guard
l(kv_finalize_lock
);
11894 kv_finalize_cond
.notify_one();
11896 for (auto osr
: s
) {
11897 dout(20) << __func__
<< " drain " << osr
<< dendl
;
11900 --deferred_aggressive
;
11903 std::lock_guard
l(zombie_osr_lock
);
11904 for (auto& osr
: zombies
) {
11905 if (zombie_osr_set
.erase(osr
->cid
)) {
11906 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
11907 ceph_assert(osr
->q
.empty());
11908 } else if (osr
->zombie
) {
11909 dout(10) << __func__
<< " empty zombie osr " << osr
11910 << " already reaped" << dendl
;
11911 ceph_assert(osr
->q
.empty());
11913 dout(10) << __func__
<< " empty zombie osr " << osr
11914 << " resurrected" << dendl
;
11919 dout(10) << __func__
<< " done" << dendl
;
11923 void BlueStore::_kv_start()
11925 dout(10) << __func__
<< dendl
;
11928 kv_sync_thread
.create("bstore_kv_sync");
11929 kv_finalize_thread
.create("bstore_kv_final");
11932 void BlueStore::_kv_stop()
11934 dout(10) << __func__
<< dendl
;
11936 std::unique_lock l
{kv_lock
};
11937 while (!kv_sync_started
) {
11941 kv_cond
.notify_all();
11944 std::unique_lock l
{kv_finalize_lock
};
11945 while (!kv_finalize_started
) {
11946 kv_finalize_cond
.wait(l
);
11948 kv_finalize_stop
= true;
11949 kv_finalize_cond
.notify_all();
11951 kv_sync_thread
.join();
11952 kv_finalize_thread
.join();
11953 ceph_assert(removed_collections
.empty());
11955 std::lock_guard
l(kv_lock
);
11959 std::lock_guard
l(kv_finalize_lock
);
11960 kv_finalize_stop
= false;
11962 dout(10) << __func__
<< " stopping finishers" << dendl
;
11963 finisher
.wait_for_empty();
11965 dout(10) << __func__
<< " stopped" << dendl
;
11968 void BlueStore::_kv_sync_thread()
11970 dout(10) << __func__
<< " start" << dendl
;
11971 deque
<DeferredBatch
*> deferred_stable_queue
; ///< deferred ios done + stable
11972 std::unique_lock l
{kv_lock
};
11973 ceph_assert(!kv_sync_started
);
11974 kv_sync_started
= true;
11975 kv_cond
.notify_all();
11977 auto t0
= mono_clock::now();
11978 timespan twait
= ceph::make_timespan(0);
11979 size_t kv_submitted
= 0;
11982 auto period
= cct
->_conf
->bluestore_kv_sync_util_logging_s
;
11983 auto observation_period
=
11984 ceph::make_timespan(period
);
11985 auto elapsed
= mono_clock::now() - t0
;
11986 if (period
&& elapsed
>= observation_period
) {
11987 dout(5) << __func__
<< " utilization: idle "
11988 << twait
<< " of " << elapsed
11989 << ", submitted: " << kv_submitted
11991 t0
= mono_clock::now();
11992 twait
= ceph::make_timespan(0);
11995 ceph_assert(kv_committing
.empty());
11996 if (kv_queue
.empty() &&
11997 ((deferred_done_queue
.empty() && deferred_stable_queue
.empty()) ||
11998 !deferred_aggressive
)) {
12001 dout(20) << __func__
<< " sleep" << dendl
;
12002 auto t
= mono_clock::now();
12003 kv_sync_in_progress
= false;
12005 twait
+= mono_clock::now() - t
;
12007 dout(20) << __func__
<< " wake" << dendl
;
12009 deque
<TransContext
*> kv_submitting
;
12010 deque
<DeferredBatch
*> deferred_done
, deferred_stable
;
12011 uint64_t aios
= 0, costs
= 0;
12013 dout(20) << __func__
<< " committing " << kv_queue
.size()
12014 << " submitting " << kv_queue_unsubmitted
.size()
12015 << " deferred done " << deferred_done_queue
.size()
12016 << " stable " << deferred_stable_queue
.size()
12018 kv_committing
.swap(kv_queue
);
12019 kv_submitting
.swap(kv_queue_unsubmitted
);
12020 deferred_done
.swap(deferred_done_queue
);
12021 deferred_stable
.swap(deferred_stable_queue
);
12023 costs
= kv_throttle_costs
;
12025 kv_throttle_costs
= 0;
12028 dout(30) << __func__
<< " committing " << kv_committing
<< dendl
;
12029 dout(30) << __func__
<< " submitting " << kv_submitting
<< dendl
;
12030 dout(30) << __func__
<< " deferred_done " << deferred_done
<< dendl
;
12031 dout(30) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
12033 auto start
= mono_clock::now();
12035 bool force_flush
= false;
12036 // if bluefs is sharing the same device as data (only), then we
12037 // can rely on the bluefs commit to flush the device and make
12038 // deferred aios stable. that means that if we do have done deferred
12039 // txcs AND we are not on a single device, we need to force a flush.
12040 if (bluefs
&& bluefs_layout
.single_shared_device()) {
12042 force_flush
= true;
12043 } else if (kv_committing
.empty() && deferred_stable
.empty()) {
12044 force_flush
= true; // there's nothing else to commit!
12045 } else if (deferred_aggressive
) {
12046 force_flush
= true;
12049 if (aios
|| !deferred_done
.empty()) {
12050 force_flush
= true;
12052 dout(20) << __func__
<< " skipping flush (no aios, no deferred_done)" << dendl
;
12057 dout(20) << __func__
<< " num_aios=" << aios
12058 << " force_flush=" << (int)force_flush
12059 << ", flushing, deferred done->stable" << dendl
;
12060 // flush/barrier on block device
12063 // if we flush then deferred done are now deferred stable
12064 deferred_stable
.insert(deferred_stable
.end(), deferred_done
.begin(),
12065 deferred_done
.end());
12066 deferred_done
.clear();
12068 auto after_flush
= mono_clock::now();
12070 // we will use one final transaction to force a sync
12071 KeyValueDB::Transaction synct
= db
->get_transaction();
12073 // increase {nid,blobid}_max? note that this covers both the
12074 // case where we are approaching the max and the case we passed
12075 // it. in either case, we increase the max in the earlier txn
12077 uint64_t new_nid_max
= 0, new_blobid_max
= 0;
12078 if (nid_last
+ cct
->_conf
->bluestore_nid_prealloc
/2 > nid_max
) {
12079 KeyValueDB::Transaction t
=
12080 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
12081 new_nid_max
= nid_last
+ cct
->_conf
->bluestore_nid_prealloc
;
12083 encode(new_nid_max
, bl
);
12084 t
->set(PREFIX_SUPER
, "nid_max", bl
);
12085 dout(10) << __func__
<< " new_nid_max " << new_nid_max
<< dendl
;
12087 if (blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
/2 > blobid_max
) {
12088 KeyValueDB::Transaction t
=
12089 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
12090 new_blobid_max
= blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
;
12092 encode(new_blobid_max
, bl
);
12093 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
12094 dout(10) << __func__
<< " new_blobid_max " << new_blobid_max
<< dendl
;
12097 for (auto txc
: kv_committing
) {
12098 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_queued_lat
);
12099 if (txc
->get_state() == TransContext::STATE_KV_QUEUED
) {
12101 _txc_apply_kv(txc
, false);
12102 --txc
->osr
->kv_committing_serially
;
12104 ceph_assert(txc
->get_state() == TransContext::STATE_KV_SUBMITTED
);
12106 if (txc
->had_ios
) {
12107 --txc
->osr
->txc_with_unstable_io
;
12111 // release throttle *before* we commit. this allows new ops
12112 // to be prepared and enter pipeline while we are waiting on
12113 // the kv commit sync/flush. then hopefully on the next
12114 // iteration there will already be ops awake. otherwise, we
12115 // end up going to sleep, and then wake up when the very first
12116 // transaction is ready for commit.
12117 throttle
.release_kv_throttle(costs
);
12119 // cleanup sync deferred keys
12120 for (auto b
: deferred_stable
) {
12121 for (auto& txc
: b
->txcs
) {
12122 bluestore_deferred_transaction_t
& wt
= *txc
.deferred_txn
;
12123 ceph_assert(wt
.released
.empty()); // only kraken did this
12125 get_deferred_key(wt
.seq
, &key
);
12126 synct
->rm_single_key(PREFIX_DEFERRED
, key
);
12130 #if defined(WITH_LTTNG)
12131 auto sync_start
= mono_clock::now();
12133 // submit synct synchronously (block and wait for it to commit)
12134 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction_sync(synct
);
12135 ceph_assert(r
== 0);
12138 for (auto txc
: kv_committing
) {
12140 txc
->trace
.event("db sync submit");
12141 txc
->trace
.keyval("kv_committing size", kv_committing
.size());
12146 int committing_size
= kv_committing
.size();
12147 int deferred_size
= deferred_stable
.size();
12149 #if defined(WITH_LTTNG)
12150 double sync_latency
= ceph::to_seconds
<double>(mono_clock::now() - sync_start
);
12151 for (auto txc
: kv_committing
) {
12152 if (txc
->tracing
) {
12155 transaction_kv_sync_latency
,
12156 txc
->osr
->get_sequencer_id(),
12158 kv_committing
.size(),
12159 deferred_done
.size(),
12160 deferred_stable
.size(),
12167 std::unique_lock m
{kv_finalize_lock
};
12168 if (kv_committing_to_finalize
.empty()) {
12169 kv_committing_to_finalize
.swap(kv_committing
);
12171 kv_committing_to_finalize
.insert(
12172 kv_committing_to_finalize
.end(),
12173 kv_committing
.begin(),
12174 kv_committing
.end());
12175 kv_committing
.clear();
12177 if (deferred_stable_to_finalize
.empty()) {
12178 deferred_stable_to_finalize
.swap(deferred_stable
);
12180 deferred_stable_to_finalize
.insert(
12181 deferred_stable_to_finalize
.end(),
12182 deferred_stable
.begin(),
12183 deferred_stable
.end());
12184 deferred_stable
.clear();
12186 if (!kv_finalize_in_progress
) {
12187 kv_finalize_in_progress
= true;
12188 kv_finalize_cond
.notify_one();
12193 nid_max
= new_nid_max
;
12194 dout(10) << __func__
<< " nid_max now " << nid_max
<< dendl
;
12196 if (new_blobid_max
) {
12197 blobid_max
= new_blobid_max
;
12198 dout(10) << __func__
<< " blobid_max now " << blobid_max
<< dendl
;
12202 auto finish
= mono_clock::now();
12203 ceph::timespan dur_flush
= after_flush
- start
;
12204 ceph::timespan dur_kv
= finish
- after_flush
;
12205 ceph::timespan dur
= finish
- start
;
12206 dout(20) << __func__
<< " committed " << committing_size
12207 << " cleaned " << deferred_size
12209 << " (" << dur_flush
<< " flush + " << dur_kv
<< " kv commit)"
12211 log_latency("kv_flush",
12212 l_bluestore_kv_flush_lat
,
12214 cct
->_conf
->bluestore_log_op_age
);
12215 log_latency("kv_commit",
12216 l_bluestore_kv_commit_lat
,
12218 cct
->_conf
->bluestore_log_op_age
);
12219 log_latency("kv_sync",
12220 l_bluestore_kv_sync_lat
,
12222 cct
->_conf
->bluestore_log_op_age
);
12226 // previously deferred "done" are now "stable" by virtue of this
12228 deferred_stable_queue
.swap(deferred_done
);
12231 dout(10) << __func__
<< " finish" << dendl
;
12232 kv_sync_started
= false;
12235 void BlueStore::_kv_finalize_thread()
12237 deque
<TransContext
*> kv_committed
;
12238 deque
<DeferredBatch
*> deferred_stable
;
12239 dout(10) << __func__
<< " start" << dendl
;
12240 std::unique_lock
l(kv_finalize_lock
);
12241 ceph_assert(!kv_finalize_started
);
12242 kv_finalize_started
= true;
12243 kv_finalize_cond
.notify_all();
12245 ceph_assert(kv_committed
.empty());
12246 ceph_assert(deferred_stable
.empty());
12247 if (kv_committing_to_finalize
.empty() &&
12248 deferred_stable_to_finalize
.empty()) {
12249 if (kv_finalize_stop
)
12251 dout(20) << __func__
<< " sleep" << dendl
;
12252 kv_finalize_in_progress
= false;
12253 kv_finalize_cond
.wait(l
);
12254 dout(20) << __func__
<< " wake" << dendl
;
12256 kv_committed
.swap(kv_committing_to_finalize
);
12257 deferred_stable
.swap(deferred_stable_to_finalize
);
12259 dout(20) << __func__
<< " kv_committed " << kv_committed
<< dendl
;
12260 dout(20) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
12262 auto start
= mono_clock::now();
12264 while (!kv_committed
.empty()) {
12265 TransContext
*txc
= kv_committed
.front();
12266 ceph_assert(txc
->get_state() == TransContext::STATE_KV_SUBMITTED
);
12267 _txc_state_proc(txc
);
12268 kv_committed
.pop_front();
12271 for (auto b
: deferred_stable
) {
12272 auto p
= b
->txcs
.begin();
12273 while (p
!= b
->txcs
.end()) {
12274 TransContext
*txc
= &*p
;
12275 p
= b
->txcs
.erase(p
); // unlink here because
12276 _txc_state_proc(txc
); // this may destroy txc
12280 deferred_stable
.clear();
12282 if (!deferred_aggressive
) {
12283 if (deferred_queue_size
>= deferred_batch_ops
.load() ||
12284 throttle
.should_submit_deferred()) {
12285 deferred_try_submit();
12289 // this is as good a place as any ...
12290 _reap_collections();
12292 logger
->set(l_bluestore_fragmentation
,
12293 (uint64_t)(shared_alloc
.a
->get_fragmentation() * 1000));
12295 log_latency("kv_final",
12296 l_bluestore_kv_final_lat
,
12297 mono_clock::now() - start
,
12298 cct
->_conf
->bluestore_log_op_age
);
12303 dout(10) << __func__
<< " finish" << dendl
;
12304 kv_finalize_started
= false;
12307 void BlueStore::_zoned_cleaner_start() {
12308 dout(10) << __func__
<< dendl
;
12310 zoned_cleaner_thread
.create("bstore_zcleaner");
12313 void BlueStore::_zoned_cleaner_stop() {
12314 dout(10) << __func__
<< dendl
;
12316 std::unique_lock l
{zoned_cleaner_lock
};
12317 while (!zoned_cleaner_started
) {
12318 zoned_cleaner_cond
.wait(l
);
12320 zoned_cleaner_stop
= true;
12321 zoned_cleaner_cond
.notify_all();
12323 zoned_cleaner_thread
.join();
12325 std::lock_guard l
{zoned_cleaner_lock
};
12326 zoned_cleaner_stop
= false;
12328 dout(10) << __func__
<< " done" << dendl
;
12331 void BlueStore::_zoned_cleaner_thread() {
12332 dout(10) << __func__
<< " start" << dendl
;
12333 std::unique_lock l
{zoned_cleaner_lock
};
12334 ceph_assert(!zoned_cleaner_started
);
12335 zoned_cleaner_started
= true;
12336 zoned_cleaner_cond
.notify_all();
12337 std::deque
<uint64_t> zones_to_clean
;
12339 if (zoned_cleaner_queue
.empty()) {
12340 if (zoned_cleaner_stop
) {
12343 dout(20) << __func__
<< " sleep" << dendl
;
12344 zoned_cleaner_cond
.wait(l
);
12345 dout(20) << __func__
<< " wake" << dendl
;
12347 zones_to_clean
.swap(zoned_cleaner_queue
);
12349 while (!zones_to_clean
.empty()) {
12350 _zoned_clean_zone(zones_to_clean
.front());
12351 zones_to_clean
.pop_front();
12356 dout(10) << __func__
<< " finish" << dendl
;
12357 zoned_cleaner_started
= false;
12360 void BlueStore::_zoned_clean_zone(uint64_t zone_num
) {
12361 dout(10) << __func__
<< " cleaning zone " << zone_num
<< dendl
;
12364 bluestore_deferred_op_t
*BlueStore::_get_deferred_op(
12367 if (!txc
->deferred_txn
) {
12368 txc
->deferred_txn
= new bluestore_deferred_transaction_t
;
12370 txc
->deferred_txn
->ops
.push_back(bluestore_deferred_op_t());
12371 return &txc
->deferred_txn
->ops
.back();
12374 void BlueStore::_deferred_queue(TransContext
*txc
)
12376 dout(20) << __func__
<< " txc " << txc
<< " osr " << txc
->osr
<< dendl
;
12378 DeferredBatch
*tmp
;
12379 txc
->osr
->deferred_lock
.lock();
12381 if (!txc
->osr
->deferred_pending
) {
12382 tmp
= new DeferredBatch(cct
, txc
->osr
.get());
12384 tmp
= txc
->osr
->deferred_pending
;
12388 tmp
->txcs
.push_back(*txc
);
12389 bluestore_deferred_transaction_t
& wt
= *txc
->deferred_txn
;
12390 for (auto opi
= wt
.ops
.begin(); opi
!= wt
.ops
.end(); ++opi
) {
12391 const auto& op
= *opi
;
12392 ceph_assert(op
.op
== bluestore_deferred_op_t::OP_WRITE
);
12393 bufferlist::const_iterator p
= op
.data
.begin();
12394 for (auto e
: op
.extents
) {
12395 tmp
->prepare_write(cct
, wt
.seq
, e
.offset
, e
.length
, p
);
12400 ++deferred_queue_size
;
12401 txc
->osr
->deferred_pending
= tmp
;
12402 // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
12403 // So we should add osr into deferred_queue.
12404 if (!txc
->osr
->deferred_running
&& (tmp
->txcs
.size() == 1)) {
12405 deferred_lock
.lock();
12406 deferred_queue
.push_back(*txc
->osr
);
12407 deferred_lock
.unlock();
12410 if (deferred_aggressive
&&
12411 !txc
->osr
->deferred_running
) {
12412 _deferred_submit_unlock(txc
->osr
.get());
12414 txc
->osr
->deferred_lock
.unlock();
12419 void BlueStore::deferred_try_submit()
12421 dout(20) << __func__
<< " " << deferred_queue
.size() << " osrs, "
12422 << deferred_queue_size
<< " txcs" << dendl
;
12423 vector
<OpSequencerRef
> osrs
;
12426 std::lock_guard
l(deferred_lock
);
12427 osrs
.reserve(deferred_queue
.size());
12428 for (auto& osr
: deferred_queue
) {
12429 osrs
.push_back(&osr
);
12433 for (auto& osr
: osrs
) {
12434 osr
->deferred_lock
.lock();
12435 if (osr
->deferred_pending
) {
12436 if (!osr
->deferred_running
) {
12437 _deferred_submit_unlock(osr
.get());
12439 osr
->deferred_lock
.unlock();
12440 dout(20) << __func__
<< " osr " << osr
<< " already has running"
12444 osr
->deferred_lock
.unlock();
12445 dout(20) << __func__
<< " osr " << osr
<< " has no pending" << dendl
;
12450 std::lock_guard
l(deferred_lock
);
12451 deferred_last_submitted
= ceph_clock_now();
12455 void BlueStore::_deferred_submit_unlock(OpSequencer
*osr
)
12457 dout(10) << __func__
<< " osr " << osr
12458 << " " << osr
->deferred_pending
->iomap
.size() << " ios pending "
12460 ceph_assert(osr
->deferred_pending
);
12461 ceph_assert(!osr
->deferred_running
);
12463 auto b
= osr
->deferred_pending
;
12464 deferred_queue_size
-= b
->seq_bytes
.size();
12465 ceph_assert(deferred_queue_size
>= 0);
12467 osr
->deferred_running
= osr
->deferred_pending
;
12468 osr
->deferred_pending
= nullptr;
12470 osr
->deferred_lock
.unlock();
12472 for (auto& txc
: b
->txcs
) {
12473 throttle
.log_state_latency(txc
, logger
, l_bluestore_state_deferred_queued_lat
);
12475 uint64_t start
= 0, pos
= 0;
12477 auto i
= b
->iomap
.begin();
12479 if (i
== b
->iomap
.end() || i
->first
!= pos
) {
12481 dout(20) << __func__
<< " write 0x" << std::hex
12482 << start
<< "~" << bl
.length()
12483 << " crc " << bl
.crc32c(-1) << std::dec
<< dendl
;
12484 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
12485 logger
->inc(l_bluestore_deferred_write_ops
);
12486 logger
->inc(l_bluestore_deferred_write_bytes
, bl
.length());
12487 int r
= bdev
->aio_write(start
, bl
, &b
->ioc
, false);
12488 ceph_assert(r
== 0);
12491 if (i
== b
->iomap
.end()) {
12498 dout(20) << __func__
<< " seq " << i
->second
.seq
<< " 0x"
12499 << std::hex
<< pos
<< "~" << i
->second
.bl
.length() << std::dec
12501 if (!bl
.length()) {
12504 pos
+= i
->second
.bl
.length();
12505 bl
.claim_append(i
->second
.bl
);
12509 bdev
->aio_submit(&b
->ioc
);
12512 struct C_DeferredTrySubmit
: public Context
{
12514 C_DeferredTrySubmit(BlueStore
*s
) : store(s
) {}
12515 void finish(int r
) {
12516 store
->deferred_try_submit();
12520 void BlueStore::_deferred_aio_finish(OpSequencer
*osr
)
12522 dout(10) << __func__
<< " osr " << osr
<< dendl
;
12523 ceph_assert(osr
->deferred_running
);
12524 DeferredBatch
*b
= osr
->deferred_running
;
12527 osr
->deferred_lock
.lock();
12528 ceph_assert(osr
->deferred_running
== b
);
12529 osr
->deferred_running
= nullptr;
12530 if (!osr
->deferred_pending
) {
12531 dout(20) << __func__
<< " dequeueing" << dendl
;
12533 deferred_lock
.lock();
12534 auto q
= deferred_queue
.iterator_to(*osr
);
12535 deferred_queue
.erase(q
);
12536 deferred_lock
.unlock();
12538 osr
->deferred_lock
.unlock();
12540 osr
->deferred_lock
.unlock();
12541 if (deferred_aggressive
) {
12542 dout(20) << __func__
<< " queuing async deferred_try_submit" << dendl
;
12543 finisher
.queue(new C_DeferredTrySubmit(this));
12545 dout(20) << __func__
<< " leaving queued, more pending" << dendl
;
12551 uint64_t costs
= 0;
12553 for (auto& i
: b
->txcs
) {
12554 TransContext
*txc
= &i
;
12555 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_deferred_aio_wait_lat
);
12556 txc
->set_state(TransContext::STATE_DEFERRED_CLEANUP
);
12557 costs
+= txc
->cost
;
12560 throttle
.release_deferred_throttle(costs
);
12564 std::lock_guard
l(kv_lock
);
12565 deferred_done_queue
.emplace_back(b
);
12567 // in the normal case, do not bother waking up the kv thread; it will
12568 // catch us on the next commit anyway.
12569 if (deferred_aggressive
&& !kv_sync_in_progress
) {
12570 kv_sync_in_progress
= true;
12571 kv_cond
.notify_one();
12576 int BlueStore::_deferred_replay()
12578 dout(10) << __func__
<< " start" << dendl
;
12581 CollectionRef ch
= _get_collection(coll_t::meta());
12582 bool fake_ch
= false;
12584 // hmm, replaying initial mkfs?
12585 ch
= static_cast<Collection
*>(create_new_collection(coll_t::meta()).get());
12588 OpSequencer
*osr
= static_cast<OpSequencer
*>(ch
->osr
.get());
12589 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_DEFERRED
);
12590 for (it
->lower_bound(string()); it
->valid(); it
->next(), ++count
) {
12591 dout(20) << __func__
<< " replay " << pretty_binary_string(it
->key())
12593 bluestore_deferred_transaction_t
*deferred_txn
=
12594 new bluestore_deferred_transaction_t
;
12595 bufferlist bl
= it
->value();
12596 auto p
= bl
.cbegin();
12598 decode(*deferred_txn
, p
);
12599 } catch (ceph::buffer::error
& e
) {
12600 derr
<< __func__
<< " failed to decode deferred txn "
12601 << pretty_binary_string(it
->key()) << dendl
;
12602 delete deferred_txn
;
12606 TransContext
*txc
= _txc_create(ch
.get(), osr
, nullptr);
12607 txc
->deferred_txn
= deferred_txn
;
12608 txc
->set_state(TransContext::STATE_KV_DONE
);
12609 _txc_state_proc(txc
);
12612 dout(20) << __func__
<< " draining osr" << dendl
;
12613 _osr_register_zombie(osr
);
12616 new_coll_map
.clear();
12618 dout(10) << __func__
<< " completed " << count
<< " events" << dendl
;
12622 // ---------------------------
12625 int BlueStore::queue_transactions(
12626 CollectionHandle
& ch
,
12627 vector
<Transaction
>& tls
,
12629 ThreadPool::TPHandle
*handle
)
12632 list
<Context
*> on_applied
, on_commit
, on_applied_sync
;
12633 ObjectStore::Transaction::collect_contexts(
12634 tls
, &on_applied
, &on_commit
, &on_applied_sync
);
12636 auto start
= mono_clock::now();
12638 Collection
*c
= static_cast<Collection
*>(ch
.get());
12639 OpSequencer
*osr
= c
->osr
.get();
12640 dout(10) << __func__
<< " ch " << c
<< " " << c
->cid
<< dendl
;
12643 TransContext
*txc
= _txc_create(static_cast<Collection
*>(ch
.get()), osr
,
12646 // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
12647 // submission to happen atomically because if I/O submission happens in a
12648 // different order than I/O allocation, we end up issuing non-sequential
12649 // writes to the drive. This is a temporary solution until ZONE APPEND
12650 // support matures in the kernel. For more information please see:
12651 // https://www.usenix.org/conference/vault20/presentation/bjorling
12652 if (bdev
->is_smr()) {
12653 atomic_alloc_and_submit_lock
.lock();
12655 for (vector
<Transaction
>::iterator p
= tls
.begin(); p
!= tls
.end(); ++p
) {
12656 txc
->bytes
+= (*p
).get_num_bytes();
12657 _txc_add_transaction(txc
, &(*p
));
12659 _txc_calc_cost(txc
);
12661 _txc_write_nodes(txc
, txc
->t
);
12663 // journal deferred items
12664 if (txc
->deferred_txn
) {
12665 txc
->deferred_txn
->seq
= ++deferred_seq
;
12667 encode(*txc
->deferred_txn
, bl
);
12669 get_deferred_key(txc
->deferred_txn
->seq
, &key
);
12670 txc
->t
->set(PREFIX_DEFERRED
, key
, bl
);
12673 _txc_finalize_kv(txc
, txc
->t
);
12677 txc
->trace
.event("txc encode finished");
12682 handle
->suspend_tp_timeout();
12684 auto tstart
= mono_clock::now();
12686 if (!throttle
.try_start_transaction(
12690 // ensure we do not block here because of deferred writes
12691 dout(10) << __func__
<< " failed get throttle_deferred_bytes, aggressive"
12693 ++deferred_aggressive
;
12694 deferred_try_submit();
12696 // wake up any previously finished deferred events
12697 std::lock_guard
l(kv_lock
);
12698 if (!kv_sync_in_progress
) {
12699 kv_sync_in_progress
= true;
12700 kv_cond
.notify_one();
12703 throttle
.finish_start_transaction(*db
, *txc
, tstart
);
12704 --deferred_aggressive
;
12706 auto tend
= mono_clock::now();
12709 handle
->reset_tp_timeout();
12711 logger
->inc(l_bluestore_txc
);
12714 _txc_state_proc(txc
);
12716 if (bdev
->is_smr()) {
12717 atomic_alloc_and_submit_lock
.unlock();
12720 // we're immediately readable (unlike FileStore)
12721 for (auto c
: on_applied_sync
) {
12724 if (!on_applied
.empty()) {
12725 if (c
->commit_queue
) {
12726 c
->commit_queue
->queue(on_applied
);
12728 finisher
.queue(on_applied
);
12734 txc
->trace
.event("txc applied");
12738 log_latency("submit_transact",
12739 l_bluestore_submit_lat
,
12740 mono_clock::now() - start
,
12741 cct
->_conf
->bluestore_log_op_age
);
12742 log_latency("throttle_transact",
12743 l_bluestore_throttle_lat
,
12745 cct
->_conf
->bluestore_log_op_age
);
12749 void BlueStore::_txc_aio_submit(TransContext
*txc
)
12751 dout(10) << __func__
<< " txc " << txc
<< dendl
;
12752 bdev
->aio_submit(&txc
->ioc
);
12755 void BlueStore::_txc_add_transaction(TransContext
*txc
, Transaction
*t
)
12757 Transaction::iterator i
= t
->begin();
12759 _dump_transaction
<30>(cct
, t
);
12761 vector
<CollectionRef
> cvec(i
.colls
.size());
12763 for (vector
<coll_t
>::iterator p
= i
.colls
.begin(); p
!= i
.colls
.end();
12765 cvec
[j
] = _get_collection(*p
);
12768 vector
<OnodeRef
> ovec(i
.objects
.size());
12770 for (int pos
= 0; i
.have_op(); ++pos
) {
12771 Transaction::Op
*op
= i
.decode_op();
12775 if (op
->op
== Transaction::OP_NOP
)
12779 // collection operations
12780 CollectionRef
&c
= cvec
[op
->cid
];
12782 // initialize osd_pool_id and do a smoke test that all collections belong
12783 // to the same pool
12785 if (!!c
? c
->cid
.is_pg(&pgid
) : false) {
12786 ceph_assert(txc
->osd_pool_id
== META_POOL_ID
||
12787 txc
->osd_pool_id
== pgid
.pool());
12788 txc
->osd_pool_id
= pgid
.pool();
12792 case Transaction::OP_RMCOLL
:
12794 const coll_t
&cid
= i
.get_cid(op
->cid
);
12795 r
= _remove_collection(txc
, cid
, &c
);
12801 case Transaction::OP_MKCOLL
:
12804 const coll_t
&cid
= i
.get_cid(op
->cid
);
12805 r
= _create_collection(txc
, cid
, op
->split_bits
, &c
);
12811 case Transaction::OP_SPLIT_COLLECTION
:
12812 ceph_abort_msg("deprecated");
12815 case Transaction::OP_SPLIT_COLLECTION2
:
12817 uint32_t bits
= op
->split_bits
;
12818 uint32_t rem
= op
->split_rem
;
12819 r
= _split_collection(txc
, c
, cvec
[op
->dest_cid
], bits
, rem
);
12825 case Transaction::OP_MERGE_COLLECTION
:
12827 uint32_t bits
= op
->split_bits
;
12828 r
= _merge_collection(txc
, &c
, cvec
[op
->dest_cid
], bits
);
12834 case Transaction::OP_COLL_HINT
:
12836 uint32_t type
= op
->hint
;
12839 auto hiter
= hint
.cbegin();
12840 if (type
== Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
) {
12843 decode(pg_num
, hiter
);
12844 decode(num_objs
, hiter
);
12845 dout(10) << __func__
<< " collection hint objects is a no-op, "
12846 << " pg_num " << pg_num
<< " num_objects " << num_objs
12850 dout(10) << __func__
<< " unknown collection hint " << type
<< dendl
;
12856 case Transaction::OP_COLL_SETATTR
:
12860 case Transaction::OP_COLL_RMATTR
:
12864 case Transaction::OP_COLL_RENAME
:
12865 ceph_abort_msg("not implemented");
12869 derr
<< __func__
<< " error " << cpp_strerror(r
)
12870 << " not handled on operation " << op
->op
12871 << " (op " << pos
<< ", counting from 0)" << dendl
;
12872 _dump_transaction
<0>(cct
, t
);
12873 ceph_abort_msg("unexpected error");
12876 // these operations implicity create the object
12877 bool create
= false;
12878 if (op
->op
== Transaction::OP_TOUCH
||
12879 op
->op
== Transaction::OP_CREATE
||
12880 op
->op
== Transaction::OP_WRITE
||
12881 op
->op
== Transaction::OP_ZERO
) {
12885 // object operations
12886 std::unique_lock
l(c
->lock
);
12887 OnodeRef
&o
= ovec
[op
->oid
];
12889 ghobject_t oid
= i
.get_oid(op
->oid
);
12890 o
= c
->get_onode(oid
, create
, op
->op
== Transaction::OP_CREATE
);
12892 if (!create
&& (!o
|| !o
->exists
)) {
12893 dout(10) << __func__
<< " op " << op
->op
<< " got ENOENT on "
12894 << i
.get_oid(op
->oid
) << dendl
;
12900 case Transaction::OP_CREATE
:
12901 case Transaction::OP_TOUCH
:
12902 r
= _touch(txc
, c
, o
);
12905 case Transaction::OP_WRITE
:
12907 uint64_t off
= op
->off
;
12908 uint64_t len
= op
->len
;
12909 uint32_t fadvise_flags
= i
.get_fadvise_flags();
12912 r
= _write(txc
, c
, o
, off
, len
, bl
, fadvise_flags
);
12916 case Transaction::OP_ZERO
:
12918 uint64_t off
= op
->off
;
12919 uint64_t len
= op
->len
;
12920 r
= _zero(txc
, c
, o
, off
, len
);
12924 case Transaction::OP_TRIMCACHE
:
12926 // deprecated, no-op
12930 case Transaction::OP_TRUNCATE
:
12932 uint64_t off
= op
->off
;
12933 r
= _truncate(txc
, c
, o
, off
);
12937 case Transaction::OP_REMOVE
:
12939 r
= _remove(txc
, c
, o
);
12943 case Transaction::OP_SETATTR
:
12945 string name
= i
.decode_string();
12948 r
= _setattr(txc
, c
, o
, name
, bp
);
12952 case Transaction::OP_SETATTRS
:
12954 map
<string
, bufferptr
> aset
;
12955 i
.decode_attrset(aset
);
12956 r
= _setattrs(txc
, c
, o
, aset
);
12960 case Transaction::OP_RMATTR
:
12962 string name
= i
.decode_string();
12963 r
= _rmattr(txc
, c
, o
, name
);
12967 case Transaction::OP_RMATTRS
:
12969 r
= _rmattrs(txc
, c
, o
);
12973 case Transaction::OP_CLONE
:
12975 OnodeRef
& no
= ovec
[op
->dest_oid
];
12977 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
12978 no
= c
->get_onode(noid
, true);
12980 r
= _clone(txc
, c
, o
, no
);
12984 case Transaction::OP_CLONERANGE
:
12985 ceph_abort_msg("deprecated");
12988 case Transaction::OP_CLONERANGE2
:
12990 OnodeRef
& no
= ovec
[op
->dest_oid
];
12992 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
12993 no
= c
->get_onode(noid
, true);
12995 uint64_t srcoff
= op
->off
;
12996 uint64_t len
= op
->len
;
12997 uint64_t dstoff
= op
->dest_off
;
12998 r
= _clone_range(txc
, c
, o
, no
, srcoff
, len
, dstoff
);
13002 case Transaction::OP_COLL_ADD
:
13003 ceph_abort_msg("not implemented");
13006 case Transaction::OP_COLL_REMOVE
:
13007 ceph_abort_msg("not implemented");
13010 case Transaction::OP_COLL_MOVE
:
13011 ceph_abort_msg("deprecated");
13014 case Transaction::OP_COLL_MOVE_RENAME
:
13015 case Transaction::OP_TRY_RENAME
:
13017 ceph_assert(op
->cid
== op
->dest_cid
);
13018 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
13019 OnodeRef
& no
= ovec
[op
->dest_oid
];
13021 no
= c
->get_onode(noid
, false);
13023 r
= _rename(txc
, c
, o
, no
, noid
);
13027 case Transaction::OP_OMAP_CLEAR
:
13029 r
= _omap_clear(txc
, c
, o
);
13032 case Transaction::OP_OMAP_SETKEYS
:
13034 bufferlist aset_bl
;
13035 i
.decode_attrset_bl(&aset_bl
);
13036 r
= _omap_setkeys(txc
, c
, o
, aset_bl
);
13039 case Transaction::OP_OMAP_RMKEYS
:
13041 bufferlist keys_bl
;
13042 i
.decode_keyset_bl(&keys_bl
);
13043 r
= _omap_rmkeys(txc
, c
, o
, keys_bl
);
13046 case Transaction::OP_OMAP_RMKEYRANGE
:
13048 string first
, last
;
13049 first
= i
.decode_string();
13050 last
= i
.decode_string();
13051 r
= _omap_rmkey_range(txc
, c
, o
, first
, last
);
13054 case Transaction::OP_OMAP_SETHEADER
:
13058 r
= _omap_setheader(txc
, c
, o
, bl
);
13062 case Transaction::OP_SETALLOCHINT
:
13064 r
= _set_alloc_hint(txc
, c
, o
,
13065 op
->expected_object_size
,
13066 op
->expected_write_size
,
13072 derr
<< __func__
<< " bad op " << op
->op
<< dendl
;
13080 if (r
== -ENOENT
&& !(op
->op
== Transaction::OP_CLONERANGE
||
13081 op
->op
== Transaction::OP_CLONE
||
13082 op
->op
== Transaction::OP_CLONERANGE2
||
13083 op
->op
== Transaction::OP_COLL_ADD
||
13084 op
->op
== Transaction::OP_SETATTR
||
13085 op
->op
== Transaction::OP_SETATTRS
||
13086 op
->op
== Transaction::OP_RMATTR
||
13087 op
->op
== Transaction::OP_OMAP_SETKEYS
||
13088 op
->op
== Transaction::OP_OMAP_RMKEYS
||
13089 op
->op
== Transaction::OP_OMAP_RMKEYRANGE
||
13090 op
->op
== Transaction::OP_OMAP_SETHEADER
))
13091 // -ENOENT is usually okay
13097 const char *msg
= "unexpected error code";
13099 if (r
== -ENOENT
&& (op
->op
== Transaction::OP_CLONERANGE
||
13100 op
->op
== Transaction::OP_CLONE
||
13101 op
->op
== Transaction::OP_CLONERANGE2
))
13102 msg
= "ENOENT on clone suggests osd bug";
13105 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
13106 // by partially applying transactions.
13107 msg
= "ENOSPC from bluestore, misconfigured cluster";
13109 if (r
== -ENOTEMPTY
) {
13110 msg
= "ENOTEMPTY suggests garbage data in osd data dir";
13113 derr
<< __func__
<< " error " << cpp_strerror(r
)
13114 << " not handled on operation " << op
->op
13115 << " (op " << pos
<< ", counting from 0)"
13117 derr
<< msg
<< dendl
;
13118 _dump_transaction
<0>(cct
, t
);
13119 ceph_abort_msg("unexpected error");
13127 // -----------------
13128 // write operations
13130 int BlueStore::_touch(TransContext
*txc
,
13134 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
13136 _assign_nid(txc
, o
);
13137 txc
->write_onode(o
);
13138 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
13142 void BlueStore::_pad_zeros(
13143 bufferlist
*bl
, uint64_t *offset
,
13144 uint64_t chunk_size
)
13146 auto length
= bl
->length();
13147 dout(30) << __func__
<< " 0x" << std::hex
<< *offset
<< "~" << length
13148 << " chunk_size 0x" << chunk_size
<< std::dec
<< dendl
;
13149 dout(40) << "before:\n";
13150 bl
->hexdump(*_dout
);
13153 size_t front_pad
= *offset
% chunk_size
;
13154 size_t back_pad
= 0;
13155 size_t pad_count
= 0;
13157 size_t front_copy
= std::min
<uint64_t>(chunk_size
- front_pad
, length
);
13158 bufferptr z
= ceph::buffer::create_small_page_aligned(chunk_size
);
13159 z
.zero(0, front_pad
, false);
13160 pad_count
+= front_pad
;
13161 bl
->begin().copy(front_copy
, z
.c_str() + front_pad
);
13162 if (front_copy
+ front_pad
< chunk_size
) {
13163 back_pad
= chunk_size
- (length
+ front_pad
);
13164 z
.zero(front_pad
+ length
, back_pad
, false);
13165 pad_count
+= back_pad
;
13169 t
.substr_of(old
, front_copy
, length
- front_copy
);
13171 bl
->claim_append(t
);
13172 *offset
-= front_pad
;
13173 length
+= pad_count
;
13177 uint64_t end
= *offset
+ length
;
13178 unsigned back_copy
= end
% chunk_size
;
13180 ceph_assert(back_pad
== 0);
13181 back_pad
= chunk_size
- back_copy
;
13182 ceph_assert(back_copy
<= length
);
13183 bufferptr
tail(chunk_size
);
13184 bl
->begin(length
- back_copy
).copy(back_copy
, tail
.c_str());
13185 tail
.zero(back_copy
, back_pad
, false);
13188 bl
->substr_of(old
, 0, length
- back_copy
);
13190 length
+= back_pad
;
13191 pad_count
+= back_pad
;
13193 dout(20) << __func__
<< " pad 0x" << std::hex
<< front_pad
<< " + 0x"
13194 << back_pad
<< " on front/back, now 0x" << *offset
<< "~"
13195 << length
<< std::dec
<< dendl
;
13196 dout(40) << "after:\n";
13197 bl
->hexdump(*_dout
);
13200 logger
->inc(l_bluestore_write_pad_bytes
, pad_count
);
13201 ceph_assert(bl
->length() == length
);
13204 void BlueStore::_do_write_small(
13208 uint64_t offset
, uint64_t length
,
13209 bufferlist::iterator
& blp
,
13210 WriteContext
*wctx
)
13212 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
13213 << std::dec
<< dendl
;
13214 ceph_assert(length
< min_alloc_size
);
13216 uint64_t end_offs
= offset
+ length
;
13218 logger
->inc(l_bluestore_write_small
);
13219 logger
->inc(l_bluestore_write_small_bytes
, length
);
13222 blp
.copy(length
, bl
);
13224 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13225 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
13226 uint32_t alloc_len
= min_alloc_size
;
13227 auto offset0
= p2align
<uint64_t>(offset
, alloc_len
);
13231 // search suitable extent in both forward and reverse direction in
13232 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13233 // then check if blob can be reused via can_reuse_blob func or apply
13234 // direct/deferred write (the latter for extents including or higher
13235 // than 'offset' only).
13236 o
->extent_map
.fault_range(db
, min_off
, offset
+ max_bsize
- min_off
);
13238 // On zoned devices, the first goal is to support non-overwrite workloads,
13239 // such as RGW, with large, aligned objects. Therefore, for user writes
13240 // _do_write_small should not trigger. OSDs, however, write and update a tiny
13241 // amount of metadata, such as OSD maps, to disk. For those cases, we
13242 // temporarily just pad them to min_alloc_size and write them to a new place
13243 // on every update.
13244 if (bdev
->is_smr()) {
13245 BlobRef b
= c
->new_blob();
13246 uint64_t b_off
= p2phase
<uint64_t>(offset
, alloc_len
);
13247 uint64_t b_off0
= b_off
;
13248 _pad_zeros(&bl
, &b_off0
, min_alloc_size
);
13249 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13250 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
, false, true);
13254 // Look for an existing mutable blob we can use.
13255 auto begin
= o
->extent_map
.extent_map
.begin();
13256 auto end
= o
->extent_map
.extent_map
.end();
13257 auto ep
= o
->extent_map
.seek_lextent(offset
);
13260 if (ep
->blob_end() <= offset
) {
13264 auto prev_ep
= end
;
13270 boost::container::flat_set
<const bluestore_blob_t
*> inspected_blobs
;
13271 // We don't want to have more blobs than min alloc units fit
13272 // into 2 max blobs
13273 size_t blob_threshold
= max_blob_size
/ min_alloc_size
* 2 + 1;
13274 bool above_blob_threshold
= false;
13276 inspected_blobs
.reserve(blob_threshold
);
13278 uint64_t max_off
= 0;
13279 auto start_ep
= ep
;
13280 auto end_ep
= ep
; // exclusively
13282 any_change
= false;
13284 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
13285 BlobRef b
= ep
->blob
;
13286 if (!above_blob_threshold
) {
13287 inspected_blobs
.insert(&b
->get_blob());
13288 above_blob_threshold
= inspected_blobs
.size() >= blob_threshold
;
13290 max_off
= ep
->logical_end();
13291 auto bstart
= ep
->blob_start();
13293 dout(20) << __func__
<< " considering " << *b
13294 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
13295 if (bstart
>= end_offs
) {
13296 dout(20) << __func__
<< " ignoring distant " << *b
<< dendl
;
13297 } else if (!b
->get_blob().is_mutable()) {
13298 dout(20) << __func__
<< " ignoring immutable " << *b
<< dendl
;
13299 } else if (ep
->logical_offset
% min_alloc_size
!=
13300 ep
->blob_offset
% min_alloc_size
) {
13301 dout(20) << __func__
<< " ignoring offset-skewed " << *b
<< dendl
;
13303 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
13304 // can we pad our head/tail out with zeros?
13305 uint64_t head_pad
, tail_pad
;
13306 head_pad
= p2phase(offset
, chunk_size
);
13307 tail_pad
= p2nphase(end_offs
, chunk_size
);
13308 if (head_pad
|| tail_pad
) {
13309 o
->extent_map
.fault_range(db
, offset
- head_pad
,
13310 end_offs
- offset
+ head_pad
+ tail_pad
);
13313 o
->extent_map
.has_any_lextents(offset
- head_pad
, chunk_size
)) {
13316 if (tail_pad
&& o
->extent_map
.has_any_lextents(end_offs
, tail_pad
)) {
13320 uint64_t b_off
= offset
- head_pad
- bstart
;
13321 uint64_t b_len
= length
+ head_pad
+ tail_pad
;
13323 // direct write into unused blocks of an existing mutable blob?
13324 if ((b_off
% chunk_size
== 0 && b_len
% chunk_size
== 0) &&
13325 b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
13326 b
->get_blob().is_unused(b_off
, b_len
) &&
13327 b
->get_blob().is_allocated(b_off
, b_len
)) {
13328 _apply_padding(head_pad
, tail_pad
, bl
);
13330 dout(20) << __func__
<< " write to unused 0x" << std::hex
13331 << b_off
<< "~" << b_len
13332 << " pad 0x" << head_pad
<< " + 0x" << tail_pad
13333 << std::dec
<< " of mutable " << *b
<< dendl
;
13334 _buffer_cache_write(txc
, b
, b_off
, bl
,
13335 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13337 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13338 if (b_len
<= prefer_deferred_size
) {
13339 dout(20) << __func__
<< " deferring small 0x" << std::hex
13340 << b_len
<< std::dec
<< " unused write via deferred" << dendl
;
13341 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
13342 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13345 [&](uint64_t offset
, uint64_t length
) {
13346 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13351 b
->get_blob().map_bl(
13353 [&](uint64_t offset
, bufferlist
& t
) {
13354 bdev
->aio_write(offset
, t
,
13355 &txc
->ioc
, wctx
->buffered
);
13359 b
->dirty_blob().calc_csum(b_off
, bl
);
13360 dout(20) << __func__
<< " lex old " << *ep
<< dendl
;
13361 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, b_off
+ head_pad
, length
,
13363 &wctx
->old_extents
);
13364 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13366 txc
->statfs_delta
.stored() += le
->length
;
13367 dout(20) << __func__
<< " lex " << *le
<< dendl
;
13368 logger
->inc(l_bluestore_write_small_unused
);
13371 // read some data to fill out the chunk?
13372 uint64_t head_read
= p2phase(b_off
, chunk_size
);
13373 uint64_t tail_read
= p2nphase(b_off
+ b_len
, chunk_size
);
13374 if ((head_read
|| tail_read
) &&
13375 (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
+ tail_read
) &&
13376 head_read
+ tail_read
< min_alloc_size
) {
13377 b_off
-= head_read
;
13378 b_len
+= head_read
+ tail_read
;
13381 head_read
= tail_read
= 0;
13384 // chunk-aligned deferred overwrite?
13385 if (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
13386 b_off
% chunk_size
== 0 &&
13387 b_len
% chunk_size
== 0 &&
13388 b
->get_blob().is_allocated(b_off
, b_len
)) {
13390 _apply_padding(head_pad
, tail_pad
, bl
);
13392 dout(20) << __func__
<< " reading head 0x" << std::hex
<< head_read
13393 << " and tail 0x" << tail_read
<< std::dec
<< dendl
;
13395 bufferlist head_bl
;
13396 int r
= _do_read(c
.get(), o
, offset
- head_pad
- head_read
, head_read
,
13398 ceph_assert(r
>= 0 && r
<= (int)head_read
);
13399 size_t zlen
= head_read
- r
;
13401 head_bl
.append_zero(zlen
);
13402 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13404 head_bl
.claim_append(bl
);
13406 logger
->inc(l_bluestore_write_penalty_read_ops
);
13409 bufferlist tail_bl
;
13410 int r
= _do_read(c
.get(), o
, offset
+ length
+ tail_pad
, tail_read
,
13412 ceph_assert(r
>= 0 && r
<= (int)tail_read
);
13413 size_t zlen
= tail_read
- r
;
13415 tail_bl
.append_zero(zlen
);
13416 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13418 bl
.claim_append(tail_bl
);
13419 logger
->inc(l_bluestore_write_penalty_read_ops
);
13421 logger
->inc(l_bluestore_write_small_pre_read
);
13423 _buffer_cache_write(txc
, b
, b_off
, bl
,
13424 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13426 b
->dirty_blob().calc_csum(b_off
, bl
);
13428 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13429 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
13430 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13431 int r
= b
->get_blob().map(
13433 [&](uint64_t offset
, uint64_t length
) {
13434 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13437 ceph_assert(r
== 0);
13438 op
->data
= std::move(bl
);
13439 dout(20) << __func__
<< " deferred write 0x" << std::hex
<< b_off
<< "~"
13440 << b_len
<< std::dec
<< " of mutable " << *b
13441 << " at " << op
->extents
<< dendl
;
13444 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, offset
- bstart
, length
,
13445 b
, &wctx
->old_extents
);
13446 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13447 txc
->statfs_delta
.stored() += le
->length
;
13448 dout(20) << __func__
<< " lex " << *le
<< dendl
;
13449 logger
->inc(l_bluestore_write_deferred
);
13452 // try to reuse blob if we can
13453 if (b
->can_reuse_blob(min_alloc_size
,
13457 ceph_assert(alloc_len
== min_alloc_size
); // expecting data always
13458 // fit into reused blob
13459 // Need to check for pending writes desiring to
13460 // reuse the same pextent. The rationale is that during GC two chunks
13461 // from garbage blobs(compressed?) can share logical space within the same
13462 // AU. That's in turn might be caused by unaligned len in clone_range2.
13463 // Hence the second write will fail in an attempt to reuse blob at
13464 // do_alloc_write().
13465 if (!wctx
->has_conflict(b
,
13467 offset0
+ alloc_len
,
13470 // we can't reuse pad_head/pad_tail since they might be truncated
13471 // due to existent extents
13472 uint64_t b_off
= offset
- bstart
;
13473 uint64_t b_off0
= b_off
;
13474 _pad_zeros(&bl
, &b_off0
, chunk_size
);
13476 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13477 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
13478 << " (0x" << b_off
<< "~" << length
<< ")"
13479 << std::dec
<< dendl
;
13481 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13482 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13484 logger
->inc(l_bluestore_write_small_unused
);
13492 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13494 // check extent for reuse in reverse order
13495 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
13496 BlobRef b
= prev_ep
->blob
;
13497 if (!above_blob_threshold
) {
13498 inspected_blobs
.insert(&b
->get_blob());
13499 above_blob_threshold
= inspected_blobs
.size() >= blob_threshold
;
13501 start_ep
= prev_ep
;
13502 auto bstart
= prev_ep
->blob_start();
13503 dout(20) << __func__
<< " considering " << *b
13504 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
13505 if (b
->can_reuse_blob(min_alloc_size
,
13509 ceph_assert(alloc_len
== min_alloc_size
); // expecting data always
13510 // fit into reused blob
13511 // Need to check for pending writes desiring to
13512 // reuse the same pextent. The rationale is that during GC two chunks
13513 // from garbage blobs(compressed?) can share logical space within the same
13514 // AU. That's in turn might be caused by unaligned len in clone_range2.
13515 // Hence the second write will fail in an attempt to reuse blob at
13516 // do_alloc_write().
13517 if (!wctx
->has_conflict(b
,
13519 offset0
+ alloc_len
,
13522 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
13523 uint64_t b_off
= offset
- bstart
;
13524 uint64_t b_off0
= b_off
;
13525 _pad_zeros(&bl
, &b_off0
, chunk_size
);
13527 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13528 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
13529 << " (0x" << b_off
<< "~" << length
<< ")"
13530 << std::dec
<< dendl
;
13532 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13533 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13535 logger
->inc(l_bluestore_write_small_unused
);
13539 if (prev_ep
!= begin
) {
13543 prev_ep
= end
; // to avoid useless first extent re-check
13545 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13546 } while (any_change
);
13548 if (above_blob_threshold
) {
13549 dout(10) << __func__
<< " request GC, blobs >= " << inspected_blobs
.size()
13550 << " " << std::hex
<< min_off
<< "~" << max_off
<< std::dec
13552 ceph_assert(start_ep
!= end_ep
);
13553 for (auto ep
= start_ep
; ep
!= end_ep
; ++ep
) {
13554 dout(20) << __func__
<< " inserting for GC "
13555 << std::hex
<< ep
->logical_offset
<< "~" << ep
->length
13556 << std::dec
<< dendl
;
13558 wctx
->extents_to_gc
.union_insert(ep
->logical_offset
, ep
->length
);
13560 // insert newly written extent to GC
13561 wctx
->extents_to_gc
.union_insert(offset
, length
);
13562 dout(20) << __func__
<< " inserting (last) for GC "
13563 << std::hex
<< offset
<< "~" << length
13564 << std::dec
<< dendl
;
13567 BlobRef b
= c
->new_blob();
13568 uint64_t b_off
= p2phase
<uint64_t>(offset
, alloc_len
);
13569 uint64_t b_off0
= b_off
;
13570 _pad_zeros(&bl
, &b_off0
, block_size
);
13571 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13572 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13573 min_alloc_size
!= block_size
, // use 'unused' bitmap when alloc granularity
13574 // doesn't match disk one only
13580 bool BlueStore::BigDeferredWriteContext::can_defer(
13581 BlueStore::extent_map_t::iterator ep
,
13582 uint64_t prefer_deferred_size
,
13583 uint64_t block_size
,
13588 auto& blob
= ep
->blob
->get_blob();
13589 if (offset
>= ep
->blob_start() &&
13590 blob
.is_mutable()) {
13592 b_off
= offset
- ep
->blob_start();
13593 uint64_t chunk_size
= blob
.get_chunk_size(block_size
);
13594 uint64_t ondisk
= blob
.get_ondisk_length();
13595 used
= std::min(l
, ondisk
- b_off
);
13597 // will read some data to fill out the chunk?
13598 head_read
= p2phase
<uint64_t>(b_off
, chunk_size
);
13599 tail_read
= p2nphase
<uint64_t>(b_off
+ used
, chunk_size
);
13600 b_off
-= head_read
;
13602 ceph_assert(b_off
% chunk_size
== 0);
13603 ceph_assert(blob_aligned_len() % chunk_size
== 0);
13605 res
= blob_aligned_len() <= prefer_deferred_size
&&
13606 blob_aligned_len() <= ondisk
&&
13607 blob
.is_allocated(b_off
, blob_aligned_len());
13609 blob_ref
= ep
->blob
;
13610 blob_start
= ep
->blob_start();
13616 bool BlueStore::BigDeferredWriteContext::apply_defer()
13618 int r
= blob_ref
->get_blob().map(
13619 b_off
, blob_aligned_len(),
13620 [&](const bluestore_pextent_t
& pext
,
13623 // apply deferred if overwrite breaks blob continuity only.
13624 // if it totally overlaps some pextent - fallback to regular write
13625 if (pext
.offset
< offset
||
13626 pext
.end() > offset
+ length
) {
13627 res_extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13635 void BlueStore::_do_write_big_apply_deferred(
13639 BlueStore::BigDeferredWriteContext
& dctx
,
13640 bufferlist::iterator
& blp
,
13641 WriteContext
* wctx
)
13644 dout(20) << __func__
<< " reading head 0x" << std::hex
<< dctx
.head_read
13645 << " and tail 0x" << dctx
.tail_read
<< std::dec
<< dendl
;
13646 if (dctx
.head_read
) {
13647 int r
= _do_read(c
.get(), o
,
13648 dctx
.off
- dctx
.head_read
,
13652 ceph_assert(r
>= 0 && r
<= (int)dctx
.head_read
);
13653 size_t zlen
= dctx
.head_read
- r
;
13655 bl
.append_zero(zlen
);
13656 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13658 logger
->inc(l_bluestore_write_penalty_read_ops
);
13660 blp
.copy(dctx
.used
, bl
);
13662 if (dctx
.tail_read
) {
13663 bufferlist tail_bl
;
13664 int r
= _do_read(c
.get(), o
,
13665 dctx
.off
+ dctx
.used
, dctx
.tail_read
,
13667 ceph_assert(r
>= 0 && r
<= (int)dctx
.tail_read
);
13668 size_t zlen
= dctx
.tail_read
- r
;
13670 tail_bl
.append_zero(zlen
);
13671 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13673 bl
.claim_append(tail_bl
);
13674 logger
->inc(l_bluestore_write_penalty_read_ops
);
13676 auto& b0
= dctx
.blob_ref
;
13677 _buffer_cache_write(txc
, b0
, dctx
.b_off
, bl
,
13678 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13680 b0
->dirty_blob().calc_csum(dctx
.b_off
, bl
);
13682 Extent
* le
= o
->extent_map
.set_lextent(c
, dctx
.off
,
13683 dctx
.off
- dctx
.blob_start
, dctx
.used
, b0
, &wctx
->old_extents
);
13685 // in fact this is a no-op for big writes but left here to maintain
13686 // uniformity and avoid missing after some refactor.
13687 b0
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13688 txc
->statfs_delta
.stored() += le
->length
;
13690 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13691 bluestore_deferred_op_t
* op
= _get_deferred_op(txc
);
13692 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13693 op
->extents
.swap(dctx
.res_extents
);
13694 op
->data
= std::move(bl
);
13698 void BlueStore::_do_write_big(
13702 uint64_t offset
, uint64_t length
,
13703 bufferlist::iterator
& blp
,
13704 WriteContext
*wctx
)
13706 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
13707 << " target_blob_size 0x" << wctx
->target_blob_size
<< std::dec
13708 << " compress " << (int)wctx
->compress
13710 logger
->inc(l_bluestore_write_big
);
13711 logger
->inc(l_bluestore_write_big_bytes
, length
);
13712 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13713 uint64_t prefer_deferred_size_snapshot
= prefer_deferred_size
.load();
13714 while (length
> 0) {
13715 bool new_blob
= false;
13716 uint32_t l
= std::min(max_bsize
, length
);
13718 uint32_t b_off
= 0;
13720 //attempting to reuse existing blob
13721 if (!wctx
->compress
) {
13722 auto end
= o
->extent_map
.extent_map
.end();
13724 if (prefer_deferred_size_snapshot
&&
13725 l
<= prefer_deferred_size_snapshot
* 2) {
13726 // Single write that spans two adjusted existing blobs can result
13727 // in up to two deferred blocks of 'prefer_deferred_size'
13728 // So we're trying to minimize the amount of resulting blobs
13729 // and preserve 2 blobs rather than inserting one more in between
13730 // E.g. write 0x10000~20000 over existing blobs
13731 // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
13732 // performance point of view) to result in two deferred writes to
13733 // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
13735 // look for an existing mutable blob we can write into
13736 auto ep
= o
->extent_map
.seek_lextent(offset
);
13737 auto ep_next
= end
;
13738 BigDeferredWriteContext head_info
, tail_info
;
13740 bool will_defer
= ep
!= end
?
13741 head_info
.can_defer(ep
,
13742 prefer_deferred_size_snapshot
,
13747 auto offset_next
= offset
+ head_info
.used
;
13748 auto remaining
= l
- head_info
.used
;
13749 if (will_defer
&& remaining
) {
13750 will_defer
= false;
13751 if (remaining
<= prefer_deferred_size_snapshot
) {
13752 ep_next
= o
->extent_map
.seek_lextent(offset_next
);
13753 // check if we can defer remaining totally
13754 will_defer
= ep_next
== end
?
13756 tail_info
.can_defer(ep_next
,
13757 prefer_deferred_size_snapshot
,
13761 will_defer
= will_defer
&& remaining
== tail_info
.used
;
13765 dout(20) << __func__
<< " " << *(head_info
.blob_ref
)
13766 << " deferring big " << std::hex
13767 << " (0x" << head_info
.b_off
<< "~" << head_info
.blob_aligned_len() << ")"
13768 << std::dec
<< " write via deferred"
13771 dout(20) << __func__
<< " " << *(tail_info
.blob_ref
)
13772 << " deferring big " << std::hex
13773 << " (0x" << tail_info
.b_off
<< "~" << tail_info
.blob_aligned_len() << ")"
13774 << std::dec
<< " write via deferred"
13778 will_defer
= head_info
.apply_defer();
13780 dout(20) << __func__
13781 << " deferring big fell back, head isn't continuous"
13783 } else if (remaining
) {
13784 will_defer
= tail_info
.apply_defer();
13786 dout(20) << __func__
13787 << " deferring big fell back, tail isn't continuous"
13793 _do_write_big_apply_deferred(txc
, c
, o
, head_info
, blp
, wctx
);
13795 _do_write_big_apply_deferred(txc
, c
, o
, tail_info
,
13800 logger
->inc(l_bluestore_write_big_blobs
, remaining
? 2 : 1);
13801 logger
->inc(l_bluestore_write_big_deferred
, remaining
? 2 : 1);
13806 o
->extent_map
.punch_hole(c
, offset
, l
, &wctx
->old_extents
);
13808 // seek again as punch_hole could invalidate ep
13809 auto ep
= o
->extent_map
.seek_lextent(offset
);
13810 auto begin
= o
->extent_map
.extent_map
.begin();
13811 auto prev_ep
= end
;
13816 dout(20) << __func__
<< " no deferred" << dendl
;
13818 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
13819 // search suitable extent in both forward and reverse direction in
13820 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13821 // then check if blob can be reused via can_reuse_blob func.
13824 any_change
= false;
13825 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
13826 dout(20) << __func__
<< " considering " << *ep
<< dendl
;
13827 dout(20) << __func__
<< " considering " << *(ep
->blob
)
13828 << " bstart 0x" << std::hex
<< ep
->blob_start() << std::dec
<< dendl
;
13830 if (offset
>= ep
->blob_start() &&
13831 ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
13832 offset
- ep
->blob_start(),
13835 b_off
= offset
- ep
->blob_start();
13836 prev_ep
= end
; // to avoid check below
13837 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13838 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
13845 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
13846 dout(20) << __func__
<< " considering rev " << *prev_ep
<< dendl
;
13847 dout(20) << __func__
<< " considering reverse " << *(prev_ep
->blob
)
13848 << " bstart 0x" << std::hex
<< prev_ep
->blob_start() << std::dec
<< dendl
;
13849 if (prev_ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
13850 offset
- prev_ep
->blob_start(),
13853 b_off
= offset
- prev_ep
->blob_start();
13854 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13855 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
13856 } else if (prev_ep
!= begin
) {
13860 prev_ep
= end
; // to avoid useless first extent re-check
13863 } while (b
== nullptr && any_change
);
13865 o
->extent_map
.punch_hole(c
, offset
, l
, &wctx
->old_extents
);
13866 } // if (!wctx->compress)
13868 if (b
== nullptr) {
13875 wctx
->write(offset
, b
, l
, b_off
, t
, b_off
, l
, false, new_blob
);
13878 logger
->inc(l_bluestore_write_big_blobs
);
13882 int BlueStore::_do_alloc_write(
13884 CollectionRef coll
,
13886 WriteContext
*wctx
)
13888 dout(20) << __func__
<< " txc " << txc
13889 << " " << wctx
->writes
.size() << " blobs"
13891 if (wctx
->writes
.empty()) {
13897 if (wctx
->compress
) {
13899 "compression_algorithm",
13903 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_ALGORITHM
, &val
)) {
13904 CompressorRef cp
= compressor
;
13905 if (!cp
|| cp
->get_type_name() != val
) {
13906 cp
= Compressor::create(cct
, val
);
13908 if (_set_compression_alert(false, val
.c_str())) {
13909 derr
<< __func__
<< " unable to initialize " << val
.c_str()
13910 << " compressor" << dendl
;
13914 return boost::optional
<CompressorRef
>(cp
);
13916 return boost::optional
<CompressorRef
>();
13920 crr
= select_option(
13921 "compression_required_ratio",
13922 cct
->_conf
->bluestore_compression_required_ratio
,
13925 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO
, &val
)) {
13926 return boost::optional
<double>(val
);
13928 return boost::optional
<double>();
13934 int64_t csum
= csum_type
.load();
13935 csum
= select_option(
13940 if (coll
->pool_opts
.get(pool_opts_t::CSUM_TYPE
, &val
)) {
13941 return boost::optional
<int64_t>(val
);
13943 return boost::optional
<int64_t>();
13947 // compress (as needed) and calc needed space
13949 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13950 for (auto& wi
: wctx
->writes
) {
13951 if (c
&& wi
.blob_length
> min_alloc_size
) {
13952 auto start
= mono_clock::now();
13955 ceph_assert(wi
.b_off
== 0);
13956 ceph_assert(wi
.blob_length
== wi
.bl
.length());
13958 // FIXME: memory alignment here is bad
13960 boost::optional
<int32_t> compressor_message
;
13961 int r
= c
->compress(wi
.bl
, t
, compressor_message
);
13962 uint64_t want_len_raw
= wi
.blob_length
* crr
;
13963 uint64_t want_len
= p2roundup(want_len_raw
, min_alloc_size
);
13964 bool rejected
= false;
13965 uint64_t compressed_len
= t
.length();
13966 // do an approximate (fast) estimation for resulting blob size
13967 // that doesn't take header overhead into account
13968 uint64_t result_len
= p2roundup(compressed_len
, min_alloc_size
);
13969 if (r
== 0 && result_len
<= want_len
&& result_len
< wi
.blob_length
) {
13970 bluestore_compression_header_t chdr
;
13971 chdr
.type
= c
->get_type();
13972 chdr
.length
= t
.length();
13973 chdr
.compressor_message
= compressor_message
;
13974 encode(chdr
, wi
.compressed_bl
);
13975 wi
.compressed_bl
.claim_append(t
);
13977 compressed_len
= wi
.compressed_bl
.length();
13978 result_len
= p2roundup(compressed_len
, min_alloc_size
);
13979 if (result_len
<= want_len
&& result_len
< wi
.blob_length
) {
13980 // Cool. We compressed at least as much as we were hoping to.
13981 // pad out to min_alloc_size
13982 wi
.compressed_bl
.append_zero(result_len
- compressed_len
);
13983 wi
.compressed_len
= compressed_len
;
13984 wi
.compressed
= true;
13985 logger
->inc(l_bluestore_write_pad_bytes
, result_len
- compressed_len
);
13986 dout(20) << __func__
<< std::hex
<< " compressed 0x" << wi
.blob_length
13987 << " -> 0x" << compressed_len
<< " => 0x" << result_len
13988 << " with " << c
->get_type()
13989 << std::dec
<< dendl
;
13990 txc
->statfs_delta
.compressed() += compressed_len
;
13991 txc
->statfs_delta
.compressed_original() += wi
.blob_length
;
13992 txc
->statfs_delta
.compressed_allocated() += result_len
;
13993 logger
->inc(l_bluestore_compress_success_count
);
13994 need
+= result_len
;
13998 } else if (r
!= 0) {
13999 dout(5) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
14000 << " bytes compressed using " << c
->get_type_name()
14002 << " failed with errcode = " << r
14003 << ", leaving uncompressed"
14005 logger
->inc(l_bluestore_compress_rejected_count
);
14006 need
+= wi
.blob_length
;
14012 dout(20) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
14013 << " compressed to 0x" << compressed_len
<< " -> 0x" << result_len
14014 << " with " << c
->get_type()
14015 << ", which is more than required 0x" << want_len_raw
14016 << " -> 0x" << want_len
14017 << ", leaving uncompressed"
14018 << std::dec
<< dendl
;
14019 logger
->inc(l_bluestore_compress_rejected_count
);
14020 need
+= wi
.blob_length
;
14022 log_latency("compress@_do_alloc_write",
14023 l_bluestore_compress_lat
,
14024 mono_clock::now() - start
,
14025 cct
->_conf
->bluestore_log_op_age
);
14027 need
+= wi
.blob_length
;
14030 PExtentVector prealloc
;
14031 prealloc
.reserve(2 * wctx
->writes
.size());;
14032 int64_t prealloc_left
= 0;
14033 prealloc_left
= shared_alloc
.a
->allocate(
14034 need
, min_alloc_size
, need
,
14036 if (prealloc_left
< 0 || prealloc_left
< (int64_t)need
) {
14037 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< need
14038 << " allocated 0x " << (prealloc_left
< 0 ? 0 : prealloc_left
)
14039 << " min_alloc_size 0x" << min_alloc_size
14040 << " available 0x " << shared_alloc
.a
->get_free()
14041 << std::dec
<< dendl
;
14042 if (prealloc
.size()) {
14043 shared_alloc
.a
->release(prealloc
);
14047 _collect_allocation_stats(need
, min_alloc_size
, prealloc
.size());
14049 if (bdev
->is_smr()) {
14050 std::deque
<uint64_t> zones_to_clean
;
14051 if (shared_alloc
.a
->zoned_get_zones_to_clean(&zones_to_clean
)) {
14052 std::lock_guard l
{zoned_cleaner_lock
};
14053 zoned_cleaner_queue
.swap(zones_to_clean
);
14054 zoned_cleaner_cond
.notify_one();
14058 dout(20) << __func__
<< " prealloc " << prealloc
<< dendl
;
14059 auto prealloc_pos
= prealloc
.begin();
14061 for (auto& wi
: wctx
->writes
) {
14063 bluestore_blob_t
& dblob
= b
->dirty_blob();
14064 uint64_t b_off
= wi
.b_off
;
14065 bufferlist
*l
= &wi
.bl
;
14066 uint64_t final_length
= wi
.blob_length
;
14067 uint64_t csum_length
= wi
.blob_length
;
14068 if (wi
.compressed
) {
14069 final_length
= wi
.compressed_bl
.length();
14070 csum_length
= final_length
;
14071 unsigned csum_order
= ctz(csum_length
);
14072 l
= &wi
.compressed_bl
;
14073 dblob
.set_compressed(wi
.blob_length
, wi
.compressed_len
);
14074 if (csum
!= Checksummer::CSUM_NONE
) {
14075 dout(20) << __func__
<< " initialize csum setting for compressed blob " << *b
14076 << " csum_type " << Checksummer::get_csum_type_string(csum
)
14077 << " csum_order " << csum_order
14078 << " csum_length 0x" << std::hex
<< csum_length
14079 << " blob_length 0x" << wi
.blob_length
14080 << " compressed_length 0x" << wi
.compressed_len
<< std::dec
14082 dblob
.init_csum(csum
, csum_order
, csum_length
);
14084 } else if (wi
.new_blob
) {
14085 unsigned csum_order
;
14086 // initialize newly created blob only
14087 ceph_assert(dblob
.is_mutable());
14088 if (l
->length() != wi
.blob_length
) {
14089 // hrm, maybe we could do better here, but let's not bother.
14090 dout(20) << __func__
<< " forcing csum_order to block_size_order "
14091 << block_size_order
<< dendl
;
14092 csum_order
= block_size_order
;
14094 csum_order
= std::min(wctx
->csum_order
, ctz(l
->length()));
14096 // try to align blob with max_blob_size to improve
14097 // its reuse ratio, e.g. in case of reverse write
14098 uint32_t suggested_boff
=
14099 (wi
.logical_offset
- (wi
.b_off0
- wi
.b_off
)) % max_bsize
;
14100 if ((suggested_boff
% (1 << csum_order
)) == 0 &&
14101 suggested_boff
+ final_length
<= max_bsize
&&
14102 suggested_boff
> b_off
) {
14103 dout(20) << __func__
<< " forcing blob_offset to 0x"
14104 << std::hex
<< suggested_boff
<< std::dec
<< dendl
;
14105 ceph_assert(suggested_boff
>= b_off
);
14106 csum_length
+= suggested_boff
- b_off
;
14107 b_off
= suggested_boff
;
14109 if (csum
!= Checksummer::CSUM_NONE
) {
14110 dout(20) << __func__
<< " initialize csum setting for new blob " << *b
14111 << " csum_type " << Checksummer::get_csum_type_string(csum
)
14112 << " csum_order " << csum_order
14113 << " csum_length 0x" << std::hex
<< csum_length
<< std::dec
14115 dblob
.init_csum(csum
, csum_order
, csum_length
);
14119 PExtentVector extents
;
14120 int64_t left
= final_length
;
14122 ceph_assert(prealloc_left
> 0);
14123 if (prealloc_pos
->length
<= left
) {
14124 prealloc_left
-= prealloc_pos
->length
;
14125 left
-= prealloc_pos
->length
;
14126 txc
->statfs_delta
.allocated() += prealloc_pos
->length
;
14127 extents
.push_back(*prealloc_pos
);
14130 extents
.emplace_back(prealloc_pos
->offset
, left
);
14131 prealloc_pos
->offset
+= left
;
14132 prealloc_pos
->length
-= left
;
14133 prealloc_left
-= left
;
14134 txc
->statfs_delta
.allocated() += left
;
14139 for (auto& p
: extents
) {
14140 txc
->allocated
.insert(p
.offset
, p
.length
);
14142 dblob
.allocated(p2align(b_off
, min_alloc_size
), final_length
, extents
);
14144 dout(20) << __func__
<< " blob " << *b
<< dendl
;
14145 if (dblob
.has_csum()) {
14146 dblob
.calc_csum(b_off
, *l
);
14149 if (wi
.mark_unused
) {
14150 ceph_assert(!dblob
.is_compressed());
14151 auto b_end
= b_off
+ wi
.bl
.length();
14153 dblob
.add_unused(0, b_off
);
14155 uint64_t llen
= dblob
.get_logical_length();
14156 if (b_end
< llen
) {
14157 dblob
.add_unused(b_end
, llen
- b_end
);
14161 Extent
*le
= o
->extent_map
.set_lextent(coll
, wi
.logical_offset
,
14162 b_off
+ (wi
.b_off0
- wi
.b_off
),
14166 wi
.b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
14167 txc
->statfs_delta
.stored() += le
->length
;
14168 dout(20) << __func__
<< " lex " << *le
<< dendl
;
14169 _buffer_cache_write(txc
, wi
.b
, b_off
, wi
.bl
,
14170 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
14173 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
14174 if (l
->length() <= prefer_deferred_size
.load()) {
14175 dout(20) << __func__
<< " deferring 0x" << std::hex
14176 << l
->length() << std::dec
<< " write via deferred" << dendl
;
14177 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
14178 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
14179 int r
= b
->get_blob().map(
14180 b_off
, l
->length(),
14181 [&](uint64_t offset
, uint64_t length
) {
14182 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
14185 ceph_assert(r
== 0);
14187 logger
->inc(l_bluestore_write_deferred
);
14189 b
->get_blob().map_bl(
14191 [&](uint64_t offset
, bufferlist
& t
) {
14192 bdev
->aio_write(offset
, t
, &txc
->ioc
, false);
14194 logger
->inc(l_bluestore_write_new
);
14198 ceph_assert(prealloc_pos
== prealloc
.end());
14199 ceph_assert(prealloc_left
== 0);
14203 void BlueStore::_wctx_finish(
14207 WriteContext
*wctx
,
14208 set
<SharedBlob
*> *maybe_unshared_blobs
)
14210 auto oep
= wctx
->old_extents
.begin();
14211 while (oep
!= wctx
->old_extents
.end()) {
14213 oep
= wctx
->old_extents
.erase(oep
);
14214 dout(20) << __func__
<< " lex_old " << lo
.e
<< dendl
;
14215 BlobRef b
= lo
.e
.blob
;
14216 const bluestore_blob_t
& blob
= b
->get_blob();
14217 if (blob
.is_compressed()) {
14218 if (lo
.blob_empty
) {
14219 txc
->statfs_delta
.compressed() -= blob
.get_compressed_payload_length();
14221 txc
->statfs_delta
.compressed_original() -= lo
.e
.length
;
14224 txc
->statfs_delta
.stored() -= lo
.e
.length
;
14226 dout(20) << __func__
<< " blob " << *b
<< " release " << r
<< dendl
;
14227 if (blob
.is_shared()) {
14228 PExtentVector final
;
14229 c
->load_shared_blob(b
->shared_blob
);
14230 bool unshare
= false;
14231 bool* unshare_ptr
=
14232 !maybe_unshared_blobs
|| b
->is_referenced() ? nullptr : &unshare
;
14234 b
->shared_blob
->put_ref(
14235 e
.offset
, e
.length
, &final
,
14239 ceph_assert(maybe_unshared_blobs
);
14240 maybe_unshared_blobs
->insert(b
->shared_blob
.get());
14242 dout(20) << __func__
<< " shared_blob release " << final
14243 << " from " << *b
->shared_blob
<< dendl
;
14244 txc
->write_shared_blob(b
->shared_blob
);
14249 // we can't invalidate our logical extents as we drop them because
14250 // other lextents (either in our onode or others) may still
14251 // reference them. but we can throw out anything that is no
14252 // longer allocated. Note that this will leave behind edge bits
14253 // that are no longer referenced but not deallocated (until they
14254 // age out of the cache naturally).
14255 b
->discard_unallocated(c
.get());
14257 dout(20) << __func__
<< " release " << e
<< dendl
;
14258 txc
->released
.insert(e
.offset
, e
.length
);
14259 txc
->statfs_delta
.allocated() -= e
.length
;
14260 if (blob
.is_compressed()) {
14261 txc
->statfs_delta
.compressed_allocated() -= e
.length
;
14265 if (b
->is_spanning() && !b
->is_referenced() && lo
.blob_empty
) {
14266 dout(20) << __func__
<< " spanning_blob_map removing empty " << *b
14268 o
->extent_map
.spanning_blob_map
.erase(b
->id
);
14274 void BlueStore::_do_write_data(
14281 WriteContext
*wctx
)
14283 uint64_t end
= offset
+ length
;
14284 bufferlist::iterator p
= bl
.begin();
14286 if (offset
/ min_alloc_size
== (end
- 1) / min_alloc_size
&&
14287 (length
!= min_alloc_size
)) {
14288 // we fall within the same block
14289 _do_write_small(txc
, c
, o
, offset
, length
, p
, wctx
);
14291 uint64_t head_offset
, head_length
;
14292 uint64_t middle_offset
, middle_length
;
14293 uint64_t tail_offset
, tail_length
;
14295 head_offset
= offset
;
14296 head_length
= p2nphase(offset
, min_alloc_size
);
14298 tail_offset
= p2align(end
, min_alloc_size
);
14299 tail_length
= p2phase(end
, min_alloc_size
);
14301 middle_offset
= head_offset
+ head_length
;
14302 middle_length
= length
- head_length
- tail_length
;
14305 _do_write_small(txc
, c
, o
, head_offset
, head_length
, p
, wctx
);
14308 _do_write_big(txc
, c
, o
, middle_offset
, middle_length
, p
, wctx
);
14311 _do_write_small(txc
, c
, o
, tail_offset
, tail_length
, p
, wctx
);
14316 void BlueStore::_choose_write_options(
14319 uint32_t fadvise_flags
,
14320 WriteContext
*wctx
)
14322 if (fadvise_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
14323 dout(20) << __func__
<< " will do buffered write" << dendl
;
14324 wctx
->buffered
= true;
14325 } else if (cct
->_conf
->bluestore_default_buffered_write
&&
14326 (fadvise_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
14327 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
14328 dout(20) << __func__
<< " defaulting to buffered write" << dendl
;
14329 wctx
->buffered
= true;
14332 // apply basic csum block size
14333 wctx
->csum_order
= block_size_order
;
14335 // compression parameters
14336 unsigned alloc_hints
= o
->onode
.alloc_hint_flags
;
14337 auto cm
= select_option(
14338 "compression_mode",
14342 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MODE
, &val
)) {
14343 return boost::optional
<Compressor::CompressionMode
>(
14344 Compressor::get_comp_mode_type(val
));
14346 return boost::optional
<Compressor::CompressionMode
>();
14350 wctx
->compress
= (cm
!= Compressor::COMP_NONE
) &&
14351 ((cm
== Compressor::COMP_FORCE
) ||
14352 (cm
== Compressor::COMP_AGGRESSIVE
&&
14353 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE
) == 0) ||
14354 (cm
== Compressor::COMP_PASSIVE
&&
14355 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE
)));
14357 if ((alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ
) &&
14358 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ
) == 0 &&
14359 (alloc_hints
& (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE
|
14360 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY
)) &&
14361 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE
) == 0) {
14363 dout(20) << __func__
<< " will prefer large blob and csum sizes" << dendl
;
14365 if (o
->onode
.expected_write_size
) {
14366 wctx
->csum_order
= std::max(min_alloc_size_order
,
14367 (uint8_t)ctz(o
->onode
.expected_write_size
));
14369 wctx
->csum_order
= min_alloc_size_order
;
14372 if (wctx
->compress
) {
14373 wctx
->target_blob_size
= select_option(
14374 "compression_max_blob_size",
14375 comp_max_blob_size
.load(),
14378 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, &val
)) {
14379 return boost::optional
<uint64_t>((uint64_t)val
);
14381 return boost::optional
<uint64_t>();
14386 if (wctx
->compress
) {
14387 wctx
->target_blob_size
= select_option(
14388 "compression_min_blob_size",
14389 comp_min_blob_size
.load(),
14392 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, &val
)) {
14393 return boost::optional
<uint64_t>((uint64_t)val
);
14395 return boost::optional
<uint64_t>();
14401 uint64_t max_bsize
= max_blob_size
.load();
14402 if (wctx
->target_blob_size
== 0 || wctx
->target_blob_size
> max_bsize
) {
14403 wctx
->target_blob_size
= max_bsize
;
14406 // set the min blob size floor at 2x the min_alloc_size, or else we
14407 // won't be able to allocate a smaller extent for the compressed
14409 if (wctx
->compress
&&
14410 wctx
->target_blob_size
< min_alloc_size
* 2) {
14411 wctx
->target_blob_size
= min_alloc_size
* 2;
14414 dout(20) << __func__
<< " prefer csum_order " << wctx
->csum_order
14415 << " target_blob_size 0x" << std::hex
<< wctx
->target_blob_size
14416 << " compress=" << (int)wctx
->compress
14417 << " buffered=" << (int)wctx
->buffered
14418 << std::dec
<< dendl
;
14421 int BlueStore::_do_gc(
14425 const WriteContext
& wctx
,
14426 uint64_t *dirty_start
,
14427 uint64_t *dirty_end
)
14430 bool dirty_range_updated
= false;
14431 WriteContext wctx_gc
;
14432 wctx_gc
.fork(wctx
); // make a clone for garbage collection
14434 auto & extents_to_collect
= wctx
.extents_to_gc
;
14435 for (auto it
= extents_to_collect
.begin();
14436 it
!= extents_to_collect
.end();
14439 auto offset
= (*it
).first
;
14440 auto length
= (*it
).second
;
14441 dout(20) << __func__
<< " processing " << std::hex
14442 << offset
<< "~" << length
<< std::dec
14444 int r
= _do_read(c
.get(), o
, offset
, length
, bl
, 0);
14445 ceph_assert(r
== (int)length
);
14447 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx_gc
);
14448 logger
->inc(l_bluestore_gc_merged
, length
);
14450 if (*dirty_start
> offset
) {
14451 *dirty_start
= offset
;
14452 dirty_range_updated
= true;
14455 if (*dirty_end
< offset
+ length
) {
14456 *dirty_end
= offset
+ length
;
14457 dirty_range_updated
= true;
14460 if (dirty_range_updated
) {
14461 o
->extent_map
.fault_range(db
, *dirty_start
, *dirty_end
);
14464 dout(30) << __func__
<< " alloc write" << dendl
;
14465 int r
= _do_alloc_write(txc
, c
, o
, &wctx_gc
);
14467 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
14472 _wctx_finish(txc
, c
, o
, &wctx_gc
);
14476 int BlueStore::_do_write(
14483 uint32_t fadvise_flags
)
14487 dout(20) << __func__
14489 << " 0x" << std::hex
<< offset
<< "~" << length
14490 << " - have 0x" << o
->onode
.size
14491 << " (" << std::dec
<< o
->onode
.size
<< ")"
14492 << " bytes" << std::hex
14493 << " fadvise_flags 0x" << fadvise_flags
14494 << " alloc_hint 0x" << o
->onode
.alloc_hint_flags
14495 << " expected_object_size " << o
->onode
.expected_object_size
14496 << " expected_write_size " << o
->onode
.expected_write_size
14499 _dump_onode
<30>(cct
, *o
);
14505 uint64_t end
= offset
+ length
;
14507 GarbageCollector
gc(c
->store
->cct
);
14508 int64_t benefit
= 0;
14509 auto dirty_start
= offset
;
14510 auto dirty_end
= end
;
14513 _choose_write_options(c
, o
, fadvise_flags
, &wctx
);
14514 o
->extent_map
.fault_range(db
, offset
, length
);
14515 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx
);
14516 r
= _do_alloc_write(txc
, c
, o
, &wctx
);
14518 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
14523 if (wctx
.extents_to_gc
.empty() ||
14524 wctx
.extents_to_gc
.range_start() > offset
||
14525 wctx
.extents_to_gc
.range_end() < offset
+ length
) {
14526 benefit
= gc
.estimate(offset
,
14533 if (bdev
->is_smr()) {
14534 if (wctx
.old_extents
.empty()) {
14535 txc
->zoned_note_new_object(o
);
14537 int64_t old_ondisk_offset
= wctx
.old_extents
.begin()->r
.begin()->offset
;
14538 txc
->zoned_note_updated_object(o
, old_ondisk_offset
);
14542 // NB: _wctx_finish() will empty old_extents
14543 // so we must do gc estimation before that
14544 _wctx_finish(txc
, c
, o
, &wctx
);
14545 if (end
> o
->onode
.size
) {
14546 dout(20) << __func__
<< " extending size to 0x" << std::hex
<< end
14547 << std::dec
<< dendl
;
14548 o
->onode
.size
= end
;
14551 if (benefit
>= g_conf()->bluestore_gc_enable_total_threshold
) {
14552 wctx
.extents_to_gc
.union_of(gc
.get_extents_to_collect());
14553 dout(20) << __func__
14554 << " perform garbage collection for compressed extents, "
14555 << "expected benefit = " << benefit
<< " AUs" << dendl
;
14557 if (!wctx
.extents_to_gc
.empty()) {
14558 dout(20) << __func__
<< " perform garbage collection" << dendl
;
14560 r
= _do_gc(txc
, c
, o
,
14562 &dirty_start
, &dirty_end
);
14564 derr
<< __func__
<< " _do_gc failed with " << cpp_strerror(r
)
14568 dout(20)<<__func__
<<" gc range is " << std::hex
<< dirty_start
14569 << "~" << dirty_end
- dirty_start
<< std::dec
<< dendl
;
14571 o
->extent_map
.compress_extent_map(dirty_start
, dirty_end
- dirty_start
);
14572 o
->extent_map
.dirty_range(dirty_start
, dirty_end
- dirty_start
);
14580 int BlueStore::_write(TransContext
*txc
,
14583 uint64_t offset
, size_t length
,
14585 uint32_t fadvise_flags
)
14587 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14588 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14591 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
14594 _assign_nid(txc
, o
);
14595 r
= _do_write(txc
, c
, o
, offset
, length
, bl
, fadvise_flags
);
14596 txc
->write_onode(o
);
14598 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14599 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14600 << " = " << r
<< dendl
;
14604 int BlueStore::_zero(TransContext
*txc
,
14607 uint64_t offset
, size_t length
)
14609 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14610 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14613 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
14616 _assign_nid(txc
, o
);
14617 r
= _do_zero(txc
, c
, o
, offset
, length
);
14619 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14620 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14621 << " = " << r
<< dendl
;
14625 int BlueStore::_do_zero(TransContext
*txc
,
14628 uint64_t offset
, size_t length
)
14630 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14631 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14635 _dump_onode
<30>(cct
, *o
);
14638 o
->extent_map
.fault_range(db
, offset
, length
);
14639 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
14640 o
->extent_map
.dirty_range(offset
, length
);
14641 _wctx_finish(txc
, c
, o
, &wctx
);
14643 if (length
> 0 && offset
+ length
> o
->onode
.size
) {
14644 o
->onode
.size
= offset
+ length
;
14645 dout(20) << __func__
<< " extending size to " << offset
+ length
14648 txc
->write_onode(o
);
14650 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14651 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14652 << " = " << r
<< dendl
;
14656 void BlueStore::_do_truncate(
14657 TransContext
*txc
, CollectionRef
& c
, OnodeRef o
, uint64_t offset
,
14658 set
<SharedBlob
*> *maybe_unshared_blobs
)
14660 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14661 << " 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
14663 _dump_onode
<30>(cct
, *o
);
14665 if (offset
== o
->onode
.size
)
14669 if (offset
< o
->onode
.size
) {
14670 uint64_t length
= o
->onode
.size
- offset
;
14671 o
->extent_map
.fault_range(db
, offset
, length
);
14672 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
14673 o
->extent_map
.dirty_range(offset
, length
);
14674 _wctx_finish(txc
, c
, o
, &wctx
, maybe_unshared_blobs
);
14676 // if we have shards past EOF, ask for a reshard
14677 if (!o
->onode
.extent_map_shards
.empty() &&
14678 o
->onode
.extent_map_shards
.back().offset
>= offset
) {
14679 dout(10) << __func__
<< " request reshard past EOF" << dendl
;
14681 o
->extent_map
.request_reshard(offset
- 1, offset
+ length
);
14683 o
->extent_map
.request_reshard(0, length
);
14688 o
->onode
.size
= offset
;
14690 if (bdev
->is_smr()) {
14691 // On zoned devices, we currently support only removing an object or
14692 // truncating it to zero size, both of which fall through this code path.
14693 ceph_assert(offset
== 0 && !wctx
.old_extents
.empty());
14694 int64_t ondisk_offset
= wctx
.old_extents
.begin()->r
.begin()->offset
;
14695 txc
->zoned_note_truncated_object(o
, ondisk_offset
);
14698 txc
->write_onode(o
);
14701 int BlueStore::_truncate(TransContext
*txc
,
14706 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14707 << " 0x" << std::hex
<< offset
<< std::dec
14710 if (offset
>= OBJECT_MAX_SIZE
) {
14713 _do_truncate(txc
, c
, o
, offset
);
14715 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14716 << " 0x" << std::hex
<< offset
<< std::dec
14717 << " = " << r
<< dendl
;
14721 int BlueStore::_do_remove(
14726 set
<SharedBlob
*> maybe_unshared_blobs
;
14727 bool is_gen
= !o
->oid
.is_no_gen();
14728 _do_truncate(txc
, c
, o
, 0, is_gen
? &maybe_unshared_blobs
: nullptr);
14729 if (o
->onode
.has_omap()) {
14731 _do_omap_clear(txc
, o
);
14735 for (auto &s
: o
->extent_map
.shards
) {
14736 dout(20) << __func__
<< " removing shard 0x" << std::hex
14737 << s
.shard_info
->offset
<< std::dec
<< dendl
;
14738 generate_extent_shard_key_and_apply(o
->key
, s
.shard_info
->offset
, &key
,
14739 [&](const string
& final_key
) {
14740 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
14744 txc
->t
->rmkey(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size());
14745 txc
->note_removed_object(o
);
14746 o
->extent_map
.clear();
14747 o
->onode
= bluestore_onode_t();
14748 _debug_obj_on_delete(o
->oid
);
14750 if (!is_gen
|| maybe_unshared_blobs
.empty()) {
14754 // see if we can unshare blobs still referenced by the head
14755 dout(10) << __func__
<< " gen and maybe_unshared_blobs "
14756 << maybe_unshared_blobs
<< dendl
;
14757 ghobject_t nogen
= o
->oid
;
14758 nogen
.generation
= ghobject_t::NO_GEN
;
14759 OnodeRef h
= c
->get_onode(nogen
, false);
14761 if (!h
|| !h
->exists
) {
14765 dout(20) << __func__
<< " checking for unshareable blobs on " << h
14766 << " " << h
->oid
<< dendl
;
14767 map
<SharedBlob
*,bluestore_extent_ref_map_t
> expect
;
14768 for (auto& e
: h
->extent_map
.extent_map
) {
14769 const bluestore_blob_t
& b
= e
.blob
->get_blob();
14770 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
14771 if (b
.is_shared() &&
14773 maybe_unshared_blobs
.count(sb
)) {
14774 if (b
.is_compressed()) {
14775 expect
[sb
].get(0, b
.get_ondisk_length());
14777 b
.map(e
.blob_offset
, e
.length
, [&](uint64_t off
, uint64_t len
) {
14778 expect
[sb
].get(off
, len
);
14785 vector
<SharedBlob
*> unshared_blobs
;
14786 unshared_blobs
.reserve(maybe_unshared_blobs
.size());
14787 for (auto& p
: expect
) {
14788 dout(20) << " ? " << *p
.first
<< " vs " << p
.second
<< dendl
;
14789 if (p
.first
->persistent
->ref_map
== p
.second
) {
14790 SharedBlob
*sb
= p
.first
;
14791 dout(20) << __func__
<< " unsharing " << *sb
<< dendl
;
14792 unshared_blobs
.push_back(sb
);
14793 txc
->unshare_blob(sb
);
14794 uint64_t sbid
= c
->make_blob_unshared(sb
);
14796 get_shared_blob_key(sbid
, &key
);
14797 txc
->t
->rmkey(PREFIX_SHARED_BLOB
, key
);
14801 if (unshared_blobs
.empty()) {
14805 for (auto& e
: h
->extent_map
.extent_map
) {
14806 const bluestore_blob_t
& b
= e
.blob
->get_blob();
14807 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
14808 if (b
.is_shared() &&
14809 std::find(unshared_blobs
.begin(), unshared_blobs
.end(),
14810 sb
) != unshared_blobs
.end()) {
14811 dout(20) << __func__
<< " unsharing " << e
<< dendl
;
14812 bluestore_blob_t
& blob
= e
.blob
->dirty_blob();
14813 blob
.clear_flag(bluestore_blob_t::FLAG_SHARED
);
14814 h
->extent_map
.dirty_range(e
.logical_offset
, 1);
14817 txc
->write_onode(h
);
14822 int BlueStore::_remove(TransContext
*txc
,
14826 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14827 << " onode " << o
.get()
14828 << " txc "<< txc
<< dendl
;
14830 auto start_time
= mono_clock::now();
14831 int r
= _do_remove(txc
, c
, o
);
14834 l_bluestore_remove_lat
,
14835 mono_clock::now() - start_time
,
14836 cct
->_conf
->bluestore_log_op_age
,
14837 [&](const ceph::timespan
& lat
) {
14838 ostringstream ostr
;
14839 ostr
<< ", lat = " << timespan_str(lat
)
14840 << " cid =" << c
->cid
14841 << " oid =" << o
->oid
;
14846 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14850 int BlueStore::_setattr(TransContext
*txc
,
14853 const string
& name
,
14856 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14857 << " " << name
<< " (" << val
.length() << " bytes)"
14860 if (val
.is_partial()) {
14861 auto& b
= o
->onode
.attrs
[name
.c_str()] = bufferptr(val
.c_str(),
14863 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14865 auto& b
= o
->onode
.attrs
[name
.c_str()] = val
;
14866 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14868 txc
->write_onode(o
);
14869 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14870 << " " << name
<< " (" << val
.length() << " bytes)"
14871 << " = " << r
<< dendl
;
14875 int BlueStore::_setattrs(TransContext
*txc
,
14878 const map
<string
,bufferptr
>& aset
)
14880 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14881 << " " << aset
.size() << " keys"
14884 for (map
<string
,bufferptr
>::const_iterator p
= aset
.begin();
14885 p
!= aset
.end(); ++p
) {
14886 if (p
->second
.is_partial()) {
14887 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] =
14888 bufferptr(p
->second
.c_str(), p
->second
.length());
14889 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14891 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] = p
->second
;
14892 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14895 txc
->write_onode(o
);
14896 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14897 << " " << aset
.size() << " keys"
14898 << " = " << r
<< dendl
;
14903 int BlueStore::_rmattr(TransContext
*txc
,
14906 const string
& name
)
14908 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14909 << " " << name
<< dendl
;
14911 auto it
= o
->onode
.attrs
.find(name
.c_str());
14912 if (it
== o
->onode
.attrs
.end())
14915 o
->onode
.attrs
.erase(it
);
14916 txc
->write_onode(o
);
14919 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14920 << " " << name
<< " = " << r
<< dendl
;
14924 int BlueStore::_rmattrs(TransContext
*txc
,
14928 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14931 if (o
->onode
.attrs
.empty())
14934 o
->onode
.attrs
.clear();
14935 txc
->write_onode(o
);
14938 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14942 void BlueStore::_do_omap_clear(TransContext
*txc
, OnodeRef
& o
)
14944 const string
& omap_prefix
= o
->get_omap_prefix();
14945 string prefix
, tail
;
14946 o
->get_omap_header(&prefix
);
14947 o
->get_omap_tail(&tail
);
14948 txc
->t
->rm_range_keys(omap_prefix
, prefix
, tail
);
14949 txc
->t
->rmkey(omap_prefix
, tail
);
14950 dout(20) << __func__
<< " remove range start: "
14951 << pretty_binary_string(prefix
) << " end: "
14952 << pretty_binary_string(tail
) << dendl
;
14955 int BlueStore::_omap_clear(TransContext
*txc
,
14959 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14961 if (o
->onode
.has_omap()) {
14963 _do_omap_clear(txc
, o
);
14964 o
->onode
.clear_omap_flag();
14965 txc
->write_onode(o
);
14967 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14971 int BlueStore::_omap_setkeys(TransContext
*txc
,
14976 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14978 auto p
= bl
.cbegin();
14980 if (!o
->onode
.has_omap()) {
14981 if (o
->oid
.is_pgmeta()) {
14982 o
->onode
.set_omap_flags_pgmeta();
14984 o
->onode
.set_omap_flags();
14986 txc
->write_onode(o
);
14988 const string
& prefix
= o
->get_omap_prefix();
14991 o
->get_omap_tail(&key_tail
);
14992 txc
->t
->set(prefix
, key_tail
, tail
);
14994 txc
->note_modified_object(o
);
14996 const string
& prefix
= o
->get_omap_prefix();
14998 o
->get_omap_key(string(), &final_key
);
14999 size_t base_key_len
= final_key
.size();
15006 final_key
.resize(base_key_len
); // keep prefix
15008 dout(20) << __func__
<< " " << pretty_binary_string(final_key
)
15009 << " <- " << key
<< dendl
;
15010 txc
->t
->set(prefix
, final_key
, value
);
15013 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15017 int BlueStore::_omap_setheader(TransContext
*txc
,
15022 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
15025 if (!o
->onode
.has_omap()) {
15026 if (o
->oid
.is_pgmeta()) {
15027 o
->onode
.set_omap_flags_pgmeta();
15029 o
->onode
.set_omap_flags();
15031 txc
->write_onode(o
);
15033 const string
& prefix
= o
->get_omap_prefix();
15036 o
->get_omap_tail(&key_tail
);
15037 txc
->t
->set(prefix
, key_tail
, tail
);
15039 txc
->note_modified_object(o
);
15041 const string
& prefix
= o
->get_omap_prefix();
15042 o
->get_omap_header(&key
);
15043 txc
->t
->set(prefix
, key
, bl
);
15045 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15049 int BlueStore::_omap_rmkeys(TransContext
*txc
,
15054 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
15056 auto p
= bl
.cbegin();
15060 if (!o
->onode
.has_omap()) {
15064 const string
& prefix
= o
->get_omap_prefix();
15065 o
->get_omap_key(string(), &final_key
);
15066 size_t base_key_len
= final_key
.size();
15071 final_key
.resize(base_key_len
); // keep prefix
15073 dout(20) << __func__
<< " rm " << pretty_binary_string(final_key
)
15074 << " <- " << key
<< dendl
;
15075 txc
->t
->rmkey(prefix
, final_key
);
15078 txc
->note_modified_object(o
);
15081 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15085 int BlueStore::_omap_rmkey_range(TransContext
*txc
,
15088 const string
& first
, const string
& last
)
15090 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
15091 string key_first
, key_last
;
15093 if (!o
->onode
.has_omap()) {
15097 const string
& prefix
= o
->get_omap_prefix();
15099 o
->get_omap_key(first
, &key_first
);
15100 o
->get_omap_key(last
, &key_last
);
15101 txc
->t
->rm_range_keys(prefix
, key_first
, key_last
);
15102 dout(20) << __func__
<< " remove range start: "
15103 << pretty_binary_string(key_first
) << " end: "
15104 << pretty_binary_string(key_last
) << dendl
;
15106 txc
->note_modified_object(o
);
15109 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15113 int BlueStore::_set_alloc_hint(
15117 uint64_t expected_object_size
,
15118 uint64_t expected_write_size
,
15121 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
15122 << " object_size " << expected_object_size
15123 << " write_size " << expected_write_size
15124 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
15127 o
->onode
.expected_object_size
= expected_object_size
;
15128 o
->onode
.expected_write_size
= expected_write_size
;
15129 o
->onode
.alloc_hint_flags
= flags
;
15130 txc
->write_onode(o
);
15131 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
15132 << " object_size " << expected_object_size
15133 << " write_size " << expected_write_size
15134 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
15135 << " = " << r
<< dendl
;
15139 int BlueStore::_clone(TransContext
*txc
,
15144 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15145 << newo
->oid
<< dendl
;
15147 if (oldo
->oid
.hobj
.get_hash() != newo
->oid
.hobj
.get_hash()) {
15148 derr
<< __func__
<< " mismatched hash on " << oldo
->oid
15149 << " and " << newo
->oid
<< dendl
;
15153 _assign_nid(txc
, newo
);
15157 _do_truncate(txc
, c
, newo
, 0);
15158 if (cct
->_conf
->bluestore_clone_cow
) {
15159 _do_clone_range(txc
, c
, oldo
, newo
, 0, oldo
->onode
.size
, 0);
15162 r
= _do_read(c
.get(), oldo
, 0, oldo
->onode
.size
, bl
, 0);
15165 r
= _do_write(txc
, c
, newo
, 0, oldo
->onode
.size
, bl
, 0);
15171 newo
->onode
.attrs
= oldo
->onode
.attrs
;
15174 if (newo
->onode
.has_omap()) {
15175 dout(20) << __func__
<< " clearing old omap data" << dendl
;
15177 _do_omap_clear(txc
, newo
);
15178 newo
->onode
.clear_omap_flag();
15180 if (oldo
->onode
.has_omap()) {
15181 dout(20) << __func__
<< " copying omap data" << dendl
;
15182 if (newo
->oid
.is_pgmeta()) {
15183 newo
->onode
.set_omap_flags_pgmeta();
15185 newo
->onode
.set_omap_flags();
15187 const string
& prefix
= newo
->get_omap_prefix();
15188 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
15190 oldo
->get_omap_header(&head
);
15191 oldo
->get_omap_tail(&tail
);
15192 it
->lower_bound(head
);
15193 while (it
->valid()) {
15194 if (it
->key() >= tail
) {
15195 dout(30) << __func__
<< " reached tail" << dendl
;
15198 dout(30) << __func__
<< " got header/data "
15199 << pretty_binary_string(it
->key()) << dendl
;
15201 newo
->rewrite_omap_key(it
->key(), &key
);
15202 txc
->t
->set(prefix
, key
, it
->value());
15207 bufferlist new_tail_value
;
15208 newo
->get_omap_tail(&new_tail
);
15209 txc
->t
->set(prefix
, new_tail
, new_tail_value
);
15212 txc
->write_onode(newo
);
15216 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15217 << newo
->oid
<< " = " << r
<< dendl
;
15221 int BlueStore::_do_clone_range(
15230 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15232 << " 0x" << std::hex
<< srcoff
<< "~" << length
<< " -> "
15233 << " 0x" << dstoff
<< "~" << length
<< std::dec
<< dendl
;
15234 oldo
->extent_map
.fault_range(db
, srcoff
, length
);
15235 newo
->extent_map
.fault_range(db
, dstoff
, length
);
15236 _dump_onode
<30>(cct
, *oldo
);
15237 _dump_onode
<30>(cct
, *newo
);
15239 oldo
->extent_map
.dup(this, txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
15240 _dump_onode
<30>(cct
, *oldo
);
15241 _dump_onode
<30>(cct
, *newo
);
15245 int BlueStore::_clone_range(TransContext
*txc
,
15249 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
)
15251 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15252 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
15253 << " to offset 0x" << dstoff
<< std::dec
<< dendl
;
15256 if (srcoff
+ length
>= OBJECT_MAX_SIZE
||
15257 dstoff
+ length
>= OBJECT_MAX_SIZE
) {
15261 if (srcoff
+ length
> oldo
->onode
.size
) {
15266 _assign_nid(txc
, newo
);
15269 if (cct
->_conf
->bluestore_clone_cow
) {
15270 _do_zero(txc
, c
, newo
, dstoff
, length
);
15271 _do_clone_range(txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
15274 r
= _do_read(c
.get(), oldo
, srcoff
, length
, bl
, 0);
15277 r
= _do_write(txc
, c
, newo
, dstoff
, bl
.length(), bl
, 0);
15283 txc
->write_onode(newo
);
15287 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15288 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
15289 << " to offset 0x" << dstoff
<< std::dec
15290 << " = " << r
<< dendl
;
15294 int BlueStore::_rename(TransContext
*txc
,
15298 const ghobject_t
& new_oid
)
15300 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15301 << new_oid
<< dendl
;
15303 ghobject_t old_oid
= oldo
->oid
;
15304 mempool::bluestore_cache_meta::string new_okey
;
15307 if (newo
->exists
) {
15311 ceph_assert(txc
->onodes
.count(newo
) == 0);
15314 txc
->t
->rmkey(PREFIX_OBJ
, oldo
->key
.c_str(), oldo
->key
.size());
15318 oldo
->extent_map
.fault_range(db
, 0, oldo
->onode
.size
);
15319 get_object_key(cct
, new_oid
, &new_okey
);
15321 for (auto &s
: oldo
->extent_map
.shards
) {
15322 generate_extent_shard_key_and_apply(oldo
->key
, s
.shard_info
->offset
, &key
,
15323 [&](const string
& final_key
) {
15324 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
15332 txc
->write_onode(newo
);
15334 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
15335 // Onode in the old slot
15336 c
->onode_map
.rename(oldo
, old_oid
, new_oid
, new_okey
);
15339 // hold a ref to new Onode in old name position, to ensure we don't drop
15340 // it from the cache before this txc commits (or else someone may come along
15341 // and read newo's metadata via the old name).
15342 txc
->note_modified_object(oldo
);
15345 dout(10) << __func__
<< " " << c
->cid
<< " " << old_oid
<< " -> "
15346 << new_oid
<< " = " << r
<< dendl
;
15352 int BlueStore::_create_collection(
15358 dout(15) << __func__
<< " " << cid
<< " bits " << bits
<< dendl
;
15363 std::unique_lock
l(coll_lock
);
15368 auto p
= new_coll_map
.find(cid
);
15369 ceph_assert(p
!= new_coll_map
.end());
15371 (*c
)->cnode
.bits
= bits
;
15372 coll_map
[cid
] = *c
;
15373 new_coll_map
.erase(p
);
15375 encode((*c
)->cnode
, bl
);
15376 txc
->t
->set(PREFIX_COLL
, stringify(cid
), bl
);
15380 dout(10) << __func__
<< " " << cid
<< " bits " << bits
<< " = " << r
<< dendl
;
15384 int BlueStore::_remove_collection(TransContext
*txc
, const coll_t
&cid
,
15387 dout(15) << __func__
<< " " << cid
<< dendl
;
15390 (*c
)->flush_all_but_last();
15392 std::unique_lock
l(coll_lock
);
15397 size_t nonexistent_count
= 0;
15398 ceph_assert((*c
)->exists
);
15399 if ((*c
)->onode_map
.map_any([&](Onode
* o
) {
15401 dout(1) << __func__
<< " " << o
->oid
<< " " << o
15402 << " exists in onode_map" << dendl
;
15405 ++nonexistent_count
;
15411 vector
<ghobject_t
> ls
;
15413 // Enumerate onodes in db, up to nonexistent_count + 1
15414 // then check if all of them are marked as non-existent.
15415 // Bypass the check if (next != ghobject_t::get_max())
15416 r
= _collection_list(c
->get(), ghobject_t(), ghobject_t::get_max(),
15417 nonexistent_count
+ 1, false, &ls
, &next
);
15419 // If true mean collecton has more objects than nonexistent_count,
15420 // so bypass check.
15421 bool exists
= (!next
.is_max());
15422 for (auto it
= ls
.begin(); !exists
&& it
< ls
.end(); ++it
) {
15423 dout(10) << __func__
<< " oid " << *it
<< dendl
;
15424 auto onode
= (*c
)->onode_map
.lookup(*it
);
15425 exists
= !onode
|| onode
->exists
;
15427 dout(1) << __func__
<< " " << *it
15428 << " exists in db, "
15429 << (!onode
? "not present in ram" : "present in ram")
15434 _do_remove_collection(txc
, c
);
15437 dout(10) << __func__
<< " " << cid
15438 << " is non-empty" << dendl
;
15444 dout(10) << __func__
<< " " << cid
<< " = " << r
<< dendl
;
15448 void BlueStore::_do_remove_collection(TransContext
*txc
,
15451 coll_map
.erase((*c
)->cid
);
15452 txc
->removed_collections
.push_back(*c
);
15453 (*c
)->exists
= false;
15454 _osr_register_zombie((*c
)->osr
.get());
15455 txc
->t
->rmkey(PREFIX_COLL
, stringify((*c
)->cid
));
15459 int BlueStore::_split_collection(TransContext
*txc
,
15462 unsigned bits
, int rem
)
15464 dout(15) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
15465 << " bits " << bits
<< dendl
;
15466 std::unique_lock
l(c
->lock
);
15467 std::unique_lock
l2(d
->lock
);
15470 // flush all previous deferred writes on this sequencer. this is a bit
15471 // heavyweight, but we need to make sure all deferred writes complete
15472 // before we split as the new collection's sequencer may need to order
15473 // this after those writes, and we don't bother with the complexity of
15474 // moving those TransContexts over to the new osr.
15475 _osr_drain_preceding(txc
);
15477 // move any cached items (onodes and referenced shared blobs) that will
15478 // belong to the child collection post-split. leave everything else behind.
15479 // this may include things that don't strictly belong to the now-smaller
15480 // parent split, but the OSD will always send us a split for every new
15483 spg_t pgid
, dest_pgid
;
15484 bool is_pg
= c
->cid
.is_pg(&pgid
);
15485 ceph_assert(is_pg
);
15486 is_pg
= d
->cid
.is_pg(&dest_pgid
);
15487 ceph_assert(is_pg
);
15489 // the destination should initially be empty.
15490 ceph_assert(d
->onode_map
.empty());
15491 ceph_assert(d
->shared_blob_set
.empty());
15492 ceph_assert(d
->cnode
.bits
== bits
);
15494 c
->split_cache(d
.get());
15496 // adjust bits. note that this will be redundant for all but the first
15497 // split call for this parent (first child).
15498 c
->cnode
.bits
= bits
;
15499 ceph_assert(d
->cnode
.bits
== bits
);
15503 encode(c
->cnode
, bl
);
15504 txc
->t
->set(PREFIX_COLL
, stringify(c
->cid
), bl
);
15506 dout(10) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
15507 << " bits " << bits
<< " = " << r
<< dendl
;
15511 int BlueStore::_merge_collection(
15517 dout(15) << __func__
<< " " << (*c
)->cid
<< " to " << d
->cid
15518 << " bits " << bits
<< dendl
;
15519 std::unique_lock
l((*c
)->lock
);
15520 std::unique_lock
l2(d
->lock
);
15523 coll_t cid
= (*c
)->cid
;
15525 // flush all previous deferred writes on the source collection to ensure
15526 // that all deferred writes complete before we merge as the target collection's
15527 // sequencer may need to order new ops after those writes.
15529 _osr_drain((*c
)->osr
.get());
15531 // move any cached items (onodes and referenced shared blobs) that will
15532 // belong to the child collection post-split. leave everything else behind.
15533 // this may include things that don't strictly belong to the now-smaller
15534 // parent split, but the OSD will always send us a split for every new
15537 spg_t pgid
, dest_pgid
;
15538 bool is_pg
= cid
.is_pg(&pgid
);
15539 ceph_assert(is_pg
);
15540 is_pg
= d
->cid
.is_pg(&dest_pgid
);
15541 ceph_assert(is_pg
);
15543 // adjust bits. note that this will be redundant for all but the first
15544 // merge call for the parent/target.
15545 d
->cnode
.bits
= bits
;
15547 // behavior depends on target (d) bits, so this after that is updated.
15548 (*c
)->split_cache(d
.get());
15550 // remove source collection
15552 std::unique_lock
l3(coll_lock
);
15553 _do_remove_collection(txc
, c
);
15559 encode(d
->cnode
, bl
);
15560 txc
->t
->set(PREFIX_COLL
, stringify(d
->cid
), bl
);
15562 dout(10) << __func__
<< " " << cid
<< " to " << d
->cid
<< " "
15563 << " bits " << bits
<< " = " << r
<< dendl
;
15567 void BlueStore::log_latency(
15570 const ceph::timespan
& l
,
15571 double lat_threshold
,
15572 const char* info
) const
15574 logger
->tinc(idx
, l
);
15575 if (lat_threshold
> 0.0 &&
15576 l
>= make_timespan(lat_threshold
)) {
15577 dout(0) << __func__
<< " slow operation observed for " << name
15578 << ", latency = " << l
15584 void BlueStore::log_latency_fn(
15587 const ceph::timespan
& l
,
15588 double lat_threshold
,
15589 std::function
<string (const ceph::timespan
& lat
)> fn
) const
15591 logger
->tinc(idx
, l
);
15592 if (lat_threshold
> 0.0 &&
15593 l
>= make_timespan(lat_threshold
)) {
15594 dout(0) << __func__
<< " slow operation observed for " << name
15595 << ", latency = " << l
15601 #if defined(WITH_LTTNG)
15602 void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15605 mono_clock::time_point start_throttle_acquire
)
15607 pending_kv_ios
+= txc
.ios
;
15608 if (txc
.deferred_txn
) {
15609 pending_deferred_ios
+= txc
.ios
;
15612 uint64_t started
= 0;
15613 uint64_t completed
= 0;
15614 if (should_trace(&started
, &completed
)) {
15615 txc
.tracing
= true;
15616 uint64_t rocksdb_base_level
,
15617 rocksdb_estimate_pending_compaction_bytes
,
15618 rocksdb_cur_size_all_mem_tables
,
15619 rocksdb_compaction_pending
,
15620 rocksdb_mem_table_flush_pending
,
15621 rocksdb_num_running_compactions
,
15622 rocksdb_num_running_flushes
,
15623 rocksdb_actual_delayed_write_rate
;
15625 "rocksdb.base-level",
15626 &rocksdb_base_level
);
15628 "rocksdb.estimate-pending-compaction-bytes",
15629 &rocksdb_estimate_pending_compaction_bytes
);
15631 "rocksdb.cur-size-all-mem-tables",
15632 &rocksdb_cur_size_all_mem_tables
);
15634 "rocksdb.compaction-pending",
15635 &rocksdb_compaction_pending
);
15637 "rocksdb.mem-table-flush-pending",
15638 &rocksdb_mem_table_flush_pending
);
15640 "rocksdb.num-running-compactions",
15641 &rocksdb_num_running_compactions
);
15643 "rocksdb.num-running-flushes",
15644 &rocksdb_num_running_flushes
);
15646 "rocksdb.actual-delayed-write-rate",
15647 &rocksdb_actual_delayed_write_rate
);
15652 transaction_initial_state
,
15653 txc
.osr
->get_sequencer_id(),
15655 throttle_bytes
.get_current(),
15656 throttle_deferred_bytes
.get_current(),
15658 pending_deferred_ios
,
15661 ceph::to_seconds
<double>(mono_clock::now() - start_throttle_acquire
));
15665 transaction_initial_state_rocksdb
,
15666 txc
.osr
->get_sequencer_id(),
15668 rocksdb_base_level
,
15669 rocksdb_estimate_pending_compaction_bytes
,
15670 rocksdb_cur_size_all_mem_tables
,
15671 rocksdb_compaction_pending
,
15672 rocksdb_mem_table_flush_pending
,
15673 rocksdb_num_running_compactions
,
15674 rocksdb_num_running_flushes
,
15675 rocksdb_actual_delayed_write_rate
);
15680 mono_clock::duration
BlueStore::BlueStoreThrottle::log_state_latency(
15681 TransContext
&txc
, PerfCounters
*logger
, int state
)
15683 mono_clock::time_point now
= mono_clock::now();
15684 mono_clock::duration lat
= now
- txc
.last_stamp
;
15685 logger
->tinc(state
, lat
);
15686 #if defined(WITH_LTTNG)
15688 state
>= l_bluestore_state_prepare_lat
&&
15689 state
<= l_bluestore_state_done_lat
) {
15690 OID_ELAPSED("", lat
.to_nsec() / 1000.0, txc
.get_state_latency_name(state
));
15693 transaction_state_duration
,
15694 txc
.osr
->get_sequencer_id(),
15697 ceph::to_seconds
<double>(lat
));
15700 txc
.last_stamp
= now
;
15704 bool BlueStore::BlueStoreThrottle::try_start_transaction(
15707 mono_clock::time_point start_throttle_acquire
)
15709 throttle_bytes
.get(txc
.cost
);
15711 if (!txc
.deferred_txn
|| throttle_deferred_bytes
.get_or_fail(txc
.cost
)) {
15712 emit_initial_tracepoint(db
, txc
, start_throttle_acquire
);
15719 void BlueStore::BlueStoreThrottle::finish_start_transaction(
15722 mono_clock::time_point start_throttle_acquire
)
15724 ceph_assert(txc
.deferred_txn
);
15725 throttle_deferred_bytes
.get(txc
.cost
);
15726 emit_initial_tracepoint(db
, txc
, start_throttle_acquire
);
15729 #if defined(WITH_LTTNG)
15730 void BlueStore::BlueStoreThrottle::complete_kv(TransContext
&txc
)
15732 pending_kv_ios
-= 1;
15733 ios_completed_since_last_traced
++;
15737 transaction_commit_latency
,
15738 txc
.osr
->get_sequencer_id(),
15740 ceph::to_seconds
<double>(mono_clock::now() - txc
.start
));
15745 #if defined(WITH_LTTNG)
15746 void BlueStore::BlueStoreThrottle::complete(TransContext
&txc
)
15748 if (txc
.deferred_txn
) {
15749 pending_deferred_ios
-= 1;
15752 mono_clock::time_point now
= mono_clock::now();
15753 mono_clock::duration lat
= now
- txc
.start
;
15756 transaction_total_duration
,
15757 txc
.osr
->get_sequencer_id(),
15759 ceph::to_seconds
<double>(lat
));
15764 // DB key value Histogram
15765 #define KEY_SLAB 32
15766 #define VALUE_SLAB 64
15768 const string prefix_onode
= "o";
15769 const string prefix_onode_shard
= "x";
15770 const string prefix_other
= "Z";
15772 int BlueStore::DBHistogram::get_key_slab(size_t sz
)
15774 return (sz
/KEY_SLAB
);
15777 string
BlueStore::DBHistogram::get_key_slab_to_range(int slab
)
15779 int lower_bound
= slab
* KEY_SLAB
;
15780 int upper_bound
= (slab
+ 1) * KEY_SLAB
;
15781 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
15785 int BlueStore::DBHistogram::get_value_slab(size_t sz
)
15787 return (sz
/VALUE_SLAB
);
15790 string
BlueStore::DBHistogram::get_value_slab_to_range(int slab
)
15792 int lower_bound
= slab
* VALUE_SLAB
;
15793 int upper_bound
= (slab
+ 1) * VALUE_SLAB
;
15794 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
15798 void BlueStore::DBHistogram::update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
15799 const string
&prefix
, size_t key_size
, size_t value_size
)
15801 uint32_t key_slab
= get_key_slab(key_size
);
15802 uint32_t value_slab
= get_value_slab(value_size
);
15803 key_hist
[prefix
][key_slab
].count
++;
15804 key_hist
[prefix
][key_slab
].max_len
=
15805 std::max
<size_t>(key_size
, key_hist
[prefix
][key_slab
].max_len
);
15806 key_hist
[prefix
][key_slab
].val_map
[value_slab
].count
++;
15807 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
=
15808 std::max
<size_t>(value_size
,
15809 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
);
15812 void BlueStore::DBHistogram::dump(Formatter
*f
)
15814 f
->open_object_section("rocksdb_value_distribution");
15815 for (auto i
: value_hist
) {
15816 f
->dump_unsigned(get_value_slab_to_range(i
.first
).data(), i
.second
);
15818 f
->close_section();
15820 f
->open_object_section("rocksdb_key_value_histogram");
15821 for (auto i
: key_hist
) {
15822 f
->dump_string("prefix", i
.first
);
15823 f
->open_object_section("key_hist");
15824 for ( auto k
: i
.second
) {
15825 f
->dump_unsigned(get_key_slab_to_range(k
.first
).data(), k
.second
.count
);
15826 f
->dump_unsigned("max_len", k
.second
.max_len
);
15827 f
->open_object_section("value_hist");
15828 for ( auto j
: k
.second
.val_map
) {
15829 f
->dump_unsigned(get_value_slab_to_range(j
.first
).data(), j
.second
.count
);
15830 f
->dump_unsigned("max_len", j
.second
.max_len
);
15832 f
->close_section();
15834 f
->close_section();
15836 f
->close_section();
15839 //Itrerates through the db and collects the stats
15840 void BlueStore::generate_db_histogram(Formatter
*f
)
15843 uint64_t num_onodes
= 0;
15844 uint64_t num_shards
= 0;
15845 uint64_t num_super
= 0;
15846 uint64_t num_coll
= 0;
15847 uint64_t num_omap
= 0;
15848 uint64_t num_pgmeta_omap
= 0;
15849 uint64_t num_deferred
= 0;
15850 uint64_t num_alloc
= 0;
15851 uint64_t num_stat
= 0;
15852 uint64_t num_others
= 0;
15853 uint64_t num_shared_shards
= 0;
15854 size_t max_key_size
=0, max_value_size
= 0;
15855 uint64_t total_key_size
= 0, total_value_size
= 0;
15856 size_t key_size
= 0, value_size
= 0;
15859 auto start
= coarse_mono_clock::now();
15861 KeyValueDB::WholeSpaceIterator iter
= db
->get_wholespace_iterator();
15862 iter
->seek_to_first();
15863 while (iter
->valid()) {
15864 dout(30) << __func__
<< " Key: " << iter
->key() << dendl
;
15865 key_size
= iter
->key_size();
15866 value_size
= iter
->value_size();
15867 hist
.value_hist
[hist
.get_value_slab(value_size
)]++;
15868 max_key_size
= std::max(max_key_size
, key_size
);
15869 max_value_size
= std::max(max_value_size
, value_size
);
15870 total_key_size
+= key_size
;
15871 total_value_size
+= value_size
;
15873 pair
<string
,string
> key(iter
->raw_key());
15875 if (key
.first
== PREFIX_SUPER
) {
15876 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SUPER
, key_size
, value_size
);
15878 } else if (key
.first
== PREFIX_STAT
) {
15879 hist
.update_hist_entry(hist
.key_hist
, PREFIX_STAT
, key_size
, value_size
);
15881 } else if (key
.first
== PREFIX_COLL
) {
15882 hist
.update_hist_entry(hist
.key_hist
, PREFIX_COLL
, key_size
, value_size
);
15884 } else if (key
.first
== PREFIX_OBJ
) {
15885 if (key
.second
.back() == ONODE_KEY_SUFFIX
) {
15886 hist
.update_hist_entry(hist
.key_hist
, prefix_onode
, key_size
, value_size
);
15889 hist
.update_hist_entry(hist
.key_hist
, prefix_onode_shard
, key_size
, value_size
);
15892 } else if (key
.first
== PREFIX_OMAP
) {
15893 hist
.update_hist_entry(hist
.key_hist
, PREFIX_OMAP
, key_size
, value_size
);
15895 } else if (key
.first
== PREFIX_PERPOOL_OMAP
) {
15896 hist
.update_hist_entry(hist
.key_hist
, PREFIX_PERPOOL_OMAP
, key_size
, value_size
);
15898 } else if (key
.first
== PREFIX_PERPG_OMAP
) {
15899 hist
.update_hist_entry(hist
.key_hist
, PREFIX_PERPG_OMAP
, key_size
, value_size
);
15901 } else if (key
.first
== PREFIX_PGMETA_OMAP
) {
15902 hist
.update_hist_entry(hist
.key_hist
, PREFIX_PGMETA_OMAP
, key_size
, value_size
);
15904 } else if (key
.first
== PREFIX_DEFERRED
) {
15905 hist
.update_hist_entry(hist
.key_hist
, PREFIX_DEFERRED
, key_size
, value_size
);
15907 } else if (key
.first
== PREFIX_ALLOC
|| key
.first
== PREFIX_ALLOC_BITMAP
) {
15908 hist
.update_hist_entry(hist
.key_hist
, PREFIX_ALLOC
, key_size
, value_size
);
15910 } else if (key
.first
== PREFIX_SHARED_BLOB
) {
15911 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SHARED_BLOB
, key_size
, value_size
);
15912 num_shared_shards
++;
15914 hist
.update_hist_entry(hist
.key_hist
, prefix_other
, key_size
, value_size
);
15920 ceph::timespan duration
= coarse_mono_clock::now() - start
;
15921 f
->open_object_section("rocksdb_key_value_stats");
15922 f
->dump_unsigned("num_onodes", num_onodes
);
15923 f
->dump_unsigned("num_shards", num_shards
);
15924 f
->dump_unsigned("num_super", num_super
);
15925 f
->dump_unsigned("num_coll", num_coll
);
15926 f
->dump_unsigned("num_omap", num_omap
);
15927 f
->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap
);
15928 f
->dump_unsigned("num_deferred", num_deferred
);
15929 f
->dump_unsigned("num_alloc", num_alloc
);
15930 f
->dump_unsigned("num_stat", num_stat
);
15931 f
->dump_unsigned("num_shared_shards", num_shared_shards
);
15932 f
->dump_unsigned("num_others", num_others
);
15933 f
->dump_unsigned("max_key_size", max_key_size
);
15934 f
->dump_unsigned("max_value_size", max_value_size
);
15935 f
->dump_unsigned("total_key_size", total_key_size
);
15936 f
->dump_unsigned("total_value_size", total_value_size
);
15937 f
->close_section();
15941 dout(20) << __func__
<< " finished in " << duration
<< " seconds" << dendl
;
15945 void BlueStore::_shutdown_cache()
15947 dout(10) << __func__
<< dendl
;
15948 for (auto i
: buffer_cache_shards
) {
15950 ceph_assert(i
->empty());
15952 for (auto& p
: coll_map
) {
15953 p
.second
->onode_map
.clear();
15954 if (!p
.second
->shared_blob_set
.empty()) {
15955 derr
<< __func__
<< " stray shared blobs on " << p
.first
<< dendl
;
15956 p
.second
->shared_blob_set
.dump
<0>(cct
);
15958 ceph_assert(p
.second
->onode_map
.empty());
15959 ceph_assert(p
.second
->shared_blob_set
.empty());
15962 for (auto i
: onode_cache_shards
) {
15963 ceph_assert(i
->empty());
15967 // For external caller.
15968 // We use a best-effort policy instead, e.g.,
15969 // we don't care if there are still some pinned onodes/data in the cache
15970 // after this command is completed.
15971 int BlueStore::flush_cache(ostream
*os
)
15973 dout(10) << __func__
<< dendl
;
15974 for (auto i
: onode_cache_shards
) {
15977 for (auto i
: buffer_cache_shards
) {
15984 void BlueStore::_apply_padding(uint64_t head_pad
,
15986 bufferlist
& padded
)
15989 padded
.prepend_zero(head_pad
);
15992 padded
.append_zero(tail_pad
);
15994 if (head_pad
|| tail_pad
) {
15995 dout(20) << __func__
<< " can pad head 0x" << std::hex
<< head_pad
15996 << " tail 0x" << tail_pad
<< std::dec
<< dendl
;
15997 logger
->inc(l_bluestore_write_pad_bytes
, head_pad
+ tail_pad
);
16001 void BlueStore::_record_onode(OnodeRef
&o
, KeyValueDB::Transaction
&txn
)
16003 // finalize extent_map shards
16004 o
->extent_map
.update(txn
, false);
16005 if (o
->extent_map
.needs_reshard()) {
16006 o
->extent_map
.reshard(db
, txn
);
16007 o
->extent_map
.update(txn
, true);
16008 if (o
->extent_map
.needs_reshard()) {
16009 dout(20) << __func__
<< " warning: still wants reshard, check options?"
16011 o
->extent_map
.clear_needs_reshard();
16013 logger
->inc(l_bluestore_onode_reshard
);
16018 denc(o
->onode
, bound
);
16019 o
->extent_map
.bound_encode_spanning_blobs(bound
);
16020 if (o
->onode
.extent_map_shards
.empty()) {
16021 denc(o
->extent_map
.inline_bl
, bound
);
16026 unsigned onode_part
, blob_part
, extent_part
;
16028 auto p
= bl
.get_contiguous_appender(bound
, true);
16030 onode_part
= p
.get_logical_offset();
16031 o
->extent_map
.encode_spanning_blobs(p
);
16032 blob_part
= p
.get_logical_offset() - onode_part
;
16033 if (o
->onode
.extent_map_shards
.empty()) {
16034 denc(o
->extent_map
.inline_bl
, p
);
16036 extent_part
= p
.get_logical_offset() - onode_part
- blob_part
;
16039 dout(20) << __func__
<< " onode " << o
->oid
<< " is " << bl
.length()
16040 << " (" << onode_part
<< " bytes onode + "
16041 << blob_part
<< " bytes spanning blobs + "
16042 << extent_part
<< " bytes inline extents)"
16046 txn
->set(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size(), bl
);
16049 void BlueStore::_log_alerts(osd_alert_list_t
& alerts
)
16051 std::lock_guard
l(qlock
);
16053 if (!spurious_read_errors_alert
.empty()) {
16055 "BLUESTORE_SPURIOUS_READ_ERRORS",
16056 spurious_read_errors_alert
);
16058 if (!disk_size_mismatch_alert
.empty()) {
16060 "BLUESTORE_DISK_SIZE_MISMATCH",
16061 disk_size_mismatch_alert
);
16063 if (!legacy_statfs_alert
.empty()) {
16065 "BLUESTORE_LEGACY_STATFS",
16066 legacy_statfs_alert
);
16068 if (!spillover_alert
.empty() &&
16069 cct
->_conf
->bluestore_warn_on_bluefs_spillover
) {
16071 "BLUEFS_SPILLOVER",
16074 if (!no_per_pg_omap_alert
.empty()) {
16076 "BLUESTORE_NO_PER_PG_OMAP",
16077 no_per_pg_omap_alert
);
16079 if (!no_per_pool_omap_alert
.empty()) {
16081 "BLUESTORE_NO_PER_POOL_OMAP",
16082 no_per_pool_omap_alert
);
16084 string
s0(failed_cmode
);
16086 if (!failed_compressors
.empty()) {
16090 s0
+= "unable to load:";
16092 for (auto& s
: failed_compressors
) {
16101 "BLUESTORE_NO_COMPRESSION",
16106 void BlueStore::_collect_allocation_stats(uint64_t need
, uint32_t alloc_size
,
16109 alloc_stats_count
++;
16110 alloc_stats_fragments
+= extents
;
16111 alloc_stats_size
+= need
;
16114 void BlueStore::_record_allocation_stats()
16116 // don't care about data consistency,
16117 // fields can be partially modified while making the tuple
16118 auto t0
= std::make_tuple(
16119 alloc_stats_count
.exchange(0),
16120 alloc_stats_fragments
.exchange(0),
16121 alloc_stats_size
.exchange(0));
16123 dout(0) << " allocation stats probe "
16124 << probe_count
<< ":"
16125 << " cnt: " << std::get
<0>(t0
)
16126 << " frags: " << std::get
<1>(t0
)
16127 << " size: " << std::get
<2>(t0
)
16132 // Keep the history for probes from the power-of-two sequence:
16133 // -1, -2, -4, -8, -16
16136 for (auto& t
: alloc_stats_history
) {
16137 dout(0) << " probe -"
16138 << base
+ (probe_count
% base
) << ": "
16140 << ", " << std::get
<1>(t
)
16141 << ", " << std::get
<2>(t
)
16145 dout(0) << "------------" << dendl
;
16149 for (ssize_t i
= alloc_stats_history
.size() - 1 ; i
> 0 ; --i
) {
16150 if ((probe_count
% (1 << i
)) == 0) {
16151 alloc_stats_history
[i
] = alloc_stats_history
[i
- 1];
16154 alloc_stats_history
[0].swap(t0
);
16157 // ===========================================
16158 // BlueStoreRepairer
16160 size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
16161 const interval_set
<uint64_t>& extents
)
16163 ceph_assert(granularity
); // initialized
16164 // can't call for the second time
16165 ceph_assert(!was_filtered_out
);
16166 ceph_assert(collections_bfs
.size() == objects_bfs
.size());
16168 uint64_t prev_pos
= 0;
16169 uint64_t npos
= collections_bfs
.size();
16171 bloom_vector collections_reduced
;
16172 bloom_vector objects_reduced
;
16174 for (auto e
: extents
) {
16175 if (e
.second
== 0) {
16178 uint64_t pos
= max(e
.first
/ granularity
, prev_pos
);
16179 uint64_t end_pos
= 1 + (e
.first
+ e
.second
- 1) / granularity
;
16180 while (pos
!= npos
&& pos
< end_pos
) {
16181 ceph_assert( collections_bfs
[pos
].element_count() ==
16182 objects_bfs
[pos
].element_count());
16183 if (collections_bfs
[pos
].element_count()) {
16184 collections_reduced
.push_back(std::move(collections_bfs
[pos
]));
16185 objects_reduced
.push_back(std::move(objects_bfs
[pos
]));
16189 prev_pos
= end_pos
;
16191 collections_reduced
.swap(collections_bfs
);
16192 objects_reduced
.swap(objects_bfs
);
16193 was_filtered_out
= true;
16194 return collections_bfs
.size();
16197 bool BlueStoreRepairer::remove_key(KeyValueDB
*db
,
16198 const string
& prefix
,
16201 std::lock_guard
l(lock
);
16202 if (!remove_key_txn
) {
16203 remove_key_txn
= db
->get_transaction();
16206 remove_key_txn
->rmkey(prefix
, key
);
16211 void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB
*db
, int val
)
16213 std::lock_guard
l(lock
); // possibly redundant
16214 ceph_assert(fix_per_pool_omap_txn
== nullptr);
16215 fix_per_pool_omap_txn
= db
->get_transaction();
16218 bl
.append(stringify(val
));
16219 fix_per_pool_omap_txn
->set(PREFIX_SUPER
, "per_pool_omap", bl
);
16222 bool BlueStoreRepairer::fix_shared_blob(
16225 const bufferlist
* bl
)
16227 std::lock_guard
l(lock
); // possibly redundant
16228 KeyValueDB::Transaction txn
;
16229 if (fix_misreferences_txn
) { // reuse this txn
16230 txn
= fix_misreferences_txn
;
16232 if (!fix_shared_blob_txn
) {
16233 fix_shared_blob_txn
= db
->get_transaction();
16235 txn
= fix_shared_blob_txn
;
16238 get_shared_blob_key(sbid
, &key
);
16242 txn
->set(PREFIX_SHARED_BLOB
, key
, *bl
);
16244 txn
->rmkey(PREFIX_SHARED_BLOB
, key
);
16249 bool BlueStoreRepairer::fix_statfs(KeyValueDB
*db
,
16251 const store_statfs_t
& new_statfs
)
16253 std::lock_guard
l(lock
);
16254 if (!fix_statfs_txn
) {
16255 fix_statfs_txn
= db
->get_transaction();
16257 BlueStore::volatile_statfs vstatfs
;
16258 vstatfs
= new_statfs
;
16260 vstatfs
.encode(bl
);
16262 fix_statfs_txn
->set(PREFIX_STAT
, key
, bl
);
16266 bool BlueStoreRepairer::fix_leaked(KeyValueDB
*db
,
16267 FreelistManager
* fm
,
16268 uint64_t offset
, uint64_t len
)
16270 std::lock_guard
l(lock
);
16271 if (!fix_fm_leaked_txn
) {
16272 fix_fm_leaked_txn
= db
->get_transaction();
16275 fm
->release(offset
, len
, fix_fm_leaked_txn
);
16278 bool BlueStoreRepairer::fix_false_free(KeyValueDB
*db
,
16279 FreelistManager
* fm
,
16280 uint64_t offset
, uint64_t len
)
16282 std::lock_guard
l(lock
);
16283 if (!fix_fm_false_free_txn
) {
16284 fix_fm_false_free_txn
= db
->get_transaction();
16287 fm
->allocate(offset
, len
, fix_fm_false_free_txn
);
16291 bool BlueStoreRepairer::fix_spanning_blobs(
16293 std::function
<void(KeyValueDB::Transaction
)> f
)
16295 std::lock_guard
l(lock
);
16296 if (!fix_onode_txn
) {
16297 fix_onode_txn
= db
->get_transaction();
16304 bool BlueStoreRepairer::preprocess_misreference(KeyValueDB
*db
)
16306 //NB: not for use in multithreading mode!!!
16307 if (misreferenced_extents
.size()) {
16308 size_t n
= space_usage_tracker
.filter_out(misreferenced_extents
);
16309 ceph_assert(n
> 0);
16310 if (!fix_misreferences_txn
) {
16311 fix_misreferences_txn
= db
->get_transaction();
16318 unsigned BlueStoreRepairer::apply(KeyValueDB
* db
)
16320 //NB: not for use in multithreading mode!!!
16321 if (fix_per_pool_omap_txn
) {
16322 db
->submit_transaction_sync(fix_per_pool_omap_txn
);
16323 fix_per_pool_omap_txn
= nullptr;
16325 if (fix_fm_leaked_txn
) {
16326 db
->submit_transaction_sync(fix_fm_leaked_txn
);
16327 fix_fm_leaked_txn
= nullptr;
16329 if (fix_fm_false_free_txn
) {
16330 db
->submit_transaction_sync(fix_fm_false_free_txn
);
16331 fix_fm_false_free_txn
= nullptr;
16333 if (remove_key_txn
) {
16334 db
->submit_transaction_sync(remove_key_txn
);
16335 remove_key_txn
= nullptr;
16337 if (fix_misreferences_txn
) {
16338 db
->submit_transaction_sync(fix_misreferences_txn
);
16339 fix_misreferences_txn
= nullptr;
16341 if (fix_onode_txn
) {
16342 db
->submit_transaction_sync(fix_onode_txn
);
16343 fix_onode_txn
= nullptr;
16345 if (fix_shared_blob_txn
) {
16346 db
->submit_transaction_sync(fix_shared_blob_txn
);
16347 fix_shared_blob_txn
= nullptr;
16350 if (fix_statfs_txn
) {
16351 db
->submit_transaction_sync(fix_statfs_txn
);
16352 fix_statfs_txn
= nullptr;
16354 unsigned repaired
= to_repair_cnt
;
16359 // =======================================================
16360 // RocksDBBlueFSVolumeSelector
16362 uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h
) {
16363 ceph_assert(h
!= nullptr);
16364 uint64_t hint
= reinterpret_cast<uint64_t>(h
);
16368 res
= BlueFS::BDEV_SLOW
;
16369 if (db_avail4slow
> 0) {
16370 // considering statically available db space vs.
16371 // - observed maximums on DB dev for DB/WAL/UNSORTED data
16372 // - observed maximum spillovers
16373 uint64_t max_db_use
= 0; // max db usage we potentially observed
16374 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_LOG
- LEVEL_FIRST
);
16375 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_WAL
- LEVEL_FIRST
);
16376 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_DB
- LEVEL_FIRST
);
16377 // this could go to db hence using it in the estimation
16378 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_SLOW
, LEVEL_DB
- LEVEL_FIRST
);
16380 auto db_total
= l_totals
[LEVEL_DB
- LEVEL_FIRST
];
16381 uint64_t avail
= min(
16383 max_db_use
< db_total
? db_total
- max_db_use
: 0);
16385 // considering current DB dev usage for SLOW data
16386 if (avail
> per_level_per_dev_usage
.at(BlueFS::BDEV_DB
, LEVEL_SLOW
- LEVEL_FIRST
)) {
16387 res
= BlueFS::BDEV_DB
;
16393 res
= BlueFS::BDEV_WAL
;
16397 res
= BlueFS::BDEV_DB
;
16403 void RocksDBBlueFSVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const
16405 res
.emplace_back(base
, l_totals
[LEVEL_DB
- LEVEL_FIRST
]);
16406 res
.emplace_back(base
+ ".slow", l_totals
[LEVEL_SLOW
- LEVEL_FIRST
]);
16409 void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname
) const {
16410 uint8_t res
= LEVEL_DB
;
16411 if (dirname
.length() > 5) {
16412 // the "db.slow" and "db.wal" directory names are hard-coded at
16413 // match up with bluestore. the slow device is always the second
16414 // one (when a dedicated block.db device is present and used at
16415 // bdev 0). the wal device is always last.
16416 if (boost::algorithm::ends_with(dirname
, ".slow")) {
16419 else if (boost::algorithm::ends_with(dirname
, ".wal")) {
16423 return reinterpret_cast<void*>(res
);
16426 void RocksDBBlueFSVolumeSelector::dump(ostream
& sout
) {
16427 auto max_x
= per_level_per_dev_usage
.get_max_x();
16428 auto max_y
= per_level_per_dev_usage
.get_max_y();
16429 sout
<< "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals
[LEVEL_WAL
- LEVEL_FIRST
]
16430 << ", db_total:" << l_totals
[LEVEL_DB
- LEVEL_FIRST
]
16431 << ", slow_total:" << l_totals
[LEVEL_SLOW
- LEVEL_FIRST
]
16432 << ", db_avail:" << db_avail4slow
<< std::endl
16433 << "Usage matrix:" << std::endl
;
16434 constexpr std::array
<const char*, 8> names
{ {
16444 const size_t width
= 12;
16445 for (size_t i
= 0; i
< names
.size(); ++i
) {
16446 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16451 for (size_t l
= 0; l
< max_y
; l
++) {
16452 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16454 switch (l
+ LEVEL_FIRST
) {
16456 sout
<< "LOG"; break;
16458 sout
<< "WAL"; break;
16460 sout
<< "DB"; break;
16462 sout
<< "SLOW"; break;
16464 sout
<< "TOTALS"; break;
16466 for (size_t d
= 0; d
< max_x
; d
++) {
16467 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16469 sout
<< stringify(byte_u_t(per_level_per_dev_usage
.at(d
, l
)));
16471 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16473 sout
<< stringify(per_level_files
[l
]) << std::endl
;
16475 ceph_assert(max_x
== per_level_per_dev_max
.get_max_x());
16476 ceph_assert(max_y
== per_level_per_dev_max
.get_max_y());
16477 sout
<< "MAXIMUMS:" << std::endl
;
16478 for (size_t l
= 0; l
< max_y
; l
++) {
16479 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16481 switch (l
+ LEVEL_FIRST
) {
16483 sout
<< "LOG"; break;
16485 sout
<< "WAL"; break;
16487 sout
<< "DB"; break;
16489 sout
<< "SLOW"; break;
16491 sout
<< "TOTALS"; break;
16493 for (size_t d
= 0; d
< max_x
- 1; d
++) {
16494 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16496 sout
<< stringify(byte_u_t(per_level_per_dev_max
.at(d
, l
)));
16498 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16500 sout
<< stringify(byte_u_t(per_level_per_dev_max
.at(max_x
- 1, l
)));
16501 if (l
< max_y
- 1) {
16507 // =======================================================