1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include <sys/types.h>
21 #include <boost/container/flat_set.hpp>
22 #include "boost/algorithm/string.hpp"
24 #include "include/cpp-btree/btree_set.h"
26 #include "bluestore_common.h"
27 #include "BlueStore.h"
29 #include "include/compat.h"
30 #include "include/intarith.h"
31 #include "include/stringify.h"
32 #include "include/str_map.h"
33 #include "include/util.h"
34 #include "common/errno.h"
35 #include "common/safe_io.h"
36 #include "common/PriorityCache.h"
37 #include "common/RWLock.h"
38 #include "Allocator.h"
39 #include "FreelistManager.h"
41 #include "BlueRocksEnv.h"
42 #include "auth/Crypto.h"
43 #include "common/EventTrace.h"
44 #include "perfglue/heap_profiler.h"
45 #include "common/blkdev.h"
46 #include "common/numa.h"
48 #if defined(WITH_LTTNG)
49 #define TRACEPOINT_DEFINE
50 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
51 #include "tracing/bluestore.h"
52 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
53 #undef TRACEPOINT_DEFINE
55 #define tracepoint(...)
58 #define dout_context cct
59 #define dout_subsys ceph_subsys_bluestore
61 using bid_t
= decltype(BlueStore::Blob::id
);
63 // bluestore_cache_onode
64 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode
, bluestore_onode
,
65 bluestore_cache_onode
);
67 // bluestore_cache_other
68 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer
, bluestore_buffer
,
70 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent
, bluestore_extent
,
72 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob
, bluestore_blob
,
74 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob
, bluestore_shared_blob
,
75 bluestore_SharedBlob
);
78 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext
, bluestore_transcontext
,
83 const string PREFIX_SUPER
= "S"; // field -> value
84 const string PREFIX_STAT
= "T"; // field -> value(int64 array)
85 const string PREFIX_COLL
= "C"; // collection name -> cnode_t
86 const string PREFIX_OBJ
= "O"; // object name -> onode_t
87 const string PREFIX_OMAP
= "M"; // u64 + keyname -> value
88 const string PREFIX_PGMETA_OMAP
= "P"; // u64 + keyname -> value(for meta coll)
89 const string PREFIX_PERPOOL_OMAP
= "m"; // s64 + u64 + keyname -> value
90 const string PREFIX_DEFERRED
= "L"; // id -> deferred_transaction_t
91 const string PREFIX_ALLOC
= "B"; // u64 offset -> u64 length (freelist)
92 const string PREFIX_ALLOC_BITMAP
= "b";// (see BitmapFreelistManager)
93 const string PREFIX_SHARED_BLOB
= "X"; // u64 offset -> shared_blob_t
95 const string BLUESTORE_GLOBAL_STATFS_KEY
= "bluestore_statfs";
97 // write a label in the first block. always use this size. note that
98 // bluefs makes a matching assumption about the location of its
99 // superblock (always the second block of the device).
100 #define BDEV_LABEL_BLOCK_SIZE 4096
102 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
103 #define SUPER_RESERVED 8192
105 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
109 * extent map blob encoding
111 * we use the low bits of the blobid field to indicate some common scenarios
112 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
114 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
115 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
116 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
117 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
118 #define BLOBID_SHIFT_BITS 4
121 * object name key structure
123 * encoded u8: shard + 2^7 (so that it sorts properly)
124 * encoded u64: poolid + 2^63 (so that it sorts properly)
125 * encoded u32: hash (bit reversed)
127 * escaped string: namespace
129 * escaped string: key or object name
130 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
131 * we are done. otherwise, we are followed by the object name.
132 * escaped string: object name (unless '=' above)
135 * encoded u64: generation
138 #define ONODE_KEY_SUFFIX 'o'
147 #define EXTENT_SHARD_KEY_SUFFIX 'x'
150 * string encoding in the key
152 * The key string needs to lexicographically sort the same way that
153 * ghobject_t does. We do this by escaping anything <= to '#' with #
154 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
157 * We use ! as a terminator for strings; this works because it is < #
158 * and will get escaped if it is present in the string.
160 * NOTE: There is a bug in this implementation: due to implicit
161 * character type conversion in comparison it may produce unexpected
162 * ordering. Unfortunately fixing the bug would mean invalidating the
163 * keys in existing deployments. Instead we do additional sorting
164 * where it is needed.
167 static void append_escaped(const string
&in
, S
*out
)
169 char hexbyte
[in
.length() * 3 + 1];
170 char* ptr
= &hexbyte
[0];
171 for (string::const_iterator i
= in
.begin(); i
!= in
.end(); ++i
) {
172 if (*i
<= '#') { // bug: unexpected result for *i > 0x7f
174 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
175 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
176 } else if (*i
>= '~') { // bug: unexpected result for *i > 0x7f
178 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
179 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
185 out
->append(hexbyte
, ptr
- &hexbyte
[0]);
188 inline unsigned h2i(char c
)
190 if ((c
>= '0') && (c
<= '9')) {
192 } else if ((c
>= 'a') && (c
<= 'f')) {
194 } else if ((c
>= 'A') && (c
<= 'F')) {
197 return 256; // make it always larger than 255
201 static int decode_escaped(const char *p
, string
*out
)
204 char* ptr
= &buff
[0];
205 char* max
= &buff
[252];
206 const char *orig_p
= p
;
207 while (*p
&& *p
!= '!') {
208 if (*p
== '#' || *p
== '~') {
211 hex
= h2i(*p
++) << 4;
224 out
->append(buff
, ptr
-buff
);
229 out
->append(buff
, ptr
-buff
);
234 // some things we encode in binary (as le32 or le64); print the
235 // resulting key strings nicely
237 static string
pretty_binary_string(const S
& in
)
241 out
.reserve(in
.length() * 3);
242 enum { NONE
, HEX
, STRING
} mode
= NONE
;
243 unsigned from
= 0, i
;
244 for (i
=0; i
< in
.length(); ++i
) {
245 if ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
246 (mode
== HEX
&& in
.length() - i
>= 4 &&
247 ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
248 (in
[i
+1] < 32 || (unsigned char)in
[i
+1] > 126) ||
249 (in
[i
+2] < 32 || (unsigned char)in
[i
+2] > 126) ||
250 (in
[i
+3] < 32 || (unsigned char)in
[i
+3] > 126)))) {
251 if (mode
== STRING
) {
252 out
.append(in
.c_str() + from
, i
- from
);
259 if (in
.length() - i
>= 4) {
260 // print a whole u32 at once
261 snprintf(buf
, sizeof(buf
), "%08x",
262 (uint32_t)(((unsigned char)in
[i
] << 24) |
263 ((unsigned char)in
[i
+1] << 16) |
264 ((unsigned char)in
[i
+2] << 8) |
265 ((unsigned char)in
[i
+3] << 0)));
268 snprintf(buf
, sizeof(buf
), "%02x", (int)(unsigned char)in
[i
]);
272 if (mode
!= STRING
) {
279 if (mode
== STRING
) {
280 out
.append(in
.c_str() + from
, i
- from
);
287 static void _key_encode_shard(shard_id_t shard
, T
*key
)
289 key
->push_back((char)((uint8_t)shard
.id
+ (uint8_t)0x80));
292 static const char *_key_decode_shard(const char *key
, shard_id_t
*pshard
)
294 pshard
->id
= (uint8_t)*key
- (uint8_t)0x80;
298 static void get_coll_range(const coll_t
& cid
, int bits
,
299 ghobject_t
*temp_start
, ghobject_t
*temp_end
,
300 ghobject_t
*start
, ghobject_t
*end
)
303 if (cid
.is_pg(&pgid
)) {
304 start
->shard_id
= pgid
.shard
;
305 *temp_start
= *start
;
307 start
->hobj
.pool
= pgid
.pool();
308 temp_start
->hobj
.pool
= -2ll - pgid
.pool();
311 *temp_end
= *temp_start
;
313 uint32_t reverse_hash
= hobject_t::_reverse_bits(pgid
.ps());
314 start
->hobj
.set_bitwise_key_u32(reverse_hash
);
315 temp_start
->hobj
.set_bitwise_key_u32(reverse_hash
);
317 uint64_t end_hash
= reverse_hash
+ (1ull << (32 - bits
));
318 if (end_hash
> 0xffffffffull
)
319 end_hash
= 0xffffffffull
;
321 end
->hobj
.set_bitwise_key_u32(end_hash
);
322 temp_end
->hobj
.set_bitwise_key_u32(end_hash
);
324 start
->shard_id
= shard_id_t::NO_SHARD
;
325 start
->hobj
.pool
= -1ull;
328 start
->hobj
.set_bitwise_key_u32(0);
329 end
->hobj
.set_bitwise_key_u32(0xffffffff);
331 // no separate temp section
336 start
->generation
= 0;
338 temp_start
->generation
= 0;
339 temp_end
->generation
= 0;
342 static void get_shared_blob_key(uint64_t sbid
, string
*key
)
345 _key_encode_u64(sbid
, key
);
348 static int get_key_shared_blob(const string
& key
, uint64_t *sbid
)
350 const char *p
= key
.c_str();
351 if (key
.length() < sizeof(uint64_t))
353 _key_decode_u64(p
, sbid
);
358 static void _key_encode_prefix(const ghobject_t
& oid
, S
*key
)
360 _key_encode_shard(oid
.shard_id
, key
);
361 _key_encode_u64(oid
.hobj
.pool
+ 0x8000000000000000ull
, key
);
362 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), key
);
365 static const char *_key_decode_prefix(const char *p
, ghobject_t
*oid
)
367 p
= _key_decode_shard(p
, &oid
->shard_id
);
370 p
= _key_decode_u64(p
, &pool
);
371 oid
->hobj
.pool
= pool
- 0x8000000000000000ull
;
374 p
= _key_decode_u32(p
, &hash
);
376 oid
->hobj
.set_bitwise_key_u32(hash
);
381 #define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
384 static int get_key_object(const S
& key
, ghobject_t
*oid
)
387 const char *p
= key
.c_str();
389 if (key
.length() < ENCODED_KEY_PREFIX_LEN
)
392 p
= _key_decode_prefix(p
, oid
);
394 if (key
.length() == ENCODED_KEY_PREFIX_LEN
)
397 r
= decode_escaped(p
, &oid
->hobj
.nspace
);
403 r
= decode_escaped(p
, &k
);
410 oid
->hobj
.oid
.name
= k
;
411 } else if (*p
== '<' || *p
== '>') {
414 r
= decode_escaped(p
, &oid
->hobj
.oid
.name
);
418 oid
->hobj
.set_key(k
);
424 p
= _key_decode_u64(p
, &oid
->hobj
.snap
.val
);
425 p
= _key_decode_u64(p
, &oid
->generation
);
427 if (*p
!= ONODE_KEY_SUFFIX
) {
432 // if we get something other than a null terminator here,
433 // something goes wrong.
441 static void get_object_key(CephContext
*cct
, const ghobject_t
& oid
, S
*key
)
445 size_t max_len
= ENCODED_KEY_PREFIX_LEN
+
446 (oid
.hobj
.nspace
.length() * 3 + 1) +
447 (oid
.hobj
.get_key().length() * 3 + 1) +
448 1 + // for '<', '=', or '>'
449 (oid
.hobj
.oid
.name
.length() * 3 + 1) +
451 key
->reserve(max_len
);
453 _key_encode_prefix(oid
, key
);
455 append_escaped(oid
.hobj
.nspace
, key
);
457 if (oid
.hobj
.get_key().length()) {
458 // is a key... could be < = or >.
459 append_escaped(oid
.hobj
.get_key(), key
);
460 // (ASCII chars < = and > sort in that order, yay)
461 int r
= oid
.hobj
.get_key().compare(oid
.hobj
.oid
.name
);
463 key
->append(r
> 0 ? ">" : "<");
464 append_escaped(oid
.hobj
.oid
.name
, key
);
471 append_escaped(oid
.hobj
.oid
.name
, key
);
475 _key_encode_u64(oid
.hobj
.snap
, key
);
476 _key_encode_u64(oid
.generation
, key
);
478 key
->push_back(ONODE_KEY_SUFFIX
);
483 int r
= get_key_object(*key
, &t
);
485 derr
<< " r " << r
<< dendl
;
486 derr
<< "key " << pretty_binary_string(*key
) << dendl
;
487 derr
<< "oid " << oid
<< dendl
;
488 derr
<< " t " << t
<< dendl
;
489 ceph_assert(r
== 0 && t
== oid
);
495 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
496 // char lets us quickly test whether it is a shard key without decoding any
497 // of the prefix bytes.
499 static void get_extent_shard_key(const S
& onode_key
, uint32_t offset
,
503 key
->reserve(onode_key
.length() + 4 + 1);
504 key
->append(onode_key
.c_str(), onode_key
.size());
505 _key_encode_u32(offset
, key
);
506 key
->push_back(EXTENT_SHARD_KEY_SUFFIX
);
509 static void rewrite_extent_shard_key(uint32_t offset
, string
*key
)
511 ceph_assert(key
->size() > sizeof(uint32_t) + 1);
512 ceph_assert(*key
->rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
513 _key_encode_u32(offset
, key
->size() - sizeof(uint32_t) - 1, key
);
517 static void generate_extent_shard_key_and_apply(
521 std::function
<void(const string
& final_key
)> apply
)
523 if (key
->empty()) { // make full key
524 ceph_assert(!onode_key
.empty());
525 get_extent_shard_key(onode_key
, offset
, key
);
527 rewrite_extent_shard_key(offset
, key
);
532 int get_key_extent_shard(const string
& key
, string
*onode_key
, uint32_t *offset
)
534 ceph_assert(key
.size() > sizeof(uint32_t) + 1);
535 ceph_assert(*key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
536 int okey_len
= key
.size() - sizeof(uint32_t) - 1;
537 *onode_key
= key
.substr(0, okey_len
);
538 const char *p
= key
.data() + okey_len
;
539 _key_decode_u32(p
, offset
);
543 static bool is_extent_shard_key(const string
& key
)
545 return *key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
;
548 static void get_deferred_key(uint64_t seq
, string
*out
)
550 _key_encode_u64(seq
, out
);
553 static void get_pool_stat_key(int64_t pool_id
, string
*key
)
556 _key_encode_u64(pool_id
, key
);
559 static int get_key_pool_stat(const string
& key
, uint64_t* pool_id
)
561 const char *p
= key
.c_str();
562 if (key
.length() < sizeof(uint64_t))
564 _key_decode_u64(p
, pool_id
);
568 template <int LogLevelV
>
569 void _dump_extent_map(CephContext
*cct
, const BlueStore::ExtentMap
&em
)
572 for (auto& s
: em
.shards
) {
573 dout(LogLevelV
) << __func__
<< " shard " << *s
.shard_info
574 << (s
.loaded
? " (loaded)" : "")
575 << (s
.dirty
? " (dirty)" : "")
578 for (auto& e
: em
.extent_map
) {
579 dout(LogLevelV
) << __func__
<< " " << e
<< dendl
;
580 ceph_assert(e
.logical_offset
>= pos
);
581 pos
= e
.logical_offset
+ e
.length
;
582 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
583 if (blob
.has_csum()) {
585 unsigned n
= blob
.get_csum_count();
586 for (unsigned i
= 0; i
< n
; ++i
)
587 v
.push_back(blob
.get_csum_item(i
));
588 dout(LogLevelV
) << __func__
<< " csum: " << std::hex
<< v
<< std::dec
591 std::lock_guard
l(e
.blob
->shared_blob
->get_cache()->lock
);
592 for (auto& i
: e
.blob
->shared_blob
->bc
.buffer_map
) {
593 dout(LogLevelV
) << __func__
<< " 0x" << std::hex
<< i
.first
594 << "~" << i
.second
->length
<< std::dec
595 << " " << *i
.second
<< dendl
;
600 template <int LogLevelV
>
601 void _dump_onode(CephContext
*cct
, const BlueStore::Onode
& o
)
603 if (!cct
->_conf
->subsys
.should_gather
<ceph_subsys_bluestore
, LogLevelV
>())
605 dout(LogLevelV
) << __func__
<< " " << &o
<< " " << o
.oid
606 << " nid " << o
.onode
.nid
607 << " size 0x" << std::hex
<< o
.onode
.size
608 << " (" << std::dec
<< o
.onode
.size
<< ")"
609 << " expected_object_size " << o
.onode
.expected_object_size
610 << " expected_write_size " << o
.onode
.expected_write_size
611 << " in " << o
.onode
.extent_map_shards
.size() << " shards"
612 << ", " << o
.extent_map
.spanning_blob_map
.size()
615 for (auto p
= o
.onode
.attrs
.begin();
616 p
!= o
.onode
.attrs
.end();
618 dout(LogLevelV
) << __func__
<< " attr " << p
->first
619 << " len " << p
->second
.length() << dendl
;
621 _dump_extent_map
<LogLevelV
>(cct
, o
.extent_map
);
624 template <int LogLevelV
>
625 void _dump_transaction(CephContext
*cct
, ObjectStore::Transaction
*t
)
627 dout(LogLevelV
) << __func__
<< " transaction dump:\n";
628 JSONFormatter
f(true);
629 f
.open_object_section("transaction");
638 struct Int64ArrayMergeOperator
: public KeyValueDB::MergeOperator
{
639 void merge_nonexistent(
640 const char *rdata
, size_t rlen
, std::string
*new_value
) override
{
641 *new_value
= std::string(rdata
, rlen
);
644 const char *ldata
, size_t llen
,
645 const char *rdata
, size_t rlen
,
646 std::string
*new_value
) override
{
647 ceph_assert(llen
== rlen
);
648 ceph_assert((rlen
% 8) == 0);
649 new_value
->resize(rlen
);
650 const ceph_le64
* lv
= (const ceph_le64
*)ldata
;
651 const ceph_le64
* rv
= (const ceph_le64
*)rdata
;
652 ceph_le64
* nv
= &(ceph_le64
&)new_value
->at(0);
653 for (size_t i
= 0; i
< rlen
>> 3; ++i
) {
654 nv
[i
] = lv
[i
] + rv
[i
];
657 // We use each operator name and each prefix to construct the
658 // overall RocksDB operator name for consistency check at open time.
659 const char *name() const override
{
660 return "int64_array";
667 ostream
& operator<<(ostream
& out
, const BlueStore::Buffer
& b
)
669 out
<< "buffer(" << &b
<< " space " << b
.space
<< " 0x" << std::hex
670 << b
.offset
<< "~" << b
.length
<< std::dec
671 << " " << BlueStore::Buffer::get_state_name(b
.state
);
673 out
<< " " << BlueStore::Buffer::get_flag_name(b
.flags
);
680 * Due to a bug in key string encoding (see a comment for append_escaped)
681 * the KeyValueDB iterator does not lexicographically sort the same
682 * way that ghobject_t does: objects with the same hash may have wrong order.
684 * This is the iterator wrapper that fixes the keys order.
687 class CollectionListIterator
{
689 CollectionListIterator(const KeyValueDB::Iterator
&it
)
692 virtual ~CollectionListIterator() {
695 virtual bool valid() const = 0;
696 virtual const ghobject_t
&oid() const = 0;
697 virtual void lower_bound(const ghobject_t
&oid
) = 0;
698 virtual void upper_bound(const ghobject_t
&oid
) = 0;
699 virtual void next() = 0;
701 virtual int cmp(const ghobject_t
&oid
) const = 0;
703 bool is_ge(const ghobject_t
&oid
) const {
704 return cmp(oid
) >= 0;
707 bool is_lt(const ghobject_t
&oid
) const {
712 KeyValueDB::Iterator m_it
;
715 class SimpleCollectionListIterator
: public CollectionListIterator
{
717 SimpleCollectionListIterator(CephContext
*cct
, const KeyValueDB::Iterator
&it
)
718 : CollectionListIterator(it
), m_cct(cct
) {
721 bool valid() const override
{
722 return m_it
->valid();
725 const ghobject_t
&oid() const override
{
726 ceph_assert(valid());
731 void lower_bound(const ghobject_t
&oid
) override
{
733 get_object_key(m_cct
, oid
, &key
);
735 m_it
->lower_bound(key
);
739 void upper_bound(const ghobject_t
&oid
) override
{
741 get_object_key(m_cct
, oid
, &key
);
743 m_it
->upper_bound(key
);
747 void next() override
{
748 ceph_assert(valid());
754 int cmp(const ghobject_t
&oid
) const override
{
755 ceph_assert(valid());
758 get_object_key(m_cct
, oid
, &key
);
760 return m_it
->key().compare(key
);
772 if (is_extent_shard_key(m_it
->key())) {
777 m_oid
= ghobject_t();
778 int r
= get_key_object(m_it
->key(), &m_oid
);
783 class SortedCollectionListIterator
: public CollectionListIterator
{
785 SortedCollectionListIterator(const KeyValueDB::Iterator
&it
)
786 : CollectionListIterator(it
), m_chunk_iter(m_chunk
.end()) {
789 bool valid() const override
{
790 return m_chunk_iter
!= m_chunk
.end();
793 const ghobject_t
&oid() const override
{
794 ceph_assert(valid());
796 return m_chunk_iter
->first
;
799 void lower_bound(const ghobject_t
&oid
) override
{
801 _key_encode_prefix(oid
, &key
);
803 m_it
->lower_bound(key
);
804 m_chunk_iter
= m_chunk
.end();
805 if (!get_next_chunk()) {
809 if (this->oid().shard_id
!= oid
.shard_id
||
810 this->oid().hobj
.pool
!= oid
.hobj
.pool
||
811 this->oid().hobj
.get_bitwise_key_u32() != oid
.hobj
.get_bitwise_key_u32()) {
815 m_chunk_iter
= m_chunk
.lower_bound(oid
);
816 if (m_chunk_iter
== m_chunk
.end()) {
821 void upper_bound(const ghobject_t
&oid
) override
{
824 if (valid() && this->oid() == oid
) {
829 void next() override
{
830 ceph_assert(valid());
833 if (m_chunk_iter
== m_chunk
.end()) {
838 int cmp(const ghobject_t
&oid
) const override
{
839 ceph_assert(valid());
841 if (this->oid() < oid
) {
844 if (this->oid() > oid
) {
851 std::map
<ghobject_t
, std::string
> m_chunk
;
852 std::map
<ghobject_t
, std::string
>::iterator m_chunk_iter
;
854 bool get_next_chunk() {
855 while (m_it
->valid() && is_extent_shard_key(m_it
->key())) {
859 if (!m_it
->valid()) {
864 int r
= get_key_object(m_it
->key(), &oid
);
869 m_chunk
.insert({oid
, m_it
->key()});
873 } while (m_it
->valid() && is_extent_shard_key(m_it
->key()));
875 if (!m_it
->valid()) {
880 r
= get_key_object(m_it
->key(), &next
);
882 if (next
.shard_id
!= oid
.shard_id
||
883 next
.hobj
.pool
!= oid
.hobj
.pool
||
884 next
.hobj
.get_bitwise_key_u32() != oid
.hobj
.get_bitwise_key_u32()) {
890 m_chunk_iter
= m_chunk
.begin();
895 } // anonymous namespace
899 void BlueStore::GarbageCollector::process_protrusive_extents(
900 const BlueStore::ExtentMap
& extent_map
,
901 uint64_t start_offset
,
903 uint64_t start_touch_offset
,
904 uint64_t end_touch_offset
,
905 uint64_t min_alloc_size
)
907 ceph_assert(start_offset
<= start_touch_offset
&& end_offset
>= end_touch_offset
);
909 uint64_t lookup_start_offset
= p2align(start_offset
, min_alloc_size
);
910 uint64_t lookup_end_offset
= round_up_to(end_offset
, min_alloc_size
);
912 dout(30) << __func__
<< " (hex): [" << std::hex
913 << lookup_start_offset
<< ", " << lookup_end_offset
914 << ")" << std::dec
<< dendl
;
916 for (auto it
= extent_map
.seek_lextent(lookup_start_offset
);
917 it
!= extent_map
.extent_map
.end() &&
918 it
->logical_offset
< lookup_end_offset
;
920 uint64_t alloc_unit_start
= it
->logical_offset
/ min_alloc_size
;
921 uint64_t alloc_unit_end
= (it
->logical_end() - 1) / min_alloc_size
;
923 dout(30) << __func__
<< " " << *it
924 << "alloc_units: " << alloc_unit_start
<< ".." << alloc_unit_end
927 Blob
* b
= it
->blob
.get();
929 if (it
->logical_offset
>=start_touch_offset
&&
930 it
->logical_end() <= end_touch_offset
) {
931 // Process extents within the range affected by
932 // the current write request.
933 // Need to take into account if existing extents
934 // can be merged with them (uncompressed case)
935 if (!b
->get_blob().is_compressed()) {
936 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
937 --blob_info_counted
->expected_allocations
; // don't need to allocate
938 // new AU for compressed
939 // data since another
940 // collocated uncompressed
941 // blob already exists
942 dout(30) << __func__
<< " --expected:"
943 << alloc_unit_start
<< dendl
;
945 used_alloc_unit
= alloc_unit_end
;
946 blob_info_counted
= nullptr;
948 } else if (b
->get_blob().is_compressed()) {
950 // additionally we take compressed blobs that were not impacted
951 // by the write into account too
953 affected_blobs
.emplace(
954 b
, BlobInfo(b
->get_referenced_bytes())).first
->second
;
957 (used_alloc_unit
&& used_alloc_unit
== alloc_unit_start
) ? 0 : 1;
958 bi
.expected_allocations
+= alloc_unit_end
- alloc_unit_start
+ adjust
;
959 dout(30) << __func__
<< " expected_allocations="
960 << bi
.expected_allocations
<< " end_au:"
961 << alloc_unit_end
<< dendl
;
963 blob_info_counted
= &bi
;
964 used_alloc_unit
= alloc_unit_end
;
966 ceph_assert(it
->length
<= bi
.referenced_bytes
);
967 bi
.referenced_bytes
-= it
->length
;
968 dout(30) << __func__
<< " affected_blob:" << *b
969 << " unref 0x" << std::hex
<< it
->length
970 << " referenced = 0x" << bi
.referenced_bytes
971 << std::dec
<< dendl
;
972 // NOTE: we can't move specific blob to resulting GC list here
973 // when reference counter == 0 since subsequent extents might
974 // decrement its expected_allocation.
975 // Hence need to enumerate all the extents first.
976 if (!bi
.collect_candidate
) {
977 bi
.first_lextent
= it
;
978 bi
.collect_candidate
= true;
980 bi
.last_lextent
= it
;
982 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
983 // don't need to allocate new AU for compressed data since another
984 // collocated uncompressed blob already exists
985 --blob_info_counted
->expected_allocations
;
986 dout(30) << __func__
<< " --expected_allocations:"
987 << alloc_unit_start
<< dendl
;
989 used_alloc_unit
= alloc_unit_end
;
990 blob_info_counted
= nullptr;
994 for (auto b_it
= affected_blobs
.begin();
995 b_it
!= affected_blobs
.end();
997 Blob
* b
= b_it
->first
;
998 BlobInfo
& bi
= b_it
->second
;
999 if (bi
.referenced_bytes
== 0) {
1000 uint64_t len_on_disk
= b_it
->first
->get_blob().get_ondisk_length();
1001 int64_t blob_expected_for_release
=
1002 round_up_to(len_on_disk
, min_alloc_size
) / min_alloc_size
;
1004 dout(30) << __func__
<< " " << *(b_it
->first
)
1005 << " expected4release=" << blob_expected_for_release
1006 << " expected_allocations=" << bi
.expected_allocations
1008 int64_t benefit
= blob_expected_for_release
- bi
.expected_allocations
;
1009 if (benefit
>= g_conf()->bluestore_gc_enable_blob_threshold
) {
1010 if (bi
.collect_candidate
) {
1011 auto it
= bi
.first_lextent
;
1014 if (it
->blob
.get() == b
) {
1015 extents_to_collect
.insert(it
->logical_offset
, it
->length
);
1017 bExit
= it
== bi
.last_lextent
;
1021 expected_for_release
+= blob_expected_for_release
;
1022 expected_allocations
+= bi
.expected_allocations
;
1028 int64_t BlueStore::GarbageCollector::estimate(
1029 uint64_t start_offset
,
1031 const BlueStore::ExtentMap
& extent_map
,
1032 const BlueStore::old_extent_map_t
& old_extents
,
1033 uint64_t min_alloc_size
)
1036 affected_blobs
.clear();
1037 extents_to_collect
.clear();
1038 used_alloc_unit
= boost::optional
<uint64_t >();
1039 blob_info_counted
= nullptr;
1041 uint64_t gc_start_offset
= start_offset
;
1042 uint64_t gc_end_offset
= start_offset
+ length
;
1044 uint64_t end_offset
= start_offset
+ length
;
1046 for (auto it
= old_extents
.begin(); it
!= old_extents
.end(); ++it
) {
1047 Blob
* b
= it
->e
.blob
.get();
1048 if (b
->get_blob().is_compressed()) {
1050 // update gc_start_offset/gc_end_offset if needed
1051 gc_start_offset
= min(gc_start_offset
, (uint64_t)it
->e
.blob_start());
1052 gc_end_offset
= std::max(gc_end_offset
, (uint64_t)it
->e
.blob_end());
1054 auto o
= it
->e
.logical_offset
;
1055 auto l
= it
->e
.length
;
1057 uint64_t ref_bytes
= b
->get_referenced_bytes();
1058 // micro optimization to bypass blobs that have no more references
1059 if (ref_bytes
!= 0) {
1060 dout(30) << __func__
<< " affected_blob:" << *b
1061 << " unref 0x" << std::hex
<< o
<< "~" << l
1062 << std::dec
<< dendl
;
1063 affected_blobs
.emplace(b
, BlobInfo(ref_bytes
));
1067 dout(30) << __func__
<< " gc range(hex): [" << std::hex
1068 << gc_start_offset
<< ", " << gc_end_offset
1069 << ")" << std::dec
<< dendl
;
1071 // enumerate preceeding extents to check if they reference affected blobs
1072 if (gc_start_offset
< start_offset
|| gc_end_offset
> end_offset
) {
1073 process_protrusive_extents(extent_map
,
1080 return expected_for_release
- expected_allocations
;
1083 // LruOnodeCacheShard
1084 struct LruOnodeCacheShard
: public BlueStore::OnodeCacheShard
{
1085 typedef boost::intrusive::list
<
1087 boost::intrusive::member_hook
<
1089 boost::intrusive::list_member_hook
<>,
1090 &BlueStore::Onode::lru_item
> > list_t
;
1094 explicit LruOnodeCacheShard(CephContext
*cct
) : BlueStore::OnodeCacheShard(cct
) {}
1096 void _add(BlueStore::Onode
* o
, int level
) override
1098 if (o
->put_cache()) {
1099 (level
> 0) ? lru
.push_front(*o
) : lru
.push_back(*o
);
1103 ++num
; // we count both pinned and unpinned entries
1104 dout(20) << __func__
<< " " << this << " " << o
->oid
<< " added, num=" << num
<< dendl
;
1106 void _rm(BlueStore::Onode
* o
) override
1108 if (o
->pop_cache()) {
1109 lru
.erase(lru
.iterator_to(*o
));
1111 ceph_assert(num_pinned
);
1116 dout(20) << __func__
<< " " << this << " " << " " << o
->oid
<< " removed, num=" << num
<< dendl
;
1118 void _pin(BlueStore::Onode
* o
) override
1120 lru
.erase(lru
.iterator_to(*o
));
1122 dout(20) << __func__
<< this << " " << " " << " " << o
->oid
<< " pinned" << dendl
;
1124 void _unpin(BlueStore::Onode
* o
) override
1127 ceph_assert(num_pinned
);
1129 dout(20) << __func__
<< this << " " << " " << " " << o
->oid
<< " unpinned" << dendl
;
1131 void _unpin_and_rm(BlueStore::Onode
* o
) override
1134 ceph_assert(num_pinned
);
1139 void _trim_to(uint64_t new_size
) override
1141 if (new_size
>= lru
.size()) {
1142 return; // don't even try
1144 uint64_t n
= lru
.size() - new_size
;
1146 ceph_assert(p
!= lru
.begin());
1148 ceph_assert(num
>= n
);
1151 BlueStore::Onode
*o
= &*p
;
1152 dout(20) << __func__
<< " rm " << o
->oid
<< " "
1153 << o
->nref
<< " " << o
->cached
<< " " << o
->pinned
<< dendl
;
1154 if (p
!= lru
.begin()) {
1157 ceph_assert(n
== 0);
1160 auto pinned
= !o
->pop_cache();
1161 ceph_assert(!pinned
);
1162 o
->c
->onode_map
._remove(o
->oid
);
1165 void move_pinned(OnodeCacheShard
*to
, BlueStore::Onode
*o
) override
1170 ceph_assert(o
->cached
);
1171 ceph_assert(o
->pinned
);
1173 ceph_assert(num_pinned
);
1179 void add_stats(uint64_t *onodes
, uint64_t *pinned_onodes
) override
1182 *pinned_onodes
+= num_pinned
;
1187 BlueStore::OnodeCacheShard
*BlueStore::OnodeCacheShard::create(
1190 PerfCounters
*logger
)
1192 BlueStore::OnodeCacheShard
*c
= nullptr;
1193 // Currently we only implement an LRU cache for onodes
1194 c
= new LruOnodeCacheShard(cct
);
1199 // LruBufferCacheShard
1200 struct LruBufferCacheShard
: public BlueStore::BufferCacheShard
{
1201 typedef boost::intrusive::list
<
1203 boost::intrusive::member_hook
<
1205 boost::intrusive::list_member_hook
<>,
1206 &BlueStore::Buffer::lru_item
> > list_t
;
1209 explicit LruBufferCacheShard(CephContext
*cct
) : BlueStore::BufferCacheShard(cct
) {}
1211 void _add(BlueStore::Buffer
*b
, int level
, BlueStore::Buffer
*near
) override
{
1213 auto q
= lru
.iterator_to(*near
);
1215 } else if (level
> 0) {
1220 buffer_bytes
+= b
->length
;
1223 void _rm(BlueStore::Buffer
*b
) override
{
1224 ceph_assert(buffer_bytes
>= b
->length
);
1225 buffer_bytes
-= b
->length
;
1226 auto q
= lru
.iterator_to(*b
);
1230 void _move(BlueStore::BufferCacheShard
*src
, BlueStore::Buffer
*b
) override
{
1232 _add(b
, 0, nullptr);
1234 void _adjust_size(BlueStore::Buffer
*b
, int64_t delta
) override
{
1235 ceph_assert((int64_t)buffer_bytes
+ delta
>= 0);
1236 buffer_bytes
+= delta
;
1238 void _touch(BlueStore::Buffer
*b
) override
{
1239 auto p
= lru
.iterator_to(*b
);
1243 _audit("_touch_buffer end");
1246 void _trim_to(uint64_t max
) override
1248 while (buffer_bytes
> max
) {
1249 auto i
= lru
.rbegin();
1250 if (i
== lru
.rend()) {
1251 // stop if lru is now empty
1255 BlueStore::Buffer
*b
= &*i
;
1256 ceph_assert(b
->is_clean());
1257 dout(20) << __func__
<< " rm " << *b
<< dendl
;
1258 b
->space
->_rm_buffer(this, b
);
1263 void add_stats(uint64_t *extents
,
1266 uint64_t *bytes
) override
{
1267 *extents
+= num_extents
;
1268 *blobs
+= num_blobs
;
1270 *bytes
+= buffer_bytes
;
1273 void _audit(const char *s
) override
1275 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1277 for (auto i
= lru
.begin(); i
!= lru
.end(); ++i
) {
1280 if (s
!= buffer_bytes
) {
1281 derr
<< __func__
<< " buffer_size " << buffer_bytes
<< " actual " << s
1283 for (auto i
= lru
.begin(); i
!= lru
.end(); ++i
) {
1284 derr
<< __func__
<< " " << *i
<< dendl
;
1286 ceph_assert(s
== buffer_bytes
);
1288 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1294 // TwoQBufferCacheShard
1296 struct TwoQBufferCacheShard
: public BlueStore::BufferCacheShard
{
1297 typedef boost::intrusive::list
<
1299 boost::intrusive::member_hook
<
1301 boost::intrusive::list_member_hook
<>,
1302 &BlueStore::Buffer::lru_item
> > list_t
;
1303 list_t hot
; ///< "Am" hot buffers
1304 list_t warm_in
; ///< "A1in" newly warm buffers
1305 list_t warm_out
; ///< "A1out" empty buffers we've evicted
1306 uint64_t buffer_bytes
= 0; ///< bytes
1310 BUFFER_WARM_IN
, ///< in warm_in
1311 BUFFER_WARM_OUT
, ///< in warm_out
1312 BUFFER_HOT
, ///< in hot
1316 uint64_t list_bytes
[BUFFER_TYPE_MAX
] = {0}; ///< bytes per type
1319 explicit TwoQBufferCacheShard(CephContext
*cct
) : BufferCacheShard(cct
) {}
1321 void _add(BlueStore::Buffer
*b
, int level
, BlueStore::Buffer
*near
) override
1323 dout(20) << __func__
<< " level " << level
<< " near " << near
1325 << " which has cache_private " << b
->cache_private
<< dendl
;
1327 b
->cache_private
= near
->cache_private
;
1328 switch (b
->cache_private
) {
1329 case BUFFER_WARM_IN
:
1330 warm_in
.insert(warm_in
.iterator_to(*near
), *b
);
1332 case BUFFER_WARM_OUT
:
1333 ceph_assert(b
->is_empty());
1334 warm_out
.insert(warm_out
.iterator_to(*near
), *b
);
1337 hot
.insert(hot
.iterator_to(*near
), *b
);
1340 ceph_abort_msg("bad cache_private");
1342 } else if (b
->cache_private
== BUFFER_NEW
) {
1343 b
->cache_private
= BUFFER_WARM_IN
;
1345 warm_in
.push_front(*b
);
1347 // take caller hint to start at the back of the warm queue
1348 warm_in
.push_back(*b
);
1351 // we got a hint from discard
1352 switch (b
->cache_private
) {
1353 case BUFFER_WARM_IN
:
1354 // stay in warm_in. move to front, even though 2Q doesn't actually
1356 dout(20) << __func__
<< " move to front of warm " << *b
<< dendl
;
1357 warm_in
.push_front(*b
);
1359 case BUFFER_WARM_OUT
:
1360 b
->cache_private
= BUFFER_HOT
;
1361 // move to hot. fall-thru
1363 dout(20) << __func__
<< " move to front of hot " << *b
<< dendl
;
1367 ceph_abort_msg("bad cache_private");
1370 if (!b
->is_empty()) {
1371 buffer_bytes
+= b
->length
;
1372 list_bytes
[b
->cache_private
] += b
->length
;
1374 num
= hot
.size() + warm_in
.size();
1377 void _rm(BlueStore::Buffer
*b
) override
1379 dout(20) << __func__
<< " " << *b
<< dendl
;
1380 if (!b
->is_empty()) {
1381 ceph_assert(buffer_bytes
>= b
->length
);
1382 buffer_bytes
-= b
->length
;
1383 ceph_assert(list_bytes
[b
->cache_private
] >= b
->length
);
1384 list_bytes
[b
->cache_private
] -= b
->length
;
1386 switch (b
->cache_private
) {
1387 case BUFFER_WARM_IN
:
1388 warm_in
.erase(warm_in
.iterator_to(*b
));
1390 case BUFFER_WARM_OUT
:
1391 warm_out
.erase(warm_out
.iterator_to(*b
));
1394 hot
.erase(hot
.iterator_to(*b
));
1397 ceph_abort_msg("bad cache_private");
1399 num
= hot
.size() + warm_in
.size();
1402 void _move(BlueStore::BufferCacheShard
*srcc
, BlueStore::Buffer
*b
) override
1404 TwoQBufferCacheShard
*src
= static_cast<TwoQBufferCacheShard
*>(srcc
);
1407 // preserve which list we're on (even if we can't preserve the order!)
1408 switch (b
->cache_private
) {
1409 case BUFFER_WARM_IN
:
1410 ceph_assert(!b
->is_empty());
1411 warm_in
.push_back(*b
);
1413 case BUFFER_WARM_OUT
:
1414 ceph_assert(b
->is_empty());
1415 warm_out
.push_back(*b
);
1418 ceph_assert(!b
->is_empty());
1422 ceph_abort_msg("bad cache_private");
1424 if (!b
->is_empty()) {
1425 buffer_bytes
+= b
->length
;
1426 list_bytes
[b
->cache_private
] += b
->length
;
1428 num
= hot
.size() + warm_in
.size();
1431 void _adjust_size(BlueStore::Buffer
*b
, int64_t delta
) override
1433 dout(20) << __func__
<< " delta " << delta
<< " on " << *b
<< dendl
;
1434 if (!b
->is_empty()) {
1435 ceph_assert((int64_t)buffer_bytes
+ delta
>= 0);
1436 buffer_bytes
+= delta
;
1437 ceph_assert((int64_t)list_bytes
[b
->cache_private
] + delta
>= 0);
1438 list_bytes
[b
->cache_private
] += delta
;
1442 void _touch(BlueStore::Buffer
*b
) override
{
1443 switch (b
->cache_private
) {
1444 case BUFFER_WARM_IN
:
1445 // do nothing (somewhat counter-intuitively!)
1447 case BUFFER_WARM_OUT
:
1448 // move from warm_out to hot LRU
1449 ceph_abort_msg("this happens via discard hint");
1452 // move to front of hot LRU
1453 hot
.erase(hot
.iterator_to(*b
));
1457 num
= hot
.size() + warm_in
.size();
1458 _audit("_touch_buffer end");
1461 void _trim_to(uint64_t max
) override
1463 if (buffer_bytes
> max
) {
1464 uint64_t kin
= max
* cct
->_conf
->bluestore_2q_cache_kin_ratio
;
1465 uint64_t khot
= max
- kin
;
1467 // pre-calculate kout based on average buffer size too,
1468 // which is typical(the warm_in and hot lists may change later)
1470 uint64_t buffer_num
= hot
.size() + warm_in
.size();
1472 uint64_t avg_size
= buffer_bytes
/ buffer_num
;
1473 ceph_assert(avg_size
);
1474 uint64_t calculated_num
= max
/ avg_size
;
1475 kout
= calculated_num
* cct
->_conf
->bluestore_2q_cache_kout_ratio
;
1478 if (list_bytes
[BUFFER_HOT
] < khot
) {
1479 // hot is small, give slack to warm_in
1480 kin
+= khot
- list_bytes
[BUFFER_HOT
];
1481 } else if (list_bytes
[BUFFER_WARM_IN
] < kin
) {
1482 // warm_in is small, give slack to hot
1483 khot
+= kin
- list_bytes
[BUFFER_WARM_IN
];
1486 // adjust warm_in list
1487 int64_t to_evict_bytes
= list_bytes
[BUFFER_WARM_IN
] - kin
;
1488 uint64_t evicted
= 0;
1490 while (to_evict_bytes
> 0) {
1491 auto p
= warm_in
.rbegin();
1492 if (p
== warm_in
.rend()) {
1493 // stop if warm_in list is now empty
1497 BlueStore::Buffer
*b
= &*p
;
1498 ceph_assert(b
->is_clean());
1499 dout(20) << __func__
<< " buffer_warm_in -> out " << *b
<< dendl
;
1500 ceph_assert(buffer_bytes
>= b
->length
);
1501 buffer_bytes
-= b
->length
;
1502 ceph_assert(list_bytes
[BUFFER_WARM_IN
] >= b
->length
);
1503 list_bytes
[BUFFER_WARM_IN
] -= b
->length
;
1504 to_evict_bytes
-= b
->length
;
1505 evicted
+= b
->length
;
1506 b
->state
= BlueStore::Buffer::STATE_EMPTY
;
1508 warm_in
.erase(warm_in
.iterator_to(*b
));
1509 warm_out
.push_front(*b
);
1510 b
->cache_private
= BUFFER_WARM_OUT
;
1514 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1515 << " from warm_in list, done evicting warm_in buffers"
1520 to_evict_bytes
= list_bytes
[BUFFER_HOT
] - khot
;
1523 while (to_evict_bytes
> 0) {
1524 auto p
= hot
.rbegin();
1525 if (p
== hot
.rend()) {
1526 // stop if hot list is now empty
1530 BlueStore::Buffer
*b
= &*p
;
1531 dout(20) << __func__
<< " buffer_hot rm " << *b
<< dendl
;
1532 ceph_assert(b
->is_clean());
1533 // adjust evict size before buffer goes invalid
1534 to_evict_bytes
-= b
->length
;
1535 evicted
+= b
->length
;
1536 b
->space
->_rm_buffer(this, b
);
1540 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1541 << " from hot list, done evicting hot buffers"
1545 // adjust warm out list too, if necessary
1546 int64_t n
= warm_out
.size() - kout
;
1548 BlueStore::Buffer
*b
= &*warm_out
.rbegin();
1549 ceph_assert(b
->is_empty());
1550 dout(20) << __func__
<< " buffer_warm_out rm " << *b
<< dendl
;
1551 b
->space
->_rm_buffer(this, b
);
1554 num
= hot
.size() + warm_in
.size();
1557 void add_stats(uint64_t *extents
,
1560 uint64_t *bytes
) override
{
1561 *extents
+= num_extents
;
1562 *blobs
+= num_blobs
;
1564 *bytes
+= buffer_bytes
;
1568 void _audit(const char *s
) override
1570 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1572 for (auto i
= hot
.begin(); i
!= hot
.end(); ++i
) {
1576 uint64_t hot_bytes
= s
;
1577 if (hot_bytes
!= list_bytes
[BUFFER_HOT
]) {
1578 derr
<< __func__
<< " hot_list_bytes "
1579 << list_bytes
[BUFFER_HOT
]
1580 << " != actual " << hot_bytes
1582 ceph_assert(hot_bytes
== list_bytes
[BUFFER_HOT
]);
1585 for (auto i
= warm_in
.begin(); i
!= warm_in
.end(); ++i
) {
1589 uint64_t warm_in_bytes
= s
- hot_bytes
;
1590 if (warm_in_bytes
!= list_bytes
[BUFFER_WARM_IN
]) {
1591 derr
<< __func__
<< " warm_in_list_bytes "
1592 << list_bytes
[BUFFER_WARM_IN
]
1593 << " != actual " << warm_in_bytes
1595 ceph_assert(warm_in_bytes
== list_bytes
[BUFFER_WARM_IN
]);
1598 if (s
!= buffer_bytes
) {
1599 derr
<< __func__
<< " buffer_bytes " << buffer_bytes
<< " actual " << s
1601 ceph_assert(s
== buffer_bytes
);
1604 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1612 BlueStore::BufferCacheShard
*BlueStore::BufferCacheShard::create(
1615 PerfCounters
*logger
)
1617 BufferCacheShard
*c
= nullptr;
1619 c
= new LruBufferCacheShard(cct
);
1620 else if (type
== "2q")
1621 c
= new TwoQBufferCacheShard(cct
);
1623 ceph_abort_msg("unrecognized cache type");
1631 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1633 void BlueStore::BufferSpace::_clear(BufferCacheShard
* cache
)
1635 // note: we already hold cache->lock
1636 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1637 while (!buffer_map
.empty()) {
1638 _rm_buffer(cache
, buffer_map
.begin());
1642 int BlueStore::BufferSpace::_discard(BufferCacheShard
* cache
, uint32_t offset
, uint32_t length
)
1644 // note: we already hold cache->lock
1645 ldout(cache
->cct
, 20) << __func__
<< std::hex
<< " 0x" << offset
<< "~" << length
1646 << std::dec
<< dendl
;
1647 int cache_private
= 0;
1648 cache
->_audit("discard start");
1649 auto i
= _data_lower_bound(offset
);
1650 uint32_t end
= offset
+ length
;
1651 while (i
!= buffer_map
.end()) {
1652 Buffer
*b
= i
->second
.get();
1653 if (b
->offset
>= end
) {
1656 if (b
->cache_private
> cache_private
) {
1657 cache_private
= b
->cache_private
;
1659 if (b
->offset
< offset
) {
1660 int64_t front
= offset
- b
->offset
;
1661 if (b
->end() > end
) {
1662 // drop middle (split)
1663 uint32_t tail
= b
->end() - end
;
1664 if (b
->data
.length()) {
1666 bl
.substr_of(b
->data
, b
->length
- tail
, tail
);
1667 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1668 nb
->maybe_rebuild();
1669 _add_buffer(cache
, nb
, 0, b
);
1671 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, tail
),
1674 if (!b
->is_writing()) {
1675 cache
->_adjust_size(b
, front
- (int64_t)b
->length
);
1679 cache
->_audit("discard end 1");
1683 if (!b
->is_writing()) {
1684 cache
->_adjust_size(b
, front
- (int64_t)b
->length
);
1692 if (b
->end() <= end
) {
1693 // drop entire buffer
1694 _rm_buffer(cache
, i
++);
1698 uint32_t keep
= b
->end() - end
;
1699 if (b
->data
.length()) {
1701 bl
.substr_of(b
->data
, b
->length
- keep
, keep
);
1702 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1703 nb
->maybe_rebuild();
1704 _add_buffer(cache
, nb
, 0, b
);
1706 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, keep
), 0, b
);
1708 _rm_buffer(cache
, i
);
1709 cache
->_audit("discard end 2");
1712 return cache_private
;
1715 void BlueStore::BufferSpace::read(
1716 BufferCacheShard
* cache
,
1719 BlueStore::ready_regions_t
& res
,
1720 interval_set
<uint32_t>& res_intervals
,
1724 res_intervals
.clear();
1725 uint32_t want_bytes
= length
;
1726 uint32_t end
= offset
+ length
;
1729 std::lock_guard
l(cache
->lock
);
1730 for (auto i
= _data_lower_bound(offset
);
1731 i
!= buffer_map
.end() && offset
< end
&& i
->first
< end
;
1733 Buffer
*b
= i
->second
.get();
1734 ceph_assert(b
->end() > offset
);
1737 if (flags
& BYPASS_CLEAN_CACHE
)
1738 val
= b
->is_writing();
1740 val
= b
->is_writing() || b
->is_clean();
1742 if (b
->offset
< offset
) {
1743 uint32_t skip
= offset
- b
->offset
;
1744 uint32_t l
= min(length
, b
->length
- skip
);
1745 res
[offset
].substr_of(b
->data
, skip
, l
);
1746 res_intervals
.insert(offset
, l
);
1749 if (!b
->is_writing()) {
1754 if (b
->offset
> offset
) {
1755 uint32_t gap
= b
->offset
- offset
;
1756 if (length
<= gap
) {
1762 if (!b
->is_writing()) {
1765 if (b
->length
> length
) {
1766 res
[offset
].substr_of(b
->data
, 0, length
);
1767 res_intervals
.insert(offset
, length
);
1770 res
[offset
].append(b
->data
);
1771 res_intervals
.insert(offset
, b
->length
);
1772 if (b
->length
== length
)
1774 offset
+= b
->length
;
1775 length
-= b
->length
;
1781 uint64_t hit_bytes
= res_intervals
.size();
1782 ceph_assert(hit_bytes
<= want_bytes
);
1783 uint64_t miss_bytes
= want_bytes
- hit_bytes
;
1784 cache
->logger
->inc(l_bluestore_buffer_hit_bytes
, hit_bytes
);
1785 cache
->logger
->inc(l_bluestore_buffer_miss_bytes
, miss_bytes
);
1788 void BlueStore::BufferSpace::_finish_write(BufferCacheShard
* cache
, uint64_t seq
)
1790 auto i
= writing
.begin();
1791 while (i
!= writing
.end()) {
1801 ceph_assert(b
->is_writing());
1803 if (b
->flags
& Buffer::FLAG_NOCACHE
) {
1805 ldout(cache
->cct
, 20) << __func__
<< " discard " << *b
<< dendl
;
1806 buffer_map
.erase(b
->offset
);
1808 b
->state
= Buffer::STATE_CLEAN
;
1811 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
1812 cache
->_add(b
, 1, nullptr);
1813 ldout(cache
->cct
, 20) << __func__
<< " added " << *b
<< dendl
;
1817 cache
->_audit("finish_write end");
1820 void BlueStore::BufferSpace::split(BufferCacheShard
* cache
, size_t pos
, BlueStore::BufferSpace
&r
)
1822 std::lock_guard
lk(cache
->lock
);
1823 if (buffer_map
.empty())
1826 auto p
= --buffer_map
.end();
1828 if (p
->second
->end() <= pos
)
1831 if (p
->second
->offset
< pos
) {
1832 ldout(cache
->cct
, 30) << __func__
<< " cut " << *p
->second
<< dendl
;
1833 size_t left
= pos
- p
->second
->offset
;
1834 size_t right
= p
->second
->length
- left
;
1835 if (p
->second
->data
.length()) {
1837 bl
.substr_of(p
->second
->data
, left
, right
);
1838 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, bl
),
1839 0, p
->second
.get());
1841 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, right
),
1842 0, p
->second
.get());
1844 cache
->_adjust_size(p
->second
.get(), -right
);
1845 p
->second
->truncate(left
);
1849 ceph_assert(p
->second
->end() > pos
);
1850 ldout(cache
->cct
, 30) << __func__
<< " move " << *p
->second
<< dendl
;
1851 if (p
->second
->data
.length()) {
1852 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1853 p
->second
->offset
- pos
, p
->second
->data
),
1854 0, p
->second
.get());
1856 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1857 p
->second
->offset
- pos
, p
->second
->length
),
1858 0, p
->second
.get());
1860 if (p
== buffer_map
.begin()) {
1861 _rm_buffer(cache
, p
);
1864 _rm_buffer(cache
, p
--);
1867 ceph_assert(writing
.empty());
1874 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1876 BlueStore::OnodeRef
BlueStore::OnodeSpace::add(const ghobject_t
& oid
,
1879 std::lock_guard
l(cache
->lock
);
1880 auto p
= onode_map
.find(oid
);
1881 if (p
!= onode_map
.end()) {
1882 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
1883 << " raced, returning existing " << p
->second
1887 ldout(cache
->cct
, 20) << __func__
<< " " << oid
<< " " << o
<< dendl
;
1889 cache
->_add(o
.get(), 1);
1894 void BlueStore::OnodeSpace::_remove(const ghobject_t
& oid
)
1896 ldout(cache
->cct
, 20) << __func__
<< " " << oid
<< " " << dendl
;
1897 onode_map
.erase(oid
);
1900 BlueStore::OnodeRef
BlueStore::OnodeSpace::lookup(const ghobject_t
& oid
)
1902 ldout(cache
->cct
, 30) << __func__
<< dendl
;
1907 std::lock_guard
l(cache
->lock
);
1908 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator p
= onode_map
.find(oid
);
1909 if (p
== onode_map
.end()) {
1910 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " miss" << dendl
;
1912 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " hit " << p
->second
1913 << " " << p
->second
->nref
1914 << " " << p
->second
->cached
1915 << " " << p
->second
->pinned
1917 // This will pin onode and implicitly touch the cache when Onode
1918 // eventually will become unpinned
1920 ceph_assert(!o
->cached
|| o
->pinned
);
1927 cache
->logger
->inc(l_bluestore_onode_hits
);
1929 cache
->logger
->inc(l_bluestore_onode_misses
);
1934 void BlueStore::OnodeSpace::clear()
1936 std::lock_guard
l(cache
->lock
);
1937 ldout(cache
->cct
, 10) << __func__
<< " " << onode_map
.size()<< dendl
;
1938 for (auto &p
: onode_map
) {
1939 cache
->_rm(p
.second
.get());
1944 bool BlueStore::OnodeSpace::empty()
1946 std::lock_guard
l(cache
->lock
);
1947 return onode_map
.empty();
1950 void BlueStore::OnodeSpace::rename(
1952 const ghobject_t
& old_oid
,
1953 const ghobject_t
& new_oid
,
1954 const mempool::bluestore_cache_meta::string
& new_okey
)
1956 std::lock_guard
l(cache
->lock
);
1957 ldout(cache
->cct
, 30) << __func__
<< " " << old_oid
<< " -> " << new_oid
1959 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator po
, pn
;
1960 po
= onode_map
.find(old_oid
);
1961 pn
= onode_map
.find(new_oid
);
1962 ceph_assert(po
!= pn
);
1964 ceph_assert(po
!= onode_map
.end());
1965 if (pn
!= onode_map
.end()) {
1966 ldout(cache
->cct
, 30) << __func__
<< " removing target " << pn
->second
1968 cache
->_rm(pn
->second
.get());
1969 onode_map
.erase(pn
);
1971 OnodeRef o
= po
->second
;
1973 // install a non-existent onode at old location
1974 oldo
.reset(new Onode(o
->c
, old_oid
, o
->key
));
1976 cache
->_add(oldo
.get(), 1);
1977 // add at new position and fix oid, key.
1978 // This will pin 'o' and implicitly touch cache
1979 // when it will eventually become unpinned
1980 onode_map
.insert(make_pair(new_oid
, o
));
1981 ceph_assert(o
->pinned
);
1988 bool BlueStore::OnodeSpace::map_any(std::function
<bool(Onode
*)> f
)
1990 std::lock_guard
l(cache
->lock
);
1991 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1992 for (auto& i
: onode_map
) {
1993 if (f(i
.second
.get())) {
2000 template <int LogLevelV
= 30>
2001 void BlueStore::OnodeSpace::dump(CephContext
*cct
)
2003 for (auto& i
: onode_map
) {
2004 ldout(cct
, LogLevelV
) << i
.first
<< " : " << i
.second
2005 << " " << i
.second
->nref
2006 << " " << i
.second
->cached
2007 << " " << i
.second
->pinned
2015 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
2017 #define dout_context coll->store->cct
2019 void BlueStore::SharedBlob::dump(Formatter
* f
) const
2021 f
->dump_bool("loaded", loaded
);
2023 persistent
->dump(f
);
2025 f
->dump_unsigned("sbid_unloaded", sbid_unloaded
);
2029 ostream
& operator<<(ostream
& out
, const BlueStore::SharedBlob
& sb
)
2031 out
<< "SharedBlob(" << &sb
;
2034 out
<< " loaded " << *sb
.persistent
;
2036 out
<< " sbid 0x" << std::hex
<< sb
.sbid_unloaded
<< std::dec
;
2041 BlueStore::SharedBlob::SharedBlob(uint64_t i
, Collection
*_coll
)
2042 : coll(_coll
), sbid_unloaded(i
)
2044 ceph_assert(sbid_unloaded
> 0);
2046 get_cache()->add_blob();
2050 BlueStore::SharedBlob::~SharedBlob()
2052 if (loaded
&& persistent
) {
2057 void BlueStore::SharedBlob::put()
2060 dout(20) << __func__
<< " " << this
2061 << " removing self from set " << get_parent()
2064 auto coll_snap
= coll
;
2066 std::lock_guard
l(coll_snap
->cache
->lock
);
2067 if (coll_snap
!= coll
) {
2070 if (!coll_snap
->shared_blob_set
.remove(this, true)) {
2074 bc
._clear(coll_snap
->cache
);
2075 coll_snap
->cache
->rm_blob();
2081 void BlueStore::SharedBlob::get_ref(uint64_t offset
, uint32_t length
)
2083 ceph_assert(persistent
);
2084 persistent
->ref_map
.get(offset
, length
);
2087 void BlueStore::SharedBlob::put_ref(uint64_t offset
, uint32_t length
,
2091 ceph_assert(persistent
);
2092 persistent
->ref_map
.put(offset
, length
, r
,
2093 unshare
&& !*unshare
? unshare
: nullptr);
2096 void BlueStore::SharedBlob::finish_write(uint64_t seq
)
2099 BufferCacheShard
*cache
= coll
->cache
;
2100 std::lock_guard
l(cache
->lock
);
2101 if (coll
->cache
!= cache
) {
2102 dout(20) << __func__
2103 << " raced with sb cache update, was " << cache
2104 << ", now " << coll
->cache
<< ", retrying"
2108 bc
._finish_write(cache
, seq
);
2116 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2118 template <int LogLevelV
= 30>
2119 void BlueStore::SharedBlobSet::dump(CephContext
*cct
)
2121 std::lock_guard
l(lock
);
2122 for (auto& i
: sb_map
) {
2123 ldout(cct
, LogLevelV
) << i
.first
<< " : " << *i
.second
<< dendl
;
2130 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2132 void BlueStore::Blob::dump(Formatter
* f
) const
2134 if (is_spanning()) {
2135 f
->dump_unsigned("spanning_id ", id
);
2139 f
->dump_object("shared", *shared_blob
);
2143 ostream
& operator<<(ostream
& out
, const BlueStore::Blob
& b
)
2145 out
<< "Blob(" << &b
;
2146 if (b
.is_spanning()) {
2147 out
<< " spanning " << b
.id
;
2149 out
<< " " << b
.get_blob() << " " << b
.get_blob_use_tracker();
2150 if (b
.shared_blob
) {
2151 out
<< " " << *b
.shared_blob
;
2153 out
<< " (shared_blob=NULL)";
2159 void BlueStore::Blob::discard_unallocated(Collection
*coll
)
2161 if (get_blob().is_shared()) {
2164 if (get_blob().is_compressed()) {
2165 bool discard
= false;
2166 bool all_invalid
= true;
2167 for (auto e
: get_blob().get_extents()) {
2168 if (!e
.is_valid()) {
2171 all_invalid
= false;
2174 ceph_assert(discard
== all_invalid
); // in case of compressed blob all
2175 // or none pextents are invalid.
2177 shared_blob
->bc
.discard(shared_blob
->get_cache(), 0,
2178 get_blob().get_logical_length());
2182 for (auto e
: get_blob().get_extents()) {
2183 if (!e
.is_valid()) {
2184 dout(20) << __func__
<< " 0x" << std::hex
<< pos
2186 << std::dec
<< dendl
;
2187 shared_blob
->bc
.discard(shared_blob
->get_cache(), pos
, e
.length
);
2191 if (get_blob().can_prune_tail()) {
2192 dirty_blob().prune_tail();
2193 used_in_blob
.prune_tail(get_blob().get_ondisk_length());
2194 dout(20) << __func__
<< " pruned tail, now " << get_blob() << dendl
;
2199 void BlueStore::Blob::get_ref(
2204 // Caller has to initialize Blob's logical length prior to increment
2205 // references. Otherwise one is neither unable to determine required
2206 // amount of counters in case of per-au tracking nor obtain min_release_size
2207 // for single counter mode.
2208 ceph_assert(get_blob().get_logical_length() != 0);
2209 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2210 << std::dec
<< " " << *this << dendl
;
2212 if (used_in_blob
.is_empty()) {
2213 uint32_t min_release_size
=
2214 get_blob().get_release_size(coll
->store
->min_alloc_size
);
2215 uint64_t l
= get_blob().get_logical_length();
2216 dout(20) << __func__
<< " init 0x" << std::hex
<< l
<< ", "
2217 << min_release_size
<< std::dec
<< dendl
;
2218 used_in_blob
.init(l
, min_release_size
);
2225 bool BlueStore::Blob::put_ref(
2231 PExtentVector logical
;
2233 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2234 << std::dec
<< " " << *this << dendl
;
2236 bool empty
= used_in_blob
.put(
2241 // nothing to release
2242 if (!empty
&& logical
.empty()) {
2246 bluestore_blob_t
& b
= dirty_blob();
2247 return b
.release_extents(empty
, logical
, r
);
2250 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size
,
2251 uint32_t target_blob_size
,
2253 uint32_t *length0
) {
2254 ceph_assert(min_alloc_size
);
2255 ceph_assert(target_blob_size
);
2256 if (!get_blob().is_mutable()) {
2260 uint32_t length
= *length0
;
2261 uint32_t end
= b_offset
+ length
;
2263 // Currently for the sake of simplicity we omit blob reuse if data is
2264 // unaligned with csum chunk. Later we can perform padding if needed.
2265 if (get_blob().has_csum() &&
2266 ((b_offset
% get_blob().get_csum_chunk_size()) != 0 ||
2267 (end
% get_blob().get_csum_chunk_size()) != 0)) {
2271 auto blen
= get_blob().get_logical_length();
2272 uint32_t new_blen
= blen
;
2274 // make sure target_blob_size isn't less than current blob len
2275 target_blob_size
= std::max(blen
, target_blob_size
);
2277 if (b_offset
>= blen
) {
2278 // new data totally stands out of the existing blob
2281 // new data overlaps with the existing blob
2282 new_blen
= std::max(blen
, end
);
2284 uint32_t overlap
= 0;
2285 if (new_blen
> blen
) {
2286 overlap
= blen
- b_offset
;
2291 if (!get_blob().is_unallocated(b_offset
, overlap
)) {
2292 // abort if any piece of the overlap has already been allocated
2297 if (new_blen
> blen
) {
2298 int64_t overflow
= int64_t(new_blen
) - target_blob_size
;
2299 // Unable to decrease the provided length to fit into max_blob_size
2300 if (overflow
>= length
) {
2304 // FIXME: in some cases we could reduce unused resolution
2305 if (get_blob().has_unused()) {
2310 new_blen
-= overflow
;
2315 if (new_blen
> blen
) {
2316 dirty_blob().add_tail(new_blen
);
2317 used_in_blob
.add_tail(new_blen
,
2318 get_blob().get_release_size(min_alloc_size
));
2324 void BlueStore::Blob::split(Collection
*coll
, uint32_t blob_offset
, Blob
*r
)
2326 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2327 << " start " << *this << dendl
;
2328 ceph_assert(blob
.can_split());
2329 ceph_assert(used_in_blob
.can_split());
2330 bluestore_blob_t
&lb
= dirty_blob();
2331 bluestore_blob_t
&rb
= r
->dirty_blob();
2335 &(r
->used_in_blob
));
2337 lb
.split(blob_offset
, rb
);
2338 shared_blob
->bc
.split(shared_blob
->get_cache(), blob_offset
, r
->shared_blob
->bc
);
2340 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2341 << " finish " << *this << dendl
;
2342 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2343 << " and " << *r
<< dendl
;
2346 #ifndef CACHE_BLOB_BL
2347 void BlueStore::Blob::decode(
2349 bufferptr::const_iterator
& p
,
2352 bool include_ref_map
)
2354 denc(blob
, p
, struct_v
);
2355 if (blob
.is_shared()) {
2358 if (include_ref_map
) {
2360 used_in_blob
.decode(p
);
2362 used_in_blob
.clear();
2363 bluestore_extent_ref_map_t legacy_ref_map
;
2364 legacy_ref_map
.decode(p
);
2365 for (auto r
: legacy_ref_map
.ref_map
) {
2369 r
.second
.refs
* r
.second
.length
);
2378 void BlueStore::Extent::dump(Formatter
* f
) const
2380 f
->dump_unsigned("logical_offset", logical_offset
);
2381 f
->dump_unsigned("length", length
);
2382 f
->dump_unsigned("blob_offset", blob_offset
);
2383 f
->dump_object("blob", *blob
);
2386 ostream
& operator<<(ostream
& out
, const BlueStore::Extent
& e
)
2388 return out
<< std::hex
<< "0x" << e
.logical_offset
<< "~" << e
.length
2389 << ": 0x" << e
.blob_offset
<< "~" << e
.length
<< std::dec
2394 BlueStore::OldExtent
* BlueStore::OldExtent::create(CollectionRef c
,
2399 OldExtent
* oe
= new OldExtent(lo
, o
, l
, b
);
2400 b
->put_ref(c
.get(), o
, l
, &(oe
->r
));
2401 oe
->blob_empty
= !b
->is_referenced();
2408 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2410 #define dout_context onode->c->store->cct
2412 BlueStore::ExtentMap::ExtentMap(Onode
*o
)
2415 o
->c
->store
->cct
->_conf
->bluestore_extent_map_inline_shard_prealloc_size
) {
2418 void BlueStore::ExtentMap::dump(Formatter
* f
) const
2420 f
->open_array_section("extents");
2422 for (auto& e
: extent_map
) {
2423 f
->dump_object("extent", e
);
2428 void BlueStore::ExtentMap::dup(BlueStore
* b
, TransContext
* txc
,
2429 CollectionRef
& c
, OnodeRef
& oldo
, OnodeRef
& newo
, uint64_t& srcoff
,
2430 uint64_t& length
, uint64_t& dstoff
) {
2432 auto cct
= onode
->c
->store
->cct
;
2434 cct
->_conf
->bluestore_debug_inject_bug21040
;
2435 vector
<BlobRef
> id_to_blob(oldo
->extent_map
.extent_map
.size());
2436 for (auto& e
: oldo
->extent_map
.extent_map
) {
2437 e
.blob
->last_encoded_id
= -1;
2441 uint64_t end
= srcoff
+ length
;
2442 uint32_t dirty_range_begin
= 0;
2443 uint32_t dirty_range_end
= 0;
2444 bool src_dirty
= false;
2445 for (auto ep
= oldo
->extent_map
.seek_lextent(srcoff
);
2446 ep
!= oldo
->extent_map
.extent_map
.end();
2449 if (e
.logical_offset
>= end
) {
2452 dout(20) << __func__
<< " src " << e
<< dendl
;
2454 bool blob_duped
= true;
2455 if (e
.blob
->last_encoded_id
>= 0) {
2456 cb
= id_to_blob
[e
.blob
->last_encoded_id
];
2460 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
2461 // make sure it is shared
2462 if (!blob
.is_shared()) {
2463 c
->make_blob_shared(b
->_assign_blobid(txc
), e
.blob
);
2464 if (!inject_21040
&& !src_dirty
) {
2466 dirty_range_begin
= e
.logical_offset
;
2467 } else if (inject_21040
&&
2468 dirty_range_begin
== 0 && dirty_range_end
== 0) {
2469 dirty_range_begin
= e
.logical_offset
;
2471 ceph_assert(e
.logical_end() > 0);
2472 // -1 to exclude next potential shard
2473 dirty_range_end
= e
.logical_end() - 1;
2475 c
->load_shared_blob(e
.blob
->shared_blob
);
2478 e
.blob
->last_encoded_id
= n
;
2481 // bump the extent refs on the copied blob's extents
2482 for (auto p
: blob
.get_extents()) {
2484 e
.blob
->shared_blob
->get_ref(p
.offset
, p
.length
);
2487 txc
->write_shared_blob(e
.blob
->shared_blob
);
2488 dout(20) << __func__
<< " new " << *cb
<< dendl
;
2491 int skip_front
, skip_back
;
2492 if (e
.logical_offset
< srcoff
) {
2493 skip_front
= srcoff
- e
.logical_offset
;
2497 if (e
.logical_end() > end
) {
2498 skip_back
= e
.logical_end() - end
;
2503 Extent
* ne
= new Extent(e
.logical_offset
+ skip_front
+ dstoff
- srcoff
,
2504 e
.blob_offset
+ skip_front
, e
.length
- skip_front
- skip_back
, cb
);
2505 newo
->extent_map
.extent_map
.insert(*ne
);
2506 ne
->blob
->get_ref(c
.get(), ne
->blob_offset
, ne
->length
);
2507 // fixme: we may leave parts of new blob unreferenced that could
2508 // be freed (relative to the shared_blob).
2509 txc
->statfs_delta
.stored() += ne
->length
;
2510 if (e
.blob
->get_blob().is_compressed()) {
2511 txc
->statfs_delta
.compressed_original() += ne
->length
;
2513 txc
->statfs_delta
.compressed() +=
2514 cb
->get_blob().get_compressed_payload_length();
2517 dout(20) << __func__
<< " dst " << *ne
<< dendl
;
2520 if ((!inject_21040
&& src_dirty
) ||
2521 (inject_21040
&& dirty_range_end
> dirty_range_begin
)) {
2522 oldo
->extent_map
.dirty_range(dirty_range_begin
,
2523 dirty_range_end
- dirty_range_begin
);
2524 txc
->write_onode(oldo
);
2526 txc
->write_onode(newo
);
2528 if (dstoff
+ length
> newo
->onode
.size
) {
2529 newo
->onode
.size
= dstoff
+ length
;
2531 newo
->extent_map
.dirty_range(dstoff
, length
);
2533 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t
,
2536 auto cct
= onode
->c
->store
->cct
; //used by dout
2537 dout(20) << __func__
<< " " << onode
->oid
<< (force
? " force" : "") << dendl
;
2538 if (onode
->onode
.extent_map_shards
.empty()) {
2539 if (inline_bl
.length() == 0) {
2541 // we need to encode inline_bl to measure encoded length
2542 bool never_happen
= encode_some(0, OBJECT_MAX_SIZE
, inline_bl
, &n
);
2543 inline_bl
.reassign_to_mempool(mempool::mempool_bluestore_inline_bl
);
2544 ceph_assert(!never_happen
);
2545 size_t len
= inline_bl
.length();
2546 dout(20) << __func__
<< " inline shard " << len
<< " bytes from " << n
2547 << " extents" << dendl
;
2548 if (!force
&& len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2549 request_reshard(0, OBJECT_MAX_SIZE
);
2553 // will persist in the onode key.
2555 // pending shard update
2556 struct dirty_shard_t
{
2559 dirty_shard_t(Shard
*s
) : shard(s
) {}
2561 vector
<dirty_shard_t
> encoded_shards
;
2562 // allocate slots for all shards in a single call instead of
2563 // doing multiple allocations - one per each dirty shard
2564 encoded_shards
.reserve(shards
.size());
2566 auto p
= shards
.begin();
2568 while (p
!= shards
.end()) {
2569 ceph_assert(p
->shard_info
->offset
>= prev_p
->shard_info
->offset
);
2574 if (n
== shards
.end()) {
2575 endoff
= OBJECT_MAX_SIZE
;
2577 endoff
= n
->shard_info
->offset
;
2579 encoded_shards
.emplace_back(dirty_shard_t(&(*p
)));
2580 bufferlist
& bl
= encoded_shards
.back().bl
;
2581 if (encode_some(p
->shard_info
->offset
, endoff
- p
->shard_info
->offset
,
2584 derr
<< __func__
<< " encode_some needs reshard" << dendl
;
2585 ceph_assert(!force
);
2588 size_t len
= bl
.length();
2590 dout(20) << __func__
<< " shard 0x" << std::hex
2591 << p
->shard_info
->offset
<< std::dec
<< " is " << len
2592 << " bytes (was " << p
->shard_info
->bytes
<< ") from "
2593 << p
->extents
<< " extents" << dendl
;
2596 if (len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2597 // we are big; reshard ourselves
2598 request_reshard(p
->shard_info
->offset
, endoff
);
2600 // avoid resharding the trailing shard, even if it is small
2601 else if (n
!= shards
.end() &&
2602 len
< g_conf()->bluestore_extent_map_shard_min_size
) {
2603 ceph_assert(endoff
!= OBJECT_MAX_SIZE
);
2604 if (p
== shards
.begin()) {
2605 // we are the first shard, combine with next shard
2606 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2608 // combine either with the previous shard or the next,
2609 // whichever is smaller
2610 if (prev_p
->shard_info
->bytes
> n
->shard_info
->bytes
) {
2611 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2613 request_reshard(prev_p
->shard_info
->offset
, endoff
);
2622 if (needs_reshard()) {
2626 // schedule DB update for dirty shards
2628 for (auto& it
: encoded_shards
) {
2629 it
.shard
->dirty
= false;
2630 it
.shard
->shard_info
->bytes
= it
.bl
.length();
2631 generate_extent_shard_key_and_apply(
2633 it
.shard
->shard_info
->offset
,
2635 [&](const string
& final_key
) {
2636 t
->set(PREFIX_OBJ
, final_key
, it
.bl
);
2643 bid_t
BlueStore::ExtentMap::allocate_spanning_blob_id()
2645 if (spanning_blob_map
.empty())
2647 bid_t bid
= spanning_blob_map
.rbegin()->first
+ 1;
2648 // bid is valid and available.
2651 // Find next unused bid;
2652 bid
= rand() % (numeric_limits
<bid_t
>::max() + 1);
2653 const auto begin_bid
= bid
;
2655 if (!spanning_blob_map
.count(bid
))
2659 if (bid
< 0) bid
= 0;
2661 } while (bid
!= begin_bid
);
2662 auto cct
= onode
->c
->store
->cct
; // used by dout
2663 _dump_onode
<0>(cct
, *onode
);
2664 ceph_abort_msg("no available blob id");
2667 void BlueStore::ExtentMap::reshard(
2669 KeyValueDB::Transaction t
)
2671 auto cct
= onode
->c
->store
->cct
; // used by dout
2673 dout(10) << __func__
<< " 0x[" << std::hex
<< needs_reshard_begin
<< ","
2674 << needs_reshard_end
<< ")" << std::dec
2675 << " of " << onode
->onode
.extent_map_shards
.size()
2676 << " shards on " << onode
->oid
<< dendl
;
2677 for (auto& p
: spanning_blob_map
) {
2678 dout(20) << __func__
<< " spanning blob " << p
.first
<< " " << *p
.second
2681 // determine shard index range
2682 unsigned si_begin
= 0, si_end
= 0;
2683 if (!shards
.empty()) {
2684 while (si_begin
+ 1 < shards
.size() &&
2685 shards
[si_begin
+ 1].shard_info
->offset
<= needs_reshard_begin
) {
2688 needs_reshard_begin
= shards
[si_begin
].shard_info
->offset
;
2689 for (si_end
= si_begin
; si_end
< shards
.size(); ++si_end
) {
2690 if (shards
[si_end
].shard_info
->offset
>= needs_reshard_end
) {
2691 needs_reshard_end
= shards
[si_end
].shard_info
->offset
;
2695 if (si_end
== shards
.size()) {
2696 needs_reshard_end
= OBJECT_MAX_SIZE
;
2698 dout(20) << __func__
<< " shards [" << si_begin
<< "," << si_end
<< ")"
2699 << " over 0x[" << std::hex
<< needs_reshard_begin
<< ","
2700 << needs_reshard_end
<< ")" << std::dec
<< dendl
;
2703 fault_range(db
, needs_reshard_begin
, (needs_reshard_end
- needs_reshard_begin
));
2705 // we may need to fault in a larger interval later must have all
2706 // referring extents for spanning blobs loaded in order to have
2707 // accurate use_tracker values.
2708 uint32_t spanning_scan_begin
= needs_reshard_begin
;
2709 uint32_t spanning_scan_end
= needs_reshard_end
;
2713 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2714 generate_extent_shard_key_and_apply(
2715 onode
->key
, shards
[i
].shard_info
->offset
, &key
,
2716 [&](const string
& final_key
) {
2717 t
->rmkey(PREFIX_OBJ
, final_key
);
2722 // calculate average extent size
2724 unsigned extents
= 0;
2725 if (onode
->onode
.extent_map_shards
.empty()) {
2726 bytes
= inline_bl
.length();
2727 extents
= extent_map
.size();
2729 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2730 bytes
+= shards
[i
].shard_info
->bytes
;
2731 extents
+= shards
[i
].extents
;
2734 unsigned target
= cct
->_conf
->bluestore_extent_map_shard_target_size
;
2735 unsigned slop
= target
*
2736 cct
->_conf
->bluestore_extent_map_shard_target_size_slop
;
2737 unsigned extent_avg
= bytes
/ std::max(1u, extents
);
2738 dout(20) << __func__
<< " extent_avg " << extent_avg
<< ", target " << target
2739 << ", slop " << slop
<< dendl
;
2742 unsigned estimate
= 0;
2743 unsigned offset
= needs_reshard_begin
;
2744 vector
<bluestore_onode_t::shard_info
> new_shard_info
;
2745 unsigned max_blob_end
= 0;
2746 Extent
dummy(needs_reshard_begin
);
2747 for (auto e
= extent_map
.lower_bound(dummy
);
2748 e
!= extent_map
.end();
2750 if (e
->logical_offset
>= needs_reshard_end
) {
2753 dout(30) << " extent " << *e
<< dendl
;
2755 // disfavor shard boundaries that span a blob
2756 bool would_span
= (e
->logical_offset
< max_blob_end
) || e
->blob_offset
;
2758 estimate
+ extent_avg
> target
+ (would_span
? slop
: 0)) {
2760 if (offset
== needs_reshard_begin
) {
2761 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2762 new_shard_info
.back().offset
= offset
;
2763 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2764 << std::dec
<< dendl
;
2766 offset
= e
->logical_offset
;
2767 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2768 new_shard_info
.back().offset
= offset
;
2769 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2770 << std::dec
<< dendl
;
2773 estimate
+= extent_avg
;
2774 unsigned bs
= e
->blob_start();
2775 if (bs
< spanning_scan_begin
) {
2776 spanning_scan_begin
= bs
;
2778 uint32_t be
= e
->blob_end();
2779 if (be
> max_blob_end
) {
2782 if (be
> spanning_scan_end
) {
2783 spanning_scan_end
= be
;
2786 if (new_shard_info
.empty() && (si_begin
> 0 ||
2787 si_end
< shards
.size())) {
2788 // we resharded a partial range; we must produce at least one output
2790 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2791 new_shard_info
.back().offset
= needs_reshard_begin
;
2792 dout(20) << __func__
<< " new shard 0x" << std::hex
<< needs_reshard_begin
2793 << std::dec
<< " (singleton degenerate case)" << dendl
;
2796 auto& sv
= onode
->onode
.extent_map_shards
;
2797 dout(20) << __func__
<< " new " << new_shard_info
<< dendl
;
2798 dout(20) << __func__
<< " old " << sv
<< dendl
;
2800 // no old shards to keep
2801 sv
.swap(new_shard_info
);
2802 init_shards(true, true);
2804 // splice in new shards
2805 sv
.erase(sv
.begin() + si_begin
, sv
.begin() + si_end
);
2806 shards
.erase(shards
.begin() + si_begin
, shards
.begin() + si_end
);
2808 sv
.begin() + si_begin
,
2809 new_shard_info
.begin(),
2810 new_shard_info
.end());
2811 shards
.insert(shards
.begin() + si_begin
, new_shard_info
.size(), Shard());
2812 si_end
= si_begin
+ new_shard_info
.size();
2814 ceph_assert(sv
.size() == shards
.size());
2816 // note that we need to update every shard_info of shards here,
2817 // as sv might have been totally re-allocated above
2818 for (unsigned i
= 0; i
< shards
.size(); i
++) {
2819 shards
[i
].shard_info
= &sv
[i
];
2822 // mark newly added shards as dirty
2823 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2824 shards
[i
].loaded
= true;
2825 shards
[i
].dirty
= true;
2828 dout(20) << __func__
<< " fin " << sv
<< dendl
;
2832 // no more shards; unspan all previously spanning blobs
2833 auto p
= spanning_blob_map
.begin();
2834 while (p
!= spanning_blob_map
.end()) {
2836 dout(30) << __func__
<< " un-spanning " << *p
->second
<< dendl
;
2837 p
= spanning_blob_map
.erase(p
);
2840 // identify new spanning blobs
2841 dout(20) << __func__
<< " checking spanning blobs 0x[" << std::hex
2842 << spanning_scan_begin
<< "," << spanning_scan_end
<< ")" << dendl
;
2843 if (spanning_scan_begin
< needs_reshard_begin
) {
2844 fault_range(db
, spanning_scan_begin
,
2845 needs_reshard_begin
- spanning_scan_begin
);
2847 if (spanning_scan_end
> needs_reshard_end
) {
2848 fault_range(db
, needs_reshard_end
,
2849 spanning_scan_end
- needs_reshard_end
);
2851 auto sp
= sv
.begin() + si_begin
;
2852 auto esp
= sv
.end();
2853 unsigned shard_start
= sp
->offset
;
2857 shard_end
= OBJECT_MAX_SIZE
;
2859 shard_end
= sp
->offset
;
2861 Extent
dummy(needs_reshard_begin
);
2863 bool was_too_many_blobs_check
= false;
2864 auto too_many_blobs_threshold
=
2865 g_conf()->bluestore_debug_too_many_blobs_threshold
;
2866 auto& dumped_onodes
= onode
->c
->onode_map
.cache
->dumped_onodes
;
2867 decltype(onode
->c
->onode_map
.cache
->dumped_onodes
)::value_type
* oid_slot
= nullptr;
2868 decltype(onode
->c
->onode_map
.cache
->dumped_onodes
)::value_type
* oldest_slot
= nullptr;
2870 for (auto e
= extent_map
.lower_bound(dummy
); e
!= extent_map
.end(); ++e
) {
2871 if (e
->logical_offset
>= needs_reshard_end
) {
2874 dout(30) << " extent " << *e
<< dendl
;
2875 while (e
->logical_offset
>= shard_end
) {
2876 shard_start
= shard_end
;
2877 ceph_assert(sp
!= esp
);
2880 shard_end
= OBJECT_MAX_SIZE
;
2882 shard_end
= sp
->offset
;
2884 dout(30) << __func__
<< " shard 0x" << std::hex
<< shard_start
2885 << " to 0x" << shard_end
<< std::dec
<< dendl
;
2888 if (e
->blob_escapes_range(shard_start
, shard_end
- shard_start
)) {
2889 if (!e
->blob
->is_spanning()) {
2890 // We have two options: (1) split the blob into pieces at the
2891 // shard boundaries (and adjust extents accordingly), or (2)
2892 // mark it spanning. We prefer to cut the blob if we can. Note that
2893 // we may have to split it multiple times--potentially at every
2895 bool must_span
= false;
2896 BlobRef b
= e
->blob
;
2897 if (b
->can_split()) {
2898 uint32_t bstart
= e
->blob_start();
2899 uint32_t bend
= e
->blob_end();
2900 for (const auto& sh
: shards
) {
2901 if (bstart
< sh
.shard_info
->offset
&&
2902 bend
> sh
.shard_info
->offset
) {
2903 uint32_t blob_offset
= sh
.shard_info
->offset
- bstart
;
2904 if (b
->can_split_at(blob_offset
)) {
2905 dout(20) << __func__
<< " splitting blob, bstart 0x"
2906 << std::hex
<< bstart
<< " blob_offset 0x"
2907 << blob_offset
<< std::dec
<< " " << *b
<< dendl
;
2908 b
= split_blob(b
, blob_offset
, sh
.shard_info
->offset
);
2909 // switch b to the new right-hand side, in case it
2910 // *also* has to get split.
2911 bstart
+= blob_offset
;
2912 onode
->c
->store
->logger
->inc(l_bluestore_blob_split
);
2923 auto bid
= allocate_spanning_blob_id();
2925 spanning_blob_map
[b
->id
] = b
;
2926 dout(20) << __func__
<< " adding spanning " << *b
<< dendl
;
2927 if (!was_too_many_blobs_check
&&
2928 too_many_blobs_threshold
&&
2929 spanning_blob_map
.size() >= size_t(too_many_blobs_threshold
)) {
2931 was_too_many_blobs_check
= true;
2932 for (size_t i
= 0; i
< dumped_onodes
.size(); ++i
) {
2933 if (dumped_onodes
[i
].first
== onode
->oid
) {
2934 oid_slot
= &dumped_onodes
[i
];
2937 if (!oldest_slot
|| (oldest_slot
&&
2938 dumped_onodes
[i
].second
< oldest_slot
->second
)) {
2939 oldest_slot
= &dumped_onodes
[i
];
2946 if (e
->blob
->is_spanning()) {
2947 spanning_blob_map
.erase(e
->blob
->id
);
2949 dout(30) << __func__
<< " un-spanning " << *e
->blob
<< dendl
;
2953 bool do_dump
= (!oid_slot
&& was_too_many_blobs_check
) ||
2955 (mono_clock::now() - oid_slot
->second
>= make_timespan(5 * 60)));
2958 << " spanning blob count exceeds threshold, "
2959 << spanning_blob_map
.size() << " spanning blobs"
2961 _dump_onode
<0>(cct
, *onode
);
2963 oid_slot
->second
= mono_clock::now();
2965 ceph_assert(oldest_slot
);
2966 oldest_slot
->first
= onode
->oid
;
2967 oldest_slot
->second
= mono_clock::now();
2972 clear_needs_reshard();
2975 bool BlueStore::ExtentMap::encode_some(
2981 Extent
dummy(offset
);
2982 auto start
= extent_map
.lower_bound(dummy
);
2983 uint32_t end
= offset
+ length
;
2985 __u8 struct_v
= 2; // Version 2 differs from v1 in blob's ref_map
2986 // serialization only. Hence there is no specific
2987 // handling at ExtentMap level.
2991 bool must_reshard
= false;
2992 for (auto p
= start
;
2993 p
!= extent_map
.end() && p
->logical_offset
< end
;
2995 ceph_assert(p
->logical_offset
>= offset
);
2996 p
->blob
->last_encoded_id
= -1;
2997 if (!p
->blob
->is_spanning() && p
->blob_escapes_range(offset
, length
)) {
2998 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2999 << std::dec
<< " hit new spanning blob " << *p
<< dendl
;
3000 request_reshard(p
->blob_start(), p
->blob_end());
3001 must_reshard
= true;
3003 if (!must_reshard
) {
3004 denc_varint(0, bound
); // blobid
3005 denc_varint(0, bound
); // logical_offset
3006 denc_varint(0, bound
); // len
3007 denc_varint(0, bound
); // blob_offset
3009 p
->blob
->bound_encode(
3012 p
->blob
->shared_blob
->get_sbid(),
3020 denc(struct_v
, bound
);
3021 denc_varint(0, bound
); // number of extents
3024 auto app
= bl
.get_contiguous_appender(bound
);
3025 denc(struct_v
, app
);
3026 denc_varint(n
, app
);
3033 uint64_t prev_len
= 0;
3034 for (auto p
= start
;
3035 p
!= extent_map
.end() && p
->logical_offset
< end
;
3038 bool include_blob
= false;
3039 if (p
->blob
->is_spanning()) {
3040 blobid
= p
->blob
->id
<< BLOBID_SHIFT_BITS
;
3041 blobid
|= BLOBID_FLAG_SPANNING
;
3042 } else if (p
->blob
->last_encoded_id
< 0) {
3043 p
->blob
->last_encoded_id
= n
+ 1; // so it is always non-zero
3044 include_blob
= true;
3045 blobid
= 0; // the decoder will infer the id from n
3047 blobid
= p
->blob
->last_encoded_id
<< BLOBID_SHIFT_BITS
;
3049 if (p
->logical_offset
== pos
) {
3050 blobid
|= BLOBID_FLAG_CONTIGUOUS
;
3052 if (p
->blob_offset
== 0) {
3053 blobid
|= BLOBID_FLAG_ZEROOFFSET
;
3055 if (p
->length
== prev_len
) {
3056 blobid
|= BLOBID_FLAG_SAMELENGTH
;
3058 prev_len
= p
->length
;
3060 denc_varint(blobid
, app
);
3061 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
3062 denc_varint_lowz(p
->logical_offset
- pos
, app
);
3064 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
3065 denc_varint_lowz(p
->blob_offset
, app
);
3067 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
3068 denc_varint_lowz(p
->length
, app
);
3070 pos
= p
->logical_end();
3072 p
->blob
->encode(app
, struct_v
, p
->blob
->shared_blob
->get_sbid(), false);
3076 /*derr << __func__ << bl << dendl;
3077 derr << __func__ << ":";
3084 unsigned BlueStore::ExtentMap::decode_some(bufferlist
& bl
)
3087 derr << __func__ << ":";
3092 ceph_assert(bl
.get_num_buffers() <= 1);
3093 auto p
= bl
.front().begin_deep();
3096 // Version 2 differs from v1 in blob's ref_map
3097 // serialization only. Hence there is no specific
3098 // handling at ExtentMap level below.
3099 ceph_assert(struct_v
== 1 || struct_v
== 2);
3102 denc_varint(num
, p
);
3103 vector
<BlobRef
> blobs(num
);
3105 uint64_t prev_len
= 0;
3109 Extent
*le
= new Extent();
3111 denc_varint(blobid
, p
);
3112 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
3114 denc_varint_lowz(gap
, p
);
3117 le
->logical_offset
= pos
;
3118 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
3119 denc_varint_lowz(le
->blob_offset
, p
);
3121 le
->blob_offset
= 0;
3123 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
3124 denc_varint_lowz(prev_len
, p
);
3126 le
->length
= prev_len
;
3128 if (blobid
& BLOBID_FLAG_SPANNING
) {
3129 dout(30) << __func__
<< " getting spanning blob "
3130 << (blobid
>> BLOBID_SHIFT_BITS
) << dendl
;
3131 le
->assign_blob(get_spanning_blob(blobid
>> BLOBID_SHIFT_BITS
));
3133 blobid
>>= BLOBID_SHIFT_BITS
;
3135 le
->assign_blob(blobs
[blobid
- 1]);
3136 ceph_assert(le
->blob
);
3138 Blob
*b
= new Blob();
3140 b
->decode(onode
->c
, p
, struct_v
, &sbid
, false);
3142 onode
->c
->open_shared_blob(sbid
, b
);
3145 // we build ref_map dynamically for non-spanning blobs
3153 extent_map
.insert(*le
);
3156 ceph_assert(n
== num
);
3160 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p
)
3162 // Version 2 differs from v1 in blob's ref_map
3163 // serialization only. Hence there is no specific
3164 // handling at ExtentMap level.
3168 denc_varint((uint32_t)0, p
);
3169 size_t key_size
= 0;
3170 denc_varint((uint32_t)0, key_size
);
3171 p
+= spanning_blob_map
.size() * key_size
;
3172 for (const auto& i
: spanning_blob_map
) {
3173 i
.second
->bound_encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
3177 void BlueStore::ExtentMap::encode_spanning_blobs(
3178 bufferlist::contiguous_appender
& p
)
3180 // Version 2 differs from v1 in blob's ref_map
3181 // serialization only. Hence there is no specific
3182 // handling at ExtentMap level.
3186 denc_varint(spanning_blob_map
.size(), p
);
3187 for (auto& i
: spanning_blob_map
) {
3188 denc_varint(i
.second
->id
, p
);
3189 i
.second
->encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
3193 void BlueStore::ExtentMap::decode_spanning_blobs(
3194 bufferptr::const_iterator
& p
)
3198 // Version 2 differs from v1 in blob's ref_map
3199 // serialization only. Hence there is no specific
3200 // handling at ExtentMap level.
3201 ceph_assert(struct_v
== 1 || struct_v
== 2);
3206 BlobRef
b(new Blob());
3207 denc_varint(b
->id
, p
);
3208 spanning_blob_map
[b
->id
] = b
;
3210 b
->decode(onode
->c
, p
, struct_v
, &sbid
, true);
3211 onode
->c
->open_shared_blob(sbid
, b
);
3215 void BlueStore::ExtentMap::init_shards(bool loaded
, bool dirty
)
3217 shards
.resize(onode
->onode
.extent_map_shards
.size());
3219 for (auto &s
: onode
->onode
.extent_map_shards
) {
3220 shards
[i
].shard_info
= &s
;
3221 shards
[i
].loaded
= loaded
;
3222 shards
[i
].dirty
= dirty
;
3227 void BlueStore::ExtentMap::fault_range(
3232 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3233 << std::dec
<< dendl
;
3234 auto start
= seek_shard(offset
);
3235 auto last
= seek_shard(offset
+ length
);
3240 ceph_assert(last
>= start
);
3242 while (start
<= last
) {
3243 ceph_assert((size_t)start
< shards
.size());
3244 auto p
= &shards
[start
];
3246 dout(30) << __func__
<< " opening shard 0x" << std::hex
3247 << p
->shard_info
->offset
<< std::dec
<< dendl
;
3249 generate_extent_shard_key_and_apply(
3250 onode
->key
, p
->shard_info
->offset
, &key
,
3251 [&](const string
& final_key
) {
3252 int r
= db
->get(PREFIX_OBJ
, final_key
, &v
);
3254 derr
<< __func__
<< " missing shard 0x" << std::hex
3255 << p
->shard_info
->offset
<< std::dec
<< " for " << onode
->oid
3257 ceph_assert(r
>= 0);
3261 p
->extents
= decode_some(v
);
3263 dout(20) << __func__
<< " open shard 0x" << std::hex
3264 << p
->shard_info
->offset
3265 << " for range 0x" << offset
<< "~" << length
<< std::dec
3266 << " (" << v
.length() << " bytes)" << dendl
;
3267 ceph_assert(p
->dirty
== false);
3268 ceph_assert(v
.length() == p
->shard_info
->bytes
);
3269 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_misses
);
3271 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_hits
);
3277 void BlueStore::ExtentMap::dirty_range(
3281 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3282 << std::dec
<< dendl
;
3283 if (shards
.empty()) {
3284 dout(20) << __func__
<< " mark inline shard dirty" << dendl
;
3288 auto start
= seek_shard(offset
);
3292 auto last
= seek_shard(offset
+ length
- 1);
3296 ceph_assert(last
>= start
);
3297 while (start
<= last
) {
3298 ceph_assert((size_t)start
< shards
.size());
3299 auto p
= &shards
[start
];
3301 derr
<< __func__
<< "on write 0x" << std::hex
<< offset
3302 << "~" << length
<< " shard 0x" << p
->shard_info
->offset
3303 << std::dec
<< " is not loaded, can't mark dirty" << dendl
;
3304 ceph_abort_msg("can't mark unloaded shard dirty");
3307 dout(20) << __func__
<< " mark shard 0x" << std::hex
3308 << p
->shard_info
->offset
<< std::dec
<< " dirty" << dendl
;
3315 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::find(
3318 Extent
dummy(offset
);
3319 return extent_map
.find(dummy
);
3322 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::seek_lextent(
3325 Extent
dummy(offset
);
3326 auto fp
= extent_map
.lower_bound(dummy
);
3327 if (fp
!= extent_map
.begin()) {
3329 if (fp
->logical_end() <= offset
) {
3336 BlueStore::extent_map_t::const_iterator
BlueStore::ExtentMap::seek_lextent(
3337 uint64_t offset
) const
3339 Extent
dummy(offset
);
3340 auto fp
= extent_map
.lower_bound(dummy
);
3341 if (fp
!= extent_map
.begin()) {
3343 if (fp
->logical_end() <= offset
) {
3350 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset
, uint64_t length
)
3352 auto fp
= seek_lextent(offset
);
3353 if (fp
== extent_map
.end() || fp
->logical_offset
>= offset
+ length
) {
3359 int BlueStore::ExtentMap::compress_extent_map(
3363 if (extent_map
.empty())
3366 auto p
= seek_lextent(offset
);
3367 if (p
!= extent_map
.begin()) {
3368 --p
; // start to the left of offset
3370 // the caller should have just written to this region
3371 ceph_assert(p
!= extent_map
.end());
3373 // identify the *next* shard
3374 auto pshard
= shards
.begin();
3375 while (pshard
!= shards
.end() &&
3376 p
->logical_offset
>= pshard
->shard_info
->offset
) {
3380 if (pshard
!= shards
.end()) {
3381 shard_end
= pshard
->shard_info
->offset
;
3383 shard_end
= OBJECT_MAX_SIZE
;
3387 for (++n
; n
!= extent_map
.end(); p
= n
++) {
3388 if (n
->logical_offset
> offset
+ length
) {
3389 break; // stop after end
3391 while (n
!= extent_map
.end() &&
3392 p
->logical_end() == n
->logical_offset
&&
3393 p
->blob
== n
->blob
&&
3394 p
->blob_offset
+ p
->length
== n
->blob_offset
&&
3395 n
->logical_offset
< shard_end
) {
3396 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3397 << " next shard 0x" << shard_end
<< std::dec
3398 << " merging " << *p
<< " and " << *n
<< dendl
;
3399 p
->length
+= n
->length
;
3403 if (n
== extent_map
.end()) {
3406 if (n
->logical_offset
>= shard_end
) {
3407 ceph_assert(pshard
!= shards
.end());
3409 if (pshard
!= shards
.end()) {
3410 shard_end
= pshard
->shard_info
->offset
;
3412 shard_end
= OBJECT_MAX_SIZE
;
3417 onode
->c
->store
->logger
->inc(l_bluestore_extent_compress
, removed
);
3422 void BlueStore::ExtentMap::punch_hole(
3426 old_extent_map_t
*old_extents
)
3428 auto p
= seek_lextent(offset
);
3429 uint64_t end
= offset
+ length
;
3430 while (p
!= extent_map
.end()) {
3431 if (p
->logical_offset
>= end
) {
3434 if (p
->logical_offset
< offset
) {
3435 if (p
->logical_end() > end
) {
3436 // split and deref middle
3437 uint64_t front
= offset
- p
->logical_offset
;
3438 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ front
,
3440 old_extents
->push_back(*oe
);
3442 p
->blob_offset
+ front
+ length
,
3443 p
->length
- front
- length
,
3449 ceph_assert(p
->logical_end() > offset
); // else seek_lextent bug
3450 uint64_t keep
= offset
- p
->logical_offset
;
3451 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ keep
,
3452 p
->length
- keep
, p
->blob
);
3453 old_extents
->push_back(*oe
);
3459 if (p
->logical_offset
+ p
->length
<= end
) {
3460 // deref whole lextent
3461 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
3462 p
->length
, p
->blob
);
3463 old_extents
->push_back(*oe
);
3468 uint64_t keep
= p
->logical_end() - end
;
3469 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
3470 p
->length
- keep
, p
->blob
);
3471 old_extents
->push_back(*oe
);
3473 add(end
, p
->blob_offset
+ p
->length
- keep
, keep
, p
->blob
);
3479 BlueStore::Extent
*BlueStore::ExtentMap::set_lextent(
3481 uint64_t logical_offset
,
3482 uint64_t blob_offset
, uint64_t length
, BlobRef b
,
3483 old_extent_map_t
*old_extents
)
3485 // We need to have completely initialized Blob to increment its ref counters.
3486 ceph_assert(b
->get_blob().get_logical_length() != 0);
3488 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3489 // old_extents list if we overwre the blob totally
3490 // This might happen during WAL overwrite.
3491 b
->get_ref(onode
->c
, blob_offset
, length
);
3494 punch_hole(c
, logical_offset
, length
, old_extents
);
3497 Extent
*le
= new Extent(logical_offset
, blob_offset
, length
, b
);
3498 extent_map
.insert(*le
);
3499 if (spans_shard(logical_offset
, length
)) {
3500 request_reshard(logical_offset
, logical_offset
+ length
);
3505 BlueStore::BlobRef
BlueStore::ExtentMap::split_blob(
3507 uint32_t blob_offset
,
3510 uint32_t end_pos
= pos
+ lb
->get_blob().get_logical_length() - blob_offset
;
3511 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< " end 0x" << end_pos
3512 << " blob_offset 0x" << blob_offset
<< std::dec
<< " " << *lb
3514 BlobRef rb
= onode
->c
->new_blob();
3515 lb
->split(onode
->c
, blob_offset
, rb
.get());
3517 for (auto ep
= seek_lextent(pos
);
3518 ep
!= extent_map
.end() && ep
->logical_offset
< end_pos
;
3520 if (ep
->blob
!= lb
) {
3523 if (ep
->logical_offset
< pos
) {
3525 size_t left
= pos
- ep
->logical_offset
;
3526 Extent
*ne
= new Extent(pos
, 0, ep
->length
- left
, rb
);
3527 extent_map
.insert(*ne
);
3529 dout(30) << __func__
<< " split " << *ep
<< dendl
;
3530 dout(30) << __func__
<< " to " << *ne
<< dendl
;
3533 ceph_assert(ep
->blob_offset
>= blob_offset
);
3536 ep
->blob_offset
-= blob_offset
;
3537 dout(30) << __func__
<< " adjusted " << *ep
<< dendl
;
3546 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3549 // A tricky thing about Onode's ref counter is that we do an additional
3550 // increment when newly pinned instance is detected. And -1 on unpin.
3551 // This prevents from a conflict with a delete call (when nref == 0).
3552 // The latter might happen while the thread is in unpin() function
3553 // (and e.g. waiting for lock acquisition) since nref is already
3554 // decremented. And another 'putting' thread on the instance will release it.
3556 void BlueStore::Onode::get() {
3557 if (++nref
>= 2 && !pinned
) {
3558 OnodeCacheShard
* ocs
= c
->get_onode_cache();
3560 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3561 while (ocs
!= c
->get_onode_cache()) {
3563 ocs
= c
->get_onode_cache();
3566 bool was_pinned
= pinned
;
3568 // additional increment for newly pinned instance
3569 bool r
= !was_pinned
&& pinned
;
3579 void BlueStore::Onode::put() {
3582 OnodeCacheShard
* ocs
= c
->get_onode_cache();
3584 // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
3585 while (ocs
!= c
->get_onode_cache()) {
3587 ocs
= c
->get_onode_cache();
3590 bool need_unpin
= pinned
;
3591 pinned
= pinned
&& nref
> 2; // intentionally use > not >= as we have
3592 // +1 due to pinned state
3593 need_unpin
= need_unpin
&& !pinned
;
3594 if (cached
&& need_unpin
) {
3598 ocs
->_unpin_and_rm(this);
3599 // remove will also decrement nref and delete Onode
3600 c
->onode_map
._remove(oid
);
3603 // additional decrement for newly unpinned instance
3604 // should be the last action since Onode can be released
3605 // at any point after this decrement
3616 BlueStore::Onode
* BlueStore::Onode::decode(
3618 const ghobject_t
& oid
,
3620 const bufferlist
& v
)
3622 Onode
* on
= new Onode(c
.get(), oid
, key
);
3624 auto p
= v
.front().begin_deep();
3625 on
->onode
.decode(p
);
3626 for (auto& i
: on
->onode
.attrs
) {
3627 i
.second
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
3630 // initialize extent_map
3631 on
->extent_map
.decode_spanning_blobs(p
);
3632 if (on
->onode
.extent_map_shards
.empty()) {
3633 denc(on
->extent_map
.inline_bl
, p
);
3634 on
->extent_map
.decode_some(on
->extent_map
.inline_bl
);
3635 on
->extent_map
.inline_bl
.reassign_to_mempool(
3636 mempool::mempool_bluestore_cache_data
);
3639 on
->extent_map
.init_shards(false, false);
3644 void BlueStore::Onode::flush()
3646 if (flushing_count
.load()) {
3647 ldout(c
->store
->cct
, 20) << __func__
<< " cnt:" << flushing_count
<< dendl
;
3649 std::unique_lock
l(flush_lock
);
3650 while (flushing_count
.load()) {
3655 ldout(c
->store
->cct
, 20) << __func__
<< " done" << dendl
;
3658 void BlueStore::Onode::dump(Formatter
* f
) const
3665 const string
& BlueStore::Onode::get_omap_prefix()
3667 if (onode
.is_pgmeta_omap()) {
3668 return PREFIX_PGMETA_OMAP
;
3670 if (onode
.is_perpool_omap()) {
3671 return PREFIX_PERPOOL_OMAP
;
3678 void BlueStore::Onode::get_omap_header(string
*out
)
3680 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3681 _key_encode_u64(c
->pool(), out
);
3683 _key_encode_u64(onode
.nid
, out
);
3684 out
->push_back('-');
3687 void BlueStore::Onode::get_omap_key(const string
& key
, string
*out
)
3689 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3690 _key_encode_u64(c
->pool(), out
);
3692 _key_encode_u64(onode
.nid
, out
);
3693 out
->push_back('.');
3697 void BlueStore::Onode::rewrite_omap_key(const string
& old
, string
*out
)
3699 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3700 _key_encode_u64(c
->pool(), out
);
3702 _key_encode_u64(onode
.nid
, out
);
3703 out
->append(old
.c_str() + out
->length(), old
.size() - out
->length());
3706 void BlueStore::Onode::get_omap_tail(string
*out
)
3708 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3709 _key_encode_u64(c
->pool(), out
);
3711 _key_encode_u64(onode
.nid
, out
);
3712 out
->push_back('~');
3715 void BlueStore::Onode::decode_omap_key(const string
& key
, string
*user_key
)
3717 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3718 *user_key
= key
.substr(sizeof(uint64_t)*2 + 1);
3720 *user_key
= key
.substr(sizeof(uint64_t) + 1);
3725 // =======================================================
3728 /// Checks for writes to the same pextent within a blob
3729 bool BlueStore::WriteContext::has_conflict(
3733 uint64_t min_alloc_size
)
3735 ceph_assert((loffs
% min_alloc_size
) == 0);
3736 ceph_assert((loffs_end
% min_alloc_size
) == 0);
3737 for (auto w
: writes
) {
3739 auto loffs2
= p2align(w
.logical_offset
, min_alloc_size
);
3740 auto loffs2_end
= p2roundup(w
.logical_offset
+ w
.length0
, min_alloc_size
);
3741 if ((loffs
<= loffs2
&& loffs_end
> loffs2
) ||
3742 (loffs
>= loffs2
&& loffs
< loffs2_end
)) {
3750 // =======================================================
3754 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3756 #define dout_context cct
3758 void BlueStore::DeferredBatch::prepare_write(
3760 uint64_t seq
, uint64_t offset
, uint64_t length
,
3761 bufferlist::const_iterator
& blp
)
3763 _discard(cct
, offset
, length
);
3764 auto i
= iomap
.insert(make_pair(offset
, deferred_io()));
3765 ceph_assert(i
.second
); // this should be a new insertion
3766 i
.first
->second
.seq
= seq
;
3767 blp
.copy(length
, i
.first
->second
.bl
);
3768 i
.first
->second
.bl
.reassign_to_mempool(
3769 mempool::mempool_bluestore_writing_deferred
);
3770 dout(20) << __func__
<< " seq " << seq
3771 << " 0x" << std::hex
<< offset
<< "~" << length
3772 << " crc " << i
.first
->second
.bl
.crc32c(-1)
3773 << std::dec
<< dendl
;
3774 seq_bytes
[seq
] += length
;
3775 #ifdef DEBUG_DEFERRED
3780 void BlueStore::DeferredBatch::_discard(
3781 CephContext
*cct
, uint64_t offset
, uint64_t length
)
3783 generic_dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3784 << std::dec
<< dendl
;
3785 auto p
= iomap
.lower_bound(offset
);
3786 if (p
!= iomap
.begin()) {
3788 auto end
= p
->first
+ p
->second
.bl
.length();
3791 head
.substr_of(p
->second
.bl
, 0, offset
- p
->first
);
3792 dout(20) << __func__
<< " keep head " << p
->second
.seq
3793 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3794 << " -> 0x" << head
.length() << std::dec
<< dendl
;
3795 auto i
= seq_bytes
.find(p
->second
.seq
);
3796 ceph_assert(i
!= seq_bytes
.end());
3797 if (end
> offset
+ length
) {
3799 tail
.substr_of(p
->second
.bl
, offset
+ length
- p
->first
,
3800 end
- (offset
+ length
));
3801 dout(20) << __func__
<< " keep tail " << p
->second
.seq
3802 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3803 << " -> 0x" << tail
.length() << std::dec
<< dendl
;
3804 auto &n
= iomap
[offset
+ length
];
3806 n
.seq
= p
->second
.seq
;
3807 i
->second
-= length
;
3809 i
->second
-= end
- offset
;
3811 ceph_assert(i
->second
>= 0);
3812 p
->second
.bl
.swap(head
);
3816 while (p
!= iomap
.end()) {
3817 if (p
->first
>= offset
+ length
) {
3820 auto i
= seq_bytes
.find(p
->second
.seq
);
3821 ceph_assert(i
!= seq_bytes
.end());
3822 auto end
= p
->first
+ p
->second
.bl
.length();
3823 if (end
> offset
+ length
) {
3824 unsigned drop_front
= offset
+ length
- p
->first
;
3825 unsigned keep_tail
= end
- (offset
+ length
);
3826 dout(20) << __func__
<< " truncate front " << p
->second
.seq
3827 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3828 << " drop_front 0x" << drop_front
<< " keep_tail 0x" << keep_tail
3829 << " to 0x" << (offset
+ length
) << "~" << keep_tail
3830 << std::dec
<< dendl
;
3831 auto &s
= iomap
[offset
+ length
];
3832 s
.seq
= p
->second
.seq
;
3833 s
.bl
.substr_of(p
->second
.bl
, drop_front
, keep_tail
);
3834 i
->second
-= drop_front
;
3836 dout(20) << __func__
<< " drop " << p
->second
.seq
3837 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3838 << std::dec
<< dendl
;
3839 i
->second
-= p
->second
.bl
.length();
3841 ceph_assert(i
->second
>= 0);
3846 void BlueStore::DeferredBatch::_audit(CephContext
*cct
)
3848 map
<uint64_t,int> sb
;
3849 for (auto p
: seq_bytes
) {
3850 sb
[p
.first
] = 0; // make sure we have the same set of keys
3853 for (auto& p
: iomap
) {
3854 ceph_assert(p
.first
>= pos
);
3855 sb
[p
.second
.seq
] += p
.second
.bl
.length();
3856 pos
= p
.first
+ p
.second
.bl
.length();
3858 ceph_assert(sb
== seq_bytes
);
3865 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3867 BlueStore::Collection::Collection(BlueStore
*store_
, OnodeCacheShard
*oc
, BufferCacheShard
*bc
, coll_t cid
)
3868 : CollectionImpl(store_
->cct
, cid
),
3873 commit_queue(nullptr)
3877 bool BlueStore::Collection::flush_commit(Context
*c
)
3879 return osr
->flush_commit(c
);
3882 void BlueStore::Collection::flush()
3887 void BlueStore::Collection::flush_all_but_last()
3889 osr
->flush_all_but_last();
3892 void BlueStore::Collection::open_shared_blob(uint64_t sbid
, BlobRef b
)
3894 ceph_assert(!b
->shared_blob
);
3895 const bluestore_blob_t
& blob
= b
->get_blob();
3896 if (!blob
.is_shared()) {
3897 b
->shared_blob
= new SharedBlob(this);
3901 b
->shared_blob
= shared_blob_set
.lookup(sbid
);
3902 if (b
->shared_blob
) {
3903 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3904 << std::dec
<< " had " << *b
->shared_blob
<< dendl
;
3906 b
->shared_blob
= new SharedBlob(sbid
, this);
3907 shared_blob_set
.add(this, b
->shared_blob
.get());
3908 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3909 << std::dec
<< " opened " << *b
->shared_blob
3914 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb
)
3916 if (!sb
->is_loaded()) {
3920 auto sbid
= sb
->get_sbid();
3921 get_shared_blob_key(sbid
, &key
);
3922 int r
= store
->db
->get(PREFIX_SHARED_BLOB
, key
, &v
);
3924 lderr(store
->cct
) << __func__
<< " sbid 0x" << std::hex
<< sbid
3925 << std::dec
<< " not found at key "
3926 << pretty_binary_string(key
) << dendl
;
3927 ceph_abort_msg("uh oh, missing shared_blob");
3931 sb
->persistent
= new bluestore_shared_blob_t(sbid
);
3932 auto p
= v
.cbegin();
3933 decode(*(sb
->persistent
), p
);
3934 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3935 << std::dec
<< " loaded shared_blob " << *sb
<< dendl
;
3939 void BlueStore::Collection::make_blob_shared(uint64_t sbid
, BlobRef b
)
3941 ldout(store
->cct
, 10) << __func__
<< " " << *b
<< dendl
;
3942 ceph_assert(!b
->shared_blob
->is_loaded());
3945 bluestore_blob_t
& blob
= b
->dirty_blob();
3946 blob
.set_flag(bluestore_blob_t::FLAG_SHARED
);
3948 // update shared blob
3949 b
->shared_blob
->loaded
= true;
3950 b
->shared_blob
->persistent
= new bluestore_shared_blob_t(sbid
);
3951 shared_blob_set
.add(this, b
->shared_blob
.get());
3952 for (auto p
: blob
.get_extents()) {
3954 b
->shared_blob
->get_ref(
3959 ldout(store
->cct
, 20) << __func__
<< " now " << *b
<< dendl
;
3962 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob
*sb
)
3964 ldout(store
->cct
, 10) << __func__
<< " " << *sb
<< dendl
;
3965 ceph_assert(sb
->is_loaded());
3967 uint64_t sbid
= sb
->get_sbid();
3968 shared_blob_set
.remove(sb
);
3970 delete sb
->persistent
;
3971 sb
->sbid_unloaded
= 0;
3972 ldout(store
->cct
, 20) << __func__
<< " now " << *sb
<< dendl
;
3976 BlueStore::OnodeRef
BlueStore::Collection::get_onode(
3977 const ghobject_t
& oid
,
3981 ceph_assert(create
? ceph_mutex_is_wlocked(lock
) : ceph_mutex_is_locked(lock
));
3984 if (cid
.is_pg(&pgid
)) {
3985 if (!oid
.match(cnode
.bits
, pgid
.ps())) {
3986 lderr(store
->cct
) << __func__
<< " oid " << oid
<< " not part of "
3987 << pgid
<< " bits " << cnode
.bits
<< dendl
;
3992 OnodeRef o
= onode_map
.lookup(oid
);
3997 get_object_key(store
->cct
, oid
, &key
);
3999 ldout(store
->cct
, 20) << __func__
<< " oid " << oid
<< " key "
4000 << pretty_binary_string(key
) << dendl
;
4006 r
= store
->db
->get(PREFIX_OBJ
, key
.c_str(), key
.size(), &v
);
4007 ldout(store
->cct
, 20) << " r " << r
<< " v.len " << v
.length() << dendl
;
4009 if (v
.length() == 0) {
4010 ceph_assert(r
== -ENOENT
);
4011 if (!store
->cct
->_conf
->bluestore_debug_misc
&&
4015 // new object, new onode
4016 on
= new Onode(this, oid
, key
);
4019 ceph_assert(r
>= 0);
4020 on
= Onode::decode(this, oid
, key
, v
);
4023 return onode_map
.add(oid
, o
);
4026 void BlueStore::Collection::split_cache(
4029 ldout(store
->cct
, 10) << __func__
<< " to " << dest
<< dendl
;
4031 auto *ocache
= get_onode_cache();
4032 auto *ocache_dest
= dest
->get_onode_cache();
4034 // lock cache shards
4035 std::lock(ocache
->lock
, ocache_dest
->lock
, cache
->lock
, dest
->cache
->lock
);
4036 std::lock_guard
l(ocache
->lock
, std::adopt_lock
);
4037 std::lock_guard
l2(ocache_dest
->lock
, std::adopt_lock
);
4038 std::lock_guard
l3(cache
->lock
, std::adopt_lock
);
4039 std::lock_guard
l4(dest
->cache
->lock
, std::adopt_lock
);
4041 int destbits
= dest
->cnode
.bits
;
4043 bool is_pg
= dest
->cid
.is_pg(&destpg
);
4046 auto p
= onode_map
.onode_map
.begin();
4047 while (p
!= onode_map
.onode_map
.end()) {
4048 OnodeRef o
= p
->second
;
4049 if (!p
->second
->oid
.match(destbits
, destpg
.pgid
.ps())) {
4050 // onode does not belong to this child
4051 ldout(store
->cct
, 20) << __func__
<< " not moving " << o
<< " " << o
->oid
4055 ldout(store
->cct
, 20) << __func__
<< " moving " << o
<< " " << o
->oid
4058 // ensuring that nref is always >= 2 and hence onode is pinned and
4059 // physically out of cache during the transition
4061 ceph_assert(o
->pinned
);
4063 p
= onode_map
.onode_map
.erase(p
);
4064 dest
->onode_map
.onode_map
[o
->oid
] = o
;
4066 get_onode_cache()->move_pinned(dest
->get_onode_cache(), o
.get());
4070 // move over shared blobs and buffers. cover shared blobs from
4071 // both extent map and spanning blob map (the full extent map
4072 // may not be faulted in)
4073 vector
<SharedBlob
*> sbvec
;
4074 for (auto& e
: o
->extent_map
.extent_map
) {
4075 sbvec
.push_back(e
.blob
->shared_blob
.get());
4077 for (auto& b
: o
->extent_map
.spanning_blob_map
) {
4078 sbvec
.push_back(b
.second
->shared_blob
.get());
4080 for (auto sb
: sbvec
) {
4081 if (sb
->coll
== dest
) {
4082 ldout(store
->cct
, 20) << __func__
<< " already moved " << *sb
4086 ldout(store
->cct
, 20) << __func__
<< " moving " << *sb
<< dendl
;
4087 if (sb
->get_sbid()) {
4088 ldout(store
->cct
, 20) << __func__
4089 << " moving registration " << *sb
<< dendl
;
4090 shared_blob_set
.remove(sb
);
4091 dest
->shared_blob_set
.add(dest
, sb
);
4094 if (dest
->cache
!= cache
) {
4095 for (auto& i
: sb
->bc
.buffer_map
) {
4096 if (!i
.second
->is_writing()) {
4097 ldout(store
->cct
, 20) << __func__
<< " moving " << *i
.second
4099 dest
->cache
->_move(cache
, i
.second
.get());
4106 dest
->cache
->_trim();
4109 // =======================================================
4114 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
4116 #define dout_context store->cct
4118 void *BlueStore::MempoolThread::entry()
4120 std::unique_lock l
{lock
};
4122 uint32_t prev_config_change
= store
->config_changed
.load();
4123 uint64_t base
= store
->osd_memory_base
;
4124 double fragmentation
= store
->osd_memory_expected_fragmentation
;
4125 uint64_t target
= store
->osd_memory_target
;
4126 uint64_t min
= store
->osd_memory_cache_min
;
4129 // When setting the maximum amount of memory to use for cache, first
4130 // assume some base amount of memory for the OSD and then fudge in
4131 // some overhead for fragmentation that scales with cache usage.
4132 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
4133 if (ltarget
> base
+ min
) {
4134 max
= ltarget
- base
;
4137 binned_kv_cache
= store
->db
->get_priority_cache();
4138 if (store
->cache_autotune
&& binned_kv_cache
!= nullptr) {
4139 pcm
= std::make_shared
<PriorityCache::Manager
>(
4140 store
->cct
, min
, max
, target
, true);
4141 pcm
->insert("kv", binned_kv_cache
, true);
4142 pcm
->insert("meta", meta_cache
, true);
4143 pcm
->insert("data", data_cache
, true);
4146 utime_t next_balance
= ceph_clock_now();
4147 utime_t next_resize
= ceph_clock_now();
4148 utime_t next_deferred_force_submit
= ceph_clock_now();
4149 utime_t alloc_stats_dump_clock
= ceph_clock_now();
4151 bool interval_stats_trim
= false;
4153 // Update pcm cache settings if related configuration was changed
4154 uint32_t cur_config_change
= store
->config_changed
.load();
4155 if (cur_config_change
!= prev_config_change
) {
4156 _update_cache_settings();
4157 prev_config_change
= cur_config_change
;
4160 // Before we trim, check and see if it's time to rebalance/resize.
4161 double autotune_interval
= store
->cache_autotune_interval
;
4162 double resize_interval
= store
->osd_memory_cache_resize_interval
;
4163 double max_defer_interval
= store
->max_defer_interval
;
4165 double alloc_stats_dump_interval
=
4166 store
->cct
->_conf
->bluestore_alloc_stats_dump_interval
;
4168 if (alloc_stats_dump_interval
> 0 &&
4169 alloc_stats_dump_clock
+ alloc_stats_dump_interval
< ceph_clock_now()) {
4170 store
->_record_allocation_stats();
4171 alloc_stats_dump_clock
= ceph_clock_now();
4173 if (autotune_interval
> 0 && next_balance
< ceph_clock_now()) {
4174 _adjust_cache_settings();
4176 // Log events at 5 instead of 20 when balance happens.
4177 interval_stats_trim
= true;
4179 if (pcm
!= nullptr) {
4183 next_balance
= ceph_clock_now();
4184 next_balance
+= autotune_interval
;
4186 if (resize_interval
> 0 && next_resize
< ceph_clock_now()) {
4187 if (ceph_using_tcmalloc() && pcm
!= nullptr) {
4190 next_resize
= ceph_clock_now();
4191 next_resize
+= resize_interval
;
4194 if (max_defer_interval
> 0 &&
4195 next_deferred_force_submit
< ceph_clock_now()) {
4196 if (store
->get_deferred_last_submitted() + max_defer_interval
<
4198 store
->deferred_try_submit();
4200 next_deferred_force_submit
= ceph_clock_now();
4201 next_deferred_force_submit
+= max_defer_interval
/3;
4204 // Now Resize the shards
4205 _resize_shards(interval_stats_trim
);
4206 interval_stats_trim
= false;
4208 store
->_update_cache_logger();
4209 auto wait
= ceph::make_timespan(
4210 store
->cct
->_conf
->bluestore_cache_trim_interval
);
4211 cond
.wait_for(l
, wait
);
4214 store
->_record_allocation_stats();
4219 void BlueStore::MempoolThread::_adjust_cache_settings()
4221 if (binned_kv_cache
!= nullptr) {
4222 binned_kv_cache
->set_cache_ratio(store
->cache_kv_ratio
);
4224 meta_cache
->set_cache_ratio(store
->cache_meta_ratio
);
4225 data_cache
->set_cache_ratio(store
->cache_data_ratio
);
4228 void BlueStore::MempoolThread::_resize_shards(bool interval_stats
)
4230 size_t onode_shards
= store
->onode_cache_shards
.size();
4231 size_t buffer_shards
= store
->buffer_cache_shards
.size();
4232 int64_t kv_used
= store
->db
->get_cache_usage();
4233 int64_t meta_used
= meta_cache
->_get_used_bytes();
4234 int64_t data_used
= data_cache
->_get_used_bytes();
4236 uint64_t cache_size
= store
->cache_size
;
4238 static_cast<int64_t>(store
->cache_kv_ratio
* cache_size
);
4239 int64_t meta_alloc
=
4240 static_cast<int64_t>(store
->cache_meta_ratio
* cache_size
);
4241 int64_t data_alloc
=
4242 static_cast<int64_t>(store
->cache_data_ratio
* cache_size
);
4244 if (pcm
!= nullptr && binned_kv_cache
!= nullptr) {
4245 cache_size
= pcm
->get_tuned_mem();
4246 kv_alloc
= binned_kv_cache
->get_committed_size();
4247 meta_alloc
= meta_cache
->get_committed_size();
4248 data_alloc
= data_cache
->get_committed_size();
4251 if (interval_stats
) {
4252 dout(5) << __func__
<< " cache_size: " << cache_size
4253 << " kv_alloc: " << kv_alloc
4254 << " kv_used: " << kv_used
4255 << " meta_alloc: " << meta_alloc
4256 << " meta_used: " << meta_used
4257 << " data_alloc: " << data_alloc
4258 << " data_used: " << data_used
<< dendl
;
4260 dout(20) << __func__
<< " cache_size: " << cache_size
4261 << " kv_alloc: " << kv_alloc
4262 << " kv_used: " << kv_used
4263 << " meta_alloc: " << meta_alloc
4264 << " meta_used: " << meta_used
4265 << " data_alloc: " << data_alloc
4266 << " data_used: " << data_used
<< dendl
;
4269 uint64_t max_shard_onodes
= static_cast<uint64_t>(
4270 (meta_alloc
/ (double) onode_shards
) / meta_cache
->get_bytes_per_onode());
4271 uint64_t max_shard_buffer
= static_cast<uint64_t>(data_alloc
/ buffer_shards
);
4273 dout(30) << __func__
<< " max_shard_onodes: " << max_shard_onodes
4274 << " max_shard_buffer: " << max_shard_buffer
<< dendl
;
4276 for (auto i
: store
->onode_cache_shards
) {
4277 i
->set_max(max_shard_onodes
);
4279 for (auto i
: store
->buffer_cache_shards
) {
4280 i
->set_max(max_shard_buffer
);
4284 void BlueStore::MempoolThread::_update_cache_settings()
4286 // Nothing to do if pcm is not used.
4287 if (pcm
== nullptr) {
4291 uint64_t target
= store
->osd_memory_target
;
4292 uint64_t base
= store
->osd_memory_base
;
4293 uint64_t min
= store
->osd_memory_cache_min
;
4295 double fragmentation
= store
->osd_memory_expected_fragmentation
;
4297 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
4298 if (ltarget
> base
+ min
) {
4299 max
= ltarget
- base
;
4302 // set pcm cache levels
4303 pcm
->set_target_memory(target
);
4304 pcm
->set_min_memory(min
);
4305 pcm
->set_max_memory(max
);
4307 dout(5) << __func__
<< " updated pcm target: " << target
4308 << " pcm min: " << min
4309 << " pcm max: " << max
4313 // =======================================================
4318 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4320 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4321 CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
)
4322 : c(c
), o(o
), it(it
)
4324 std::shared_lock
l(c
->lock
);
4325 if (o
->onode
.has_omap()) {
4326 o
->get_omap_key(string(), &head
);
4327 o
->get_omap_tail(&tail
);
4328 it
->lower_bound(head
);
4332 string
BlueStore::OmapIteratorImpl::_stringify() const
4335 s
<< " omap_iterator(cid = " << c
->cid
4336 <<", oid = " << o
->oid
<< ")";
4340 int BlueStore::OmapIteratorImpl::seek_to_first()
4342 std::shared_lock
l(c
->lock
);
4343 auto start1
= mono_clock::now();
4344 if (o
->onode
.has_omap()) {
4345 it
->lower_bound(head
);
4347 it
= KeyValueDB::Iterator();
4349 c
->store
->log_latency(
4351 l_bluestore_omap_seek_to_first_lat
,
4352 mono_clock::now() - start1
,
4353 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
4358 int BlueStore::OmapIteratorImpl::upper_bound(const string
& after
)
4360 std::shared_lock
l(c
->lock
);
4361 auto start1
= mono_clock::now();
4362 if (o
->onode
.has_omap()) {
4364 o
->get_omap_key(after
, &key
);
4365 ldout(c
->store
->cct
,20) << __func__
<< " after " << after
<< " key "
4366 << pretty_binary_string(key
) << dendl
;
4367 it
->upper_bound(key
);
4369 it
= KeyValueDB::Iterator();
4371 c
->store
->log_latency_fn(
4373 l_bluestore_omap_upper_bound_lat
,
4374 mono_clock::now() - start1
,
4375 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
,
4376 [&] (const ceph::timespan
& lat
) {
4377 return ", after = " + after
+
4384 int BlueStore::OmapIteratorImpl::lower_bound(const string
& to
)
4386 std::shared_lock
l(c
->lock
);
4387 auto start1
= mono_clock::now();
4388 if (o
->onode
.has_omap()) {
4390 o
->get_omap_key(to
, &key
);
4391 ldout(c
->store
->cct
,20) << __func__
<< " to " << to
<< " key "
4392 << pretty_binary_string(key
) << dendl
;
4393 it
->lower_bound(key
);
4395 it
= KeyValueDB::Iterator();
4397 c
->store
->log_latency_fn(
4399 l_bluestore_omap_lower_bound_lat
,
4400 mono_clock::now() - start1
,
4401 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
,
4402 [&] (const ceph::timespan
& lat
) {
4403 return ", to = " + to
+
4410 bool BlueStore::OmapIteratorImpl::valid()
4412 std::shared_lock
l(c
->lock
);
4413 bool r
= o
->onode
.has_omap() && it
&& it
->valid() &&
4414 it
->raw_key().second
< tail
;
4415 if (it
&& it
->valid()) {
4416 ldout(c
->store
->cct
,20) << __func__
<< " is at "
4417 << pretty_binary_string(it
->raw_key().second
)
4423 int BlueStore::OmapIteratorImpl::next()
4426 std::shared_lock
l(c
->lock
);
4427 auto start1
= mono_clock::now();
4428 if (o
->onode
.has_omap()) {
4432 c
->store
->log_latency(
4434 l_bluestore_omap_next_lat
,
4435 mono_clock::now() - start1
,
4436 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
4441 string
BlueStore::OmapIteratorImpl::key()
4443 std::shared_lock
l(c
->lock
);
4444 ceph_assert(it
->valid());
4445 string db_key
= it
->raw_key().second
;
4447 o
->decode_omap_key(db_key
, &user_key
);
4452 bufferlist
BlueStore::OmapIteratorImpl::value()
4454 std::shared_lock
l(c
->lock
);
4455 ceph_assert(it
->valid());
4460 // =====================================
4463 #define dout_prefix *_dout << "bluestore(" << path << ") "
4465 #define dout_context cct
4468 static void aio_cb(void *priv
, void *priv2
)
4470 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
4471 BlueStore::AioContext
*c
= static_cast<BlueStore::AioContext
*>(priv2
);
4472 c
->aio_finish(store
);
4475 static void discard_cb(void *priv
, void *priv2
)
4477 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
4478 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
4479 store
->handle_discard(*tmp
);
4482 void BlueStore::handle_discard(interval_set
<uint64_t>& to_release
)
4484 dout(10) << __func__
<< dendl
;
4486 alloc
->release(to_release
);
4489 BlueStore::BlueStore(CephContext
*cct
, const string
& path
)
4490 : BlueStore(cct
, path
, 0) {}
4492 BlueStore::BlueStore(CephContext
*cct
,
4494 uint64_t _min_alloc_size
)
4495 : ObjectStore(cct
, path
),
4497 finisher(cct
, "commit_finisher", "cfin"),
4498 kv_sync_thread(this),
4499 kv_finalize_thread(this),
4500 min_alloc_size(_min_alloc_size
),
4501 min_alloc_size_order(ctz(_min_alloc_size
)),
4502 mempool_thread(this)
4505 cct
->_conf
.add_observer(this);
4506 set_cache_shards(1);
4509 BlueStore::~BlueStore()
4511 cct
->_conf
.remove_observer(this);
4513 ceph_assert(!mounted
);
4514 ceph_assert(db
== NULL
);
4515 ceph_assert(bluefs
== NULL
);
4516 ceph_assert(fsid_fd
< 0);
4517 ceph_assert(path_fd
< 0);
4518 for (auto i
: onode_cache_shards
) {
4521 for (auto i
: buffer_cache_shards
) {
4524 onode_cache_shards
.clear();
4525 buffer_cache_shards
.clear();
4528 const char **BlueStore::get_tracked_conf_keys() const
4530 static const char* KEYS
[] = {
4531 "bluestore_csum_type",
4532 "bluestore_compression_mode",
4533 "bluestore_compression_algorithm",
4534 "bluestore_compression_min_blob_size",
4535 "bluestore_compression_min_blob_size_ssd",
4536 "bluestore_compression_min_blob_size_hdd",
4537 "bluestore_compression_max_blob_size",
4538 "bluestore_compression_max_blob_size_ssd",
4539 "bluestore_compression_max_blob_size_hdd",
4540 "bluestore_compression_required_ratio",
4541 "bluestore_max_alloc_size",
4542 "bluestore_prefer_deferred_size",
4543 "bluestore_prefer_deferred_size_hdd",
4544 "bluestore_prefer_deferred_size_ssd",
4545 "bluestore_deferred_batch_ops",
4546 "bluestore_deferred_batch_ops_hdd",
4547 "bluestore_deferred_batch_ops_ssd",
4548 "bluestore_throttle_bytes",
4549 "bluestore_throttle_deferred_bytes",
4550 "bluestore_throttle_cost_per_io_hdd",
4551 "bluestore_throttle_cost_per_io_ssd",
4552 "bluestore_throttle_cost_per_io",
4553 "bluestore_max_blob_size",
4554 "bluestore_max_blob_size_ssd",
4555 "bluestore_max_blob_size_hdd",
4556 "osd_memory_target",
4557 "osd_memory_target_cgroup_limit_ratio",
4559 "osd_memory_cache_min",
4560 "osd_memory_expected_fragmentation",
4561 "bluestore_cache_autotune",
4562 "bluestore_cache_autotune_interval",
4563 "bluestore_warn_on_legacy_statfs",
4564 "bluestore_warn_on_no_per_pool_omap",
4565 "bluestore_max_defer_interval",
4571 void BlueStore::handle_conf_change(const ConfigProxy
& conf
,
4572 const std::set
<std::string
> &changed
)
4574 if (changed
.count("bluestore_warn_on_legacy_statfs")) {
4575 _check_legacy_statfs_alert();
4577 if (changed
.count("bluestore_warn_on_no_per_pool_omap")) {
4578 _check_no_per_pool_omap_alert();
4581 if (changed
.count("bluestore_csum_type")) {
4584 if (changed
.count("bluestore_compression_mode") ||
4585 changed
.count("bluestore_compression_algorithm") ||
4586 changed
.count("bluestore_compression_min_blob_size") ||
4587 changed
.count("bluestore_compression_max_blob_size")) {
4592 if (changed
.count("bluestore_max_blob_size") ||
4593 changed
.count("bluestore_max_blob_size_ssd") ||
4594 changed
.count("bluestore_max_blob_size_hdd")) {
4596 // only after startup
4600 if (changed
.count("bluestore_prefer_deferred_size") ||
4601 changed
.count("bluestore_prefer_deferred_size_hdd") ||
4602 changed
.count("bluestore_prefer_deferred_size_ssd") ||
4603 changed
.count("bluestore_max_alloc_size") ||
4604 changed
.count("bluestore_deferred_batch_ops") ||
4605 changed
.count("bluestore_deferred_batch_ops_hdd") ||
4606 changed
.count("bluestore_deferred_batch_ops_ssd")) {
4608 // only after startup
4612 if (changed
.count("bluestore_throttle_cost_per_io") ||
4613 changed
.count("bluestore_throttle_cost_per_io_hdd") ||
4614 changed
.count("bluestore_throttle_cost_per_io_ssd")) {
4616 _set_throttle_params();
4619 if (changed
.count("bluestore_throttle_bytes") ||
4620 changed
.count("bluestore_throttle_deferred_bytes") ||
4621 changed
.count("bluestore_throttle_trace_rate")) {
4622 throttle
.reset_throttle(conf
);
4624 if (changed
.count("bluestore_max_defer_interval")) {
4626 _set_max_defer_interval();
4629 if (changed
.count("osd_memory_target") ||
4630 changed
.count("osd_memory_base") ||
4631 changed
.count("osd_memory_cache_min") ||
4632 changed
.count("osd_memory_expected_fragmentation")) {
4633 _update_osd_memory_options();
4637 void BlueStore::_set_compression()
4639 auto m
= Compressor::get_comp_mode_type(cct
->_conf
->bluestore_compression_mode
);
4641 _clear_compression_alert();
4644 derr
<< __func__
<< " unrecognized value '"
4645 << cct
->_conf
->bluestore_compression_mode
4646 << "' for bluestore_compression_mode, reverting to 'none'"
4648 comp_mode
= Compressor::COMP_NONE
;
4649 string
s("unknown mode: ");
4650 s
+= cct
->_conf
->bluestore_compression_mode
;
4651 _set_compression_alert(true, s
.c_str());
4654 compressor
= nullptr;
4656 if (cct
->_conf
->bluestore_compression_min_blob_size
) {
4657 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size
;
4660 if (_use_rotational_settings()) {
4661 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_hdd
;
4663 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_ssd
;
4667 if (cct
->_conf
->bluestore_compression_max_blob_size
) {
4668 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size
;
4671 if (_use_rotational_settings()) {
4672 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_hdd
;
4674 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_ssd
;
4678 auto& alg_name
= cct
->_conf
->bluestore_compression_algorithm
;
4679 if (!alg_name
.empty()) {
4680 compressor
= Compressor::create(cct
, alg_name
);
4682 derr
<< __func__
<< " unable to initialize " << alg_name
.c_str() << " compressor"
4684 _set_compression_alert(false, alg_name
.c_str());
4688 dout(10) << __func__
<< " mode " << Compressor::get_comp_mode_name(comp_mode
)
4689 << " alg " << (compressor
? compressor
->get_type_name() : "(none)")
4690 << " min_blob " << comp_min_blob_size
4691 << " max_blob " << comp_max_blob_size
4695 void BlueStore::_set_csum()
4697 csum_type
= Checksummer::CSUM_NONE
;
4698 int t
= Checksummer::get_csum_string_type(cct
->_conf
->bluestore_csum_type
);
4699 if (t
> Checksummer::CSUM_NONE
)
4702 dout(10) << __func__
<< " csum_type "
4703 << Checksummer::get_csum_type_string(csum_type
)
4707 void BlueStore::_set_throttle_params()
4709 if (cct
->_conf
->bluestore_throttle_cost_per_io
) {
4710 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io
;
4713 if (_use_rotational_settings()) {
4714 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_hdd
;
4716 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_ssd
;
4720 dout(10) << __func__
<< " throttle_cost_per_io " << throttle_cost_per_io
4723 void BlueStore::_set_blob_size()
4725 if (cct
->_conf
->bluestore_max_blob_size
) {
4726 max_blob_size
= cct
->_conf
->bluestore_max_blob_size
;
4729 if (_use_rotational_settings()) {
4730 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_hdd
;
4732 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_ssd
;
4735 dout(10) << __func__
<< " max_blob_size 0x" << std::hex
<< max_blob_size
4736 << std::dec
<< dendl
;
4739 void BlueStore::_update_osd_memory_options()
4741 osd_memory_target
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_target");
4742 osd_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
4743 osd_memory_expected_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
4744 osd_memory_cache_min
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_cache_min");
4746 dout(10) << __func__
4747 << " osd_memory_target " << osd_memory_target
4748 << " osd_memory_base " << osd_memory_base
4749 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4750 << " osd_memory_cache_min " << osd_memory_cache_min
4754 int BlueStore::_set_cache_sizes()
4757 cache_autotune
= cct
->_conf
.get_val
<bool>("bluestore_cache_autotune");
4758 cache_autotune_interval
=
4759 cct
->_conf
.get_val
<double>("bluestore_cache_autotune_interval");
4760 osd_memory_target
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_target");
4761 osd_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
4762 osd_memory_expected_fragmentation
=
4763 cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
4764 osd_memory_cache_min
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_cache_min");
4765 osd_memory_cache_resize_interval
=
4766 cct
->_conf
.get_val
<double>("osd_memory_cache_resize_interval");
4768 if (cct
->_conf
->bluestore_cache_size
) {
4769 cache_size
= cct
->_conf
->bluestore_cache_size
;
4771 // choose global cache size based on backend type
4772 if (_use_rotational_settings()) {
4773 cache_size
= cct
->_conf
->bluestore_cache_size_hdd
;
4775 cache_size
= cct
->_conf
->bluestore_cache_size_ssd
;
4779 cache_meta_ratio
= cct
->_conf
->bluestore_cache_meta_ratio
;
4780 if (cache_meta_ratio
< 0 || cache_meta_ratio
> 1.0) {
4781 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
4782 << ") must be in range [0,1.0]" << dendl
;
4786 cache_kv_ratio
= cct
->_conf
->bluestore_cache_kv_ratio
;
4787 if (cache_kv_ratio
< 0 || cache_kv_ratio
> 1.0) {
4788 derr
<< __func__
<< " bluestore_cache_kv_ratio (" << cache_kv_ratio
4789 << ") must be in range [0,1.0]" << dendl
;
4793 if (cache_meta_ratio
+ cache_kv_ratio
> 1.0) {
4794 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
4795 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4796 << ") = " << cache_meta_ratio
+ cache_kv_ratio
<< "; must be <= 1.0"
4802 (double)1.0 - (double)cache_meta_ratio
- (double)cache_kv_ratio
;
4803 if (cache_data_ratio
< 0) {
4804 // deal with floating point imprecision
4805 cache_data_ratio
= 0;
4808 dout(1) << __func__
<< " cache_size " << cache_size
4809 << " meta " << cache_meta_ratio
4810 << " kv " << cache_kv_ratio
4811 << " data " << cache_data_ratio
4816 int BlueStore::write_meta(const std::string
& key
, const std::string
& value
)
4818 bluestore_bdev_label_t label
;
4819 string p
= path
+ "/block";
4820 int r
= _read_bdev_label(cct
, p
, &label
);
4822 return ObjectStore::write_meta(key
, value
);
4824 label
.meta
[key
] = value
;
4825 r
= _write_bdev_label(cct
, p
, label
);
4826 ceph_assert(r
== 0);
4827 return ObjectStore::write_meta(key
, value
);
4830 int BlueStore::read_meta(const std::string
& key
, std::string
*value
)
4832 bluestore_bdev_label_t label
;
4833 string p
= path
+ "/block";
4834 int r
= _read_bdev_label(cct
, p
, &label
);
4836 return ObjectStore::read_meta(key
, value
);
4838 auto i
= label
.meta
.find(key
);
4839 if (i
== label
.meta
.end()) {
4840 return ObjectStore::read_meta(key
, value
);
4846 void BlueStore::_init_logger()
4848 PerfCountersBuilder
b(cct
, "bluestore",
4849 l_bluestore_first
, l_bluestore_last
);
4850 b
.add_time_avg(l_bluestore_kv_flush_lat
, "kv_flush_lat",
4851 "Average kv_thread flush latency",
4852 "fl_l", PerfCountersBuilder::PRIO_INTERESTING
);
4853 b
.add_time_avg(l_bluestore_kv_commit_lat
, "kv_commit_lat",
4854 "Average kv_thread commit latency");
4855 b
.add_time_avg(l_bluestore_kv_sync_lat
, "kv_sync_lat",
4856 "Average kv_sync thread latency",
4857 "ks_l", PerfCountersBuilder::PRIO_INTERESTING
);
4858 b
.add_time_avg(l_bluestore_kv_final_lat
, "kv_final_lat",
4859 "Average kv_finalize thread latency",
4860 "kf_l", PerfCountersBuilder::PRIO_INTERESTING
);
4861 b
.add_time_avg(l_bluestore_state_prepare_lat
, "state_prepare_lat",
4862 "Average prepare state latency");
4863 b
.add_time_avg(l_bluestore_state_aio_wait_lat
, "state_aio_wait_lat",
4864 "Average aio_wait state latency",
4865 "io_l", PerfCountersBuilder::PRIO_INTERESTING
);
4866 b
.add_time_avg(l_bluestore_state_io_done_lat
, "state_io_done_lat",
4867 "Average io_done state latency");
4868 b
.add_time_avg(l_bluestore_state_kv_queued_lat
, "state_kv_queued_lat",
4869 "Average kv_queued state latency");
4870 b
.add_time_avg(l_bluestore_state_kv_committing_lat
, "state_kv_commiting_lat",
4871 "Average kv_commiting state latency");
4872 b
.add_time_avg(l_bluestore_state_kv_done_lat
, "state_kv_done_lat",
4873 "Average kv_done state latency");
4874 b
.add_time_avg(l_bluestore_state_deferred_queued_lat
, "state_deferred_queued_lat",
4875 "Average deferred_queued state latency");
4876 b
.add_time_avg(l_bluestore_state_deferred_aio_wait_lat
, "state_deferred_aio_wait_lat",
4877 "Average aio_wait state latency");
4878 b
.add_time_avg(l_bluestore_state_deferred_cleanup_lat
, "state_deferred_cleanup_lat",
4879 "Average cleanup state latency");
4880 b
.add_time_avg(l_bluestore_state_finishing_lat
, "state_finishing_lat",
4881 "Average finishing state latency");
4882 b
.add_time_avg(l_bluestore_state_done_lat
, "state_done_lat",
4883 "Average done state latency");
4884 b
.add_time_avg(l_bluestore_throttle_lat
, "throttle_lat",
4885 "Average submit throttle latency",
4886 "th_l", PerfCountersBuilder::PRIO_CRITICAL
);
4887 b
.add_time_avg(l_bluestore_submit_lat
, "submit_lat",
4888 "Average submit latency",
4889 "s_l", PerfCountersBuilder::PRIO_CRITICAL
);
4890 b
.add_time_avg(l_bluestore_commit_lat
, "commit_lat",
4891 "Average commit latency",
4892 "c_l", PerfCountersBuilder::PRIO_CRITICAL
);
4893 b
.add_time_avg(l_bluestore_read_lat
, "read_lat",
4894 "Average read latency",
4895 "r_l", PerfCountersBuilder::PRIO_CRITICAL
);
4896 b
.add_time_avg(l_bluestore_read_onode_meta_lat
, "read_onode_meta_lat",
4897 "Average read onode metadata latency");
4898 b
.add_time_avg(l_bluestore_read_wait_aio_lat
, "read_wait_aio_lat",
4899 "Average read latency");
4900 b
.add_time_avg(l_bluestore_compress_lat
, "compress_lat",
4901 "Average compress latency");
4902 b
.add_time_avg(l_bluestore_decompress_lat
, "decompress_lat",
4903 "Average decompress latency");
4904 b
.add_time_avg(l_bluestore_csum_lat
, "csum_lat",
4905 "Average checksum latency");
4906 b
.add_u64_counter(l_bluestore_compress_success_count
, "compress_success_count",
4907 "Sum for beneficial compress ops");
4908 b
.add_u64_counter(l_bluestore_compress_rejected_count
, "compress_rejected_count",
4909 "Sum for compress ops rejected due to low net gain of space");
4910 b
.add_u64_counter(l_bluestore_write_pad_bytes
, "write_pad_bytes",
4911 "Sum for write-op padded bytes", NULL
, 0, unit_t(UNIT_BYTES
));
4912 b
.add_u64_counter(l_bluestore_deferred_write_ops
, "deferred_write_ops",
4913 "Sum for deferred write op");
4914 b
.add_u64_counter(l_bluestore_deferred_write_bytes
, "deferred_write_bytes",
4915 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES
));
4916 b
.add_u64_counter(l_bluestore_write_penalty_read_ops
, "write_penalty_read_ops",
4917 "Sum for write penalty read ops");
4918 b
.add_u64(l_bluestore_allocated
, "bluestore_allocated",
4919 "Sum for allocated bytes");
4920 b
.add_u64(l_bluestore_stored
, "bluestore_stored",
4921 "Sum for stored bytes");
4922 b
.add_u64(l_bluestore_compressed
, "bluestore_compressed",
4923 "Sum for stored compressed bytes",
4924 "c", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4925 b
.add_u64(l_bluestore_compressed_allocated
, "bluestore_compressed_allocated",
4926 "Sum for bytes allocated for compressed data",
4927 "c_a", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4928 b
.add_u64(l_bluestore_compressed_original
, "bluestore_compressed_original",
4929 "Sum for original bytes that were compressed",
4930 "c_o", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4931 b
.add_u64(l_bluestore_onodes
, "bluestore_onodes",
4932 "Number of onodes in cache");
4933 b
.add_u64(l_bluestore_pinned_onodes
, "bluestore_pinned_onodes",
4934 "Number of pinned onodes in cache");
4935 b
.add_u64_counter(l_bluestore_onode_hits
, "bluestore_onode_hits",
4936 "Sum for onode-lookups hit in the cache");
4937 b
.add_u64_counter(l_bluestore_onode_misses
, "bluestore_onode_misses",
4938 "Sum for onode-lookups missed in the cache");
4939 b
.add_u64_counter(l_bluestore_onode_shard_hits
, "bluestore_onode_shard_hits",
4940 "Sum for onode-shard lookups hit in the cache");
4941 b
.add_u64_counter(l_bluestore_onode_shard_misses
,
4942 "bluestore_onode_shard_misses",
4943 "Sum for onode-shard lookups missed in the cache");
4944 b
.add_u64(l_bluestore_extents
, "bluestore_extents",
4945 "Number of extents in cache");
4946 b
.add_u64(l_bluestore_blobs
, "bluestore_blobs",
4947 "Number of blobs in cache");
4948 b
.add_u64(l_bluestore_buffers
, "bluestore_buffers",
4949 "Number of buffers in cache");
4950 b
.add_u64(l_bluestore_buffer_bytes
, "bluestore_buffer_bytes",
4951 "Number of buffer bytes in cache", NULL
, 0, unit_t(UNIT_BYTES
));
4952 b
.add_u64_counter(l_bluestore_buffer_hit_bytes
, "bluestore_buffer_hit_bytes",
4953 "Sum for bytes of read hit in the cache", NULL
, 0, unit_t(UNIT_BYTES
));
4954 b
.add_u64_counter(l_bluestore_buffer_miss_bytes
, "bluestore_buffer_miss_bytes",
4955 "Sum for bytes of read missed in the cache", NULL
, 0, unit_t(UNIT_BYTES
));
4957 b
.add_u64_counter(l_bluestore_write_big
, "bluestore_write_big",
4958 "Large aligned writes into fresh blobs");
4959 b
.add_u64_counter(l_bluestore_write_big_bytes
, "bluestore_write_big_bytes",
4960 "Large aligned writes into fresh blobs (bytes)", NULL
, 0, unit_t(UNIT_BYTES
));
4961 b
.add_u64_counter(l_bluestore_write_big_blobs
, "bluestore_write_big_blobs",
4962 "Large aligned writes into fresh blobs (blobs)");
4963 b
.add_u64_counter(l_bluestore_write_small
, "bluestore_write_small",
4964 "Small writes into existing or sparse small blobs");
4965 b
.add_u64_counter(l_bluestore_write_small_bytes
, "bluestore_write_small_bytes",
4966 "Small writes into existing or sparse small blobs (bytes)", NULL
, 0, unit_t(UNIT_BYTES
));
4967 b
.add_u64_counter(l_bluestore_write_small_unused
,
4968 "bluestore_write_small_unused",
4969 "Small writes into unused portion of existing blob");
4970 b
.add_u64_counter(l_bluestore_write_small_deferred
,
4971 "bluestore_write_small_deferred",
4972 "Small overwrites using deferred");
4973 b
.add_u64_counter(l_bluestore_write_small_pre_read
,
4974 "bluestore_write_small_pre_read",
4975 "Small writes that required we read some data (possibly "
4976 "cached) to fill out the block");
4977 b
.add_u64_counter(l_bluestore_write_small_new
, "bluestore_write_small_new",
4978 "Small write into new (sparse) blob");
4980 b
.add_u64_counter(l_bluestore_txc
, "bluestore_txc", "Transactions committed");
4981 b
.add_u64_counter(l_bluestore_onode_reshard
, "bluestore_onode_reshard",
4982 "Onode extent map reshard events");
4983 b
.add_u64_counter(l_bluestore_blob_split
, "bluestore_blob_split",
4984 "Sum for blob splitting due to resharding");
4985 b
.add_u64_counter(l_bluestore_extent_compress
, "bluestore_extent_compress",
4986 "Sum for extents that have been removed due to compression");
4987 b
.add_u64_counter(l_bluestore_gc_merged
, "bluestore_gc_merged",
4988 "Sum for extents that have been merged due to garbage "
4990 b
.add_u64_counter(l_bluestore_read_eio
, "bluestore_read_eio",
4991 "Read EIO errors propagated to high level callers");
4992 b
.add_u64_counter(l_bluestore_reads_with_retries
, "bluestore_reads_with_retries",
4993 "Read operations that required at least one retry due to failed checksum validation");
4994 b
.add_u64(l_bluestore_fragmentation
, "bluestore_fragmentation_micros",
4995 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
4996 b
.add_time_avg(l_bluestore_omap_seek_to_first_lat
, "omap_seek_to_first_lat",
4997 "Average omap iterator seek_to_first call latency");
4998 b
.add_time_avg(l_bluestore_omap_upper_bound_lat
, "omap_upper_bound_lat",
4999 "Average omap iterator upper_bound call latency");
5000 b
.add_time_avg(l_bluestore_omap_lower_bound_lat
, "omap_lower_bound_lat",
5001 "Average omap iterator lower_bound call latency");
5002 b
.add_time_avg(l_bluestore_omap_next_lat
, "omap_next_lat",
5003 "Average omap iterator next call latency");
5004 b
.add_time_avg(l_bluestore_omap_get_keys_lat
, "omap_get_keys_lat",
5005 "Average omap get_keys call latency");
5006 b
.add_time_avg(l_bluestore_omap_get_values_lat
, "omap_get_values_lat",
5007 "Average omap get_values call latency");
5008 b
.add_time_avg(l_bluestore_clist_lat
, "clist_lat",
5009 "Average collection listing latency");
5010 b
.add_time_avg(l_bluestore_remove_lat
, "remove_lat",
5011 "Average removal latency");
5013 logger
= b
.create_perf_counters();
5014 cct
->get_perfcounters_collection()->add(logger
);
5017 int BlueStore::_reload_logger()
5019 struct store_statfs_t store_statfs
;
5020 int r
= statfs(&store_statfs
);
5022 logger
->set(l_bluestore_allocated
, store_statfs
.allocated
);
5023 logger
->set(l_bluestore_stored
, store_statfs
.data_stored
);
5024 logger
->set(l_bluestore_compressed
, store_statfs
.data_compressed
);
5025 logger
->set(l_bluestore_compressed_allocated
, store_statfs
.data_compressed_allocated
);
5026 logger
->set(l_bluestore_compressed_original
, store_statfs
.data_compressed_original
);
5031 void BlueStore::_shutdown_logger()
5033 cct
->get_perfcounters_collection()->remove(logger
);
5037 int BlueStore::get_block_device_fsid(CephContext
* cct
, const string
& path
,
5040 bluestore_bdev_label_t label
;
5041 int r
= _read_bdev_label(cct
, path
, &label
);
5044 *fsid
= label
.osd_uuid
;
5048 int BlueStore::_open_path()
5051 ceph_assert(path_fd
< 0);
5052 path_fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_DIRECTORY
|O_CLOEXEC
));
5055 derr
<< __func__
<< " unable to open " << path
<< ": " << cpp_strerror(r
)
5062 void BlueStore::_close_path()
5064 VOID_TEMP_FAILURE_RETRY(::close(path_fd
));
5068 int BlueStore::_write_bdev_label(CephContext
*cct
,
5069 string path
, bluestore_bdev_label_t label
)
5071 dout(10) << __func__
<< " path " << path
<< " label " << label
<< dendl
;
5074 uint32_t crc
= bl
.crc32c(-1);
5076 ceph_assert(bl
.length() <= BDEV_LABEL_BLOCK_SIZE
);
5077 bufferptr
z(BDEV_LABEL_BLOCK_SIZE
- bl
.length());
5079 bl
.append(std::move(z
));
5081 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_WRONLY
|O_CLOEXEC
));
5084 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
5088 int r
= bl
.write_fd(fd
);
5090 derr
<< __func__
<< " failed to write to " << path
5091 << ": " << cpp_strerror(r
) << dendl
;
5096 derr
<< __func__
<< " failed to fsync " << path
5097 << ": " << cpp_strerror(r
) << dendl
;
5100 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5104 int BlueStore::_read_bdev_label(CephContext
* cct
, string path
,
5105 bluestore_bdev_label_t
*label
)
5107 dout(10) << __func__
<< dendl
;
5108 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_RDONLY
|O_CLOEXEC
));
5111 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
5116 int r
= bl
.read_fd(fd
, BDEV_LABEL_BLOCK_SIZE
);
5117 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5119 derr
<< __func__
<< " failed to read from " << path
5120 << ": " << cpp_strerror(r
) << dendl
;
5124 uint32_t crc
, expected_crc
;
5125 auto p
= bl
.cbegin();
5129 t
.substr_of(bl
, 0, p
.get_off());
5131 decode(expected_crc
, p
);
5133 catch (buffer::error
& e
) {
5134 dout(2) << __func__
<< " unable to decode label at offset " << p
.get_off()
5139 if (crc
!= expected_crc
) {
5140 derr
<< __func__
<< " bad crc on label, expected " << expected_crc
5141 << " != actual " << crc
<< dendl
;
5144 dout(10) << __func__
<< " got " << *label
<< dendl
;
5148 int BlueStore::_check_or_set_bdev_label(
5149 string path
, uint64_t size
, string desc
, bool create
)
5151 bluestore_bdev_label_t label
;
5153 label
.osd_uuid
= fsid
;
5155 label
.btime
= ceph_clock_now();
5156 label
.description
= desc
;
5157 int r
= _write_bdev_label(cct
, path
, label
);
5161 int r
= _read_bdev_label(cct
, path
, &label
);
5164 if (cct
->_conf
->bluestore_debug_permit_any_bdev_label
) {
5165 dout(20) << __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
5166 << " and fsid " << fsid
<< " check bypassed" << dendl
;
5167 } else if (label
.osd_uuid
!= fsid
) {
5168 derr
<< __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
5169 << " does not match our fsid " << fsid
<< dendl
;
5176 void BlueStore::_set_alloc_sizes(void)
5178 max_alloc_size
= cct
->_conf
->bluestore_max_alloc_size
;
5180 if (cct
->_conf
->bluestore_prefer_deferred_size
) {
5181 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size
;
5184 if (_use_rotational_settings()) {
5185 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_hdd
;
5187 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_ssd
;
5191 if (cct
->_conf
->bluestore_deferred_batch_ops
) {
5192 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops
;
5195 if (_use_rotational_settings()) {
5196 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_hdd
;
5198 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_ssd
;
5202 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
5203 << std::dec
<< " order " << (int)min_alloc_size_order
5204 << " max_alloc_size 0x" << std::hex
<< max_alloc_size
5205 << " prefer_deferred_size 0x" << prefer_deferred_size
5207 << " deferred_batch_ops " << deferred_batch_ops
5211 int BlueStore::_open_bdev(bool create
)
5213 ceph_assert(bdev
== NULL
);
5214 string p
= path
+ "/block";
5215 bdev
= BlockDevice::create(cct
, p
, aio_cb
, static_cast<void*>(this), discard_cb
, static_cast<void*>(this));
5216 int r
= bdev
->open(p
);
5220 if (create
&& cct
->_conf
->bdev_enable_discard
) {
5221 bdev
->discard(0, bdev
->get_size());
5224 if (bdev
->supported_bdev_label()) {
5225 r
= _check_or_set_bdev_label(p
, bdev
->get_size(), "main", create
);
5230 // initialize global block parameters
5231 block_size
= bdev
->get_block_size();
5232 block_mask
= ~(block_size
- 1);
5233 block_size_order
= ctz(block_size
);
5234 ceph_assert(block_size
== 1u << block_size_order
);
5235 _set_max_defer_interval();
5236 // and set cache_size based on device type
5237 r
= _set_cache_sizes();
5251 void BlueStore::_validate_bdev()
5254 ceph_assert(min_alloc_size
); // _get_odisk_reserved depends on that
5255 uint64_t dev_size
= bdev
->get_size();
5257 _get_ondisk_reserved() + cct
->_conf
->bluestore_bluefs_min
) {
5258 dout(1) << __func__
<< " main device size " << byte_u_t(dev_size
)
5259 << " is too small, disable bluestore_bluefs_min for now"
5261 ceph_assert(dev_size
>= _get_ondisk_reserved());
5263 int r
= cct
->_conf
.set_val("bluestore_bluefs_min", "0");
5264 ceph_assert(r
== 0);
5268 void BlueStore::_close_bdev()
5276 int BlueStore::_open_fm(KeyValueDB::Transaction t
, bool read_only
)
5279 bluestore_bdev_label_t label
;
5281 ceph_assert(fm
== NULL
);
5282 fm
= FreelistManager::create(cct
, freelist_type
, PREFIX_ALLOC
);
5285 // create mode. initialize freespace
5286 dout(20) << __func__
<< " initializing freespace" << dendl
;
5289 bl
.append(freelist_type
);
5290 t
->set(PREFIX_SUPER
, "freelist_type", bl
);
5292 // being able to allocate in units less than bdev block size
5293 // seems to be a bad idea.
5294 ceph_assert( cct
->_conf
->bdev_block_size
<= (int64_t)min_alloc_size
);
5295 fm
->create(bdev
->get_size(), (int64_t)min_alloc_size
, t
);
5297 // allocate superblock reserved space. note that we do not mark
5298 // bluefs space as allocated in the freelist; we instead rely on
5300 auto reserved
= _get_ondisk_reserved();
5301 fm
->allocate(0, reserved
, t
);
5303 if (cct
->_conf
->bluestore_bluefs
) {
5304 ceph_assert(bluefs_extents
.num_intervals() == 1);
5305 interval_set
<uint64_t>::iterator p
= bluefs_extents
.begin();
5306 reserved
= round_up_to(p
.get_start() + p
.get_len(), min_alloc_size
);
5307 dout(20) << __func__
<< " reserved 0x" << std::hex
<< reserved
<< std::dec
5308 << " for bluefs" << dendl
;
5311 if (cct
->_conf
->bluestore_debug_prefill
> 0) {
5312 uint64_t end
= bdev
->get_size() - reserved
;
5313 dout(1) << __func__
<< " pre-fragmenting freespace, using "
5314 << cct
->_conf
->bluestore_debug_prefill
<< " with max free extent "
5315 << cct
->_conf
->bluestore_debug_prefragment_max
<< dendl
;
5316 uint64_t start
= p2roundup(reserved
, min_alloc_size
);
5317 uint64_t max_b
= cct
->_conf
->bluestore_debug_prefragment_max
/ min_alloc_size
;
5318 float r
= cct
->_conf
->bluestore_debug_prefill
;
5322 while (!stop
&& start
< end
) {
5323 uint64_t l
= (rand() % max_b
+ 1) * min_alloc_size
;
5324 if (start
+ l
> end
) {
5326 l
= p2align(l
, min_alloc_size
);
5328 ceph_assert(start
+ l
<= end
);
5330 uint64_t u
= 1 + (uint64_t)(r
* (double)l
);
5331 u
= p2roundup(u
, min_alloc_size
);
5332 if (start
+ l
+ u
> end
) {
5333 u
= end
- (start
+ l
);
5334 // trim to align so we don't overflow again
5335 u
= p2align(u
, min_alloc_size
);
5338 ceph_assert(start
+ l
+ u
<= end
);
5340 dout(20) << __func__
<< " free 0x" << std::hex
<< start
<< "~" << l
5341 << " use 0x" << u
<< std::dec
<< dendl
;
5344 // break if u has been trimmed to nothing
5348 fm
->allocate(start
+ l
, u
, t
);
5352 r
= _write_out_fm_meta(0, false, &label
);
5353 ceph_assert(r
== 0);
5355 string p
= path
+ "/block";
5356 r
= _read_bdev_label(cct
, p
, &label
);
5358 derr
<< __func__
<< " freelist init failed, error reading bdev label: " << cpp_strerror(r
) << dendl
;
5364 r
= fm
->init(label
, db
, read_only
);
5366 derr
<< __func__
<< " freelist init failed: " << cpp_strerror(r
) << dendl
;
5371 // if space size tracked by free list manager is that higher than actual
5372 // dev size one can hit out-of-space allocation which will result
5373 // in data loss and/or assertions
5374 // Probably user altered the device size somehow.
5375 // The only fix for now is to redeploy OSD.
5376 if (fm
->get_size() >= bdev
->get_size() + min_alloc_size
) {
5378 ss
<< "slow device size mismatch detected, "
5379 << " fm size(" << fm
->get_size()
5380 << ") > slow device size(" << bdev
->get_size()
5381 << "), Please stop using this OSD as it might cause data loss.";
5382 _set_disk_size_mismatch_alert(ss
.str());
5387 void BlueStore::_close_fm()
5389 dout(10) << __func__
<< dendl
;
5396 int BlueStore::_write_out_fm_meta(uint64_t target_size
,
5397 bool update_root_size
,
5398 bluestore_bdev_label_t
* res_label
)
5400 string p
= path
+ "/block";
5402 std::vector
<std::pair
<string
, string
>> fm_meta
;
5403 fm
->get_meta(target_size
, &fm_meta
);
5405 bluestore_bdev_label_t label
;
5406 int r
= _read_bdev_label(cct
, p
, &label
);
5410 for (auto& m
: fm_meta
) {
5411 label
.meta
[m
.first
] = m
.second
;
5413 if (update_root_size
) {
5414 label
.size
= target_size
;
5416 r
= _write_bdev_label(cct
, p
, label
);
5424 int BlueStore::_open_alloc()
5426 ceph_assert(alloc
== NULL
);
5427 ceph_assert(bdev
->get_size());
5430 bluefs_extents
.clear();
5431 auto r
= bluefs
->get_block_extents(bluefs_layout
.shared_bdev
,
5434 lderr(cct
) << __func__
<< " failed to retrieve bluefs_extents: "
5435 << cpp_strerror(r
) << dendl
;
5439 dout(10) << __func__
<< " bluefs extents 0x"
5440 << std::hex
<< bluefs_extents
<< std::dec
5444 alloc
= Allocator::create(cct
, cct
->_conf
->bluestore_allocator
,
5446 min_alloc_size
, "block");
5448 lderr(cct
) << __func__
<< " Allocator::unknown alloc type "
5449 << cct
->_conf
->bluestore_allocator
5454 uint64_t num
= 0, bytes
= 0;
5456 dout(1) << __func__
<< " opening allocation metadata" << dendl
;
5457 // initialize from freelist
5458 fm
->enumerate_reset();
5459 uint64_t offset
, length
;
5460 while (fm
->enumerate_next(db
, &offset
, &length
)) {
5461 alloc
->init_add_free(offset
, length
);
5465 fm
->enumerate_reset();
5467 // also mark bluefs space as allocated
5468 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
5469 alloc
->init_rm_free(e
.get_start(), e
.get_len());
5472 dout(1) << __func__
<< " loaded " << byte_u_t(bytes
)
5473 << " in " << num
<< " extents"
5474 << " available " << byte_u_t(alloc
->get_free())
5480 void BlueStore::_close_alloc()
5483 bdev
->discard_drain();
5489 bluefs_extents
.clear();
5492 int BlueStore::_open_fsid(bool create
)
5494 ceph_assert(fsid_fd
< 0);
5495 int flags
= O_RDWR
|O_CLOEXEC
;
5498 fsid_fd
= ::openat(path_fd
, "fsid", flags
, 0644);
5501 derr
<< __func__
<< " " << cpp_strerror(err
) << dendl
;
5507 int BlueStore::_read_fsid(uuid_d
*uuid
)
5510 memset(fsid_str
, 0, sizeof(fsid_str
));
5511 int ret
= safe_read(fsid_fd
, fsid_str
, sizeof(fsid_str
));
5513 derr
<< __func__
<< " failed: " << cpp_strerror(ret
) << dendl
;
5520 if (!uuid
->parse(fsid_str
)) {
5521 derr
<< __func__
<< " unparsable uuid " << fsid_str
<< dendl
;
5527 int BlueStore::_write_fsid()
5529 int r
= ::ftruncate(fsid_fd
, 0);
5532 derr
<< __func__
<< " fsid truncate failed: " << cpp_strerror(r
) << dendl
;
5535 string str
= stringify(fsid
) + "\n";
5536 r
= safe_write(fsid_fd
, str
.c_str(), str
.length());
5538 derr
<< __func__
<< " fsid write failed: " << cpp_strerror(r
) << dendl
;
5541 r
= ::fsync(fsid_fd
);
5544 derr
<< __func__
<< " fsid fsync failed: " << cpp_strerror(r
) << dendl
;
5550 void BlueStore::_close_fsid()
5552 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd
));
5556 int BlueStore::_lock_fsid()
5559 memset(&l
, 0, sizeof(l
));
5561 l
.l_whence
= SEEK_SET
;
5562 int r
= ::fcntl(fsid_fd
, F_SETLK
, &l
);
5565 derr
<< __func__
<< " failed to lock " << path
<< "/fsid"
5566 << " (is another ceph-osd still running?)"
5567 << cpp_strerror(err
) << dendl
;
5573 bool BlueStore::is_rotational()
5576 return bdev
->is_rotational();
5579 bool rotational
= true;
5580 int r
= _open_path();
5583 r
= _open_fsid(false);
5586 r
= _read_fsid(&fsid
);
5592 r
= _open_bdev(false);
5595 rotational
= bdev
->is_rotational();
5605 bool BlueStore::is_journal_rotational()
5608 dout(5) << __func__
<< " bluefs disabled, default to store media type"
5610 return is_rotational();
5612 dout(10) << __func__
<< " " << (int)bluefs
->wal_is_rotational() << dendl
;
5613 return bluefs
->wal_is_rotational();
5616 bool BlueStore::_use_rotational_settings()
5618 if (cct
->_conf
->bluestore_debug_enforce_settings
== "hdd") {
5621 if (cct
->_conf
->bluestore_debug_enforce_settings
== "ssd") {
5624 return bdev
->is_rotational();
5627 bool BlueStore::test_mount_in_use()
5629 // most error conditions mean the mount is not in use (e.g., because
5630 // it doesn't exist). only if we fail to lock do we conclude it is
5633 int r
= _open_path();
5636 r
= _open_fsid(false);
5641 ret
= true; // if we can't lock, it is in use
5648 int BlueStore::_minimal_open_bluefs(bool create
)
5651 bluefs
= new BlueFS(cct
);
5656 bfn
= path
+ "/block.db";
5657 if (::stat(bfn
.c_str(), &st
) == 0) {
5658 r
= bluefs
->add_block_device(
5659 BlueFS::BDEV_DB
, bfn
,
5660 create
&& cct
->_conf
->bdev_enable_discard
);
5662 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5663 << cpp_strerror(r
) << dendl
;
5667 if (bluefs
->bdev_support_label(BlueFS::BDEV_DB
)) {
5668 r
= _check_or_set_bdev_label(
5670 bluefs
->get_block_device_size(BlueFS::BDEV_DB
),
5671 "bluefs db", create
);
5674 << " check block device(" << bfn
<< ") label returned: "
5675 << cpp_strerror(r
) << dendl
;
5680 bluefs
->add_block_extent(
5683 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) - SUPER_RESERVED
);
5685 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
5686 bluefs_layout
.dedicated_db
= true;
5689 if (::lstat(bfn
.c_str(), &st
) == -1) {
5691 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
5693 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
5694 << cpp_strerror(r
) << dendl
;
5700 bfn
= path
+ "/block";
5702 r
= bluefs
->add_block_device(bluefs_layout
.shared_bdev
, bfn
, false,
5703 true /* shared with bluestore */);
5705 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5706 << cpp_strerror(r
) << dendl
;
5710 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
5712 bdev
->get_size() * (cct
->_conf
->bluestore_bluefs_min_ratio
+
5713 cct
->_conf
->bluestore_bluefs_gift_ratio
);
5714 initial
= std::max(initial
, cct
->_conf
->bluestore_bluefs_min
);
5715 uint64_t alloc_size
= cct
->_conf
->bluefs_shared_alloc_size
;
5716 if (alloc_size
% min_alloc_size
) {
5717 derr
<< __func__
<< " bluefs_shared_alloc_size 0x" << std::hex
5718 << alloc_size
<< " is not a multiple of "
5719 << "min_alloc_size 0x" << min_alloc_size
<< std::dec
<< dendl
;
5723 // align to bluefs's alloc_size
5724 initial
= p2roundup(initial
, alloc_size
);
5725 // put bluefs in the middle of the device in case it is an HDD
5726 uint64_t start
= p2align((bdev
->get_size() - initial
) / 2, alloc_size
);
5727 //avoiding superblock overwrite
5728 start
= std::max(alloc_size
, start
);
5729 ceph_assert(start
>=_get_ondisk_reserved());
5731 bluefs
->add_block_extent(bluefs_layout
.shared_bdev
, start
, initial
);
5732 bluefs_extents
.insert(start
, initial
);
5736 bfn
= path
+ "/block.wal";
5737 if (::stat(bfn
.c_str(), &st
) == 0) {
5738 r
= bluefs
->add_block_device(BlueFS::BDEV_WAL
, bfn
,
5739 create
&& cct
->_conf
->bdev_enable_discard
);
5741 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5742 << cpp_strerror(r
) << dendl
;
5746 if (bluefs
->bdev_support_label(BlueFS::BDEV_WAL
)) {
5747 r
= _check_or_set_bdev_label(
5749 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
),
5750 "bluefs wal", create
);
5752 derr
<< __func__
<< " check block device(" << bfn
5753 << ") label returned: " << cpp_strerror(r
) << dendl
;
5759 bluefs
->add_block_extent(
5760 BlueFS::BDEV_WAL
, BDEV_LABEL_BLOCK_SIZE
,
5761 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) -
5762 BDEV_LABEL_BLOCK_SIZE
);
5764 bluefs_layout
.dedicated_wal
= true;
5767 if (::lstat(bfn
.c_str(), &st
) != -1) {
5769 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
5770 << cpp_strerror(r
) << dendl
;
5777 ceph_assert(bluefs
);
5783 int BlueStore::_open_bluefs(bool create
)
5785 int r
= _minimal_open_bluefs(create
);
5789 RocksDBBlueFSVolumeSelector
* vselector
= nullptr;
5790 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
5792 string options
= cct
->_conf
->bluestore_rocksdb_options
;
5793 string options_annex
= cct
->_conf
->bluestore_rocksdb_options_annex
;
5794 if (!options_annex
.empty()) {
5795 if (!options
.empty() &&
5796 *options
.rbegin() != ',') {
5799 options
+= options_annex
;
5802 rocksdb::Options rocks_opts
;
5803 int r
= RocksDBStore::ParseOptionsFromStringStatic(
5812 double reserved_factor
= cct
->_conf
->bluestore_volume_selection_reserved_factor
;
5814 new RocksDBBlueFSVolumeSelector(
5815 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
5816 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
5817 bluefs
->get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100,
5818 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5819 rocks_opts
.max_bytes_for_level_base
,
5820 rocks_opts
.max_bytes_for_level_multiplier
,
5822 cct
->_conf
->bluestore_volume_selection_reserved
,
5823 cct
->_conf
->bluestore_volume_selection_policy
!= "rocksdb_original");
5826 bluefs
->mkfs(fsid
, bluefs_layout
);
5828 bluefs
->set_volume_selector(vselector
);
5829 r
= bluefs
->mount();
5831 derr
<< __func__
<< " failed bluefs mount: " << cpp_strerror(r
) << dendl
;
5833 ceph_assert_always(bluefs
->maybe_verify_layout(bluefs_layout
) == 0);
5837 void BlueStore::_close_bluefs(bool cold_close
)
5839 bluefs
->umount(cold_close
);
5840 _minimal_close_bluefs();
5843 void BlueStore::_minimal_close_bluefs()
5849 int BlueStore::_is_bluefs(bool create
, bool* ret
)
5852 *ret
= cct
->_conf
->bluestore_bluefs
;
5855 int r
= read_meta("bluefs", &s
);
5857 derr
<< __func__
<< " unable to read 'bluefs' meta" << dendl
;
5862 } else if (s
== "0") {
5865 derr
<< __func__
<< " bluefs = " << s
<< " : not 0 or 1, aborting"
5874 * opens both DB and dependant super_meta, FreelistManager and allocator
5875 * in the proper order
5877 int BlueStore::_open_db_and_around(bool read_only
)
5880 bool do_bluefs
= false;
5881 _is_bluefs(false, &do_bluefs
); // ignore err code
5883 // open in read-only first to read FM list and init allocator
5884 // as they might be needed for some BlueFS procedures
5885 r
= _open_db(false, false, true);
5889 r
= _open_super_meta();
5894 r
= _open_fm(nullptr, true);
5902 // now open in R/W mode
5906 r
= _open_db(false, false, false);
5915 r
= _open_db(false, false);
5919 r
= _open_super_meta();
5924 r
= _open_fm(nullptr, false);
5937 _close_db(read_only
);
5941 void BlueStore::_close_db_and_around(bool read_only
)
5944 if (!read_only
&& out_of_sync_fm
.fetch_and(0)) {
5945 _sync_bluefs_and_fm();
5947 _close_db(read_only
);
5948 while(!read_only
&& out_of_sync_fm
.fetch_and(0)) {
5949 // if seen some allocations during close - repeat open_db, sync fm, close
5950 dout(0) << __func__
<< " syncing FreelistManager" << dendl
;
5951 int r
= _open_db(false, false, false);
5954 << " unable to open db, FreelistManager is probably out of sync"
5958 _sync_bluefs_and_fm();
5968 _close_db(read_only
);
5972 // updates legacy bluefs related recs in DB to a state valid for
5973 // downgrades from nautilus.
5974 void BlueStore::_sync_bluefs_and_fm()
5976 if (cct
->_conf
->bluestore_bluefs_db_compatibility
) {
5978 encode(bluefs_extents
, bl
);
5979 dout(20) << __func__
<< " bluefs_extents at KV is now 0x"
5980 << std::hex
<< bluefs_extents
<< std::dec
5982 KeyValueDB::Transaction synct
= db
->get_transaction();
5983 synct
->set(PREFIX_SUPER
, "bluefs_extents", bl
);
5984 synct
->set(PREFIX_SUPER
, "bluefs_extents_back", bl
);
5986 // Nice thing is that we don't need to update FreelistManager here.
5987 // It always has corresponding bits set to 'Free' for both Nautilus+ and
5988 // pre-Nautilis releases.
5989 // So once we get an extent to bluefs_extents this means it's
5990 // been free in allocator and hence it's free in FM too.
5992 db
->submit_transaction_sync(synct
);
5996 int BlueStore::_open_db(bool create
, bool to_repair_db
, bool read_only
)
6000 ceph_assert(!(create
&& read_only
));
6001 string fn
= path
+ "/db";
6003 string options_annex
;
6005 std::shared_ptr
<Int64ArrayMergeOperator
> merge_op(new Int64ArrayMergeOperator
);
6008 std::vector
<KeyValueDB::ColumnFamily
> cfs
;
6011 kv_backend
= cct
->_conf
->bluestore_kvbackend
;
6013 r
= read_meta("kv_backend", &kv_backend
);
6015 derr
<< __func__
<< " unable to read 'kv_backend' meta" << dendl
;
6019 dout(10) << __func__
<< " kv_backend = " << kv_backend
<< dendl
;
6022 r
= _is_bluefs(create
, &do_bluefs
);
6026 dout(10) << __func__
<< " do_bluefs = " << do_bluefs
<< dendl
;
6028 map
<string
,string
> kv_options
;
6029 // force separate wal dir for all new deployments.
6030 kv_options
["separate_wal_dir"] = 1;
6031 rocksdb::Env
*env
= NULL
;
6033 dout(10) << __func__
<< " initializing bluefs" << dendl
;
6034 if (kv_backend
!= "rocksdb") {
6035 derr
<< " backend must be rocksdb to use bluefs" << dendl
;
6039 r
= _open_bluefs(create
);
6044 if (cct
->_conf
->bluestore_bluefs_env_mirror
) {
6045 rocksdb::Env
* a
= new BlueRocksEnv(bluefs
);
6046 rocksdb::Env
* b
= rocksdb::Env::Default();
6048 string cmd
= "rm -rf " + path
+ "/db " +
6049 path
+ "/db.slow " +
6051 int r
= system(cmd
.c_str());
6054 env
= new rocksdb::EnvMirror(b
, a
, false, true);
6056 env
= new BlueRocksEnv(bluefs
);
6058 // simplify the dir names, too, as "seen" by rocksdb
6061 bluefs
->set_slow_device_expander(this);
6062 BlueFSVolumeSelector::paths paths
;
6063 bluefs
->get_vselector_paths(fn
, paths
);
6065 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
6066 // we have both block.db and block; tell rocksdb!
6067 // note: the second (last) size value doesn't really matter
6068 ostringstream db_paths
;
6070 for (auto& p
: paths
) {
6075 db_paths
<< p
.first
<< "," << p
.second
;
6078 kv_options
["db_paths"] = db_paths
.str();
6079 dout(1) << __func__
<< " set db_paths to " << db_paths
.str() << dendl
;
6083 for (auto& p
: paths
) {
6084 env
->CreateDir(p
.first
);
6086 // Selectors don't provide wal path so far hence create explicitly
6087 env
->CreateDir(fn
+ ".wal");
6089 std::vector
<std::string
> res
;
6090 // check for dir presence
6091 auto r
= env
->GetChildren(fn
+".wal", &res
);
6092 if (r
.IsNotFound()) {
6093 kv_options
.erase("separate_wal_dir");
6097 string walfn
= path
+ "/db.wal";
6100 int r
= ::mkdir(fn
.c_str(), 0755);
6103 if (r
< 0 && r
!= -EEXIST
) {
6104 derr
<< __func__
<< " failed to create " << fn
<< ": " << cpp_strerror(r
)
6110 r
= ::mkdir(walfn
.c_str(), 0755);
6113 if (r
< 0 && r
!= -EEXIST
) {
6114 derr
<< __func__
<< " failed to create " << walfn
6115 << ": " << cpp_strerror(r
)
6121 r
= ::stat(walfn
.c_str(), &st
);
6122 if (r
< 0 && errno
== ENOENT
) {
6123 kv_options
.erase("separate_wal_dir");
6129 db
= KeyValueDB::create(cct
,
6133 static_cast<void*>(env
));
6135 derr
<< __func__
<< " error creating db" << dendl
;
6137 _close_bluefs(read_only
);
6139 // delete env manually here since we can't depend on db to do this
6146 FreelistManager::setup_merge_operators(db
);
6147 db
->set_merge_operator(PREFIX_STAT
, merge_op
);
6148 db
->set_cache_size(cache_kv_ratio
* cache_size
);
6150 if (kv_backend
== "rocksdb") {
6151 options
= cct
->_conf
->bluestore_rocksdb_options
;
6152 options_annex
= cct
->_conf
->bluestore_rocksdb_options_annex
;
6153 if (!options_annex
.empty()) {
6154 if (!options
.empty() &&
6155 *options
.rbegin() != ',') {
6158 options
+= options_annex
;
6161 map
<string
,string
> cf_map
;
6162 cct
->_conf
.with_val
<string
>("bluestore_rocksdb_cfs",
6166 for (auto& i
: cf_map
) {
6167 dout(10) << "column family " << i
.first
<< ": " << i
.second
<< dendl
;
6168 cfs
.push_back(KeyValueDB::ColumnFamily(i
.first
, i
.second
));
6176 if (cct
->_conf
.get_val
<bool>("bluestore_rocksdb_cf")) {
6177 r
= db
->create_and_open(err
, cfs
);
6179 r
= db
->create_and_open(err
);
6182 // we pass in cf list here, but it is only used if the db already has
6183 // column families created.
6185 db
->open_read_only(err
, cfs
) :
6189 derr
<< __func__
<< " erroring opening db: " << err
.str() << dendl
;
6190 _close_db(read_only
);
6193 dout(1) << __func__
<< " opened " << kv_backend
6194 << " path " << fn
<< " options " << options
<< dendl
;
6198 void BlueStore::_close_db(bool cold_close
)
6204 _close_bluefs(cold_close
);
6208 void BlueStore::_dump_alloc_on_failure()
6210 auto dump_interval
=
6211 cct
->_conf
->bluestore_bluefs_alloc_failure_dump_interval
;
6212 if (dump_interval
> 0 &&
6213 next_dump_on_bluefs_alloc_failure
<= ceph_clock_now()) {
6215 next_dump_on_bluefs_alloc_failure
= ceph_clock_now();
6216 next_dump_on_bluefs_alloc_failure
+= dump_interval
;
6221 int BlueStore::allocate_bluefs_freespace(
6224 PExtentVector
* extents_out
)
6226 ceph_assert(min_size
<= size
);
6228 // round up to alloc size
6229 uint64_t alloc_size
= bluefs
->get_alloc_size(bluefs_layout
.shared_bdev
);
6230 min_size
= p2roundup(min_size
, alloc_size
);
6231 size
= p2roundup(size
, alloc_size
);
6233 PExtentVector extents_local
;
6234 PExtentVector
* extents
= extents_out
? extents_out
: &extents_local
;
6238 uint64_t allocated
= 0;
6241 auto extent_count0
= extents
->size();
6243 // hard cap to fit into 32 bits
6244 gift
= std::min
<uint64_t>(size
, 1ull << 30);
6245 dout(10) << __func__
<< " gifting " << gift
6246 << " (" << byte_u_t(gift
) << ")" << dendl
;
6248 alloc_len
= alloc
->allocate(gift
, alloc_size
, 0, 0, extents
);
6249 if (alloc_len
> 0) {
6250 allocated
+= alloc_len
;
6254 if (alloc_len
< 0 ||
6255 (alloc_len
< (int64_t)gift
&& (min_size
> allocated
))) {
6257 << " failed to allocate on 0x" << std::hex
<< gift
6258 << " min_size 0x" << min_size
6259 << " > allocated total 0x" << allocated
6260 << " bluefs_shared_alloc_size 0x" << alloc_size
6261 << " allocated 0x" << (alloc_len
< 0 ? 0 : alloc_len
)
6262 << " available 0x " << alloc
->get_free()
6263 << std::dec
<< dendl
;
6265 _dump_alloc_on_failure();
6266 alloc
->release(*extents
);
6270 } while (size
&& alloc_len
> 0);
6271 _collect_allocation_stats(need
, alloc_size
, extents
->size() - extent_count0
);
6273 for (auto& e
: *extents
) {
6274 dout(5) << __func__
<< " gifting " << e
<< " to bluefs" << dendl
;
6275 bluefs_extents
.insert(e
.offset
, e
.length
);
6277 // apply to bluefs if not requested from outside
6279 bluefs
->add_block_extent(bluefs_layout
.shared_bdev
, e
.offset
, e
.length
);
6286 uint64_t BlueStore::available_freespace(uint64_t alloc_size
) {
6288 auto iterated_allocation
= [&](uint64_t off
, uint64_t len
) {
6289 //only count in size that is alloc_size aligned
6290 uint64_t dist_to_alignment
;
6291 uint64_t offset_in_block
= off
& (alloc_size
- 1);
6292 if (offset_in_block
== 0)
6293 dist_to_alignment
= 0;
6295 dist_to_alignment
= alloc_size
- offset_in_block
;
6296 if (dist_to_alignment
>= len
)
6298 len
-= dist_to_alignment
;
6299 total
+= p2align(len
, alloc_size
);
6301 alloc
->dump(iterated_allocation
);
6305 int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free
, uint64_t bluefs_total
)
6307 float bluefs_free_ratio
= (float)bluefs_free
/ (float)bluefs_total
;
6309 uint64_t my_free
= alloc
->get_free();
6310 uint64_t total
= bdev
->get_size();
6311 float my_free_ratio
= (float)my_free
/ (float)total
;
6313 uint64_t total_free
= bluefs_free
+ my_free
;
6315 float bluefs_ratio
= (float)bluefs_free
/ (float)total_free
;
6317 dout(10) << __func__
6318 << " bluefs " << byte_u_t(bluefs_free
)
6319 << " free (" << bluefs_free_ratio
6320 << ") bluestore " << byte_u_t(my_free
)
6321 << " free (" << my_free_ratio
6322 << "), bluefs_ratio " << bluefs_ratio
6326 uint64_t reclaim
= 0;
6327 if (bluefs_ratio
< cct
->_conf
->bluestore_bluefs_min_ratio
) {
6328 gift
= cct
->_conf
->bluestore_bluefs_gift_ratio
* total_free
;
6329 if (gift
>= my_free
)
6331 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
6332 << " < min_ratio " << cct
->_conf
->bluestore_bluefs_min_ratio
6333 << ", should gift " << byte_u_t(gift
) << dendl
;
6334 } else if (bluefs_ratio
> cct
->_conf
->bluestore_bluefs_max_ratio
) {
6335 reclaim
= cct
->_conf
->bluestore_bluefs_reclaim_ratio
* total_free
;
6336 if (bluefs_total
- reclaim
< cct
->_conf
->bluestore_bluefs_min
)
6337 reclaim
= bluefs_total
- cct
->_conf
->bluestore_bluefs_min
;
6338 if (reclaim
>= bluefs_free
)
6339 reclaim
= bluefs_free
/ 2;
6340 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
6341 << " > max_ratio " << cct
->_conf
->bluestore_bluefs_max_ratio
6342 << ", should reclaim " << byte_u_t(reclaim
) << dendl
;
6345 // don't take over too much of the freespace
6346 uint64_t free_cap
= cct
->_conf
->bluestore_bluefs_max_ratio
* total_free
;
6347 if (bluefs_total
< cct
->_conf
->bluestore_bluefs_min
&&
6348 cct
->_conf
->bluestore_bluefs_min
< free_cap
) {
6349 uint64_t g
= cct
->_conf
->bluestore_bluefs_min
- bluefs_total
;
6350 dout(10) << __func__
<< " bluefs_total " << bluefs_total
6351 << " < min " << cct
->_conf
->bluestore_bluefs_min
6352 << ", should gift " << byte_u_t(g
) << dendl
;
6358 cct
->_conf
.get_val
<Option::size_t>("bluestore_bluefs_min_free");
6359 if (bluefs_free
< min_free
&&
6360 min_free
< free_cap
) {
6361 uint64_t g
= min_free
- bluefs_free
;
6362 dout(10) << __func__
<< " bluefs_free " << bluefs_free
6363 << " < min " << min_free
6364 << ", should gift " << byte_u_t(g
) << dendl
;
6370 cct
->_conf
.get_val
<Option::size_t>("bluestore_bluefs_max_free");
6371 if (bluefs_free
> max_free
) {
6372 dout(10) << __func__
<< " bluefs_free " << bluefs_free
6373 << " > max " << max_free
6374 << ", stop gifting for now" << dendl
;
6377 ceph_assert((int64_t)gift
>= 0);
6378 ceph_assert((int64_t)reclaim
>= 0);
6379 return gift
> 0 ? (int64_t)gift
: -(int64_t)reclaim
;
6382 int BlueStore::_balance_bluefs_freespace()
6385 ceph_assert(bluefs
);
6387 vector
<pair
<uint64_t,uint64_t>> bluefs_usage
; // <free, total> ...
6388 bluefs
->get_usage(&bluefs_usage
);
6389 ceph_assert(bluefs_usage
.size() > bluefs_layout
.shared_bdev
);
6391 bool clear_alert
= true;
6392 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
6393 auto& p
= bluefs_usage
[bluefs_layout
.shared_bdev
];
6394 if (p
.first
!= p
.second
) {
6395 auto& db
= bluefs_usage
[BlueFS::BDEV_DB
];
6397 ss
<< "spilled over " << byte_u_t(p
.second
- p
.first
)
6398 << " metadata from 'db' device (" << byte_u_t(db
.second
- db
.first
)
6399 << " used of " << byte_u_t(db
.second
) << ") to slow device";
6400 _set_spillover_alert(ss
.str());
6401 clear_alert
= false;
6405 _clear_spillover_alert();
6408 // fixme: look at primary bdev only for now
6409 int64_t delta
= _get_bluefs_size_delta(
6410 bluefs_usage
[bluefs_layout
.shared_bdev
].first
,
6411 bluefs_usage
[bluefs_layout
.shared_bdev
].second
);
6413 // reclaim from bluefs?
6415 // round up to alloc size
6416 uint64_t alloc_size
= bluefs
->get_alloc_size(bluefs_layout
.shared_bdev
);
6417 auto reclaim
= p2roundup(uint64_t(-delta
), alloc_size
);
6419 // hard cap to fit into 32 bits
6420 reclaim
= std::min
<uint64_t>(reclaim
, 1ull << 30);
6421 dout(10) << __func__
<< " reclaiming " << reclaim
6422 << " (" << byte_u_t(reclaim
) << ")" << dendl
;
6424 while (reclaim
> 0) {
6425 // NOTE: this will block and do IO.
6426 PExtentVector extents
;
6427 int r
= bluefs
->reclaim_blocks(bluefs_layout
.shared_bdev
, reclaim
,
6430 derr
<< __func__
<< " failed to reclaim space from bluefs"
6434 for (auto e
: extents
) {
6436 bluefs_extents
.erase(e
.offset
, e
.length
);
6437 bluefs_extents_reclaiming
.insert(e
.offset
, e
.length
);
6438 reclaim
-= e
.length
;
6448 int BlueStore::_open_collections()
6450 dout(10) << __func__
<< dendl
;
6451 collections_had_errors
= false;
6452 ceph_assert(coll_map
.empty());
6453 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
6454 for (it
->upper_bound(string());
6458 if (cid
.parse(it
->key())) {
6459 auto c
= ceph::make_ref
<Collection
>(
6461 onode_cache_shards
[cid
.hash_to_shard(onode_cache_shards
.size())],
6462 buffer_cache_shards
[cid
.hash_to_shard(buffer_cache_shards
.size())],
6464 bufferlist bl
= it
->value();
6465 auto p
= bl
.cbegin();
6467 decode(c
->cnode
, p
);
6468 } catch (buffer::error
& e
) {
6469 derr
<< __func__
<< " failed to decode cnode, key:"
6470 << pretty_binary_string(it
->key()) << dendl
;
6473 dout(20) << __func__
<< " opened " << cid
<< " " << c
6474 << " " << c
->cnode
<< dendl
;
6475 _osr_attach(c
.get());
6479 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
6480 collections_had_errors
= true;
6486 void BlueStore::_fsck_collections(int64_t* errors
)
6488 if (collections_had_errors
) {
6489 dout(10) << __func__
<< dendl
;
6490 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
6491 for (it
->upper_bound(string());
6495 if (!cid
.parse(it
->key())) {
6496 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
6505 void BlueStore::_set_per_pool_omap()
6507 per_pool_omap
= false;
6509 db
->get(PREFIX_SUPER
, "per_pool_omap", &bl
);
6511 per_pool_omap
= true;
6512 dout(10) << __func__
<< " per_pool_omap=1" << dendl
;
6514 dout(10) << __func__
<< " per_pool_omap not present" << dendl
;
6516 _check_no_per_pool_omap_alert();
6519 void BlueStore::_open_statfs()
6525 int r
= db
->get(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, &bl
);
6527 per_pool_stat_collection
= false;
6528 if (size_t(bl
.length()) >= sizeof(vstatfs
.values
)) {
6529 auto it
= bl
.cbegin();
6531 dout(10) << __func__
<< " store_statfs is found" << dendl
;
6533 dout(10) << __func__
<< " store_statfs is corrupt, using empty" << dendl
;
6535 _check_legacy_statfs_alert();
6537 per_pool_stat_collection
= true;
6538 dout(10) << __func__
<< " per-pool statfs is enabled" << dendl
;
6539 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_STAT
);
6540 for (it
->upper_bound(string());
6545 int r
= get_key_pool_stat(it
->key(), &pool_id
);
6546 ceph_assert(r
== 0);
6550 auto p
= bl
.cbegin();
6551 auto& st
= osd_pools
[pool_id
];
6556 dout(30) << __func__
<< " pool " << pool_id
6557 << " statfs " << st
<< dendl
;
6558 } catch (buffer::error
& e
) {
6559 derr
<< __func__
<< " failed to decode pool stats, key:"
6560 << pretty_binary_string(it
->key()) << dendl
;
6564 dout(30) << __func__
<< " statfs " << vstatfs
<< dendl
;
6568 int BlueStore::_setup_block_symlink_or_file(
6574 dout(20) << __func__
<< " name " << name
<< " path " << epath
6575 << " size " << size
<< " create=" << (int)create
<< dendl
;
6577 int flags
= O_RDWR
|O_CLOEXEC
;
6580 if (epath
.length()) {
6581 r
= ::symlinkat(epath
.c_str(), path_fd
, name
.c_str());
6584 derr
<< __func__
<< " failed to create " << name
<< " symlink to "
6585 << epath
<< ": " << cpp_strerror(r
) << dendl
;
6589 if (!epath
.compare(0, strlen(SPDK_PREFIX
), SPDK_PREFIX
)) {
6590 int fd
= ::openat(path_fd
, epath
.c_str(), flags
, 0644);
6593 derr
<< __func__
<< " failed to open " << epath
<< " file: "
6594 << cpp_strerror(r
) << dendl
;
6597 // write the Transport ID of the NVMe device
6598 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6599 // where "0000:02:00.0" is the selector of a PCI device, see
6600 // the first column of "lspci -mm -n -D"
6601 string trid
{"trtype:PCIe "};
6603 trid
+= epath
.substr(strlen(SPDK_PREFIX
));
6604 r
= ::write(fd
, trid
.c_str(), trid
.size());
6605 ceph_assert(r
== static_cast<int>(trid
.size()));
6606 dout(1) << __func__
<< " created " << name
<< " symlink to "
6608 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6612 int fd
= ::openat(path_fd
, name
.c_str(), flags
, 0644);
6614 // block file is present
6616 int r
= ::fstat(fd
, &st
);
6618 S_ISREG(st
.st_mode
) && // if it is a regular file
6619 st
.st_size
== 0) { // and is 0 bytes
6620 r
= ::ftruncate(fd
, size
);
6623 derr
<< __func__
<< " failed to resize " << name
<< " file to "
6624 << size
<< ": " << cpp_strerror(r
) << dendl
;
6625 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6629 if (cct
->_conf
->bluestore_block_preallocate_file
) {
6630 r
= ::ceph_posix_fallocate(fd
, 0, size
);
6632 derr
<< __func__
<< " failed to prefallocate " << name
<< " file to "
6633 << size
<< ": " << cpp_strerror(r
) << dendl
;
6634 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6638 dout(1) << __func__
<< " resized " << name
<< " file to "
6639 << byte_u_t(size
) << dendl
;
6641 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6645 derr
<< __func__
<< " failed to open " << name
<< " file: "
6646 << cpp_strerror(r
) << dendl
;
6654 int BlueStore::mkfs()
6656 dout(1) << __func__
<< " path " << path
<< dendl
;
6660 if (cct
->_conf
->osd_max_object_size
> OBJECT_MAX_SIZE
) {
6661 derr
<< __func__
<< " osd_max_object_size "
6662 << cct
->_conf
->osd_max_object_size
<< " > bluestore max "
6663 << OBJECT_MAX_SIZE
<< dendl
;
6669 r
= read_meta("mkfs_done", &done
);
6671 dout(1) << __func__
<< " already created" << dendl
;
6672 if (cct
->_conf
->bluestore_fsck_on_mkfs
) {
6673 r
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
6675 derr
<< __func__
<< " fsck found fatal error: " << cpp_strerror(r
)
6680 derr
<< __func__
<< " fsck found " << r
<< " errors" << dendl
;
6684 return r
; // idempotent
6690 r
= read_meta("type", &type
);
6692 if (type
!= "bluestore") {
6693 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
6697 r
= write_meta("type", "bluestore");
6703 freelist_type
= "bitmap";
6709 r
= _open_fsid(true);
6715 goto out_close_fsid
;
6717 r
= _read_fsid(&old_fsid
);
6718 if (r
< 0 || old_fsid
.is_zero()) {
6719 if (fsid
.is_zero()) {
6720 fsid
.generate_random();
6721 dout(1) << __func__
<< " generated fsid " << fsid
<< dendl
;
6723 dout(1) << __func__
<< " using provided fsid " << fsid
<< dendl
;
6725 // we'll write it later.
6727 if (!fsid
.is_zero() && fsid
!= old_fsid
) {
6728 derr
<< __func__
<< " on-disk fsid " << old_fsid
6729 << " != provided " << fsid
<< dendl
;
6731 goto out_close_fsid
;
6736 r
= _setup_block_symlink_or_file("block", cct
->_conf
->bluestore_block_path
,
6737 cct
->_conf
->bluestore_block_size
,
6738 cct
->_conf
->bluestore_block_create
);
6740 goto out_close_fsid
;
6741 if (cct
->_conf
->bluestore_bluefs
) {
6742 r
= _setup_block_symlink_or_file("block.wal", cct
->_conf
->bluestore_block_wal_path
,
6743 cct
->_conf
->bluestore_block_wal_size
,
6744 cct
->_conf
->bluestore_block_wal_create
);
6746 goto out_close_fsid
;
6747 r
= _setup_block_symlink_or_file("block.db", cct
->_conf
->bluestore_block_db_path
,
6748 cct
->_conf
->bluestore_block_db_size
,
6749 cct
->_conf
->bluestore_block_db_create
);
6751 goto out_close_fsid
;
6754 r
= _open_bdev(true);
6756 goto out_close_fsid
;
6758 // choose min_alloc_size
6759 if (cct
->_conf
->bluestore_min_alloc_size
) {
6760 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size
;
6763 if (bdev
->is_rotational()) {
6764 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_hdd
;
6766 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_ssd
;
6771 // make sure min_alloc_size is power of 2 aligned.
6772 if (!isp2(min_alloc_size
)) {
6773 derr
<< __func__
<< " min_alloc_size 0x"
6774 << std::hex
<< min_alloc_size
<< std::dec
6775 << " is not power of 2 aligned!"
6778 goto out_close_bdev
;
6783 goto out_close_bdev
;
6786 KeyValueDB::Transaction t
= db
->get_transaction();
6787 r
= _open_fm(t
, true);
6792 encode((uint64_t)0, bl
);
6793 t
->set(PREFIX_SUPER
, "nid_max", bl
);
6794 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
6799 encode((uint64_t)min_alloc_size
, bl
);
6800 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
6805 t
->set(PREFIX_SUPER
, "per_pool_omap", bl
);
6807 ondisk_format
= latest_ondisk_format
;
6808 _prepare_ondisk_format_super(t
);
6809 db
->submit_transaction_sync(t
);
6812 r
= write_meta("kv_backend", cct
->_conf
->bluestore_kvbackend
);
6816 r
= write_meta("bluefs", stringify(bluefs
? 1 : 0));
6820 if (fsid
!= old_fsid
) {
6823 derr
<< __func__
<< " error writing fsid: " << cpp_strerror(r
) << dendl
;
6828 if (out_of_sync_fm
.fetch_and(0)) {
6829 _sync_bluefs_and_fm();
6844 cct
->_conf
->bluestore_fsck_on_mkfs
) {
6845 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
6849 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
6855 // indicate success by writing the 'mkfs_done' file
6856 r
= write_meta("mkfs_done", "yes");
6860 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
6862 dout(0) << __func__
<< " success" << dendl
;
6867 int BlueStore::_mount_for_bluefs()
6869 int r
= _open_path();
6870 ceph_assert(r
== 0);
6871 r
= _open_fsid(false);
6872 ceph_assert(r
== 0);
6873 r
= _read_fsid(&fsid
);
6874 ceph_assert(r
== 0);
6876 ceph_assert(r
== 0);
6877 r
= _open_bluefs(false);
6878 ceph_assert(r
== 0);
6882 void BlueStore::_umount_for_bluefs()
6884 _close_bluefs(false);
6889 int BlueStore::add_new_bluefs_device(int id
, const string
& dev_path
)
6891 dout(10) << __func__
<< " path " << dev_path
<< " id:" << id
<< dendl
;
6893 ceph_assert(path_fd
< 0);
6895 ceph_assert(id
== BlueFS::BDEV_NEWWAL
|| id
== BlueFS::BDEV_NEWDB
);
6897 if (!cct
->_conf
->bluestore_bluefs
) {
6898 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6902 r
= _mount_for_bluefs();
6905 if (id
== BlueFS::BDEV_NEWWAL
) {
6906 string p
= path
+ "/block.wal";
6907 r
= _setup_block_symlink_or_file("block.wal", dev_path
,
6908 cct
->_conf
->bluestore_block_wal_size
,
6910 ceph_assert(r
== 0);
6912 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWWAL
, p
,
6913 cct
->_conf
->bdev_enable_discard
);
6914 ceph_assert(r
== 0);
6916 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWWAL
)) {
6917 r
= _check_or_set_bdev_label(
6919 bluefs
->get_block_device_size(BlueFS::BDEV_NEWWAL
),
6922 ceph_assert(r
== 0);
6925 reserved
= BDEV_LABEL_BLOCK_SIZE
;
6926 bluefs_layout
.dedicated_wal
= true;
6927 } else if (id
== BlueFS::BDEV_NEWDB
) {
6928 string p
= path
+ "/block.db";
6929 r
= _setup_block_symlink_or_file("block.db", dev_path
,
6930 cct
->_conf
->bluestore_block_db_size
,
6932 ceph_assert(r
== 0);
6934 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWDB
, p
,
6935 cct
->_conf
->bdev_enable_discard
);
6936 ceph_assert(r
== 0);
6938 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWDB
)) {
6939 r
= _check_or_set_bdev_label(
6941 bluefs
->get_block_device_size(BlueFS::BDEV_NEWDB
),
6944 ceph_assert(r
== 0);
6946 reserved
= SUPER_RESERVED
;
6947 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
6948 bluefs_layout
.dedicated_db
= true;
6954 bluefs
->add_block_extent(
6957 bluefs
->get_block_device_size(id
) - reserved
, true);
6959 r
= bluefs
->prepare_new_device(id
, bluefs_layout
);
6960 ceph_assert(r
== 0);
6963 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
6965 dout(0) << __func__
<< " success" << dendl
;
6968 _umount_for_bluefs();
6972 int BlueStore::migrate_to_existing_bluefs_device(const set
<int>& devs_source
,
6975 dout(10) << __func__
<< " id:" << id
<< dendl
;
6976 ceph_assert(path_fd
< 0);
6978 ceph_assert(id
== BlueFS::BDEV_SLOW
|| id
== BlueFS::BDEV_DB
);
6980 if (!cct
->_conf
->bluestore_bluefs
) {
6981 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6985 int r
= _mount_for_bluefs();
6987 // require bluestore_bluefs_min_free to be free at target device!
6988 uint64_t used_space
= cct
->_conf
.get_val
<Option::size_t>("bluestore_bluefs_min_free");
6989 for(auto src_id
: devs_source
) {
6990 used_space
+= bluefs
->get_total(src_id
) - bluefs
->get_free(src_id
);
6992 uint64_t target_free
= bluefs
->get_free(id
);
6993 if (id
== BlueFS::BDEV_SLOW
&& target_free
< used_space
) {
6994 // will need to remount full BlueStore instance to allocate more space
6995 _umount_for_bluefs();
6998 ceph_assert(r
== 0);
7000 << " Allocating more space at slow device for BlueFS: +"
7001 << used_space
- target_free
<< " bytes" << dendl
;
7002 r
= allocate_bluefs_freespace(
7003 used_space
- target_free
,
7004 used_space
- target_free
,
7010 << " can't migrate, unable to allocate extra space: "
7011 << used_space
- target_free
<< " at target:" << id
7016 r
= _mount_for_bluefs();
7017 ceph_assert(r
== 0);
7018 } else if (target_free
< used_space
) {
7020 << " can't migrate, free space at target: " << target_free
7021 << " is less than required space: " << used_space
7025 if (devs_source
.count(BlueFS::BDEV_DB
)) {
7026 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
7027 bluefs_layout
.dedicated_db
= false;
7029 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
7030 bluefs_layout
.dedicated_wal
= false;
7032 r
= bluefs
->device_migrate_to_existing(cct
, devs_source
, id
, bluefs_layout
);
7034 derr
<< __func__
<< " failed during BlueFS migration, " << cpp_strerror(r
) << dendl
;
7038 if (devs_source
.count(BlueFS::BDEV_DB
)) {
7039 r
= unlink(string(path
+ "/block.db").c_str());
7040 ceph_assert(r
== 0);
7042 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
7043 r
= unlink(string(path
+ "/block.wal").c_str());
7044 ceph_assert(r
== 0);
7048 _umount_for_bluefs();
7052 int BlueStore::migrate_to_new_bluefs_device(const set
<int>& devs_source
,
7054 const string
& dev_path
)
7056 dout(10) << __func__
<< " path " << dev_path
<< " id:" << id
<< dendl
;
7058 ceph_assert(path_fd
< 0);
7060 ceph_assert(id
== BlueFS::BDEV_NEWWAL
|| id
== BlueFS::BDEV_NEWDB
);
7062 if (!cct
->_conf
->bluestore_bluefs
) {
7063 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
7067 r
= _mount_for_bluefs();
7072 if (devs_source
.count(BlueFS::BDEV_DB
) &&
7073 bluefs_layout
.shared_bdev
!= BlueFS::BDEV_DB
) {
7074 link_db
= path
+ "/block.db";
7075 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
7076 bluefs_layout
.dedicated_db
= false;
7078 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
7079 link_wal
= path
+ "/block.wal";
7080 bluefs_layout
.dedicated_wal
= false;
7085 if (id
== BlueFS::BDEV_NEWWAL
) {
7086 target_name
= "block.wal";
7087 target_size
= cct
->_conf
->bluestore_block_wal_size
;
7088 bluefs_layout
.dedicated_wal
= true;
7090 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWWAL
, dev_path
,
7091 cct
->_conf
->bdev_enable_discard
);
7092 ceph_assert(r
== 0);
7094 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWWAL
)) {
7095 r
= _check_or_set_bdev_label(
7097 bluefs
->get_block_device_size(BlueFS::BDEV_NEWWAL
),
7100 ceph_assert(r
== 0);
7102 reserved
= BDEV_LABEL_BLOCK_SIZE
;
7103 } else if (id
== BlueFS::BDEV_NEWDB
) {
7104 target_name
= "block.db";
7105 target_size
= cct
->_conf
->bluestore_block_db_size
;
7106 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
7107 bluefs_layout
.dedicated_db
= true;
7109 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWDB
, dev_path
,
7110 cct
->_conf
->bdev_enable_discard
);
7111 ceph_assert(r
== 0);
7113 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWDB
)) {
7114 r
= _check_or_set_bdev_label(
7116 bluefs
->get_block_device_size(BlueFS::BDEV_NEWDB
),
7119 ceph_assert(r
== 0);
7121 reserved
= SUPER_RESERVED
;
7127 bluefs
->add_block_extent(
7128 id
, reserved
, bluefs
->get_block_device_size(id
) - reserved
);
7130 r
= bluefs
->device_migrate_to_new(cct
, devs_source
, id
, bluefs_layout
);
7133 derr
<< __func__
<< " failed during BlueFS migration, " << cpp_strerror(r
) << dendl
;
7137 if (!link_db
.empty()) {
7138 r
= unlink(link_db
.c_str());
7139 ceph_assert(r
== 0);
7141 if (!link_wal
.empty()) {
7142 r
= unlink(link_wal
.c_str());
7143 ceph_assert(r
== 0);
7145 r
= _setup_block_symlink_or_file(
7150 ceph_assert(r
== 0);
7151 dout(0) << __func__
<< " success" << dendl
;
7154 _umount_for_bluefs();
7158 string
BlueStore::get_device_path(unsigned id
)
7161 if (id
< BlueFS::MAX_BDEV
) {
7163 case BlueFS::BDEV_WAL
:
7164 res
= path
+ "/block.wal";
7166 case BlueFS::BDEV_DB
:
7167 if (id
== bluefs_layout
.shared_bdev
) {
7168 res
= path
+ "/block";
7170 res
= path
+ "/block.db";
7173 case BlueFS::BDEV_SLOW
:
7174 res
= path
+ "/block";
7181 int BlueStore::expand_devices(ostream
& out
)
7183 int r
= cold_open();
7184 ceph_assert(r
== 0);
7185 bluefs
->dump_block_extents(out
);
7186 out
<< "Expanding DB/WAL..." << std::endl
;
7187 for (auto devid
: { BlueFS::BDEV_WAL
, BlueFS::BDEV_DB
}) {
7188 if (devid
== bluefs_layout
.shared_bdev
) {
7191 uint64_t size
= bluefs
->get_block_device_size(devid
);
7197 interval_set
<uint64_t> before
;
7198 bluefs
->get_block_extents(devid
, &before
);
7199 ceph_assert(!before
.empty());
7200 uint64_t end
= before
.range_end();
7203 <<" : expanding " << " from 0x" << std::hex
7204 << end
<< " to 0x" << size
<< std::dec
<< std::endl
;
7205 bluefs
->add_block_extent(devid
, end
, size
-end
);
7206 string p
= get_device_path(devid
);
7207 const char* path
= p
.c_str();
7208 if (path
== nullptr) {
7210 <<": can't find device path " << dendl
;
7213 bluestore_bdev_label_t label
;
7214 int r
= _read_bdev_label(cct
, path
, &label
);
7216 derr
<< "unable to read label for " << path
<< ": "
7217 << cpp_strerror(r
) << dendl
;
7221 r
= _write_bdev_label(cct
, path
, label
);
7223 derr
<< "unable to write label for " << path
<< ": "
7224 << cpp_strerror(r
) << dendl
;
7228 <<" : size label updated to " << size
7232 uint64_t size0
= fm
->get_size();
7233 uint64_t size
= bdev
->get_size();
7235 out
<< bluefs_layout
.shared_bdev
7236 << " : expanding " << " from 0x" << std::hex
7237 << size0
<< " to 0x" << size
<< std::dec
<< std::endl
;
7238 _write_out_fm_meta(size
, true);
7241 // mount in read/write to sync expansion changes
7243 ceph_assert(r
== 0);
7251 int BlueStore::dump_bluefs_sizes(ostream
& out
)
7253 int r
= cold_open();
7254 ceph_assert(r
== 0);
7255 bluefs
->dump_block_extents(out
);
7260 void BlueStore::set_cache_shards(unsigned num
)
7262 dout(10) << __func__
<< " " << num
<< dendl
;
7263 size_t oold
= onode_cache_shards
.size();
7264 size_t bold
= buffer_cache_shards
.size();
7265 ceph_assert(num
>= oold
&& num
>= bold
);
7266 onode_cache_shards
.resize(num
);
7267 buffer_cache_shards
.resize(num
);
7268 for (unsigned i
= oold
; i
< num
; ++i
) {
7269 onode_cache_shards
[i
] =
7270 OnodeCacheShard::create(cct
, cct
->_conf
->bluestore_cache_type
,
7273 for (unsigned i
= bold
; i
< num
; ++i
) {
7274 buffer_cache_shards
[i
] =
7275 BufferCacheShard::create(cct
, cct
->_conf
->bluestore_cache_type
,
7280 int BlueStore::_mount(bool kv_only
, bool open_db
)
7282 dout(1) << __func__
<< " path " << path
<< dendl
;
7288 int r
= read_meta("type", &type
);
7290 derr
<< __func__
<< " failed to load os-type: " << cpp_strerror(r
)
7295 if (type
!= "bluestore") {
7296 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
7301 if (cct
->_conf
->bluestore_fsck_on_mount
) {
7302 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mount_deep
);
7306 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
7311 if (cct
->_conf
->osd_max_object_size
> OBJECT_MAX_SIZE
) {
7312 derr
<< __func__
<< " osd_max_object_size "
7313 << cct
->_conf
->osd_max_object_size
<< " > bluestore max "
7314 << OBJECT_MAX_SIZE
<< dendl
;
7318 int r
= _open_path();
7321 r
= _open_fsid(false);
7325 r
= _read_fsid(&fsid
);
7333 r
= _open_bdev(false);
7338 r
= _open_db_and_around(false);
7340 // we can bypass db open exclusively in case of kv_only mode
7341 ceph_assert(kv_only
);
7342 r
= _open_db(false, true);
7351 r
= _upgrade_super();
7356 r
= _open_collections();
7360 r
= _reload_logger();
7366 r
= _deferred_replay();
7370 mempool_thread
.init();
7372 if ((!per_pool_stat_collection
|| !per_pool_omap
) &&
7373 cct
->_conf
->bluestore_fsck_quick_fix_on_mount
== true) {
7375 bool was_per_pool_omap
= per_pool_omap
;
7377 dout(1) << __func__
<< " quick-fix on mount" << dendl
;
7378 _fsck_on_open(FSCK_SHALLOW
, true);
7381 //FIXME minor: replace with actual open/close?
7383 _check_legacy_statfs_alert();
7385 //set again as hopefully it has been fixed
7386 if (!was_per_pool_omap
) {
7387 _set_per_pool_omap();
7399 _close_db_and_around(false);
7409 int BlueStore::umount()
7411 ceph_assert(_kv_only
|| mounted
);
7412 dout(1) << __func__
<< dendl
;
7418 mempool_thread
.shutdown();
7419 dout(20) << __func__
<< " stopping kv thread" << dendl
;
7422 dout(20) << __func__
<< " closing" << dendl
;
7425 _close_db_and_around(false);
7430 if (cct
->_conf
->bluestore_fsck_on_umount
) {
7431 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_umount_deep
);
7435 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
7442 int BlueStore::cold_open()
7444 int r
= _open_path();
7447 r
= _open_fsid(false);
7451 r
= _read_fsid(&fsid
);
7459 r
= _open_bdev(false);
7462 r
= _open_db_and_around(true);
7475 int BlueStore::cold_close()
7477 _close_db_and_around(true);
7484 // derr wrapper to limit enormous output and avoid log flooding.
7485 // Of limited use where such output is expected for now
7486 #define fsck_derr(err_cnt, threshold) \
7487 if (err_cnt <= threshold) { \
7488 bool need_skip_print = err_cnt == threshold; \
7491 #define fsck_dendl \
7493 if (need_skip_print) \
7494 derr << "more error lines skipped..." << dendl; \
7497 int _fsck_sum_extents(
7498 const PExtentVector
& extents
,
7500 store_statfs_t
& expected_statfs
)
7502 for (auto e
: extents
) {
7505 expected_statfs
.allocated
+= e
.length
;
7507 expected_statfs
.data_compressed_allocated
+= e
.length
;
7513 int BlueStore::_fsck_check_extents(
7515 const ghobject_t
& oid
,
7516 const PExtentVector
& extents
,
7518 mempool_dynamic_bitset
&used_blocks
,
7519 uint64_t granularity
,
7520 BlueStoreRepairer
* repairer
,
7521 store_statfs_t
& expected_statfs
,
7524 dout(30) << __func__
<< " oid " << oid
<< " extents " << extents
<< dendl
;
7526 for (auto e
: extents
) {
7529 expected_statfs
.allocated
+= e
.length
;
7531 expected_statfs
.data_compressed_allocated
+= e
.length
;
7533 if (depth
!= FSCK_SHALLOW
) {
7534 bool already
= false;
7535 apply_for_bitset_range(
7536 e
.offset
, e
.length
, granularity
, used_blocks
,
7537 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
7540 repairer
->note_misreference(
7541 pos
* min_alloc_size
, min_alloc_size
, !already
);
7544 derr
<< "fsck error: " << oid
<< " extent " << e
7545 << " or a subset is already allocated (misreferenced)" << dendl
;
7554 repairer
->get_space_usage_tracker().set_used( e
.offset
, e
.length
, cid
, oid
);
7557 if (e
.end() > bdev
->get_size()) {
7558 derr
<< "fsck error: " << oid
<< " extent " << e
7559 << " past end of block device" << dendl
;
7567 void BlueStore::_fsck_check_pool_statfs(
7568 BlueStore::per_pool_statfs
& expected_pool_statfs
,
7571 BlueStoreRepairer
* repairer
)
7573 auto it
= db
->get_iterator(PREFIX_STAT
);
7575 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
7576 string key
= it
->key();
7577 if (key
== BLUESTORE_GLOBAL_STATFS_KEY
) {
7580 repairer
->remove_key(db
, PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
);
7581 derr
<< "fsck error: " << "legacy statfs record found, removing"
7587 if (get_key_pool_stat(key
, &pool_id
) < 0) {
7588 derr
<< "fsck error: bad key " << key
7589 << "in statfs namespece" << dendl
;
7591 repairer
->remove_key(db
, PREFIX_STAT
, key
);
7597 volatile_statfs vstatfs
;
7598 bufferlist bl
= it
->value();
7599 auto blp
= bl
.cbegin();
7601 vstatfs
.decode(blp
);
7602 } catch (buffer::error
& e
) {
7603 derr
<< "fsck error: failed to decode Pool StatFS record"
7604 << pretty_binary_string(key
) << dendl
;
7606 dout(20) << __func__
<< " undecodable Pool StatFS record, key:'"
7607 << pretty_binary_string(key
)
7608 << "', removing" << dendl
;
7609 repairer
->remove_key(db
, PREFIX_STAT
, key
);
7614 auto stat_it
= expected_pool_statfs
.find(pool_id
);
7615 if (stat_it
== expected_pool_statfs
.end()) {
7616 if (vstatfs
.is_empty()) {
7617 // we don't consider that as an error since empty pool statfs
7618 // are left in DB for now
7619 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7620 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7622 // but we need to increment error count in case of repair
7623 // to have proper counters at the end
7624 // (as repairer increments recovery counter anyway).
7628 derr
<< "fsck error: found stray Pool StatFS record for pool id 0x"
7629 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7633 repairer
->remove_key(db
, PREFIX_SHARED_BLOB
, key
);
7637 store_statfs_t statfs
;
7638 vstatfs
.publish(&statfs
);
7639 if (!(stat_it
->second
== statfs
)) {
7640 derr
<< "fsck error: actual " << statfs
7641 << " != expected " << stat_it
->second
7643 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7645 repairer
->fix_statfs(db
, key
, stat_it
->second
);
7649 expected_pool_statfs
.erase(stat_it
);
7652 for (auto& s
: expected_pool_statfs
) {
7653 if (s
.second
.is_zero()) {
7654 // we might lack empty statfs recs in DB
7657 derr
<< "fsck error: missing Pool StatFS record for pool "
7658 << std::hex
<< s
.first
<< std::dec
<< dendl
;
7661 get_pool_stat_key(s
.first
, &key
);
7662 repairer
->fix_statfs(db
, key
, s
.second
);
7666 if (!per_pool_stat_collection
&&
7668 // by virtue of running this method, we correct the top-level
7669 // error of having global stats
7670 repairer
->inc_repaired();
7674 BlueStore::OnodeRef
BlueStore::fsck_check_objects_shallow(
7675 BlueStore::FSCKDepth depth
,
7677 BlueStore::CollectionRef c
,
7678 const ghobject_t
& oid
,
7680 const bufferlist
& value
,
7681 mempool::bluestore_fsck::list
<string
>* expecting_shards
,
7682 map
<BlobRef
, bluestore_blob_t::unused_t
>* referenced
,
7683 const BlueStore::FSCK_ObjectCtx
& ctx
)
7685 auto& errors
= ctx
.errors
;
7686 auto& num_objects
= ctx
.num_objects
;
7687 auto& num_extents
= ctx
.num_extents
;
7688 auto& num_blobs
= ctx
.num_blobs
;
7689 auto& num_sharded_objects
= ctx
.num_sharded_objects
;
7690 auto& num_spanning_blobs
= ctx
.num_spanning_blobs
;
7691 auto used_blocks
= ctx
.used_blocks
;
7692 auto sb_info_lock
= ctx
.sb_info_lock
;
7693 auto& sb_info
= ctx
.sb_info
;
7694 auto repairer
= ctx
.repairer
;
7696 store_statfs_t
* res_statfs
= (per_pool_stat_collection
|| repairer
) ?
7697 &ctx
.expected_pool_statfs
[pool_id
] :
7698 &ctx
.expected_store_statfs
;
7700 dout(10) << __func__
<< " " << oid
<< dendl
;
7702 o
.reset(Onode::decode(c
, oid
, key
, value
));
7705 num_spanning_blobs
+= o
->extent_map
.spanning_blob_map
.size();
7707 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
7708 _dump_onode
<30>(cct
, *o
);
7710 if (!o
->extent_map
.shards
.empty()) {
7711 ++num_sharded_objects
;
7712 if (depth
!= FSCK_SHALLOW
) {
7713 ceph_assert(expecting_shards
);
7714 for (auto& s
: o
->extent_map
.shards
) {
7715 dout(20) << __func__
<< " shard " << *s
.shard_info
<< dendl
;
7716 expecting_shards
->push_back(string());
7717 get_extent_shard_key(o
->key
, s
.shard_info
->offset
,
7718 &expecting_shards
->back());
7719 if (s
.shard_info
->offset
>= o
->onode
.size
) {
7720 derr
<< "fsck error: " << oid
<< " shard 0x" << std::hex
7721 << s
.shard_info
->offset
<< " past EOF at 0x" << o
->onode
.size
7722 << std::dec
<< dendl
;
7731 mempool::bluestore_fsck::map
<BlobRef
,
7732 bluestore_blob_use_tracker_t
> ref_map
;
7733 for (auto& l
: o
->extent_map
.extent_map
) {
7734 dout(20) << __func__
<< " " << l
<< dendl
;
7735 if (l
.logical_offset
< pos
) {
7736 derr
<< "fsck error: " << oid
<< " lextent at 0x"
7737 << std::hex
<< l
.logical_offset
7738 << " overlaps with the previous, which ends at 0x" << pos
7739 << std::dec
<< dendl
;
7742 if (depth
!= FSCK_SHALLOW
&&
7743 o
->extent_map
.spans_shard(l
.logical_offset
, l
.length
)) {
7744 derr
<< "fsck error: " << oid
<< " lextent at 0x"
7745 << std::hex
<< l
.logical_offset
<< "~" << l
.length
7746 << " spans a shard boundary"
7747 << std::dec
<< dendl
;
7750 pos
= l
.logical_offset
+ l
.length
;
7751 res_statfs
->data_stored
+= l
.length
;
7752 ceph_assert(l
.blob
);
7753 const bluestore_blob_t
& blob
= l
.blob
->get_blob();
7755 auto& ref
= ref_map
[l
.blob
];
7756 if (ref
.is_empty()) {
7757 uint32_t min_release_size
= blob
.get_release_size(min_alloc_size
);
7758 uint32_t l
= blob
.get_logical_length();
7759 ref
.init(l
, min_release_size
);
7765 if (depth
!= FSCK_SHALLOW
&&
7766 blob
.has_unused()) {
7767 ceph_assert(referenced
);
7768 auto p
= referenced
->find(l
.blob
);
7769 bluestore_blob_t::unused_t
* pu
;
7770 if (p
== referenced
->end()) {
7771 pu
= &(*referenced
)[l
.blob
];
7776 uint64_t blob_len
= blob
.get_logical_length();
7777 ceph_assert((blob_len
% (sizeof(*pu
) * 8)) == 0);
7778 ceph_assert(l
.blob_offset
+ l
.length
<= blob_len
);
7779 uint64_t chunk_size
= blob_len
/ (sizeof(*pu
) * 8);
7780 uint64_t start
= l
.blob_offset
/ chunk_size
;
7782 round_up_to(l
.blob_offset
+ l
.length
, chunk_size
) / chunk_size
;
7783 for (auto i
= start
; i
< end
; ++i
) {
7787 } //for (auto& l : o->extent_map.extent_map)
7789 for (auto& i
: ref_map
) {
7791 const bluestore_blob_t
& blob
= i
.first
->get_blob();
7793 depth
== FSCK_SHALLOW
? true :
7794 i
.first
->get_blob_use_tracker().equal(i
.second
);
7796 derr
<< "fsck error: " << oid
<< " blob " << *i
.first
7797 << " doesn't match expected ref_map " << i
.second
<< dendl
;
7800 if (blob
.is_compressed()) {
7801 res_statfs
->data_compressed
+= blob
.get_compressed_payload_length();
7802 res_statfs
->data_compressed_original
+=
7803 i
.first
->get_referenced_bytes();
7805 if (blob
.is_shared()) {
7806 if (i
.first
->shared_blob
->get_sbid() > blobid_max
) {
7807 derr
<< "fsck error: " << oid
<< " blob " << blob
7808 << " sbid " << i
.first
->shared_blob
->get_sbid() << " > blobid_max "
7809 << blobid_max
<< dendl
;
7812 else if (i
.first
->shared_blob
->get_sbid() == 0) {
7813 derr
<< "fsck error: " << oid
<< " blob " << blob
7814 << " marked as shared but has uninitialized sbid"
7818 // the below lock is optional and provided in multithreading mode only
7820 sb_info_lock
->lock();
7822 sb_info_t
& sbi
= sb_info
[i
.first
->shared_blob
->get_sbid()];
7823 ceph_assert(sbi
.cid
== coll_t() || sbi
.cid
== c
->cid
);
7824 ceph_assert(sbi
.pool_id
== INT64_MIN
||
7825 sbi
.pool_id
== oid
.hobj
.get_logical_pool());
7827 sbi
.pool_id
= oid
.hobj
.get_logical_pool();
7828 sbi
.sb
= i
.first
->shared_blob
;
7829 sbi
.oids
.push_back(oid
);
7830 sbi
.compressed
= blob
.is_compressed();
7831 for (auto e
: blob
.get_extents()) {
7833 sbi
.ref_map
.get(e
.offset
, e
.length
);
7837 sb_info_lock
->unlock();
7839 } else if (depth
!= FSCK_SHALLOW
) {
7840 ceph_assert(used_blocks
);
7841 errors
+= _fsck_check_extents(c
->cid
, oid
, blob
.get_extents(),
7842 blob
.is_compressed(),
7844 fm
->get_alloc_size(),
7849 errors
+= _fsck_sum_extents(
7851 blob
.is_compressed(),
7854 } // for (auto& i : ref_map)
7857 auto &sbm
= o
->extent_map
.spanning_blob_map
;
7859 BlobRef first_broken
;
7860 for (auto it
= sbm
.begin(); it
!= sbm
.end();) {
7862 if (ref_map
.count(it1
->second
) == 0) {
7864 first_broken
= it1
->second
;
7874 derr
<< "fsck error: " << oid
<< " - " << broken
7875 << " zombie spanning blob(s) found, the first one: "
7876 << *first_broken
<< dendl
;
7878 auto txn
= repairer
->fix_spanning_blobs(db
);
7879 _record_onode(o
, txn
);
7884 if (o
->onode
.has_omap()) {
7885 _fsck_check_object_omap(depth
, o
, ctx
);
7891 #include "common/WorkQueue.h"
7893 class ShallowFSCKThreadPool
: public ThreadPool
7896 ShallowFSCKThreadPool(CephContext
* cct_
, std::string nm
, std::string tn
, int n
) :
7897 ThreadPool(cct_
, nm
, tn
, n
) {
7899 void worker(ThreadPool::WorkThread
* wt
) override
{
7902 next_wq
%= work_queues
.size();
7903 WorkQueue_
*wq
= work_queues
[next_wq
++];
7905 void* item
= wq
->_void_dequeue();
7908 TPHandle
tp_handle(cct
, nullptr, wq
->timeout_interval
, wq
->suicide_interval
);
7909 wq
->_void_process(item
, tp_handle
);
7914 template <size_t BatchLen
>
7915 struct FSCKWorkQueue
: public ThreadPool::WorkQueue_
7919 BlueStore::CollectionRef c
;
7925 std::atomic
<size_t> running
= { 0 };
7926 size_t entry_count
= 0;
7927 std::array
<Entry
, BatchLen
> entries
;
7930 int64_t warnings
= 0;
7931 uint64_t num_objects
= 0;
7932 uint64_t num_extents
= 0;
7933 uint64_t num_blobs
= 0;
7934 uint64_t num_sharded_objects
= 0;
7935 uint64_t num_spanning_blobs
= 0;
7936 store_statfs_t expected_store_statfs
;
7937 BlueStore::per_pool_statfs expected_pool_statfs
;
7941 BlueStore
* store
= nullptr;
7943 ceph::mutex
* sb_info_lock
= nullptr;
7944 BlueStore::sb_info_map_t
* sb_info
= nullptr;
7945 BlueStoreRepairer
* repairer
= nullptr;
7947 Batch
* batches
= nullptr;
7948 size_t last_batch_pos
= 0;
7949 bool batch_acquired
= false;
7951 FSCKWorkQueue(std::string n
,
7954 ceph::mutex
* _sb_info_lock
,
7955 BlueStore::sb_info_map_t
& _sb_info
,
7956 BlueStoreRepairer
* _repairer
) :
7957 WorkQueue_(n
, time_t(), time_t()),
7958 batchCount(_batchCount
),
7960 sb_info_lock(_sb_info_lock
),
7964 batches
= new Batch
[batchCount
];
7970 /// Remove all work items from the queue.
7971 void _clear() override
{
7974 /// Check whether there is anything to do.
7975 bool _empty() override
{
7979 /// Get the next work item to process.
7980 void* _void_dequeue() override
{
7981 size_t pos
= rand() % batchCount
;
7984 auto& batch
= batches
[pos
];
7985 if (batch
.running
.fetch_add(1) == 0) {
7986 if (batch
.entry_count
) {
7993 } while (pos
!= pos0
);
7996 /** @brief Process the work item.
7997 * This function will be called several times in parallel
7998 * and must therefore be thread-safe. */
7999 void _void_process(void* item
, TPHandle
& handle
) override
{
8000 Batch
* batch
= (Batch
*)item
;
8002 BlueStore::FSCK_ObjectCtx
ctx(
8008 batch
->num_sharded_objects
,
8009 batch
->num_spanning_blobs
,
8010 nullptr, // used_blocks
8011 nullptr, //used_omap_head
8014 batch
->expected_store_statfs
,
8015 batch
->expected_pool_statfs
,
8018 for (size_t i
= 0; i
< batch
->entry_count
; i
++) {
8019 auto& entry
= batch
->entries
[i
];
8021 store
->fsck_check_objects_shallow(
8022 BlueStore::FSCK_SHALLOW
,
8028 nullptr, // expecting_shards - this will need a protection if passed
8029 nullptr, // referenced
8032 //std::cout << "processed " << batch << std::endl;
8033 batch
->entry_count
= 0;
8036 /** @brief Synchronously finish processing a work item.
8037 * This function is called after _void_process with the global thread pool lock held,
8038 * so at most one copy will execute simultaneously for a given thread pool.
8039 * It can be used for non-thread-safe finalization. */
8040 void _void_process_finish(void*) override
{
8046 BlueStore::CollectionRef c
,
8047 const ghobject_t
& oid
,
8049 const bufferlist
& value
) {
8051 size_t pos0
= last_batch_pos
;
8052 if (!batch_acquired
) {
8054 auto& batch
= batches
[last_batch_pos
];
8055 if (batch
.running
.fetch_add(1) == 0) {
8056 if (batch
.entry_count
< BatchLen
) {
8057 batch_acquired
= true;
8061 batch
.running
.fetch_sub(1);
8063 last_batch_pos
%= batchCount
;
8064 } while (last_batch_pos
!= pos0
);
8066 if (batch_acquired
) {
8067 auto& batch
= batches
[last_batch_pos
];
8068 ceph_assert(batch
.running
);
8069 ceph_assert(batch
.entry_count
< BatchLen
);
8071 auto& entry
= batch
.entries
[batch
.entry_count
];
8072 entry
.pool_id
= pool_id
;
8076 entry
.value
= value
;
8078 ++batch
.entry_count
;
8079 if (batch
.entry_count
== BatchLen
) {
8080 batch_acquired
= false;
8081 batch
.running
.fetch_sub(1);
8083 last_batch_pos
%= batchCount
;
8090 void finalize(ThreadPool
& tp
,
8091 BlueStore::FSCK_ObjectCtx
& ctx
) {
8092 if (batch_acquired
) {
8093 auto& batch
= batches
[last_batch_pos
];
8094 ceph_assert(batch
.running
);
8095 batch
.running
.fetch_sub(1);
8099 for (size_t i
= 0; i
< batchCount
; i
++) {
8100 auto& batch
= batches
[i
];
8102 //process leftovers if any
8103 if (batch
.entry_count
) {
8104 TPHandle
tp_handle(store
->cct
,
8108 ceph_assert(batch
.running
== 0);
8110 batch
.running
++; // just to be on-par with the regular call
8111 _void_process(&batch
, tp_handle
);
8113 ceph_assert(batch
.entry_count
== 0);
8115 ctx
.errors
+= batch
.errors
;
8116 ctx
.warnings
+= batch
.warnings
;
8117 ctx
.num_objects
+= batch
.num_objects
;
8118 ctx
.num_extents
+= batch
.num_extents
;
8119 ctx
.num_blobs
+= batch
.num_blobs
;
8120 ctx
.num_sharded_objects
+= batch
.num_sharded_objects
;
8121 ctx
.num_spanning_blobs
+= batch
.num_spanning_blobs
;
8123 ctx
.expected_store_statfs
.add(batch
.expected_store_statfs
);
8125 for (auto it
= batch
.expected_pool_statfs
.begin();
8126 it
!= batch
.expected_pool_statfs
.end();
8128 ctx
.expected_pool_statfs
[it
->first
].add(it
->second
);
8135 void BlueStore::_fsck_check_object_omap(FSCKDepth depth
,
8137 const BlueStore::FSCK_ObjectCtx
& ctx
)
8139 auto& errors
= ctx
.errors
;
8140 auto& warnings
= ctx
.warnings
;
8141 auto repairer
= ctx
.repairer
;
8143 ceph_assert(o
->onode
.has_omap());
8144 if (!o
->onode
.is_perpool_omap() && !o
->onode
.is_pgmeta_omap()) {
8145 if (per_pool_omap
) {
8146 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8147 << "fsck error: " << o
->oid
8148 << " has omap that is not per-pool or pgmeta"
8154 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_omap
) {
8163 fsck_derr(num
, MAX_FSCK_ERROR_LINES
)
8164 << "fsck " << w
<< ": " << o
->oid
8165 << " has omap that is not per-pool or pgmeta"
8170 !o
->onode
.is_perpool_omap() &&
8171 !o
->onode
.is_pgmeta_omap()) {
8172 dout(10) << "fsck converting " << o
->oid
<< " omap to per-pool" << dendl
;
8174 map
<string
, bufferlist
> kv
;
8175 int r
= _onode_omap_get(o
, &h
, &kv
);
8177 derr
<< " got " << r
<< " " << cpp_strerror(r
) << dendl
;
8179 KeyValueDB::Transaction txn
= db
->get_transaction();
8181 const string
& old_omap_prefix
= o
->get_omap_prefix();
8182 string old_head
, old_tail
;
8183 o
->get_omap_header(&old_head
);
8184 o
->get_omap_tail(&old_tail
);
8185 txn
->rm_range_keys(old_omap_prefix
, old_head
, old_tail
);
8186 txn
->rmkey(old_omap_prefix
, old_tail
);
8188 o
->onode
.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP
);
8189 _record_onode(o
, txn
);
8190 const string
& new_omap_prefix
= o
->get_omap_prefix();
8194 o
->get_omap_header(&new_head
);
8195 txn
->set(new_omap_prefix
, new_head
, h
);
8199 o
->get_omap_tail(&new_tail
);
8201 txn
->set(new_omap_prefix
, new_tail
, empty
);
8204 o
->get_omap_key(string(), &final_key
);
8205 size_t base_key_len
= final_key
.size();
8206 for (auto& i
: kv
) {
8207 final_key
.resize(base_key_len
);
8208 final_key
+= i
.first
;
8209 txn
->set(new_omap_prefix
, final_key
, i
.second
);
8211 db
->submit_transaction_sync(txn
);
8212 repairer
->inc_repaired();
8217 void BlueStore::_fsck_check_objects(FSCKDepth depth
,
8218 BlueStore::FSCK_ObjectCtx
& ctx
)
8220 auto& errors
= ctx
.errors
;
8221 auto sb_info_lock
= ctx
.sb_info_lock
;
8222 auto& sb_info
= ctx
.sb_info
;
8223 auto repairer
= ctx
.repairer
;
8225 uint64_t_btree_t used_nids
;
8227 size_t processed_myself
= 0;
8229 auto it
= db
->get_iterator(PREFIX_OBJ
);
8230 mempool::bluestore_fsck::list
<string
> expecting_shards
;
8232 const size_t thread_count
= cct
->_conf
->bluestore_fsck_quick_fix_threads
;
8233 typedef ShallowFSCKThreadPool::FSCKWorkQueue
<256> WQ
;
8234 std::unique_ptr
<WQ
> wq(
8237 (thread_count
? : 1) * 32,
8243 ShallowFSCKThreadPool
thread_pool(cct
, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count
);
8245 thread_pool
.add_work_queue(wq
.get());
8246 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
8247 //not the best place but let's check anyway
8248 ceph_assert(sb_info_lock
);
8249 thread_pool
.start();
8252 //fill global if not overriden below
8254 int64_t pool_id
= -1;
8256 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8257 dout(30) << __func__
<< " key "
8258 << pretty_binary_string(it
->key()) << dendl
;
8259 if (is_extent_shard_key(it
->key())) {
8260 if (depth
== FSCK_SHALLOW
) {
8263 while (!expecting_shards
.empty() &&
8264 expecting_shards
.front() < it
->key()) {
8265 derr
<< "fsck error: missing shard key "
8266 << pretty_binary_string(expecting_shards
.front())
8269 expecting_shards
.pop_front();
8271 if (!expecting_shards
.empty() &&
8272 expecting_shards
.front() == it
->key()) {
8274 expecting_shards
.pop_front();
8280 get_key_extent_shard(it
->key(), &okey
, &offset
);
8281 derr
<< "fsck error: stray shard 0x" << std::hex
<< offset
8282 << std::dec
<< dendl
;
8283 if (expecting_shards
.empty()) {
8284 derr
<< "fsck error: " << pretty_binary_string(it
->key())
8285 << " is unexpected" << dendl
;
8289 while (expecting_shards
.front() > it
->key()) {
8290 derr
<< "fsck error: saw " << pretty_binary_string(it
->key())
8292 derr
<< "fsck error: exp "
8293 << pretty_binary_string(expecting_shards
.front()) << dendl
;
8295 expecting_shards
.pop_front();
8296 if (expecting_shards
.empty()) {
8304 int r
= get_key_object(it
->key(), &oid
);
8306 derr
<< "fsck error: bad object key "
8307 << pretty_binary_string(it
->key()) << dendl
;
8312 oid
.shard_id
!= pgid
.shard
||
8313 oid
.hobj
.get_logical_pool() != (int64_t)pgid
.pool() ||
8314 !c
->contains(oid
)) {
8316 for (auto& p
: coll_map
) {
8317 if (p
.second
->contains(oid
)) {
8323 derr
<< "fsck error: stray object " << oid
8324 << " not owned by any collection" << dendl
;
8328 pool_id
= c
->cid
.is_pg(&pgid
) ? pgid
.pool() : META_POOL_ID
;
8329 dout(20) << __func__
<< " collection " << c
->cid
<< " " << c
->cnode
8333 if (depth
!= FSCK_SHALLOW
&&
8334 !expecting_shards
.empty()) {
8335 for (auto& k
: expecting_shards
) {
8336 derr
<< "fsck error: missing shard key "
8337 << pretty_binary_string(k
) << dendl
;
8340 expecting_shards
.clear();
8343 bool queued
= false;
8344 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
8353 map
<BlobRef
, bluestore_blob_t::unused_t
> referenced
;
8358 o
= fsck_check_objects_shallow(
8370 if (depth
!= FSCK_SHALLOW
) {
8371 ceph_assert(o
!= nullptr);
8373 if (o
->onode
.nid
> nid_max
) {
8374 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
8375 << " > nid_max " << nid_max
<< dendl
;
8378 if (used_nids
.count(o
->onode
.nid
)) {
8379 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
8380 << " already in use" << dendl
;
8382 continue; // go for next object
8384 used_nids
.insert(o
->onode
.nid
);
8386 for (auto& i
: referenced
) {
8387 dout(20) << __func__
<< " referenced 0x" << std::hex
<< i
.second
8388 << std::dec
<< " for " << *i
.first
<< dendl
;
8389 const bluestore_blob_t
& blob
= i
.first
->get_blob();
8390 if (i
.second
& blob
.unused
) {
8391 derr
<< "fsck error: " << oid
<< " blob claims unused 0x"
8392 << std::hex
<< blob
.unused
8393 << " but extents reference 0x" << i
.second
<< std::dec
8394 << " on blob " << *i
.first
<< dendl
;
8397 if (blob
.has_csum()) {
8398 uint64_t blob_len
= blob
.get_logical_length();
8399 uint64_t unused_chunk_size
= blob_len
/ (sizeof(blob
.unused
) * 8);
8400 unsigned csum_count
= blob
.get_csum_count();
8401 unsigned csum_chunk_size
= blob
.get_csum_chunk_size();
8402 for (unsigned p
= 0; p
< csum_count
; ++p
) {
8403 unsigned pos
= p
* csum_chunk_size
;
8404 unsigned firstbit
= pos
/ unused_chunk_size
; // [firstbit,lastbit]
8405 unsigned lastbit
= (pos
+ csum_chunk_size
- 1) / unused_chunk_size
;
8406 unsigned mask
= 1u << firstbit
;
8407 for (unsigned b
= firstbit
+ 1; b
<= lastbit
; ++b
) {
8410 if ((blob
.unused
& mask
) == mask
) {
8411 // this csum chunk region is marked unused
8412 if (blob
.get_csum_item(p
) != 0) {
8413 derr
<< "fsck error: " << oid
8414 << " blob claims csum chunk 0x" << std::hex
<< pos
8415 << "~" << csum_chunk_size
8416 << " is unused (mask 0x" << mask
<< " of unused 0x"
8417 << blob
.unused
<< ") but csum is non-zero 0x"
8418 << blob
.get_csum_item(p
) << std::dec
<< " on blob "
8419 << *i
.first
<< dendl
;
8427 if (o
->onode
.has_omap()) {
8428 ceph_assert(ctx
.used_omap_head
);
8429 if (ctx
.used_omap_head
->count(o
->onode
.nid
)) {
8430 derr
<< "fsck error: " << o
->oid
<< " omap_head " << o
->onode
.nid
8431 << " already in use" << dendl
;
8434 ctx
.used_omap_head
->insert(o
->onode
.nid
);
8436 } // if (o->onode.has_omap())
8437 if (depth
== FSCK_DEEP
) {
8439 uint64_t max_read_block
= cct
->_conf
->bluestore_fsck_read_bytes_cap
;
8440 uint64_t offset
= 0;
8442 uint64_t l
= std::min(uint64_t(o
->onode
.size
- offset
), max_read_block
);
8443 int r
= _do_read(c
.get(), o
, offset
, l
, bl
,
8444 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
8447 derr
<< "fsck error: " << oid
<< std::hex
8448 << " error during read: "
8449 << " " << offset
<< "~" << l
8450 << " " << cpp_strerror(r
) << std::dec
8455 } while (offset
< o
->onode
.size
);
8457 } //if (depth != FSCK_SHALLOW)
8458 } // for (it->lower_bound(string()); it->valid(); it->next())
8459 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
8460 wq
->finalize(thread_pool
, ctx
);
8461 if (processed_myself
) {
8462 // may be needs more threads?
8463 dout(0) << __func__
<< " partial offload"
8464 << ", done myself " << processed_myself
8465 << " of " << ctx
.num_objects
8466 << "objects, threads " << thread_count
8473 An overview for currently implemented repair logics
8474 performed in fsck in two stages: detection(+preparation) and commit.
8475 Detection stage (in processing order):
8476 (Issue -> Repair action to schedule)
8477 - Detect undecodable keys for Shared Blobs -> Remove
8478 - Detect undecodable records for Shared Blobs -> Remove
8479 (might trigger missed Shared Blob detection below)
8480 - Detect stray records for Shared Blobs -> Remove
8481 - Detect misreferenced pextents -> Fix
8482 Prepare Bloom-like filter to track cid/oid -> pextent
8483 Prepare list of extents that are improperly referenced
8484 Enumerate Onode records that might use 'misreferenced' pextents
8485 (Bloom-like filter applied to reduce computation)
8486 Per each questinable Onode enumerate all blobs and identify broken ones
8487 (i.e. blobs having 'misreferences')
8488 Rewrite each broken blob data by allocating another extents and
8490 If blob is shared - unshare it and mark corresponding Shared Blob
8492 Release previously allocated space
8494 - Detect missed Shared Blobs -> Recreate
8495 - Detect undecodable deferred transaction -> Remove
8496 - Detect Freelist Manager's 'false free' entries -> Mark as used
8497 - Detect Freelist Manager's leaked entries -> Mark as free
8498 - Detect statfs inconsistency - Update
8499 Commit stage (separate DB commit per each step):
8500 - Apply leaked FM entries fix
8501 - Apply 'false free' FM entries fix
8502 - Apply 'Remove' actions
8503 - Apply fix for misreference pextents
8504 - Apply Shared Blob recreate
8505 (can be merged with the step above if misreferences were dectected)
8506 - Apply StatFS update
8508 int BlueStore::_fsck(BlueStore::FSCKDepth depth
, bool repair
)
8511 << (repair
? " repair" : " check")
8512 << (depth
== FSCK_DEEP
? " (deep)" :
8513 depth
== FSCK_SHALLOW
? " (shallow)" : " (regular)")
8516 // in deep mode we need R/W write access to be able to replay deferred ops
8517 bool read_only
= !(repair
|| depth
== FSCK_DEEP
);
8519 int r
= _open_path();
8522 r
= _open_fsid(false);
8526 r
= _read_fsid(&fsid
);
8534 r
= _open_bdev(false);
8538 r
= _open_db_and_around(read_only
);
8543 r
= _upgrade_super();
8549 r
= _open_collections();
8553 mempool_thread
.init();
8555 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8556 // enable in repair or deep mode modes only
8559 r
= _deferred_replay();
8565 r
= _fsck_on_open(depth
, repair
);
8568 mempool_thread
.shutdown();
8571 _close_db_and_around(false);
8582 int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth
, bool repair
)
8586 << (repair
? " repair" : " check")
8587 << (depth
== FSCK_DEEP
? " (deep)" :
8588 depth
== FSCK_SHALLOW
? " (shallow)" : " (regular)")
8589 << " start" << dendl
;
8591 int64_t warnings
= 0;
8592 unsigned repaired
= 0;
8594 uint64_t_btree_t used_omap_head
;
8595 uint64_t_btree_t used_sbids
;
8597 mempool_dynamic_bitset used_blocks
;
8598 KeyValueDB::Iterator it
;
8599 store_statfs_t expected_store_statfs
, actual_statfs
;
8600 per_pool_statfs expected_pool_statfs
;
8602 sb_info_map_t sb_info
;
8604 uint64_t num_objects
= 0;
8605 uint64_t num_extents
= 0;
8606 uint64_t num_blobs
= 0;
8607 uint64_t num_spanning_blobs
= 0;
8608 uint64_t num_shared_blobs
= 0;
8609 uint64_t num_sharded_objects
= 0;
8610 BlueStoreRepairer repairer
;
8612 utime_t start
= ceph_clock_now();
8614 _fsck_collections(&errors
);
8615 used_blocks
.resize(fm
->get_alloc_units());
8616 apply_for_bitset_range(
8617 0, std::max
<uint64_t>(min_alloc_size
, SUPER_RESERVED
), fm
->get_alloc_size(), used_blocks
,
8618 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8623 repairer
.get_space_usage_tracker().init(
8629 if( cct
->_conf
->bluestore_bluefs_db_compatibility
) {
8630 interval_set
<uint64_t> bluefs_extents_db
;
8632 db
->get(PREFIX_SUPER
, "bluefs_extents", &bl
);
8633 auto p
= bl
.cbegin();
8634 auto prev_errors
= errors
;
8636 decode(bluefs_extents_db
, p
);
8637 bluefs_extents_db
.union_of(bluefs_extents
);
8638 bluefs_extents_db
.subtract(bluefs_extents
);
8639 if (!bluefs_extents_db
.empty()) {
8640 derr
<< "fsck error: bluefs_extents inconsistency, "
8641 << "downgrade to previous releases might be broken."
8646 catch (buffer::error
& e
) {
8647 derr
<< "fsck error: failed to retrieve bluefs_extents from kv" << dendl
;
8650 if (errors
!= prev_errors
&& repair
) {
8651 repairer
.fix_bluefs_extents(out_of_sync_fm
);
8655 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
8656 apply_for_bitset_range(
8657 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
8658 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8662 int r
= bluefs
->fsck();
8670 if (!per_pool_stat_collection
) {
8672 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_stats
) {
8679 derr
<< "fsck " << w
<< ": store not yet converted to per-pool stats"
8682 if (!per_pool_omap
) {
8684 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_omap
) {
8691 derr
<< "fsck " << w
<< ": store not yet converted to per-pool omap"
8695 // get expected statfs; reset unaffected fields to be able to compare
8697 statfs(&actual_statfs
);
8698 actual_statfs
.total
= 0;
8699 actual_statfs
.internally_reserved
= 0;
8700 actual_statfs
.available
= 0;
8701 actual_statfs
.internal_metadata
= 0;
8702 actual_statfs
.omap_allocated
= 0;
8704 if (g_conf()->bluestore_debug_fsck_abort
) {
8705 dout(1) << __func__
<< " debug abort" << dendl
;
8710 dout(1) << __func__
<< " walking object keyspace" << dendl
;
8711 ceph::mutex sb_info_lock
= ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8712 BlueStore::FSCK_ObjectCtx
ctx(
8718 num_sharded_objects
,
8722 //no need for the below lock when in non-shallow mode as
8723 // there is no multithreading in this case
8724 depth
== FSCK_SHALLOW
? &sb_info_lock
: nullptr,
8726 expected_store_statfs
,
8727 expected_pool_statfs
,
8728 repair
? &repairer
: nullptr);
8730 _fsck_check_objects(depth
, ctx
);
8733 dout(1) << __func__
<< " checking shared_blobs" << dendl
;
8734 it
= db
->get_iterator(PREFIX_SHARED_BLOB
);
8736 // FIXME minor: perhaps simplify for shallow mode?
8737 // fill global if not overriden below
8738 auto expected_statfs
= &expected_store_statfs
;
8740 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8741 string key
= it
->key();
8743 if (get_key_shared_blob(key
, &sbid
)) {
8744 derr
<< "fsck error: bad key '" << key
8745 << "' in shared blob namespace" << dendl
;
8747 repairer
.remove_key(db
, PREFIX_SHARED_BLOB
, key
);
8752 auto p
= sb_info
.find(sbid
);
8753 if (p
== sb_info
.end()) {
8754 derr
<< "fsck error: found stray shared blob data for sbid 0x"
8755 << std::hex
<< sbid
<< std::dec
<< dendl
;
8757 repairer
.remove_key(db
, PREFIX_SHARED_BLOB
, key
);
8762 sb_info_t
& sbi
= p
->second
;
8763 bluestore_shared_blob_t
shared_blob(sbid
);
8764 bufferlist bl
= it
->value();
8765 auto blp
= bl
.cbegin();
8767 decode(shared_blob
, blp
);
8768 } catch (buffer::error
& e
) {
8770 // Force update and don't report as missing
8771 sbi
.updated
= sbi
.passed
= true;
8773 derr
<< "fsck error: failed to decode Shared Blob"
8774 << pretty_binary_string(it
->key()) << dendl
;
8776 dout(20) << __func__
<< " undecodable Shared Blob, key:'"
8777 << pretty_binary_string(it
->key())
8778 << "', removing" << dendl
;
8779 repairer
.remove_key(db
, PREFIX_DEFERRED
, it
->key());
8783 dout(20) << __func__
<< " " << *sbi
.sb
<< " " << shared_blob
<< dendl
;
8784 if (shared_blob
.ref_map
!= sbi
.ref_map
) {
8785 derr
<< "fsck error: shared blob 0x" << std::hex
<< sbid
8786 << std::dec
<< " ref_map " << shared_blob
.ref_map
8787 << " != expected " << sbi
.ref_map
<< dendl
;
8788 sbi
.updated
= true; // will update later in repair mode only!
8791 PExtentVector extents
;
8792 for (auto &r
: shared_blob
.ref_map
.ref_map
) {
8793 extents
.emplace_back(bluestore_pextent_t(r
.first
, r
.second
.length
));
8795 if (per_pool_stat_collection
|| repair
) {
8796 expected_statfs
= &expected_pool_statfs
[sbi
.pool_id
];
8798 errors
+= _fsck_check_extents(sbi
.cid
,
8799 p
->second
.oids
.front(),
8801 p
->second
.compressed
,
8803 fm
->get_alloc_size(),
8804 repair
? &repairer
: nullptr,
8812 if (repair
&& repairer
.preprocess_misreference(db
)) {
8814 dout(1) << __func__
<< " sorting out misreferenced extents" << dendl
;
8815 auto& space_tracker
= repairer
.get_space_usage_tracker();
8816 auto& misref_extents
= repairer
.get_misreferences();
8817 interval_set
<uint64_t> to_release
;
8818 it
= db
->get_iterator(PREFIX_OBJ
);
8820 // fill global if not overriden below
8821 auto expected_statfs
= &expected_store_statfs
;
8825 KeyValueDB::Transaction txn
= repairer
.get_fix_misreferences_txn();
8826 bool bypass_rest
= false;
8827 for (it
->lower_bound(string()); it
->valid() && !bypass_rest
;
8829 dout(30) << __func__
<< " key "
8830 << pretty_binary_string(it
->key()) << dendl
;
8831 if (is_extent_shard_key(it
->key())) {
8836 int r
= get_key_object(it
->key(), &oid
);
8837 if (r
< 0 || !space_tracker
.is_used(oid
)) {
8842 oid
.shard_id
!= pgid
.shard
||
8843 oid
.hobj
.get_logical_pool() != (int64_t)pgid
.pool() ||
8844 !c
->contains(oid
)) {
8846 for (auto& p
: coll_map
) {
8847 if (p
.second
->contains(oid
)) {
8855 if (per_pool_stat_collection
|| repair
) {
8856 auto pool_id
= c
->cid
.is_pg(&pgid
) ? pgid
.pool() : META_POOL_ID
;
8857 expected_statfs
= &expected_pool_statfs
[pool_id
];
8860 if (!space_tracker
.is_used(c
->cid
)) {
8864 dout(20) << __func__
<< " check misreference for col:" << c
->cid
8865 << " obj:" << oid
<< dendl
;
8868 o
.reset(Onode::decode(c
, oid
, it
->key(), it
->value()));
8869 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
8870 mempool::bluestore_fsck::set
<BlobRef
> blobs
;
8872 for (auto& e
: o
->extent_map
.extent_map
) {
8873 blobs
.insert(e
.blob
);
8875 bool need_onode_update
= false;
8876 bool first_dump
= true;
8877 for(auto b
: blobs
) {
8878 bool broken_blob
= false;
8879 auto& pextents
= b
->dirty_blob().dirty_extents();
8880 for (auto& e
: pextents
) {
8881 if (!e
.is_valid()) {
8884 // for the sake of simplicity and proper shared blob handling
8885 // always rewrite the whole blob even when it's partially
8887 if (misref_extents
.intersects(e
.offset
, e
.length
)) {
8890 _dump_onode
<10>(cct
, *o
);
8898 bool compressed
= b
->get_blob().is_compressed();
8899 need_onode_update
= true;
8900 dout(10) << __func__
8901 << " fix misreferences in oid:" << oid
8902 << " " << *b
<< dendl
;
8904 PExtentVector pext_to_release
;
8905 pext_to_release
.reserve(pextents
.size());
8906 // rewriting all valid pextents
8907 for (auto e
= pextents
.begin(); e
!= pextents
.end();
8908 b_off
+= e
->length
, e
++) {
8909 if (!e
->is_valid()) {
8913 int64_t alloc_len
= alloc
->allocate(e
->length
, min_alloc_size
,
8915 if (alloc_len
< 0 || alloc_len
< (int64_t)e
->length
) {
8917 << " failed to allocate 0x" << std::hex
<< e
->length
8918 << " allocated 0x " << (alloc_len
< 0 ? 0 : alloc_len
)
8919 << " min_alloc_size 0x" << min_alloc_size
8920 << " available 0x " << alloc
->get_free()
8921 << std::dec
<< dendl
;
8922 if (alloc_len
> 0) {
8923 alloc
->release(exts
);
8928 expected_statfs
->allocated
+= e
->length
;
8930 expected_statfs
->data_compressed_allocated
+= e
->length
;
8934 IOContext
ioc(cct
, NULL
, true); // allow EIO
8935 r
= bdev
->read(e
->offset
, e
->length
, &bl
, &ioc
, false);
8937 derr
<< __func__
<< " failed to read from 0x" << std::hex
<< e
->offset
8938 <<"~" << e
->length
<< std::dec
<< dendl
;
8939 ceph_abort_msg("read failed, wtf");
8941 pext_to_release
.push_back(*e
);
8942 e
= pextents
.erase(e
);
8943 e
= pextents
.insert(e
, exts
.begin(), exts
.end());
8944 b
->get_blob().map_bl(
8946 [&](uint64_t offset
, bufferlist
& t
) {
8947 int r
= bdev
->write(offset
, t
, false);
8948 ceph_assert(r
== 0);
8950 e
+= exts
.size() - 1;
8951 for (auto& p
: exts
) {
8952 fm
->allocate(p
.offset
, p
.length
, txn
);
8954 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8956 if (b
->get_blob().is_shared()) {
8957 b
->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED
);
8959 auto sb_it
= sb_info
.find(b
->shared_blob
->get_sbid());
8960 ceph_assert(sb_it
!= sb_info
.end());
8961 sb_info_t
& sbi
= sb_it
->second
;
8963 for (auto& r
: sbi
.ref_map
.ref_map
) {
8964 expected_statfs
->allocated
-= r
.second
.length
;
8965 if (sbi
.compressed
) {
8966 // NB: it's crucial to use compressed flag from sb_info_t
8967 // as we originally used that value while accumulating
8969 expected_statfs
->data_compressed_allocated
-= r
.second
.length
;
8972 sbi
.updated
= sbi
.passed
= true;
8973 sbi
.ref_map
.clear();
8975 // relying on blob's pextents to decide what to release.
8976 for (auto& p
: pext_to_release
) {
8977 to_release
.union_insert(p
.offset
, p
.length
);
8980 for (auto& p
: pext_to_release
) {
8981 expected_statfs
->allocated
-= p
.length
;
8983 expected_statfs
->data_compressed_allocated
-= p
.length
;
8985 to_release
.union_insert(p
.offset
, p
.length
);
8991 } // for(auto b : blobs)
8992 if (need_onode_update
) {
8993 o
->extent_map
.dirty_range(0, OBJECT_MAX_SIZE
);
8994 _record_onode(o
, txn
);
8996 } // for (it->lower_bound(string()); it->valid(); it->next())
8998 for (auto it
= to_release
.begin(); it
!= to_release
.end(); ++it
) {
8999 dout(10) << __func__
<< " release 0x" << std::hex
<< it
.get_start()
9000 << "~" << it
.get_len() << std::dec
<< dendl
;
9001 fm
->release(it
.get_start(), it
.get_len(), txn
);
9003 alloc
->release(to_release
);
9006 } //if (repair && repairer.preprocess_misreference()) {
9008 if (depth
!= FSCK_SHALLOW
) {
9009 for (auto &p
: sb_info
) {
9010 sb_info_t
& sbi
= p
.second
;
9012 derr
<< "fsck error: missing " << *sbi
.sb
<< dendl
;
9015 if (repair
&& (!sbi
.passed
|| sbi
.updated
)) {
9016 auto sbid
= p
.first
;
9017 if (sbi
.ref_map
.empty()) {
9018 ceph_assert(sbi
.passed
);
9019 dout(20) << __func__
<< " " << *sbi
.sb
9020 << " is empty, removing" << dendl
;
9021 repairer
.fix_shared_blob(db
, sbid
, nullptr);
9024 bluestore_shared_blob_t
persistent(sbid
, std::move(sbi
.ref_map
));
9025 encode(persistent
, bl
);
9026 dout(20) << __func__
<< " " << *sbi
.sb
9027 << " is " << bl
.length() << " bytes, updating" << dendl
;
9029 repairer
.fix_shared_blob(db
, sbid
, &bl
);
9036 // check global stats only if fscking (not repairing) w/o per-pool stats
9037 if (!per_pool_stat_collection
&&
9039 !(actual_statfs
== expected_store_statfs
)) {
9040 derr
<< "fsck error: actual " << actual_statfs
9041 << " != expected " << expected_store_statfs
<< dendl
;
9043 repairer
.fix_statfs(db
, BLUESTORE_GLOBAL_STATFS_KEY
,
9044 expected_store_statfs
);
9049 dout(1) << __func__
<< " checking pool_statfs" << dendl
;
9050 _fsck_check_pool_statfs(expected_pool_statfs
,
9051 errors
, warnings
, repair
? &repairer
: nullptr);
9053 if (depth
!= FSCK_SHALLOW
) {
9054 dout(1) << __func__
<< " checking for stray omap data " << dendl
;
9055 it
= db
->get_iterator(PREFIX_OMAP
);
9057 uint64_t last_omap_head
= 0;
9058 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
9060 _key_decode_u64(it
->key().c_str(), &omap_head
);
9061 if (used_omap_head
.count(omap_head
) == 0 &&
9062 omap_head
!= last_omap_head
) {
9063 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
9064 << "fsck error: found stray omap data on omap_head "
9065 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
)<< fsck_dendl
;
9067 last_omap_head
= omap_head
;
9071 it
= db
->get_iterator(PREFIX_PGMETA_OMAP
);
9073 uint64_t last_omap_head
= 0;
9074 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
9076 _key_decode_u64(it
->key().c_str(), &omap_head
);
9077 if (used_omap_head
.count(omap_head
) == 0 &&
9078 omap_head
!= last_omap_head
) {
9079 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
9080 << "fsck error: found stray (pgmeta) omap data on omap_head "
9081 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
9082 last_omap_head
= omap_head
;
9087 it
= db
->get_iterator(PREFIX_PERPOOL_OMAP
);
9089 uint64_t last_omap_head
= 0;
9090 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
9093 string k
= it
->key();
9094 const char *c
= k
.c_str();
9095 c
= _key_decode_u64(c
, &pool
);
9096 c
= _key_decode_u64(c
, &omap_head
);
9097 if (used_omap_head
.count(omap_head
) == 0 &&
9098 omap_head
!= last_omap_head
) {
9099 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
9100 << "fsck error: found stray (per-pool) omap data on omap_head "
9101 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
9103 last_omap_head
= omap_head
;
9107 dout(1) << __func__
<< " checking deferred events" << dendl
;
9108 it
= db
->get_iterator(PREFIX_DEFERRED
);
9110 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
9111 bufferlist bl
= it
->value();
9112 auto p
= bl
.cbegin();
9113 bluestore_deferred_transaction_t wt
;
9116 } catch (buffer::error
& e
) {
9117 derr
<< "fsck error: failed to decode deferred txn "
9118 << pretty_binary_string(it
->key()) << dendl
;
9120 dout(20) << __func__
<< " undecodable deferred TXN record, key: '"
9121 << pretty_binary_string(it
->key())
9122 << "', removing" << dendl
;
9123 repairer
.remove_key(db
, PREFIX_DEFERRED
, it
->key());
9127 dout(20) << __func__
<< " deferred " << wt
.seq
9128 << " ops " << wt
.ops
.size()
9129 << " released 0x" << std::hex
<< wt
.released
<< std::dec
<< dendl
;
9130 for (auto e
= wt
.released
.begin(); e
!= wt
.released
.end(); ++e
) {
9131 apply_for_bitset_range(
9132 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
9133 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
9141 dout(1) << __func__
<< " checking freelist vs allocated" << dendl
;
9143 // remove bluefs_extents from used set since the freelist doesn't
9144 // know they are allocated.
9145 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
9146 apply_for_bitset_range(
9147 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
9148 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
9153 fm
->enumerate_reset();
9154 uint64_t offset
, length
;
9155 while (fm
->enumerate_next(db
, &offset
, &length
)) {
9156 bool intersects
= false;
9157 apply_for_bitset_range(
9158 offset
, length
, fm
->get_alloc_size(), used_blocks
,
9159 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
9161 if (offset
== SUPER_RESERVED
&&
9162 length
== min_alloc_size
- SUPER_RESERVED
) {
9163 // this is due to the change just after luminous to min_alloc_size
9164 // granularity allocations, and our baked in assumption at the top
9165 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
9166 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
9167 // since we will never allocate this region below min_alloc_size.
9168 dout(10) << __func__
<< " ignoring free extent between SUPER_RESERVED"
9169 << " and min_alloc_size, 0x" << std::hex
<< offset
<< "~"
9170 << length
<< std::dec
<< dendl
;
9174 repairer
.fix_false_free(db
, fm
,
9175 pos
* min_alloc_size
,
9185 derr
<< "fsck error: free extent 0x" << std::hex
<< offset
9186 << "~" << length
<< std::dec
9187 << " intersects allocated blocks" << dendl
;
9191 fm
->enumerate_reset();
9192 size_t count
= used_blocks
.count();
9193 if (used_blocks
.size() != count
) {
9194 ceph_assert(used_blocks
.size() > count
);
9196 size_t start
= used_blocks
.find_first();
9197 while (start
!= decltype(used_blocks
)::npos
) {
9200 size_t next
= used_blocks
.find_next(cur
);
9201 if (next
!= cur
+ 1) {
9203 derr
<< "fsck error: leaked extent 0x" << std::hex
9204 << ((uint64_t)start
* fm
->get_alloc_size()) << "~"
9205 << ((cur
+ 1 - start
) * fm
->get_alloc_size()) << std::dec
9208 repairer
.fix_leaked(db
,
9210 start
* min_alloc_size
,
9211 (cur
+ 1 - start
) * min_alloc_size
);
9224 if (!per_pool_omap
) {
9225 dout(5) << __func__
<< " marking per_pool_omap=1" << dendl
;
9226 repairer
.fix_per_pool_omap(db
);
9229 dout(5) << __func__
<< " applying repair results" << dendl
;
9230 repaired
= repairer
.apply(db
);
9231 dout(5) << __func__
<< " repair applied" << dendl
;
9235 dout(2) << __func__
<< " " << num_objects
<< " objects, "
9236 << num_sharded_objects
<< " of them sharded. "
9238 dout(2) << __func__
<< " " << num_extents
<< " extents to "
9239 << num_blobs
<< " blobs, "
9240 << num_spanning_blobs
<< " spanning, "
9241 << num_shared_blobs
<< " shared."
9244 utime_t duration
= ceph_clock_now() - start
;
9245 dout(1) << __func__
<< " <<<FINISH>>> with " << errors
<< " errors, "
9246 << warnings
<< " warnings, "
9247 << repaired
<< " repaired, "
9248 << (errors
+ warnings
- (int)repaired
) << " remaining in "
9249 << duration
<< " seconds" << dendl
;
9251 // In non-repair mode we should return error count only as
9252 // it indicates if store status is OK.
9253 // In repair mode both errors and warnings are taken into account
9254 // since repaired counter relates to them both.
9255 return repair
? errors
+ warnings
- (int)repaired
: errors
;
9258 /// methods to inject various errors fsck can repair
9259 void BlueStore::inject_broken_shared_blob_key(const string
& key
,
9260 const bufferlist
& bl
)
9262 KeyValueDB::Transaction txn
;
9263 txn
= db
->get_transaction();
9264 txn
->set(PREFIX_SHARED_BLOB
, key
, bl
);
9265 db
->submit_transaction_sync(txn
);
9268 void BlueStore::inject_leaked(uint64_t len
)
9270 KeyValueDB::Transaction txn
;
9271 txn
= db
->get_transaction();
9274 int64_t alloc_len
= alloc
->allocate(len
, min_alloc_size
,
9275 min_alloc_size
* 256, 0, &exts
);
9276 ceph_assert(alloc_len
>= (int64_t)len
);
9277 for (auto& p
: exts
) {
9278 fm
->allocate(p
.offset
, p
.length
, txn
);
9280 db
->submit_transaction_sync(txn
);
9283 void BlueStore::inject_false_free(coll_t cid
, ghobject_t oid
)
9285 KeyValueDB::Transaction txn
;
9287 CollectionRef c
= _get_collection(cid
);
9290 std::unique_lock l
{c
->lock
}; // just to avoid internal asserts
9291 o
= c
->get_onode(oid
, false);
9293 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
9296 bool injected
= false;
9297 txn
= db
->get_transaction();
9298 auto& em
= o
->extent_map
.extent_map
;
9299 std::vector
<const PExtentVector
*> v
;
9301 v
.push_back(&em
.begin()->blob
->get_blob().get_extents());
9303 if (em
.size() > 1) {
9306 v
.push_back(&(it
->blob
->get_blob().get_extents()));
9308 for (auto pext
: v
) {
9310 auto p
= pext
->begin();
9311 while (p
!= pext
->end()) {
9312 if (p
->is_valid()) {
9313 dout(20) << __func__
<< " release 0x" << std::hex
<< p
->offset
9314 << "~" << p
->length
<< std::dec
<< dendl
;
9315 fm
->release(p
->offset
, p
->length
, txn
);
9323 ceph_assert(injected
);
9324 db
->submit_transaction_sync(txn
);
9327 void BlueStore::inject_legacy_omap()
9329 dout(1) << __func__
<< dendl
;
9330 per_pool_omap
= false;
9331 KeyValueDB::Transaction txn
;
9332 txn
= db
->get_transaction();
9333 txn
->rmkey(PREFIX_SUPER
, "per_pool_omap");
9334 db
->submit_transaction_sync(txn
);
9337 void BlueStore::inject_legacy_omap(coll_t cid
, ghobject_t oid
)
9339 dout(1) << __func__
<< " "
9340 << cid
<< " " << oid
9342 KeyValueDB::Transaction txn
;
9344 CollectionRef c
= _get_collection(cid
);
9347 std::unique_lock l
{ c
->lock
}; // just to avoid internal asserts
9348 o
= c
->get_onode(oid
, false);
9351 o
->onode
.clear_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP
| bluestore_onode_t::FLAG_PGMETA_OMAP
);
9352 txn
= db
->get_transaction();
9353 _record_onode(o
, txn
);
9354 db
->submit_transaction_sync(txn
);
9358 void BlueStore::inject_statfs(const string
& key
, const store_statfs_t
& new_statfs
)
9360 BlueStoreRepairer repairer
;
9361 repairer
.fix_statfs(db
, key
, new_statfs
);
9365 void BlueStore::inject_global_statfs(const store_statfs_t
& new_statfs
)
9367 KeyValueDB::Transaction t
= db
->get_transaction();
9372 t
->set(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, bl
);
9373 db
->submit_transaction_sync(t
);
9376 void BlueStore::inject_misreference(coll_t cid1
, ghobject_t oid1
,
9377 coll_t cid2
, ghobject_t oid2
,
9381 CollectionRef c1
= _get_collection(cid1
);
9384 std::unique_lock l
{c1
->lock
}; // just to avoid internal asserts
9385 o1
= c1
->get_onode(oid1
, false);
9387 o1
->extent_map
.fault_range(db
, offset
, OBJECT_MAX_SIZE
);
9390 CollectionRef c2
= _get_collection(cid2
);
9393 std::unique_lock l
{c2
->lock
}; // just to avoid internal asserts
9394 o2
= c2
->get_onode(oid2
, false);
9396 o2
->extent_map
.fault_range(db
, offset
, OBJECT_MAX_SIZE
);
9398 Extent
& e1
= *(o1
->extent_map
.seek_lextent(offset
));
9399 Extent
& e2
= *(o2
->extent_map
.seek_lextent(offset
));
9401 // require onode/extent layout to be the same (and simple)
9402 // to make things easier
9403 ceph_assert(o1
->onode
.extent_map_shards
.empty());
9404 ceph_assert(o2
->onode
.extent_map_shards
.empty());
9405 ceph_assert(o1
->extent_map
.spanning_blob_map
.size() == 0);
9406 ceph_assert(o2
->extent_map
.spanning_blob_map
.size() == 0);
9407 ceph_assert(e1
.logical_offset
== e2
.logical_offset
);
9408 ceph_assert(e1
.length
== e2
.length
);
9409 ceph_assert(e1
.blob_offset
== e2
.blob_offset
);
9411 KeyValueDB::Transaction txn
;
9412 txn
= db
->get_transaction();
9414 // along with misreference error this will create space leaks errors
9415 e2
.blob
->dirty_blob() = e1
.blob
->get_blob();
9416 o2
->extent_map
.dirty_range(offset
, e2
.length
);
9417 o2
->extent_map
.update(txn
, false);
9419 _record_onode(o2
, txn
);
9420 db
->submit_transaction_sync(txn
);
9423 void BlueStore::inject_zombie_spanning_blob(coll_t cid
, ghobject_t oid
,
9427 CollectionRef c
= _get_collection(cid
);
9430 std::unique_lock l
{ c
->lock
}; // just to avoid internal asserts
9431 o
= c
->get_onode(oid
, false);
9433 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
9436 BlobRef b
= c
->new_blob();
9438 o
->extent_map
.spanning_blob_map
[blob_id
] = b
;
9440 KeyValueDB::Transaction txn
;
9441 txn
= db
->get_transaction();
9443 _record_onode(o
, txn
);
9444 db
->submit_transaction_sync(txn
);
9447 void BlueStore::collect_metadata(map
<string
,string
> *pm
)
9449 dout(10) << __func__
<< dendl
;
9450 bdev
->collect_metadata("bluestore_bdev_", pm
);
9452 (*pm
)["bluefs"] = "1";
9453 // this value is for backward compatibility only
9454 (*pm
)["bluefs_single_shared_device"] = \
9455 stringify((int)bluefs_layout
.single_shared_device());
9456 (*pm
)["bluefs_dedicated_db"] = \
9457 stringify((int)bluefs_layout
.dedicated_db
);
9458 (*pm
)["bluefs_dedicated_wal"] = \
9459 stringify((int)bluefs_layout
.dedicated_wal
);
9460 bluefs
->collect_metadata(pm
, bluefs_layout
.shared_bdev
);
9462 (*pm
)["bluefs"] = "0";
9465 // report numa mapping for underlying devices
9469 int r
= get_numa_node(&node
, &nodes
, &failed
);
9471 if (!failed
.empty()) {
9472 (*pm
)["objectstore_numa_unknown_devices"] = stringify(failed
);
9474 if (!nodes
.empty()) {
9475 dout(1) << __func__
<< " devices span numa nodes " << nodes
<< dendl
;
9476 (*pm
)["objectstore_numa_nodes"] = stringify(nodes
);
9479 (*pm
)["objectstore_numa_node"] = stringify(node
);
9484 int BlueStore::get_numa_node(
9486 set
<int> *out_nodes
,
9487 set
<string
> *out_failed
)
9490 set
<string
> devices
;
9491 get_devices(&devices
);
9494 for (auto& devname
: devices
) {
9496 BlkDev
bdev(devname
);
9497 int r
= bdev
.get_numa_node(&n
);
9499 dout(10) << __func__
<< " bdev " << devname
<< " can't detect numa_node"
9501 failed
.insert(devname
);
9504 dout(10) << __func__
<< " bdev " << devname
<< " on numa_node " << n
9511 if (node
>= 0 && nodes
.size() == 1 && failed
.empty()) {
9518 *out_failed
= failed
;
9523 int BlueStore::get_devices(set
<string
> *ls
)
9526 bdev
->get_devices(ls
);
9528 bluefs
->get_devices(ls
);
9533 // grumble, we haven't started up yet.
9534 int r
= _open_path();
9537 r
= _open_fsid(false);
9540 r
= _read_fsid(&fsid
);
9546 r
= _open_bdev(false);
9549 r
= _minimal_open_bluefs(false);
9552 bdev
->get_devices(ls
);
9554 bluefs
->get_devices(ls
);
9557 _minimal_close_bluefs();
9568 void BlueStore::_get_statfs_overall(struct store_statfs_t
*buf
)
9572 buf
->omap_allocated
=
9573 db
->estimate_prefix_size(PREFIX_OMAP
, string()) +
9574 db
->estimate_prefix_size(PREFIX_PERPOOL_OMAP
, string());
9576 uint64_t bfree
= alloc
->get_free();
9579 int64_t bluefs_total
= bluefs
->get_total(bluefs_layout
.shared_bdev
);
9580 int64_t bluefs_free
= bluefs
->get_free(bluefs_layout
.shared_bdev
);
9581 // part of our shared device is "free" according to BlueFS, but we
9582 // can't touch bluestore_bluefs_min of it.
9583 int64_t shared_available
= std::min(
9585 int64_t(bluefs_total
- cct
->_conf
->bluestore_bluefs_min
));
9586 buf
->internally_reserved
= bluefs_total
- shared_available
;
9587 if (shared_available
> 0) {
9588 bfree
+= shared_available
;
9590 // include dedicated db, too, if that isn't the shared device.
9591 if (bluefs_layout
.shared_bdev
!= BlueFS::BDEV_DB
) {
9592 buf
->total
+= bluefs
->get_total(BlueFS::BDEV_DB
);
9594 // call any non-omap bluefs space "internal metadata"
9595 buf
->internal_metadata
=
9596 std::max(bluefs
->get_used(), (uint64_t)cct
->_conf
->bluestore_bluefs_min
)
9597 - buf
->omap_allocated
;
9600 uint64_t thin_total
, thin_avail
;
9601 if (bdev
->get_thin_utilization(&thin_total
, &thin_avail
)) {
9602 buf
->total
+= thin_total
;
9604 // we are limited by both the size of the virtual device and the
9605 // underlying physical device.
9606 bfree
= std::min(bfree
, thin_avail
);
9608 buf
->allocated
= thin_total
- thin_avail
;
9610 buf
->total
+= bdev
->get_size();
9612 buf
->available
= bfree
;
9615 int BlueStore::statfs(struct store_statfs_t
*buf
,
9616 osd_alert_list_t
* alerts
)
9620 _log_alerts(*alerts
);
9622 _get_statfs_overall(buf
);
9624 std::lock_guard
l(vstatfs_lock
);
9625 buf
->allocated
= vstatfs
.allocated();
9626 buf
->data_stored
= vstatfs
.stored();
9627 buf
->data_compressed
= vstatfs
.compressed();
9628 buf
->data_compressed_original
= vstatfs
.compressed_original();
9629 buf
->data_compressed_allocated
= vstatfs
.compressed_allocated();
9632 dout(20) << __func__
<< " " << *buf
<< dendl
;
9636 int BlueStore::pool_statfs(uint64_t pool_id
, struct store_statfs_t
*buf
,
9637 bool *out_per_pool_omap
)
9639 dout(20) << __func__
<< " pool " << pool_id
<< dendl
;
9641 if (!per_pool_stat_collection
) {
9642 dout(20) << __func__
<< " not supported in legacy mode " << dendl
;
9648 std::lock_guard
l(vstatfs_lock
);
9649 osd_pools
[pool_id
].publish(buf
);
9653 _key_encode_u64(pool_id
, &key_prefix
);
9654 buf
->omap_allocated
= db
->estimate_prefix_size(PREFIX_PERPOOL_OMAP
,
9656 *out_per_pool_omap
= per_pool_omap
;
9658 dout(10) << __func__
<< *buf
<< dendl
;
9662 void BlueStore::_check_legacy_statfs_alert()
9665 if (!per_pool_stat_collection
&&
9666 cct
->_conf
->bluestore_warn_on_legacy_statfs
) {
9667 s
= "legacy statfs reporting detected, "
9668 "suggest to run store repair to get consistent statistic reports";
9670 std::lock_guard
l(qlock
);
9671 legacy_statfs_alert
= s
;
9674 void BlueStore::_check_no_per_pool_omap_alert()
9677 if (!per_pool_omap
&&
9678 cct
->_conf
->bluestore_warn_on_no_per_pool_omap
) {
9679 s
= "legacy (not per-pool) omap detected, "
9680 "suggest to run store repair to measure per-pool omap usage";
9682 std::lock_guard
l(qlock
);
9683 no_per_pool_omap_alert
= s
;
9689 BlueStore::CollectionRef
BlueStore::_get_collection(const coll_t
& cid
)
9691 std::shared_lock
l(coll_lock
);
9692 ceph::unordered_map
<coll_t
,CollectionRef
>::iterator cp
= coll_map
.find(cid
);
9693 if (cp
== coll_map
.end())
9694 return CollectionRef();
9698 void BlueStore::_queue_reap_collection(CollectionRef
& c
)
9700 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
9701 // _reap_collections and this in the same thread,
9702 // so no need a lock.
9703 removed_collections
.push_back(c
);
9706 void BlueStore::_reap_collections()
9709 list
<CollectionRef
> removed_colls
;
9711 // _queue_reap_collection and this in the same thread.
9712 // So no need a lock.
9713 if (!removed_collections
.empty())
9714 removed_colls
.swap(removed_collections
);
9719 list
<CollectionRef
>::iterator p
= removed_colls
.begin();
9720 while (p
!= removed_colls
.end()) {
9721 CollectionRef c
= *p
;
9722 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
9723 if (c
->onode_map
.map_any([&](Onode
* o
) {
9724 ceph_assert(!o
->exists
);
9725 if (o
->flushing_count
.load()) {
9726 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " " << o
->oid
9727 << " flush_txns " << o
->flushing_count
<< dendl
;
9735 c
->onode_map
.clear();
9736 p
= removed_colls
.erase(p
);
9737 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " done" << dendl
;
9739 if (removed_colls
.empty()) {
9740 dout(10) << __func__
<< " all reaped" << dendl
;
9742 removed_collections
.splice(removed_collections
.begin(), removed_colls
);
9746 void BlueStore::_update_cache_logger()
9748 uint64_t num_onodes
= 0;
9749 uint64_t num_pinned_onodes
= 0;
9750 uint64_t num_extents
= 0;
9751 uint64_t num_blobs
= 0;
9752 uint64_t num_buffers
= 0;
9753 uint64_t num_buffer_bytes
= 0;
9754 for (auto c
: onode_cache_shards
) {
9755 c
->add_stats(&num_onodes
, &num_pinned_onodes
);
9757 for (auto c
: buffer_cache_shards
) {
9758 c
->add_stats(&num_extents
, &num_blobs
,
9759 &num_buffers
, &num_buffer_bytes
);
9761 logger
->set(l_bluestore_onodes
, num_onodes
);
9762 logger
->set(l_bluestore_pinned_onodes
, num_pinned_onodes
);
9763 logger
->set(l_bluestore_extents
, num_extents
);
9764 logger
->set(l_bluestore_blobs
, num_blobs
);
9765 logger
->set(l_bluestore_buffers
, num_buffers
);
9766 logger
->set(l_bluestore_buffer_bytes
, num_buffer_bytes
);
9772 ObjectStore::CollectionHandle
BlueStore::open_collection(const coll_t
& cid
)
9774 return _get_collection(cid
);
9777 ObjectStore::CollectionHandle
BlueStore::create_new_collection(
9780 std::unique_lock l
{coll_lock
};
9781 auto c
= ceph::make_ref
<Collection
>(
9783 onode_cache_shards
[cid
.hash_to_shard(onode_cache_shards
.size())],
9784 buffer_cache_shards
[cid
.hash_to_shard(buffer_cache_shards
.size())],
9786 new_coll_map
[cid
] = c
;
9787 _osr_attach(c
.get());
9791 void BlueStore::set_collection_commit_queue(
9793 ContextQueue
*commit_queue
)
9796 std::shared_lock
l(coll_lock
);
9797 if (coll_map
.count(cid
)) {
9798 coll_map
[cid
]->commit_queue
= commit_queue
;
9799 } else if (new_coll_map
.count(cid
)) {
9800 new_coll_map
[cid
]->commit_queue
= commit_queue
;
9806 bool BlueStore::exists(CollectionHandle
&c_
, const ghobject_t
& oid
)
9808 Collection
*c
= static_cast<Collection
*>(c_
.get());
9809 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
9816 std::shared_lock
l(c
->lock
);
9817 OnodeRef o
= c
->get_onode(oid
, false);
9818 if (!o
|| !o
->exists
)
9825 int BlueStore::stat(
9826 CollectionHandle
&c_
,
9827 const ghobject_t
& oid
,
9831 Collection
*c
= static_cast<Collection
*>(c_
.get());
9834 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
9837 std::shared_lock
l(c
->lock
);
9838 OnodeRef o
= c
->get_onode(oid
, false);
9839 if (!o
|| !o
->exists
)
9841 st
->st_size
= o
->onode
.size
;
9842 st
->st_blksize
= 4096;
9843 st
->st_blocks
= (st
->st_size
+ st
->st_blksize
- 1) / st
->st_blksize
;
9848 if (_debug_mdata_eio(oid
)) {
9850 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
9854 int BlueStore::set_collection_opts(
9855 CollectionHandle
& ch
,
9856 const pool_opts_t
& opts
)
9858 Collection
*c
= static_cast<Collection
*>(ch
.get());
9859 dout(15) << __func__
<< " " << ch
->cid
<< " options " << opts
<< dendl
;
9862 std::unique_lock l
{c
->lock
};
9863 c
->pool_opts
= opts
;
9867 int BlueStore::read(
9868 CollectionHandle
&c_
,
9869 const ghobject_t
& oid
,
9875 auto start
= mono_clock::now();
9876 Collection
*c
= static_cast<Collection
*>(c_
.get());
9877 const coll_t
&cid
= c
->get_cid();
9878 dout(15) << __func__
<< " " << cid
<< " " << oid
9879 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9887 std::shared_lock
l(c
->lock
);
9888 auto start1
= mono_clock::now();
9889 OnodeRef o
= c
->get_onode(oid
, false);
9890 log_latency("get_onode@read",
9891 l_bluestore_read_onode_meta_lat
,
9892 mono_clock::now() - start1
,
9893 cct
->_conf
->bluestore_log_op_age
);
9894 if (!o
|| !o
->exists
) {
9899 if (offset
== length
&& offset
== 0)
9900 length
= o
->onode
.size
;
9902 r
= _do_read(c
, o
, offset
, length
, bl
, op_flags
);
9904 logger
->inc(l_bluestore_read_eio
);
9909 if (r
>= 0 && _debug_data_eio(oid
)) {
9911 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
9912 } else if (oid
.hobj
.pool
> 0 && /* FIXME, see #23029 */
9913 cct
->_conf
->bluestore_debug_random_read_err
&&
9914 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
*
9916 dout(0) << __func__
<< ": inject random EIO" << dendl
;
9919 dout(10) << __func__
<< " " << cid
<< " " << oid
9920 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9921 << " = " << r
<< dendl
;
9922 log_latency(__func__
,
9923 l_bluestore_read_lat
,
9924 mono_clock::now() - start
,
9925 cct
->_conf
->bluestore_log_op_age
);
9929 void BlueStore::_read_cache(
9933 int read_cache_policy
,
9934 ready_regions_t
& ready_regions
,
9935 blobs2read_t
& blobs2read
)
9937 // build blob-wise list to of stuff read (that isn't cached)
9938 unsigned left
= length
;
9939 uint64_t pos
= offset
;
9940 auto lp
= o
->extent_map
.seek_lextent(offset
);
9941 while (left
> 0 && lp
!= o
->extent_map
.extent_map
.end()) {
9942 if (pos
< lp
->logical_offset
) {
9943 unsigned hole
= lp
->logical_offset
- pos
;
9947 dout(30) << __func__
<< " hole 0x" << std::hex
<< pos
<< "~" << hole
9948 << std::dec
<< dendl
;
9952 BlobRef
& bptr
= lp
->blob
;
9953 unsigned l_off
= pos
- lp
->logical_offset
;
9954 unsigned b_off
= l_off
+ lp
->blob_offset
;
9955 unsigned b_len
= std::min(left
, lp
->length
- l_off
);
9957 ready_regions_t cache_res
;
9958 interval_set
<uint32_t> cache_interval
;
9959 bptr
->shared_blob
->bc
.read(
9960 bptr
->shared_blob
->get_cache(), b_off
, b_len
, cache_res
, cache_interval
,
9962 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9963 << " need 0x" << b_off
<< "~" << b_len
9964 << " cache has 0x" << cache_interval
9965 << std::dec
<< dendl
;
9967 auto pc
= cache_res
.begin();
9968 uint64_t chunk_size
= bptr
->get_blob().get_chunk_size(block_size
);
9971 if (pc
!= cache_res
.end() &&
9972 pc
->first
== b_off
) {
9973 l
= pc
->second
.length();
9974 ready_regions
[pos
].claim(pc
->second
);
9975 dout(30) << __func__
<< " use cache 0x" << std::hex
<< pos
<< ": 0x"
9976 << b_off
<< "~" << l
<< std::dec
<< dendl
;
9980 if (pc
!= cache_res
.end()) {
9981 ceph_assert(pc
->first
> b_off
);
9982 l
= pc
->first
- b_off
;
9984 dout(30) << __func__
<< " will read 0x" << std::hex
<< pos
<< ": 0x"
9985 << b_off
<< "~" << l
<< std::dec
<< dendl
;
9988 uint64_t r_off
= b_off
;
9990 uint64_t front
= r_off
% chunk_size
;
9995 unsigned tail
= r_len
% chunk_size
;
9997 r_len
+= chunk_size
- tail
;
9999 bool merged
= false;
10000 regions2read_t
& r2r
= blobs2read
[bptr
];
10002 read_req_t
& pre
= r2r
.back();
10003 if (r_off
<= (pre
.r_off
+ pre
.r_len
)) {
10004 front
+= (r_off
- pre
.r_off
);
10005 pre
.r_len
+= (r_off
+ r_len
- pre
.r_off
- pre
.r_len
);
10006 pre
.regs
.emplace_back(region_t(pos
, b_off
, l
, front
));
10011 read_req_t
req(r_off
, r_len
);
10012 req
.regs
.emplace_back(region_t(pos
, b_off
, l
, front
));
10013 r2r
.emplace_back(std::move(req
));
10026 int BlueStore::_prepare_read_ioc(
10027 blobs2read_t
& blobs2read
,
10028 vector
<bufferlist
>* compressed_blob_bls
,
10031 for (auto& p
: blobs2read
) {
10032 const BlobRef
& bptr
= p
.first
;
10033 regions2read_t
& r2r
= p
.second
;
10034 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
10035 << " need " << r2r
<< std::dec
<< dendl
;
10036 if (bptr
->get_blob().is_compressed()) {
10037 // read the whole thing
10038 if (compressed_blob_bls
->empty()) {
10039 // ensure we avoid any reallocation on subsequent blobs
10040 compressed_blob_bls
->reserve(blobs2read
.size());
10042 compressed_blob_bls
->push_back(bufferlist());
10043 bufferlist
& bl
= compressed_blob_bls
->back();
10044 auto r
= bptr
->get_blob().map(
10045 0, bptr
->get_blob().get_ondisk_length(),
10046 [&](uint64_t offset
, uint64_t length
) {
10047 int r
= bdev
->aio_read(offset
, length
, &bl
, ioc
);
10053 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
) << dendl
;
10055 // propagate EIO to caller
10058 ceph_assert(r
== 0);
10062 for (auto& req
: r2r
) {
10063 dout(20) << __func__
<< " region 0x" << std::hex
10064 << req
.regs
.front().logical_offset
10065 << ": 0x" << req
.regs
.front().blob_xoffset
10066 << " reading 0x" << req
.r_off
10067 << "~" << req
.r_len
<< std::dec
10071 auto r
= bptr
->get_blob().map(
10072 req
.r_off
, req
.r_len
,
10073 [&](uint64_t offset
, uint64_t length
) {
10074 int r
= bdev
->aio_read(offset
, length
, &req
.bl
, ioc
);
10080 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
)
10083 // propagate EIO to caller
10086 ceph_assert(r
== 0);
10088 ceph_assert(req
.bl
.length() == req
.r_len
);
10095 int BlueStore::_generate_read_result_bl(
10099 ready_regions_t
& ready_regions
,
10100 vector
<bufferlist
>& compressed_blob_bls
,
10101 blobs2read_t
& blobs2read
,
10106 // enumerate and decompress desired blobs
10107 auto p
= compressed_blob_bls
.begin();
10108 blobs2read_t::iterator b2r_it
= blobs2read
.begin();
10109 while (b2r_it
!= blobs2read
.end()) {
10110 const BlobRef
& bptr
= b2r_it
->first
;
10111 regions2read_t
& r2r
= b2r_it
->second
;
10112 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
10113 << " need 0x" << r2r
<< std::dec
<< dendl
;
10114 if (bptr
->get_blob().is_compressed()) {
10115 ceph_assert(p
!= compressed_blob_bls
.end());
10116 bufferlist
& compressed_bl
= *p
++;
10117 if (_verify_csum(o
, &bptr
->get_blob(), 0, compressed_bl
,
10118 r2r
.front().regs
.front().logical_offset
) < 0) {
10119 *csum_error
= true;
10123 auto r
= _decompress(compressed_bl
, &raw_bl
);
10127 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(), 0,
10130 for (auto& req
: r2r
) {
10131 for (auto& r
: req
.regs
) {
10132 ready_regions
[r
.logical_offset
].substr_of(
10133 raw_bl
, r
.blob_xoffset
, r
.length
);
10137 for (auto& req
: r2r
) {
10138 if (_verify_csum(o
, &bptr
->get_blob(), req
.r_off
, req
.bl
,
10139 req
.regs
.front().logical_offset
) < 0) {
10140 *csum_error
= true;
10144 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(),
10145 req
.r_off
, req
.bl
);
10148 // prune and keep result
10149 for (const auto& r
: req
.regs
) {
10150 ready_regions
[r
.logical_offset
].substr_of(req
.bl
, r
.front
, r
.length
);
10157 // generate a resulting buffer
10158 auto pr
= ready_regions
.begin();
10159 auto pr_end
= ready_regions
.end();
10161 while (pos
< length
) {
10162 if (pr
!= pr_end
&& pr
->first
== pos
+ offset
) {
10163 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
10164 << ": data from 0x" << pr
->first
<< "~" << pr
->second
.length()
10165 << std::dec
<< dendl
;
10166 pos
+= pr
->second
.length();
10167 bl
.claim_append(pr
->second
);
10170 uint64_t l
= length
- pos
;
10171 if (pr
!= pr_end
) {
10172 ceph_assert(pr
->first
> pos
+ offset
);
10173 l
= pr
->first
- (pos
+ offset
);
10175 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
10176 << ": zeros for 0x" << (pos
+ offset
) << "~" << l
10177 << std::dec
<< dendl
;
10182 ceph_assert(bl
.length() == length
);
10183 ceph_assert(pos
== length
);
10184 ceph_assert(pr
== pr_end
);
10188 int BlueStore::_do_read(
10195 uint64_t retry_count
)
10199 int read_cache_policy
= 0; // do not bypass clean or dirty cache
10201 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10202 << " size 0x" << o
->onode
.size
<< " (" << std::dec
10203 << o
->onode
.size
<< ")" << dendl
;
10206 if (offset
>= o
->onode
.size
) {
10210 // generally, don't buffer anything, unless the client explicitly requests
10212 bool buffered
= false;
10213 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
10214 dout(20) << __func__
<< " will do buffered read" << dendl
;
10216 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
10217 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
10218 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
10219 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
10223 if (offset
+ length
> o
->onode
.size
) {
10224 length
= o
->onode
.size
- offset
;
10227 auto start
= mono_clock::now();
10228 o
->extent_map
.fault_range(db
, offset
, length
);
10229 log_latency(__func__
,
10230 l_bluestore_read_onode_meta_lat
,
10231 mono_clock::now() - start
,
10232 cct
->_conf
->bluestore_log_op_age
);
10233 _dump_onode
<30>(cct
, *o
);
10235 // for deep-scrub, we only read dirty cache and bypass clean cache in
10236 // order to read underlying block device in case there are silent disk errors.
10237 if (op_flags
& CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE
) {
10238 dout(20) << __func__
<< " will bypass cache and do direct read" << dendl
;
10239 read_cache_policy
= BufferSpace::BYPASS_CLEAN_CACHE
;
10242 // build blob-wise list to of stuff read (that isn't cached)
10243 ready_regions_t ready_regions
;
10244 blobs2read_t blobs2read
;
10245 _read_cache(o
, offset
, length
, read_cache_policy
, ready_regions
, blobs2read
);
10248 // read raw blob data.
10249 start
= mono_clock::now(); // for the sake of simplicity
10250 // measure the whole block below.
10251 // The error isn't that much...
10252 vector
<bufferlist
> compressed_blob_bls
;
10253 IOContext
ioc(cct
, NULL
, true); // allow EIO
10254 r
= _prepare_read_ioc(blobs2read
, &compressed_blob_bls
, &ioc
);
10255 // we always issue aio for reading, so errors other than EIO are not allowed
10259 int64_t num_ios
= length
;
10260 if (ioc
.has_pending_aios()) {
10261 num_ios
= -ioc
.get_num_ios();
10262 bdev
->aio_submit(&ioc
);
10263 dout(20) << __func__
<< " waiting for aio" << dendl
;
10265 r
= ioc
.get_return_value();
10267 ceph_assert(r
== -EIO
); // no other errors allowed
10271 log_latency_fn(__func__
,
10272 l_bluestore_read_wait_aio_lat
,
10273 mono_clock::now() - start
,
10274 cct
->_conf
->bluestore_log_op_age
,
10275 [&](auto lat
) { return ", num_ios = " + stringify(num_ios
); }
10278 bool csum_error
= false;
10279 r
= _generate_read_result_bl(o
, offset
, length
, ready_regions
,
10280 compressed_blob_bls
, blobs2read
,
10281 buffered
, &csum_error
, bl
);
10283 // Handles spurious read errors caused by a kernel bug.
10284 // We sometimes get all-zero pages as a result of the read under
10285 // high memory pressure. Retrying the failing read succeeds in most
10287 // See also: http://tracker.ceph.com/issues/22464
10288 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
10291 return _do_read(c
, o
, offset
, length
, bl
, op_flags
, retry_count
+ 1);
10295 logger
->inc(l_bluestore_reads_with_retries
);
10296 dout(5) << __func__
<< " read at 0x" << std::hex
<< offset
<< "~" << length
10297 << " failed " << std::dec
<< retry_count
<< " times before succeeding" << dendl
;
10302 int BlueStore::_verify_csum(OnodeRef
& o
,
10303 const bluestore_blob_t
* blob
, uint64_t blob_xoffset
,
10304 const bufferlist
& bl
,
10305 uint64_t logical_offset
) const
10309 auto start
= mono_clock::now();
10310 int r
= blob
->verify_csum(blob_xoffset
, bl
, &bad
, &bad_csum
);
10311 if (cct
->_conf
->bluestore_debug_inject_csum_err_probability
> 0 &&
10312 (rand() % 10000) < cct
->_conf
->bluestore_debug_inject_csum_err_probability
* 10000.0) {
10313 derr
<< __func__
<< " injecting bluestore checksum verifcation error" << dendl
;
10314 bad
= blob_xoffset
;
10316 bad_csum
= 0xDEADBEEF;
10323 blob
->get_csum_chunk_size(),
10324 [&](uint64_t offset
, uint64_t length
) {
10325 pex
.emplace_back(bluestore_pextent_t(offset
, length
));
10328 derr
<< __func__
<< " bad "
10329 << Checksummer::get_csum_type_string(blob
->csum_type
)
10330 << "/0x" << std::hex
<< blob
->get_csum_chunk_size()
10331 << " checksum at blob offset 0x" << bad
10332 << ", got 0x" << bad_csum
<< ", expected 0x"
10333 << blob
->get_csum_item(bad
/ blob
->get_csum_chunk_size()) << std::dec
10334 << ", device location " << pex
10335 << ", logical extent 0x" << std::hex
10336 << (logical_offset
+ bad
- blob_xoffset
) << "~"
10337 << blob
->get_csum_chunk_size() << std::dec
10338 << ", object " << o
->oid
10341 derr
<< __func__
<< " failed with exit code: " << cpp_strerror(r
) << dendl
;
10344 log_latency(__func__
,
10345 l_bluestore_csum_lat
,
10346 mono_clock::now() - start
,
10347 cct
->_conf
->bluestore_log_op_age
);
10348 if (cct
->_conf
->bluestore_ignore_data_csum
) {
10354 int BlueStore::_decompress(bufferlist
& source
, bufferlist
* result
)
10357 auto start
= mono_clock::now();
10358 auto i
= source
.cbegin();
10359 bluestore_compression_header_t chdr
;
10361 int alg
= int(chdr
.type
);
10362 CompressorRef cp
= compressor
;
10363 if (!cp
|| (int)cp
->get_type() != alg
) {
10364 cp
= Compressor::create(cct
, alg
);
10368 // if compressor isn't available - error, because cannot return
10369 // decompressed data?
10371 const char* alg_name
= Compressor::get_comp_alg_name(alg
);
10372 derr
<< __func__
<< " can't load decompressor " << alg_name
<< dendl
;
10373 _set_compression_alert(false, alg_name
);
10376 r
= cp
->decompress(i
, chdr
.length
, *result
);
10378 derr
<< __func__
<< " decompression failed with exit code " << r
<< dendl
;
10382 log_latency(__func__
,
10383 l_bluestore_decompress_lat
,
10384 mono_clock::now() - start
,
10385 cct
->_conf
->bluestore_log_op_age
);
10389 // this stores fiemap into interval_set, other variations
10390 // use it internally
10391 int BlueStore::_fiemap(
10392 CollectionHandle
&c_
,
10393 const ghobject_t
& oid
,
10396 interval_set
<uint64_t>& destset
)
10398 Collection
*c
= static_cast<Collection
*>(c_
.get());
10402 std::shared_lock
l(c
->lock
);
10404 OnodeRef o
= c
->get_onode(oid
, false);
10405 if (!o
|| !o
->exists
) {
10408 _dump_onode
<30>(cct
, *o
);
10410 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10411 << " size 0x" << o
->onode
.size
<< std::dec
<< dendl
;
10413 boost::intrusive::set
<Extent
>::iterator ep
, eend
;
10414 if (offset
>= o
->onode
.size
)
10417 if (offset
+ length
> o
->onode
.size
) {
10418 length
= o
->onode
.size
- offset
;
10421 o
->extent_map
.fault_range(db
, offset
, length
);
10422 eend
= o
->extent_map
.extent_map
.end();
10423 ep
= o
->extent_map
.seek_lextent(offset
);
10424 while (length
> 0) {
10425 dout(20) << __func__
<< " offset " << offset
<< dendl
;
10426 if (ep
!= eend
&& ep
->logical_offset
+ ep
->length
<= offset
) {
10431 uint64_t x_len
= length
;
10432 if (ep
!= eend
&& ep
->logical_offset
<= offset
) {
10433 uint64_t x_off
= offset
- ep
->logical_offset
;
10434 x_len
= std::min(x_len
, ep
->length
- x_off
);
10435 dout(30) << __func__
<< " lextent 0x" << std::hex
<< offset
<< "~"
10436 << x_len
<< std::dec
<< " blob " << ep
->blob
<< dendl
;
10437 destset
.insert(offset
, x_len
);
10440 if (x_off
+ x_len
== ep
->length
)
10445 ep
->logical_offset
> offset
&&
10446 ep
->logical_offset
- offset
< x_len
) {
10447 x_len
= ep
->logical_offset
- offset
;
10455 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10456 << " size = 0x(" << destset
<< ")" << std::dec
<< dendl
;
10460 int BlueStore::fiemap(
10461 CollectionHandle
&c_
,
10462 const ghobject_t
& oid
,
10467 interval_set
<uint64_t> m
;
10468 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
10475 int BlueStore::fiemap(
10476 CollectionHandle
&c_
,
10477 const ghobject_t
& oid
,
10480 map
<uint64_t, uint64_t>& destmap
)
10482 interval_set
<uint64_t> m
;
10483 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
10485 destmap
= std::move(m
).detach();
10490 int BlueStore::readv(
10491 CollectionHandle
&c_
,
10492 const ghobject_t
& oid
,
10493 interval_set
<uint64_t>& m
,
10497 auto start
= mono_clock::now();
10498 Collection
*c
= static_cast<Collection
*>(c_
.get());
10499 const coll_t
&cid
= c
->get_cid();
10500 dout(15) << __func__
<< " " << cid
<< " " << oid
10509 std::shared_lock
l(c
->lock
);
10510 auto start1
= mono_clock::now();
10511 OnodeRef o
= c
->get_onode(oid
, false);
10512 log_latency("get_onode@read",
10513 l_bluestore_read_onode_meta_lat
,
10514 mono_clock::now() - start1
,
10515 cct
->_conf
->bluestore_log_op_age
);
10516 if (!o
|| !o
->exists
) {
10526 r
= _do_readv(c
, o
, m
, bl
, op_flags
);
10528 logger
->inc(l_bluestore_read_eio
);
10533 if (r
>= 0 && _debug_data_eio(oid
)) {
10535 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10536 } else if (oid
.hobj
.pool
> 0 && /* FIXME, see #23029 */
10537 cct
->_conf
->bluestore_debug_random_read_err
&&
10538 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
*
10540 dout(0) << __func__
<< ": inject random EIO" << dendl
;
10543 dout(10) << __func__
<< " " << cid
<< " " << oid
10544 << " fiemap " << m
<< std::dec
10545 << " = " << r
<< dendl
;
10546 log_latency(__func__
,
10547 l_bluestore_read_lat
,
10548 mono_clock::now() - start
,
10549 cct
->_conf
->bluestore_log_op_age
);
10553 int BlueStore::_do_readv(
10556 const interval_set
<uint64_t>& m
,
10559 uint64_t retry_count
)
10563 int read_cache_policy
= 0; // do not bypass clean or dirty cache
10565 dout(20) << __func__
<< " fiemap " << m
<< std::hex
10566 << " size 0x" << o
->onode
.size
<< " (" << std::dec
10567 << o
->onode
.size
<< ")" << dendl
;
10569 // generally, don't buffer anything, unless the client explicitly requests
10571 bool buffered
= false;
10572 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
10573 dout(20) << __func__
<< " will do buffered read" << dendl
;
10575 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
10576 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
10577 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
10578 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
10581 // this method must be idempotent since we may call it several times
10582 // before we finally read the expected result.
10585 // call fiemap first!
10586 ceph_assert(m
.range_start() <= o
->onode
.size
);
10587 ceph_assert(m
.range_end() <= o
->onode
.size
);
10588 auto start
= mono_clock::now();
10589 o
->extent_map
.fault_range(db
, m
.range_start(), m
.range_end() - m
.range_start());
10590 log_latency(__func__
,
10591 l_bluestore_read_onode_meta_lat
,
10592 mono_clock::now() - start
,
10593 cct
->_conf
->bluestore_log_op_age
);
10594 _dump_onode
<30>(cct
, *o
);
10596 IOContext
ioc(cct
, NULL
, true); // allow EIO
10597 vector
<std::tuple
<ready_regions_t
, vector
<bufferlist
>, blobs2read_t
>> raw_results
;
10598 raw_results
.reserve(m
.num_intervals());
10600 for (auto p
= m
.begin(); p
!= m
.end(); p
++, i
++) {
10601 raw_results
.push_back({});
10602 _read_cache(o
, p
.get_start(), p
.get_len(), read_cache_policy
,
10603 std::get
<0>(raw_results
[i
]), std::get
<2>(raw_results
[i
]));
10604 r
= _prepare_read_ioc(std::get
<2>(raw_results
[i
]), &std::get
<1>(raw_results
[i
]), &ioc
);
10605 // we always issue aio for reading, so errors other than EIO are not allowed
10610 auto num_ios
= m
.size();
10611 if (ioc
.has_pending_aios()) {
10612 num_ios
= ioc
.get_num_ios();
10613 bdev
->aio_submit(&ioc
);
10614 dout(20) << __func__
<< " waiting for aio" << dendl
;
10616 r
= ioc
.get_return_value();
10618 ceph_assert(r
== -EIO
); // no other errors allowed
10622 log_latency_fn(__func__
,
10623 l_bluestore_read_wait_aio_lat
,
10624 mono_clock::now() - start
,
10625 cct
->_conf
->bluestore_log_op_age
,
10626 [&](auto lat
) { return ", num_ios = " + stringify(num_ios
); }
10629 ceph_assert(raw_results
.size() == (size_t)m
.num_intervals());
10631 for (auto p
= m
.begin(); p
!= m
.end(); p
++, i
++) {
10632 bool csum_error
= false;
10634 r
= _generate_read_result_bl(o
, p
.get_start(), p
.get_len(),
10635 std::get
<0>(raw_results
[i
]),
10636 std::get
<1>(raw_results
[i
]),
10637 std::get
<2>(raw_results
[i
]),
10638 buffered
, &csum_error
, t
);
10640 // Handles spurious read errors caused by a kernel bug.
10641 // We sometimes get all-zero pages as a result of the read under
10642 // high memory pressure. Retrying the failing read succeeds in most
10644 // See also: http://tracker.ceph.com/issues/22464
10645 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
10648 return _do_readv(c
, o
, m
, bl
, op_flags
, retry_count
+ 1);
10650 bl
.claim_append(t
);
10653 logger
->inc(l_bluestore_reads_with_retries
);
10654 dout(5) << __func__
<< " read fiemap " << m
10655 << " failed " << retry_count
<< " times before succeeding"
10658 return bl
.length();
10661 int BlueStore::dump_onode(CollectionHandle
&c_
,
10662 const ghobject_t
& oid
,
10663 const string
& section_name
,
10666 Collection
*c
= static_cast<Collection
*>(c_
.get());
10667 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
10673 std::shared_lock
l(c
->lock
);
10675 OnodeRef o
= c
->get_onode(oid
, false);
10676 if (!o
|| !o
->exists
) {
10680 // FIXME minor: actually the next line isn't enough to
10681 // load shared blobs. Leaving as is for now..
10683 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
10685 _dump_onode
<0>(cct
, *o
);
10686 f
->open_object_section(section_name
.c_str());
10688 f
->close_section();
10692 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
10693 << " = " << r
<< dendl
;
10697 int BlueStore::getattr(
10698 CollectionHandle
&c_
,
10699 const ghobject_t
& oid
,
10703 Collection
*c
= static_cast<Collection
*>(c_
.get());
10704 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
<< dendl
;
10710 std::shared_lock
l(c
->lock
);
10711 mempool::bluestore_cache_meta::string
k(name
);
10713 OnodeRef o
= c
->get_onode(oid
, false);
10714 if (!o
|| !o
->exists
) {
10719 if (!o
->onode
.attrs
.count(k
)) {
10723 value
= o
->onode
.attrs
[k
];
10727 if (r
== 0 && _debug_mdata_eio(oid
)) {
10729 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10731 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
10732 << " = " << r
<< dendl
;
10736 int BlueStore::getattrs(
10737 CollectionHandle
&c_
,
10738 const ghobject_t
& oid
,
10739 map
<string
,bufferptr
>& aset
)
10741 Collection
*c
= static_cast<Collection
*>(c_
.get());
10742 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
10748 std::shared_lock
l(c
->lock
);
10750 OnodeRef o
= c
->get_onode(oid
, false);
10751 if (!o
|| !o
->exists
) {
10755 for (auto& i
: o
->onode
.attrs
) {
10756 aset
.emplace(i
.first
.c_str(), i
.second
);
10762 if (r
== 0 && _debug_mdata_eio(oid
)) {
10764 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10766 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
10767 << " = " << r
<< dendl
;
10771 int BlueStore::list_collections(vector
<coll_t
>& ls
)
10773 std::shared_lock
l(coll_lock
);
10774 ls
.reserve(coll_map
.size());
10775 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
= coll_map
.begin();
10776 p
!= coll_map
.end();
10778 ls
.push_back(p
->first
);
10782 bool BlueStore::collection_exists(const coll_t
& c
)
10784 std::shared_lock
l(coll_lock
);
10785 return coll_map
.count(c
);
10788 int BlueStore::collection_empty(CollectionHandle
& ch
, bool *empty
)
10790 dout(15) << __func__
<< " " << ch
->cid
<< dendl
;
10791 vector
<ghobject_t
> ls
;
10793 int r
= collection_list(ch
, ghobject_t(), ghobject_t::get_max(), 1,
10796 derr
<< __func__
<< " collection_list returned: " << cpp_strerror(r
)
10800 *empty
= ls
.empty();
10801 dout(10) << __func__
<< " " << ch
->cid
<< " = " << (int)(*empty
) << dendl
;
10805 int BlueStore::collection_bits(CollectionHandle
& ch
)
10807 dout(15) << __func__
<< " " << ch
->cid
<< dendl
;
10808 Collection
*c
= static_cast<Collection
*>(ch
.get());
10809 std::shared_lock
l(c
->lock
);
10810 dout(10) << __func__
<< " " << ch
->cid
<< " = " << c
->cnode
.bits
<< dendl
;
10811 return c
->cnode
.bits
;
10814 int BlueStore::collection_list(
10815 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10816 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10818 Collection
*c
= static_cast<Collection
*>(c_
.get());
10820 dout(15) << __func__
<< " " << c
->cid
10821 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
10824 std::shared_lock
l(c
->lock
);
10825 r
= _collection_list(c
, start
, end
, max
, false, ls
, pnext
);
10828 dout(10) << __func__
<< " " << c
->cid
10829 << " start " << start
<< " end " << end
<< " max " << max
10830 << " = " << r
<< ", ls.size() = " << ls
->size()
10831 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
10835 int BlueStore::collection_list_legacy(
10836 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10837 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10839 Collection
*c
= static_cast<Collection
*>(c_
.get());
10841 dout(15) << __func__
<< " " << c
->cid
10842 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
10845 std::shared_lock
l(c
->lock
);
10846 r
= _collection_list(c
, start
, end
, max
, true, ls
, pnext
);
10849 dout(10) << __func__
<< " " << c
->cid
10850 << " start " << start
<< " end " << end
<< " max " << max
10851 << " = " << r
<< ", ls.size() = " << ls
->size()
10852 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
10856 int BlueStore::_collection_list(
10857 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10858 bool legacy
, vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10864 auto start_time
= mono_clock::now();
10866 ghobject_t static_next
;
10867 std::unique_ptr
<CollectionListIterator
> it
;
10868 ghobject_t coll_range_temp_start
, coll_range_temp_end
;
10869 ghobject_t coll_range_start
, coll_range_end
;
10870 bool set_next
= false;
10875 pnext
= &static_next
;
10877 if (start
.is_max() || start
.hobj
.is_max()) {
10880 get_coll_range(c
->cid
, c
->cnode
.bits
, &coll_range_temp_start
,
10881 &coll_range_temp_end
, &coll_range_start
, &coll_range_end
);
10882 dout(20) << __func__
10883 << " range " << coll_range_temp_start
10884 << " to " << coll_range_temp_end
10885 << " and " << coll_range_start
10886 << " to " << coll_range_end
10887 << " start " << start
<< dendl
;
10889 it
= std::make_unique
<SimpleCollectionListIterator
>(
10890 cct
, db
->get_iterator(PREFIX_OBJ
));
10892 it
= std::make_unique
<SortedCollectionListIterator
>(
10893 db
->get_iterator(PREFIX_OBJ
));
10895 if (start
== ghobject_t() ||
10896 start
.hobj
== hobject_t() ||
10897 start
== c
->cid
.get_min_hobj()) {
10898 it
->upper_bound(coll_range_temp_start
);
10901 if (start
.hobj
.is_temp()) {
10903 ceph_assert(start
>= coll_range_temp_start
&& start
< coll_range_temp_end
);
10906 ceph_assert(start
>= coll_range_start
&& start
< coll_range_end
);
10908 dout(20) << __func__
<< " temp=" << (int)temp
<< dendl
;
10909 it
->lower_bound(start
);
10911 if (end
.hobj
.is_max()) {
10912 pend
= temp
? coll_range_temp_end
: coll_range_end
;
10914 if (end
.hobj
.is_temp()) {
10920 pend
= temp
? coll_range_temp_end
: end
;
10923 dout(20) << __func__
<< " pend " << pend
<< dendl
;
10925 if (!it
->valid() || it
->is_ge(pend
)) {
10927 dout(20) << __func__
<< " iterator not valid (end of db?)" << dendl
;
10929 dout(20) << __func__
<< " oid " << it
->oid() << " >= " << pend
<< dendl
;
10931 if (end
.hobj
.is_temp()) {
10932 if (it
->valid() && it
->is_lt(coll_range_temp_end
)) {
10933 *pnext
= it
->oid();
10938 dout(30) << __func__
<< " switch to non-temp namespace" << dendl
;
10940 it
->upper_bound(coll_range_start
);
10941 if (end
.hobj
.is_max())
10942 pend
= coll_range_end
;
10945 dout(30) << __func__
<< " pend " << pend
<< dendl
;
10948 if (it
->valid() && it
->is_lt(coll_range_end
)) {
10949 *pnext
= it
->oid();
10954 dout(20) << __func__
<< " oid " << it
->oid() << " end " << end
<< dendl
;
10955 if (ls
->size() >= (unsigned)max
) {
10956 dout(20) << __func__
<< " reached max " << max
<< dendl
;
10957 *pnext
= it
->oid();
10961 ls
->push_back(it
->oid());
10966 *pnext
= ghobject_t::get_max();
10970 l_bluestore_clist_lat
,
10971 mono_clock::now() - start_time
,
10972 cct
->_conf
->bluestore_log_collection_list_age
,
10973 [&] (const ceph::timespan
& lat
) {
10974 ostringstream ostr
;
10975 ostr
<< ", lat = " << timespan_str(lat
)
10976 << " cid =" << c
->cid
10977 << " start " << start
<< " end " << end
10985 int BlueStore::omap_get(
10986 CollectionHandle
&c_
, ///< [in] Collection containing oid
10987 const ghobject_t
&oid
, ///< [in] Object containing omap
10988 bufferlist
*header
, ///< [out] omap header
10989 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10992 Collection
*c
= static_cast<Collection
*>(c_
.get());
10993 return _omap_get(c
, oid
, header
, out
);
10996 int BlueStore::_omap_get(
10997 Collection
*c
, ///< [in] Collection containing oid
10998 const ghobject_t
&oid
, ///< [in] Object containing omap
10999 bufferlist
*header
, ///< [out] omap header
11000 map
<string
, bufferlist
> *out
/// < [out] Key to value map
11003 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
11006 std::shared_lock
l(c
->lock
);
11008 OnodeRef o
= c
->get_onode(oid
, false);
11009 if (!o
|| !o
->exists
) {
11013 r
= _onode_omap_get(o
, header
, out
);
11015 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
11020 int BlueStore::_onode_omap_get(
11021 const OnodeRef
&o
, ///< [in] Object containing omap
11022 bufferlist
*header
, ///< [out] omap header
11023 map
<string
, bufferlist
> *out
/// < [out] Key to value map
11027 if (!o
|| !o
->exists
) {
11031 if (!o
->onode
.has_omap())
11035 const string
& prefix
= o
->get_omap_prefix();
11036 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
11038 o
->get_omap_header(&head
);
11039 o
->get_omap_tail(&tail
);
11040 it
->lower_bound(head
);
11041 while (it
->valid()) {
11042 if (it
->key() == head
) {
11043 dout(30) << __func__
<< " got header" << dendl
;
11044 *header
= it
->value();
11045 } else if (it
->key() >= tail
) {
11046 dout(30) << __func__
<< " reached tail" << dendl
;
11050 o
->decode_omap_key(it
->key(), &user_key
);
11051 dout(20) << __func__
<< " got " << pretty_binary_string(it
->key())
11052 << " -> " << user_key
<< dendl
;
11053 (*out
)[user_key
] = it
->value();
11062 int BlueStore::omap_get_header(
11063 CollectionHandle
&c_
, ///< [in] Collection containing oid
11064 const ghobject_t
&oid
, ///< [in] Object containing omap
11065 bufferlist
*header
, ///< [out] omap header
11066 bool allow_eio
///< [in] don't assert on eio
11069 Collection
*c
= static_cast<Collection
*>(c_
.get());
11070 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
11073 std::shared_lock
l(c
->lock
);
11075 OnodeRef o
= c
->get_onode(oid
, false);
11076 if (!o
|| !o
->exists
) {
11080 if (!o
->onode
.has_omap())
11085 o
->get_omap_header(&head
);
11086 if (db
->get(o
->get_omap_prefix(), head
, header
) >= 0) {
11087 dout(30) << __func__
<< " got header" << dendl
;
11089 dout(30) << __func__
<< " no header" << dendl
;
11093 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
11098 int BlueStore::omap_get_keys(
11099 CollectionHandle
&c_
, ///< [in] Collection containing oid
11100 const ghobject_t
&oid
, ///< [in] Object containing omap
11101 set
<string
> *keys
///< [out] Keys defined on oid
11104 Collection
*c
= static_cast<Collection
*>(c_
.get());
11105 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
11108 auto start1
= mono_clock::now();
11109 std::shared_lock
l(c
->lock
);
11111 OnodeRef o
= c
->get_onode(oid
, false);
11112 if (!o
|| !o
->exists
) {
11116 if (!o
->onode
.has_omap())
11120 const string
& prefix
= o
->get_omap_prefix();
11121 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
11123 o
->get_omap_key(string(), &head
);
11124 o
->get_omap_tail(&tail
);
11125 it
->lower_bound(head
);
11126 while (it
->valid()) {
11127 if (it
->key() >= tail
) {
11128 dout(30) << __func__
<< " reached tail" << dendl
;
11132 o
->decode_omap_key(it
->key(), &user_key
);
11133 dout(20) << __func__
<< " got " << pretty_binary_string(it
->key())
11134 << " -> " << user_key
<< dendl
;
11135 keys
->insert(user_key
);
11140 c
->store
->log_latency(
11142 l_bluestore_omap_get_keys_lat
,
11143 mono_clock::now() - start1
,
11144 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
11146 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
11151 int BlueStore::omap_get_values(
11152 CollectionHandle
&c_
, ///< [in] Collection containing oid
11153 const ghobject_t
&oid
, ///< [in] Object containing omap
11154 const set
<string
> &keys
, ///< [in] Keys to get
11155 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
11158 Collection
*c
= static_cast<Collection
*>(c_
.get());
11159 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
11162 std::shared_lock
l(c
->lock
);
11163 auto start1
= mono_clock::now();
11166 OnodeRef o
= c
->get_onode(oid
, false);
11167 if (!o
|| !o
->exists
) {
11171 if (!o
->onode
.has_omap()) {
11176 const string
& prefix
= o
->get_omap_prefix();
11177 o
->get_omap_key(string(), &final_key
);
11178 size_t base_key_len
= final_key
.size();
11179 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
11180 final_key
.resize(base_key_len
); // keep prefix
11183 if (db
->get(prefix
, final_key
, &val
) >= 0) {
11184 dout(30) << __func__
<< " got " << pretty_binary_string(final_key
)
11185 << " -> " << *p
<< dendl
;
11186 out
->insert(make_pair(*p
, val
));
11191 c
->store
->log_latency(
11193 l_bluestore_omap_get_values_lat
,
11194 mono_clock::now() - start1
,
11195 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
11197 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
11202 #ifdef WITH_SEASTAR
11203 int BlueStore::omap_get_values(
11204 CollectionHandle
&c_
, ///< [in] Collection containing oid
11205 const ghobject_t
&oid
, ///< [in] Object containing omap
11206 const std::optional
<string
> &start_after
, ///< [in] Keys to get
11207 map
<string
, bufferlist
> *output
///< [out] Returned keys and values
11210 Collection
*c
= static_cast<Collection
*>(c_
.get());
11211 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
11214 std::shared_lock
l(c
->lock
);
11216 OnodeRef o
= c
->get_onode(oid
, false);
11217 if (!o
|| !o
->exists
) {
11221 if (!o
->onode
.has_omap()) {
11226 ObjectMap::ObjectMapIterator iter
= get_omap_iterator(c_
, oid
);
11231 iter
->upper_bound(*start_after
);
11232 for (; iter
->valid(); iter
->next()) {
11233 output
->insert(make_pair(iter
->key(), iter
->value()));
11238 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
11244 int BlueStore::omap_check_keys(
11245 CollectionHandle
&c_
, ///< [in] Collection containing oid
11246 const ghobject_t
&oid
, ///< [in] Object containing omap
11247 const set
<string
> &keys
, ///< [in] Keys to check
11248 set
<string
> *out
///< [out] Subset of keys defined on oid
11251 Collection
*c
= static_cast<Collection
*>(c_
.get());
11252 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
11255 std::shared_lock
l(c
->lock
);
11258 OnodeRef o
= c
->get_onode(oid
, false);
11259 if (!o
|| !o
->exists
) {
11263 if (!o
->onode
.has_omap()) {
11268 const string
& prefix
= o
->get_omap_prefix();
11269 o
->get_omap_key(string(), &final_key
);
11270 size_t base_key_len
= final_key
.size();
11271 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
11272 final_key
.resize(base_key_len
); // keep prefix
11275 if (db
->get(prefix
, final_key
, &val
) >= 0) {
11276 dout(30) << __func__
<< " have " << pretty_binary_string(final_key
)
11277 << " -> " << *p
<< dendl
;
11280 dout(30) << __func__
<< " miss " << pretty_binary_string(final_key
)
11281 << " -> " << *p
<< dendl
;
11286 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
11291 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
11292 CollectionHandle
&c_
, ///< [in] collection
11293 const ghobject_t
&oid
///< [in] object
11296 Collection
*c
= static_cast<Collection
*>(c_
.get());
11297 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
11299 return ObjectMap::ObjectMapIterator();
11301 std::shared_lock
l(c
->lock
);
11302 OnodeRef o
= c
->get_onode(oid
, false);
11303 if (!o
|| !o
->exists
) {
11304 dout(10) << __func__
<< " " << oid
<< "doesn't exist" <<dendl
;
11305 return ObjectMap::ObjectMapIterator();
11308 dout(10) << __func__
<< " has_omap = " << (int)o
->onode
.has_omap() <<dendl
;
11309 KeyValueDB::Iterator it
= db
->get_iterator(o
->get_omap_prefix());
11310 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c
, o
, it
));
11313 // -----------------
11316 uint64_t BlueStore::_get_ondisk_reserved() const {
11317 return round_up_to(
11318 std::max
<uint64_t>(SUPER_RESERVED
, min_alloc_size
), min_alloc_size
);
11321 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction
& t
)
11323 dout(10) << __func__
<< " ondisk_format " << ondisk_format
11324 << " min_compat_ondisk_format " << min_compat_ondisk_format
11326 ceph_assert(ondisk_format
== latest_ondisk_format
);
11329 encode(ondisk_format
, bl
);
11330 t
->set(PREFIX_SUPER
, "ondisk_format", bl
);
11334 encode(min_compat_ondisk_format
, bl
);
11335 t
->set(PREFIX_SUPER
, "min_compat_ondisk_format", bl
);
11339 int BlueStore::_open_super_meta()
11345 db
->get(PREFIX_SUPER
, "nid_max", &bl
);
11346 auto p
= bl
.cbegin();
11351 } catch (buffer::error
& e
) {
11352 derr
<< __func__
<< " unable to read nid_max" << dendl
;
11355 dout(1) << __func__
<< " old nid_max " << nid_max
<< dendl
;
11356 nid_last
= nid_max
.load();
11363 db
->get(PREFIX_SUPER
, "blobid_max", &bl
);
11364 auto p
= bl
.cbegin();
11369 } catch (buffer::error
& e
) {
11370 derr
<< __func__
<< " unable to read blobid_max" << dendl
;
11373 dout(1) << __func__
<< " old blobid_max " << blobid_max
<< dendl
;
11374 blobid_last
= blobid_max
.load();
11380 db
->get(PREFIX_SUPER
, "freelist_type", &bl
);
11382 freelist_type
= std::string(bl
.c_str(), bl
.length());
11383 dout(1) << __func__
<< " freelist_type " << freelist_type
<< dendl
;
11385 ceph_abort_msg("Not Support extent freelist manager");
11390 int32_t compat_ondisk_format
= 0;
11393 int r
= db
->get(PREFIX_SUPER
, "ondisk_format", &bl
);
11395 // base case: kraken bluestore is v1 and readable by v1
11396 dout(20) << __func__
<< " missing ondisk_format; assuming kraken"
11399 compat_ondisk_format
= 1;
11401 auto p
= bl
.cbegin();
11403 decode(ondisk_format
, p
);
11404 } catch (buffer::error
& e
) {
11405 derr
<< __func__
<< " unable to read ondisk_format" << dendl
;
11410 r
= db
->get(PREFIX_SUPER
, "min_compat_ondisk_format", &bl
);
11412 auto p
= bl
.cbegin();
11414 decode(compat_ondisk_format
, p
);
11415 } catch (buffer::error
& e
) {
11416 derr
<< __func__
<< " unable to read compat_ondisk_format" << dendl
;
11421 dout(1) << __func__
<< " ondisk_format " << ondisk_format
11422 << " compat_ondisk_format " << compat_ondisk_format
11426 if (latest_ondisk_format
< compat_ondisk_format
) {
11427 derr
<< __func__
<< " compat_ondisk_format is "
11428 << compat_ondisk_format
<< " but we only understand version "
11429 << latest_ondisk_format
<< dendl
;
11435 db
->get(PREFIX_SUPER
, "min_alloc_size", &bl
);
11436 auto p
= bl
.cbegin();
11440 min_alloc_size
= val
;
11441 min_alloc_size_order
= ctz(val
);
11442 ceph_assert(min_alloc_size
== 1u << min_alloc_size_order
);
11443 } catch (buffer::error
& e
) {
11444 derr
<< __func__
<< " unable to read min_alloc_size" << dendl
;
11447 dout(1) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
11448 << std::dec
<< dendl
;
11451 _set_per_pool_omap();
11454 _set_alloc_sizes();
11455 _set_throttle_params();
11458 _set_compression();
11465 int BlueStore::_upgrade_super()
11467 dout(1) << __func__
<< " from " << ondisk_format
<< ", latest "
11468 << latest_ondisk_format
<< dendl
;
11469 if (ondisk_format
< latest_ondisk_format
) {
11470 ceph_assert(ondisk_format
> 0);
11471 ceph_assert(ondisk_format
< latest_ondisk_format
);
11473 KeyValueDB::Transaction t
= db
->get_transaction();
11474 if (ondisk_format
== 1) {
11476 // - super: added ondisk_format
11477 // - super: added min_readable_ondisk_format
11478 // - super: added min_compat_ondisk_format
11479 // - super: added min_alloc_size
11480 // - super: removed min_min_alloc_size
11483 db
->get(PREFIX_SUPER
, "min_min_alloc_size", &bl
);
11484 auto p
= bl
.cbegin();
11488 min_alloc_size
= val
;
11489 } catch (buffer::error
& e
) {
11490 derr
<< __func__
<< " failed to read min_min_alloc_size" << dendl
;
11493 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
11494 t
->rmkey(PREFIX_SUPER
, "min_min_alloc_size");
11498 if (ondisk_format
== 2) {
11500 // - onode has FLAG_PER_POOL_OMAP. Note that we do not know that *all*
11501 // ondes are using the per-pool prefix until a repair is run; at that
11502 // point the per_pool_omap=1 key will be set.
11503 // - super: added per_pool_omap key, which indicates that *all* objects
11504 // are using the new prefix and key format
11507 if (ondisk_format
== 3) {
11509 // - FreelistManager keeps meta within bdev label
11510 int r
= _write_out_fm_meta(0);
11511 ceph_assert(r
== 0);
11514 // This to be the last operation
11515 _prepare_ondisk_format_super(t
);
11516 int r
= db
->submit_transaction_sync(t
);
11517 ceph_assert(r
== 0);
11520 dout(1) << __func__
<< " done" << dendl
;
11524 void BlueStore::_assign_nid(TransContext
*txc
, OnodeRef o
)
11526 if (o
->onode
.nid
) {
11527 ceph_assert(o
->exists
);
11530 uint64_t nid
= ++nid_last
;
11531 dout(20) << __func__
<< " " << nid
<< dendl
;
11532 o
->onode
.nid
= nid
;
11533 txc
->last_nid
= nid
;
11537 uint64_t BlueStore::_assign_blobid(TransContext
*txc
)
11539 uint64_t bid
= ++blobid_last
;
11540 dout(20) << __func__
<< " " << bid
<< dendl
;
11541 txc
->last_blobid
= bid
;
11545 void BlueStore::get_db_statistics(Formatter
*f
)
11547 db
->get_statistics(f
);
11550 BlueStore::TransContext
*BlueStore::_txc_create(
11551 Collection
*c
, OpSequencer
*osr
,
11552 list
<Context
*> *on_commits
)
11554 TransContext
*txc
= new TransContext(cct
, c
, osr
, on_commits
);
11555 txc
->t
= db
->get_transaction();
11556 osr
->queue_new(txc
);
11557 dout(20) << __func__
<< " osr " << osr
<< " = " << txc
11558 << " seq " << txc
->seq
<< dendl
;
11562 void BlueStore::_txc_calc_cost(TransContext
*txc
)
11564 // one "io" for the kv commit
11565 auto ios
= 1 + txc
->ioc
.get_num_ios();
11566 auto cost
= throttle_cost_per_io
.load();
11567 txc
->cost
= ios
* cost
+ txc
->bytes
;
11569 dout(10) << __func__
<< " " << txc
<< " cost " << txc
->cost
<< " ("
11570 << ios
<< " ios * " << cost
<< " + " << txc
->bytes
11571 << " bytes)" << dendl
;
11574 void BlueStore::_txc_update_store_statfs(TransContext
*txc
)
11576 if (txc
->statfs_delta
.is_empty())
11579 logger
->inc(l_bluestore_allocated
, txc
->statfs_delta
.allocated());
11580 logger
->inc(l_bluestore_stored
, txc
->statfs_delta
.stored());
11581 logger
->inc(l_bluestore_compressed
, txc
->statfs_delta
.compressed());
11582 logger
->inc(l_bluestore_compressed_allocated
, txc
->statfs_delta
.compressed_allocated());
11583 logger
->inc(l_bluestore_compressed_original
, txc
->statfs_delta
.compressed_original());
11586 txc
->statfs_delta
.encode(bl
);
11587 if (per_pool_stat_collection
) {
11589 get_pool_stat_key(txc
->osd_pool_id
, &key
);
11590 txc
->t
->merge(PREFIX_STAT
, key
, bl
);
11592 std::lock_guard
l(vstatfs_lock
);
11593 auto& stats
= osd_pools
[txc
->osd_pool_id
];
11594 stats
+= txc
->statfs_delta
;
11596 vstatfs
+= txc
->statfs_delta
; //non-persistent in this mode
11599 txc
->t
->merge(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, bl
);
11601 std::lock_guard
l(vstatfs_lock
);
11602 vstatfs
+= txc
->statfs_delta
;
11604 txc
->statfs_delta
.reset();
11607 void BlueStore::_txc_state_proc(TransContext
*txc
)
11610 dout(10) << __func__
<< " txc " << txc
11611 << " " << txc
->get_state_name() << dendl
;
11612 switch (txc
->state
) {
11613 case TransContext::STATE_PREPARE
:
11614 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_prepare_lat
);
11615 if (txc
->ioc
.has_pending_aios()) {
11616 txc
->state
= TransContext::STATE_AIO_WAIT
;
11617 txc
->had_ios
= true;
11618 _txc_aio_submit(txc
);
11623 case TransContext::STATE_AIO_WAIT
:
11625 mono_clock::duration lat
= throttle
.log_state_latency(
11626 *txc
, logger
, l_bluestore_state_aio_wait_lat
);
11627 if (ceph::to_seconds
<double>(lat
) >= cct
->_conf
->bluestore_log_op_age
) {
11628 dout(0) << __func__
<< " slow aio_wait, txc = " << txc
11629 << ", latency = " << lat
11634 _txc_finish_io(txc
); // may trigger blocked txc's too
11637 case TransContext::STATE_IO_DONE
:
11638 ceph_assert(ceph_mutex_is_locked(txc
->osr
->qlock
)); // see _txc_finish_io
11639 if (txc
->had_ios
) {
11640 ++txc
->osr
->txc_with_unstable_io
;
11642 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_io_done_lat
);
11643 txc
->state
= TransContext::STATE_KV_QUEUED
;
11644 if (cct
->_conf
->bluestore_sync_submit_transaction
) {
11645 if (txc
->last_nid
>= nid_max
||
11646 txc
->last_blobid
>= blobid_max
) {
11647 dout(20) << __func__
11648 << " last_{nid,blobid} exceeds max, submit via kv thread"
11650 } else if (txc
->osr
->kv_committing_serially
) {
11651 dout(20) << __func__
<< " prior txc submitted via kv thread, us too"
11653 // note: this is starvation-prone. once we have a txc in a busy
11654 // sequencer that is committing serially it is possible to keep
11655 // submitting new transactions fast enough that we get stuck doing
11656 // so. the alternative is to block here... fixme?
11657 } else if (txc
->osr
->txc_with_unstable_io
) {
11658 dout(20) << __func__
<< " prior txc(s) with unstable ios "
11659 << txc
->osr
->txc_with_unstable_io
.load() << dendl
;
11660 } else if (cct
->_conf
->bluestore_debug_randomize_serial_transaction
&&
11661 rand() % cct
->_conf
->bluestore_debug_randomize_serial_transaction
11663 dout(20) << __func__
<< " DEBUG randomly forcing submit via kv thread"
11666 _txc_apply_kv(txc
, true);
11670 std::lock_guard
l(kv_lock
);
11671 kv_queue
.push_back(txc
);
11672 if (!kv_sync_in_progress
) {
11673 kv_sync_in_progress
= true;
11674 kv_cond
.notify_one();
11676 if (txc
->state
!= TransContext::STATE_KV_SUBMITTED
) {
11677 kv_queue_unsubmitted
.push_back(txc
);
11678 ++txc
->osr
->kv_committing_serially
;
11682 kv_throttle_costs
+= txc
->cost
;
11685 case TransContext::STATE_KV_SUBMITTED
:
11686 _txc_committed_kv(txc
);
11689 case TransContext::STATE_KV_DONE
:
11690 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_done_lat
);
11691 if (txc
->deferred_txn
) {
11692 txc
->state
= TransContext::STATE_DEFERRED_QUEUED
;
11693 _deferred_queue(txc
);
11696 txc
->state
= TransContext::STATE_FINISHING
;
11699 case TransContext::STATE_DEFERRED_CLEANUP
:
11700 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_deferred_cleanup_lat
);
11701 txc
->state
= TransContext::STATE_FINISHING
;
11704 case TransContext::STATE_FINISHING
:
11705 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_finishing_lat
);
11710 derr
<< __func__
<< " unexpected txc " << txc
11711 << " state " << txc
->get_state_name() << dendl
;
11712 ceph_abort_msg("unexpected txc state");
11718 void BlueStore::_txc_finish_io(TransContext
*txc
)
11720 dout(20) << __func__
<< " " << txc
<< dendl
;
11723 * we need to preserve the order of kv transactions,
11724 * even though aio will complete in any order.
11727 OpSequencer
*osr
= txc
->osr
.get();
11728 std::lock_guard
l(osr
->qlock
);
11729 txc
->state
= TransContext::STATE_IO_DONE
;
11730 txc
->ioc
.release_running_aios();
11731 OpSequencer::q_list_t::iterator p
= osr
->q
.iterator_to(*txc
);
11732 while (p
!= osr
->q
.begin()) {
11734 if (p
->state
< TransContext::STATE_IO_DONE
) {
11735 dout(20) << __func__
<< " " << txc
<< " blocked by " << &*p
<< " "
11736 << p
->get_state_name() << dendl
;
11739 if (p
->state
> TransContext::STATE_IO_DONE
) {
11745 _txc_state_proc(&*p
++);
11746 } while (p
!= osr
->q
.end() &&
11747 p
->state
== TransContext::STATE_IO_DONE
);
11749 if (osr
->kv_submitted_waiters
) {
11750 osr
->qcond
.notify_all();
11754 void BlueStore::_txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
)
11756 dout(20) << __func__
<< " txc " << txc
11757 << " onodes " << txc
->onodes
11758 << " shared_blobs " << txc
->shared_blobs
11762 for (auto o
: txc
->onodes
) {
11763 _record_onode(o
, t
);
11764 o
->flushing_count
++;
11767 // objects we modified but didn't affect the onode
11768 auto p
= txc
->modified_objects
.begin();
11769 while (p
!= txc
->modified_objects
.end()) {
11770 if (txc
->onodes
.count(*p
) == 0) {
11771 (*p
)->flushing_count
++;
11774 // remove dups with onodes list to avoid problems in _txc_finish
11775 p
= txc
->modified_objects
.erase(p
);
11779 // finalize shared_blobs
11780 for (auto sb
: txc
->shared_blobs
) {
11782 auto sbid
= sb
->get_sbid();
11783 get_shared_blob_key(sbid
, &key
);
11784 if (sb
->persistent
->empty()) {
11785 dout(20) << __func__
<< " shared_blob 0x"
11786 << std::hex
<< sbid
<< std::dec
11787 << " is empty" << dendl
;
11788 t
->rmkey(PREFIX_SHARED_BLOB
, key
);
11791 encode(*(sb
->persistent
), bl
);
11792 dout(20) << __func__
<< " shared_blob 0x"
11793 << std::hex
<< sbid
<< std::dec
11794 << " is " << bl
.length() << " " << *sb
<< dendl
;
11795 t
->set(PREFIX_SHARED_BLOB
, key
, bl
);
11800 void BlueStore::BSPerfTracker::update_from_perfcounters(
11801 PerfCounters
&logger
)
11803 os_commit_latency_ns
.consume_next(
11804 logger
.get_tavg_ns(
11805 l_bluestore_commit_lat
));
11806 os_apply_latency_ns
.consume_next(
11807 logger
.get_tavg_ns(
11808 l_bluestore_commit_lat
));
11811 void BlueStore::_txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
)
11813 dout(20) << __func__
<< " txc " << txc
<< std::hex
11814 << " allocated 0x" << txc
->allocated
11815 << " released 0x" << txc
->released
11816 << std::dec
<< dendl
;
11818 // We have to handle the case where we allocate *and* deallocate the
11819 // same region in this transaction. The freelist doesn't like that.
11820 // (Actually, the only thing that cares is the BitmapFreelistManager
11821 // debug check. But that's important.)
11822 interval_set
<uint64_t> tmp_allocated
, tmp_released
;
11823 interval_set
<uint64_t> *pallocated
= &txc
->allocated
;
11824 interval_set
<uint64_t> *preleased
= &txc
->released
;
11825 if (!txc
->allocated
.empty() && !txc
->released
.empty()) {
11826 interval_set
<uint64_t> overlap
;
11827 overlap
.intersection_of(txc
->allocated
, txc
->released
);
11828 if (!overlap
.empty()) {
11829 tmp_allocated
= txc
->allocated
;
11830 tmp_allocated
.subtract(overlap
);
11831 tmp_released
= txc
->released
;
11832 tmp_released
.subtract(overlap
);
11833 dout(20) << __func__
<< " overlap 0x" << std::hex
<< overlap
11834 << ", new allocated 0x" << tmp_allocated
11835 << " released 0x" << tmp_released
<< std::dec
11837 pallocated
= &tmp_allocated
;
11838 preleased
= &tmp_released
;
11842 // update freelist with non-overlap sets
11843 for (interval_set
<uint64_t>::iterator p
= pallocated
->begin();
11844 p
!= pallocated
->end();
11846 fm
->allocate(p
.get_start(), p
.get_len(), t
);
11848 for (interval_set
<uint64_t>::iterator p
= preleased
->begin();
11849 p
!= preleased
->end();
11851 dout(20) << __func__
<< " release 0x" << std::hex
<< p
.get_start()
11852 << "~" << p
.get_len() << std::dec
<< dendl
;
11853 fm
->release(p
.get_start(), p
.get_len(), t
);
11856 _txc_update_store_statfs(txc
);
11859 void BlueStore::_txc_apply_kv(TransContext
*txc
, bool sync_submit_transaction
)
11861 ceph_assert(txc
->state
== TransContext::STATE_KV_QUEUED
);
11863 #if defined(WITH_LTTNG)
11864 auto start
= mono_clock::now();
11867 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction(txc
->t
);
11868 ceph_assert(r
== 0);
11869 txc
->state
= TransContext::STATE_KV_SUBMITTED
;
11870 if (txc
->osr
->kv_submitted_waiters
) {
11871 std::lock_guard
l(txc
->osr
->qlock
);
11872 txc
->osr
->qcond
.notify_all();
11875 #if defined(WITH_LTTNG)
11876 if (txc
->tracing
) {
11879 transaction_kv_submit_latency
,
11880 txc
->osr
->get_sequencer_id(),
11882 sync_submit_transaction
,
11883 ceph::to_seconds
<double>(mono_clock::now() - start
));
11888 for (auto ls
: { &txc
->onodes
, &txc
->modified_objects
}) {
11889 for (auto& o
: *ls
) {
11890 dout(20) << __func__
<< " onode " << o
<< " had " << o
->flushing_count
11892 if (--o
->flushing_count
== 0 && o
->waiting_count
.load()) {
11893 std::lock_guard
l(o
->flush_lock
);
11894 o
->flush_cond
.notify_all();
11900 void BlueStore::_txc_committed_kv(TransContext
*txc
)
11902 dout(20) << __func__
<< " txc " << txc
<< dendl
;
11903 throttle
.complete_kv(*txc
);
11905 std::lock_guard
l(txc
->osr
->qlock
);
11906 txc
->state
= TransContext::STATE_KV_DONE
;
11907 if (txc
->ch
->commit_queue
) {
11908 txc
->ch
->commit_queue
->queue(txc
->oncommits
);
11910 finisher
.queue(txc
->oncommits
);
11913 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_committing_lat
);
11916 l_bluestore_commit_lat
,
11917 mono_clock::now() - txc
->start
,
11918 cct
->_conf
->bluestore_log_op_age
,
11920 return ", txc = " + stringify(txc
);
11925 void BlueStore::_txc_finish(TransContext
*txc
)
11927 dout(20) << __func__
<< " " << txc
<< " onodes " << txc
->onodes
<< dendl
;
11928 ceph_assert(txc
->state
== TransContext::STATE_FINISHING
);
11930 for (auto& sb
: txc
->shared_blobs_written
) {
11931 sb
->finish_write(txc
->seq
);
11933 txc
->shared_blobs_written
.clear();
11935 while (!txc
->removed_collections
.empty()) {
11936 _queue_reap_collection(txc
->removed_collections
.front());
11937 txc
->removed_collections
.pop_front();
11940 OpSequencerRef osr
= txc
->osr
;
11941 bool empty
= false;
11942 bool submit_deferred
= false;
11943 OpSequencer::q_list_t releasing_txc
;
11945 std::lock_guard
l(osr
->qlock
);
11946 txc
->state
= TransContext::STATE_DONE
;
11947 bool notify
= false;
11948 while (!osr
->q
.empty()) {
11949 TransContext
*txc
= &osr
->q
.front();
11950 dout(20) << __func__
<< " txc " << txc
<< " " << txc
->get_state_name()
11952 if (txc
->state
!= TransContext::STATE_DONE
) {
11953 if (txc
->state
== TransContext::STATE_PREPARE
&&
11954 deferred_aggressive
) {
11955 // for _osr_drain_preceding()
11958 if (txc
->state
== TransContext::STATE_DEFERRED_QUEUED
&&
11959 osr
->q
.size() > g_conf()->bluestore_max_deferred_txc
) {
11960 submit_deferred
= true;
11965 osr
->q
.pop_front();
11966 releasing_txc
.push_back(*txc
);
11969 if (osr
->q
.empty()) {
11970 dout(20) << __func__
<< " osr " << osr
<< " q now empty" << dendl
;
11974 // only drain()/drain_preceding() need wakeup,
11975 // other cases use kv_submitted_waiters
11976 if (notify
|| empty
) {
11977 osr
->qcond
.notify_all();
11981 while (!releasing_txc
.empty()) {
11982 // release to allocator only after all preceding txc's have also
11983 // finished any deferred writes that potentially land in these
11985 auto txc
= &releasing_txc
.front();
11986 _txc_release_alloc(txc
);
11987 releasing_txc
.pop_front();
11988 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_done_lat
);
11989 throttle
.complete(*txc
);
11993 if (submit_deferred
) {
11994 // we're pinning memory; flush! we could be more fine-grained here but
11995 // i'm not sure it's worth the bother.
11996 deferred_try_submit();
11999 if (empty
&& osr
->zombie
) {
12000 std::lock_guard
l(zombie_osr_lock
);
12001 if (zombie_osr_set
.erase(osr
->cid
)) {
12002 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
12004 dout(10) << __func__
<< " empty zombie osr " << osr
<< " already reaped"
12010 void BlueStore::_txc_release_alloc(TransContext
*txc
)
12012 // it's expected we're called with lazy_release_lock already taken!
12013 if (likely(!cct
->_conf
->bluestore_debug_no_reuse_blocks
)) {
12015 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
12016 r
= bdev
->queue_discard(txc
->released
);
12018 dout(10) << __func__
<< "(queued) " << txc
<< " " << std::hex
12019 << txc
->released
<< std::dec
<< dendl
;
12022 } else if (cct
->_conf
->bdev_enable_discard
) {
12023 for (auto p
= txc
->released
.begin(); p
!= txc
->released
.end(); ++p
) {
12024 bdev
->discard(p
.get_start(), p
.get_len());
12027 dout(10) << __func__
<< "(sync) " << txc
<< " " << std::hex
12028 << txc
->released
<< std::dec
<< dendl
;
12029 alloc
->release(txc
->released
);
12033 txc
->allocated
.clear();
12034 txc
->released
.clear();
12037 void BlueStore::_osr_attach(Collection
*c
)
12039 // note: caller has RWLock on coll_map
12040 auto q
= coll_map
.find(c
->cid
);
12041 if (q
!= coll_map
.end()) {
12042 c
->osr
= q
->second
->osr
;
12043 ldout(cct
, 10) << __func__
<< " " << c
->cid
12044 << " reusing osr " << c
->osr
<< " from existing coll "
12045 << q
->second
<< dendl
;
12047 std::lock_guard
l(zombie_osr_lock
);
12048 auto p
= zombie_osr_set
.find(c
->cid
);
12049 if (p
== zombie_osr_set
.end()) {
12050 c
->osr
= ceph::make_ref
<OpSequencer
>(this, next_sequencer_id
++, c
->cid
);
12051 ldout(cct
, 10) << __func__
<< " " << c
->cid
12052 << " fresh osr " << c
->osr
<< dendl
;
12054 c
->osr
= p
->second
;
12055 zombie_osr_set
.erase(p
);
12056 ldout(cct
, 10) << __func__
<< " " << c
->cid
12057 << " resurrecting zombie osr " << c
->osr
<< dendl
;
12058 c
->osr
->zombie
= false;
12063 void BlueStore::_osr_register_zombie(OpSequencer
*osr
)
12065 std::lock_guard
l(zombie_osr_lock
);
12066 dout(10) << __func__
<< " " << osr
<< " " << osr
->cid
<< dendl
;
12067 osr
->zombie
= true;
12068 auto i
= zombie_osr_set
.emplace(osr
->cid
, osr
);
12069 // this is either a new insertion or the same osr is already there
12070 ceph_assert(i
.second
|| i
.first
->second
== osr
);
12073 void BlueStore::_osr_drain_preceding(TransContext
*txc
)
12075 OpSequencer
*osr
= txc
->osr
.get();
12076 dout(10) << __func__
<< " " << txc
<< " osr " << osr
<< dendl
;
12077 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
12079 // submit anything pending
12080 deferred_lock
.lock();
12081 if (osr
->deferred_pending
&& !osr
->deferred_running
) {
12082 _deferred_submit_unlock(osr
);
12084 deferred_lock
.unlock();
12088 // wake up any previously finished deferred events
12089 std::lock_guard
l(kv_lock
);
12090 if (!kv_sync_in_progress
) {
12091 kv_sync_in_progress
= true;
12092 kv_cond
.notify_one();
12095 osr
->drain_preceding(txc
);
12096 --deferred_aggressive
;
12097 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
12100 void BlueStore::_osr_drain(OpSequencer
*osr
)
12102 dout(10) << __func__
<< " " << osr
<< dendl
;
12103 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
12105 // submit anything pending
12106 deferred_lock
.lock();
12107 if (osr
->deferred_pending
&& !osr
->deferred_running
) {
12108 _deferred_submit_unlock(osr
);
12110 deferred_lock
.unlock();
12114 // wake up any previously finished deferred events
12115 std::lock_guard
l(kv_lock
);
12116 if (!kv_sync_in_progress
) {
12117 kv_sync_in_progress
= true;
12118 kv_cond
.notify_one();
12122 --deferred_aggressive
;
12123 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
12126 void BlueStore::_osr_drain_all()
12128 dout(10) << __func__
<< dendl
;
12130 set
<OpSequencerRef
> s
;
12131 vector
<OpSequencerRef
> zombies
;
12133 std::shared_lock
l(coll_lock
);
12134 for (auto& i
: coll_map
) {
12135 s
.insert(i
.second
->osr
);
12139 std::lock_guard
l(zombie_osr_lock
);
12140 for (auto& i
: zombie_osr_set
) {
12141 s
.insert(i
.second
);
12142 zombies
.push_back(i
.second
);
12145 dout(20) << __func__
<< " osr_set " << s
<< dendl
;
12147 ++deferred_aggressive
;
12149 // submit anything pending
12150 deferred_try_submit();
12153 // wake up any previously finished deferred events
12154 std::lock_guard
l(kv_lock
);
12155 kv_cond
.notify_one();
12158 std::lock_guard
l(kv_finalize_lock
);
12159 kv_finalize_cond
.notify_one();
12161 for (auto osr
: s
) {
12162 dout(20) << __func__
<< " drain " << osr
<< dendl
;
12165 --deferred_aggressive
;
12168 std::lock_guard
l(zombie_osr_lock
);
12169 for (auto& osr
: zombies
) {
12170 if (zombie_osr_set
.erase(osr
->cid
)) {
12171 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
12172 ceph_assert(osr
->q
.empty());
12173 } else if (osr
->zombie
) {
12174 dout(10) << __func__
<< " empty zombie osr " << osr
12175 << " already reaped" << dendl
;
12176 ceph_assert(osr
->q
.empty());
12178 dout(10) << __func__
<< " empty zombie osr " << osr
12179 << " resurrected" << dendl
;
12184 dout(10) << __func__
<< " done" << dendl
;
12188 void BlueStore::_kv_start()
12190 dout(10) << __func__
<< dendl
;
12193 kv_sync_thread
.create("bstore_kv_sync");
12194 kv_finalize_thread
.create("bstore_kv_final");
12197 void BlueStore::_kv_stop()
12199 dout(10) << __func__
<< dendl
;
12201 std::unique_lock l
{kv_lock
};
12202 while (!kv_sync_started
) {
12206 kv_cond
.notify_all();
12209 std::unique_lock l
{kv_finalize_lock
};
12210 while (!kv_finalize_started
) {
12211 kv_finalize_cond
.wait(l
);
12213 kv_finalize_stop
= true;
12214 kv_finalize_cond
.notify_all();
12216 kv_sync_thread
.join();
12217 kv_finalize_thread
.join();
12218 ceph_assert(removed_collections
.empty());
12220 std::lock_guard
l(kv_lock
);
12224 std::lock_guard
l(kv_finalize_lock
);
12225 kv_finalize_stop
= false;
12227 dout(10) << __func__
<< " stopping finishers" << dendl
;
12228 finisher
.wait_for_empty();
12230 dout(10) << __func__
<< " stopped" << dendl
;
12233 void BlueStore::_kv_sync_thread()
12235 dout(10) << __func__
<< " start" << dendl
;
12236 deque
<DeferredBatch
*> deferred_stable_queue
; ///< deferred ios done + stable
12237 std::unique_lock l
{kv_lock
};
12238 ceph_assert(!kv_sync_started
);
12239 kv_sync_started
= true;
12240 kv_cond
.notify_all();
12242 auto t0
= mono_clock::now();
12243 timespan twait
= ceph::make_timespan(0);
12244 size_t kv_submitted
= 0;
12247 auto period
= cct
->_conf
->bluestore_kv_sync_util_logging_s
;
12248 auto observation_period
=
12249 ceph::make_timespan(period
);
12250 auto elapsed
= mono_clock::now() - t0
;
12251 if (period
&& elapsed
>= observation_period
) {
12252 dout(5) << __func__
<< " utilization: idle "
12253 << twait
<< " of " << elapsed
12254 << ", submitted: " << kv_submitted
12256 t0
= mono_clock::now();
12257 twait
= ceph::make_timespan(0);
12260 ceph_assert(kv_committing
.empty());
12261 if (kv_queue
.empty() &&
12262 ((deferred_done_queue
.empty() && deferred_stable_queue
.empty()) ||
12263 !deferred_aggressive
)) {
12266 dout(20) << __func__
<< " sleep" << dendl
;
12267 auto t
= mono_clock::now();
12268 kv_sync_in_progress
= false;
12270 twait
+= mono_clock::now() - t
;
12272 dout(20) << __func__
<< " wake" << dendl
;
12274 deque
<TransContext
*> kv_submitting
;
12275 deque
<DeferredBatch
*> deferred_done
, deferred_stable
;
12276 uint64_t aios
= 0, costs
= 0;
12278 dout(20) << __func__
<< " committing " << kv_queue
.size()
12279 << " submitting " << kv_queue_unsubmitted
.size()
12280 << " deferred done " << deferred_done_queue
.size()
12281 << " stable " << deferred_stable_queue
.size()
12283 kv_committing
.swap(kv_queue
);
12284 kv_submitting
.swap(kv_queue_unsubmitted
);
12285 deferred_done
.swap(deferred_done_queue
);
12286 deferred_stable
.swap(deferred_stable_queue
);
12288 costs
= kv_throttle_costs
;
12290 kv_throttle_costs
= 0;
12293 dout(30) << __func__
<< " committing " << kv_committing
<< dendl
;
12294 dout(30) << __func__
<< " submitting " << kv_submitting
<< dendl
;
12295 dout(30) << __func__
<< " deferred_done " << deferred_done
<< dendl
;
12296 dout(30) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
12298 auto start
= mono_clock::now();
12300 bool force_flush
= false;
12301 // if bluefs is sharing the same device as data (only), then we
12302 // can rely on the bluefs commit to flush the device and make
12303 // deferred aios stable. that means that if we do have done deferred
12304 // txcs AND we are not on a single device, we need to force a flush.
12305 if (bluefs
&& bluefs_layout
.single_shared_device()) {
12307 force_flush
= true;
12308 } else if (kv_committing
.empty() && deferred_stable
.empty()) {
12309 force_flush
= true; // there's nothing else to commit!
12310 } else if (deferred_aggressive
) {
12311 force_flush
= true;
12314 if (aios
|| !deferred_done
.empty()) {
12315 force_flush
= true;
12317 dout(20) << __func__
<< " skipping flush (no aios, no deferred_done)" << dendl
;
12322 dout(20) << __func__
<< " num_aios=" << aios
12323 << " force_flush=" << (int)force_flush
12324 << ", flushing, deferred done->stable" << dendl
;
12325 // flush/barrier on block device
12328 // if we flush then deferred done are now deferred stable
12329 deferred_stable
.insert(deferred_stable
.end(), deferred_done
.begin(),
12330 deferred_done
.end());
12331 deferred_done
.clear();
12333 auto after_flush
= mono_clock::now();
12335 // we will use one final transaction to force a sync
12336 KeyValueDB::Transaction synct
= db
->get_transaction();
12338 // increase {nid,blobid}_max? note that this covers both the
12339 // case where we are approaching the max and the case we passed
12340 // it. in either case, we increase the max in the earlier txn
12342 uint64_t new_nid_max
= 0, new_blobid_max
= 0;
12343 if (nid_last
+ cct
->_conf
->bluestore_nid_prealloc
/2 > nid_max
) {
12344 KeyValueDB::Transaction t
=
12345 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
12346 new_nid_max
= nid_last
+ cct
->_conf
->bluestore_nid_prealloc
;
12348 encode(new_nid_max
, bl
);
12349 t
->set(PREFIX_SUPER
, "nid_max", bl
);
12350 dout(10) << __func__
<< " new_nid_max " << new_nid_max
<< dendl
;
12352 if (blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
/2 > blobid_max
) {
12353 KeyValueDB::Transaction t
=
12354 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
12355 new_blobid_max
= blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
;
12357 encode(new_blobid_max
, bl
);
12358 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
12359 dout(10) << __func__
<< " new_blobid_max " << new_blobid_max
<< dendl
;
12362 for (auto txc
: kv_committing
) {
12363 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_queued_lat
);
12364 if (txc
->state
== TransContext::STATE_KV_QUEUED
) {
12366 _txc_apply_kv(txc
, false);
12367 --txc
->osr
->kv_committing_serially
;
12369 ceph_assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
12371 if (txc
->had_ios
) {
12372 --txc
->osr
->txc_with_unstable_io
;
12376 // release throttle *before* we commit. this allows new ops
12377 // to be prepared and enter pipeline while we are waiting on
12378 // the kv commit sync/flush. then hopefully on the next
12379 // iteration there will already be ops awake. otherwise, we
12380 // end up going to sleep, and then wake up when the very first
12381 // transaction is ready for commit.
12382 throttle
.release_kv_throttle(costs
);
12385 after_flush
- bluefs_last_balance
>
12386 ceph::make_timespan(cct
->_conf
->bluestore_bluefs_balance_interval
)) {
12387 bluefs_last_balance
= after_flush
;
12388 int r
= _balance_bluefs_freespace();
12389 ceph_assert(r
>= 0);
12392 // cleanup sync deferred keys
12393 for (auto b
: deferred_stable
) {
12394 for (auto& txc
: b
->txcs
) {
12395 bluestore_deferred_transaction_t
& wt
= *txc
.deferred_txn
;
12396 ceph_assert(wt
.released
.empty()); // only kraken did this
12398 get_deferred_key(wt
.seq
, &key
);
12399 synct
->rm_single_key(PREFIX_DEFERRED
, key
);
12403 #if defined(WITH_LTTNG)
12404 auto sync_start
= mono_clock::now();
12406 // submit synct synchronously (block and wait for it to commit)
12407 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction_sync(synct
);
12408 ceph_assert(r
== 0);
12410 int committing_size
= kv_committing
.size();
12411 int deferred_size
= deferred_stable
.size();
12413 #if defined(WITH_LTTNG)
12414 double sync_latency
= ceph::to_seconds
<double>(mono_clock::now() - sync_start
);
12415 for (auto txc
: kv_committing
) {
12416 if (txc
->tracing
) {
12419 transaction_kv_sync_latency
,
12420 txc
->osr
->get_sequencer_id(),
12422 kv_committing
.size(),
12423 deferred_done
.size(),
12424 deferred_stable
.size(),
12431 std::unique_lock m
{kv_finalize_lock
};
12432 if (kv_committing_to_finalize
.empty()) {
12433 kv_committing_to_finalize
.swap(kv_committing
);
12435 kv_committing_to_finalize
.insert(
12436 kv_committing_to_finalize
.end(),
12437 kv_committing
.begin(),
12438 kv_committing
.end());
12439 kv_committing
.clear();
12441 if (deferred_stable_to_finalize
.empty()) {
12442 deferred_stable_to_finalize
.swap(deferred_stable
);
12444 deferred_stable_to_finalize
.insert(
12445 deferred_stable_to_finalize
.end(),
12446 deferred_stable
.begin(),
12447 deferred_stable
.end());
12448 deferred_stable
.clear();
12450 if (!kv_finalize_in_progress
) {
12451 kv_finalize_in_progress
= true;
12452 kv_finalize_cond
.notify_one();
12457 nid_max
= new_nid_max
;
12458 dout(10) << __func__
<< " nid_max now " << nid_max
<< dendl
;
12460 if (new_blobid_max
) {
12461 blobid_max
= new_blobid_max
;
12462 dout(10) << __func__
<< " blobid_max now " << blobid_max
<< dendl
;
12466 auto finish
= mono_clock::now();
12467 ceph::timespan dur_flush
= after_flush
- start
;
12468 ceph::timespan dur_kv
= finish
- after_flush
;
12469 ceph::timespan dur
= finish
- start
;
12470 dout(20) << __func__
<< " committed " << committing_size
12471 << " cleaned " << deferred_size
12473 << " (" << dur_flush
<< " flush + " << dur_kv
<< " kv commit)"
12475 log_latency("kv_flush",
12476 l_bluestore_kv_flush_lat
,
12478 cct
->_conf
->bluestore_log_op_age
);
12479 log_latency("kv_commit",
12480 l_bluestore_kv_commit_lat
,
12482 cct
->_conf
->bluestore_log_op_age
);
12483 log_latency("kv_sync",
12484 l_bluestore_kv_sync_lat
,
12486 cct
->_conf
->bluestore_log_op_age
);
12490 if (!bluefs_extents_reclaiming
.empty()) {
12491 dout(0) << __func__
<< " releasing old bluefs 0x" << std::hex
12492 << bluefs_extents_reclaiming
<< std::dec
<< dendl
;
12494 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
12495 r
= bdev
->queue_discard(bluefs_extents_reclaiming
);
12499 } else if (cct
->_conf
->bdev_enable_discard
) {
12500 for (auto p
= bluefs_extents_reclaiming
.begin(); p
!= bluefs_extents_reclaiming
.end(); ++p
) {
12501 bdev
->discard(p
.get_start(), p
.get_len());
12505 alloc
->release(bluefs_extents_reclaiming
);
12507 bluefs_extents_reclaiming
.clear();
12512 // previously deferred "done" are now "stable" by virtue of this
12514 deferred_stable_queue
.swap(deferred_done
);
12517 dout(10) << __func__
<< " finish" << dendl
;
12518 kv_sync_started
= false;
12521 void BlueStore::_kv_finalize_thread()
12523 deque
<TransContext
*> kv_committed
;
12524 deque
<DeferredBatch
*> deferred_stable
;
12525 dout(10) << __func__
<< " start" << dendl
;
12526 std::unique_lock
l(kv_finalize_lock
);
12527 ceph_assert(!kv_finalize_started
);
12528 kv_finalize_started
= true;
12529 kv_finalize_cond
.notify_all();
12531 ceph_assert(kv_committed
.empty());
12532 ceph_assert(deferred_stable
.empty());
12533 if (kv_committing_to_finalize
.empty() &&
12534 deferred_stable_to_finalize
.empty()) {
12535 if (kv_finalize_stop
)
12537 dout(20) << __func__
<< " sleep" << dendl
;
12538 kv_finalize_in_progress
= false;
12539 kv_finalize_cond
.wait(l
);
12540 dout(20) << __func__
<< " wake" << dendl
;
12542 kv_committed
.swap(kv_committing_to_finalize
);
12543 deferred_stable
.swap(deferred_stable_to_finalize
);
12545 dout(20) << __func__
<< " kv_committed " << kv_committed
<< dendl
;
12546 dout(20) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
12548 auto start
= mono_clock::now();
12550 while (!kv_committed
.empty()) {
12551 TransContext
*txc
= kv_committed
.front();
12552 ceph_assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
12553 _txc_state_proc(txc
);
12554 kv_committed
.pop_front();
12557 for (auto b
: deferred_stable
) {
12558 auto p
= b
->txcs
.begin();
12559 while (p
!= b
->txcs
.end()) {
12560 TransContext
*txc
= &*p
;
12561 p
= b
->txcs
.erase(p
); // unlink here because
12562 _txc_state_proc(txc
); // this may destroy txc
12566 deferred_stable
.clear();
12568 if (!deferred_aggressive
) {
12569 if (deferred_queue_size
>= deferred_batch_ops
.load() ||
12570 throttle
.should_submit_deferred()) {
12571 deferred_try_submit();
12575 // this is as good a place as any ...
12576 _reap_collections();
12578 logger
->set(l_bluestore_fragmentation
,
12579 (uint64_t)(alloc
->get_fragmentation() * 1000));
12581 log_latency("kv_final",
12582 l_bluestore_kv_final_lat
,
12583 mono_clock::now() - start
,
12584 cct
->_conf
->bluestore_log_op_age
);
12589 dout(10) << __func__
<< " finish" << dendl
;
12590 kv_finalize_started
= false;
12593 bluestore_deferred_op_t
*BlueStore::_get_deferred_op(
12596 if (!txc
->deferred_txn
) {
12597 txc
->deferred_txn
= new bluestore_deferred_transaction_t
;
12599 txc
->deferred_txn
->ops
.push_back(bluestore_deferred_op_t());
12600 return &txc
->deferred_txn
->ops
.back();
12603 void BlueStore::_deferred_queue(TransContext
*txc
)
12605 dout(20) << __func__
<< " txc " << txc
<< " osr " << txc
->osr
<< dendl
;
12606 deferred_lock
.lock();
12607 if (!txc
->osr
->deferred_pending
&&
12608 !txc
->osr
->deferred_running
) {
12609 deferred_queue
.push_back(*txc
->osr
);
12611 if (!txc
->osr
->deferred_pending
) {
12612 txc
->osr
->deferred_pending
= new DeferredBatch(cct
, txc
->osr
.get());
12614 ++deferred_queue_size
;
12615 txc
->osr
->deferred_pending
->txcs
.push_back(*txc
);
12616 bluestore_deferred_transaction_t
& wt
= *txc
->deferred_txn
;
12617 for (auto opi
= wt
.ops
.begin(); opi
!= wt
.ops
.end(); ++opi
) {
12618 const auto& op
= *opi
;
12619 ceph_assert(op
.op
== bluestore_deferred_op_t::OP_WRITE
);
12620 bufferlist::const_iterator p
= op
.data
.begin();
12621 for (auto e
: op
.extents
) {
12622 txc
->osr
->deferred_pending
->prepare_write(
12623 cct
, wt
.seq
, e
.offset
, e
.length
, p
);
12626 if (deferred_aggressive
&&
12627 !txc
->osr
->deferred_running
) {
12628 _deferred_submit_unlock(txc
->osr
.get());
12630 deferred_lock
.unlock();
12634 void BlueStore::deferred_try_submit()
12636 dout(20) << __func__
<< " " << deferred_queue
.size() << " osrs, "
12637 << deferred_queue_size
<< " txcs" << dendl
;
12638 std::lock_guard
l(deferred_lock
);
12639 vector
<OpSequencerRef
> osrs
;
12640 osrs
.reserve(deferred_queue
.size());
12641 for (auto& osr
: deferred_queue
) {
12642 osrs
.push_back(&osr
);
12644 for (auto& osr
: osrs
) {
12645 if (osr
->deferred_pending
) {
12646 if (!osr
->deferred_running
) {
12647 _deferred_submit_unlock(osr
.get());
12648 deferred_lock
.lock();
12650 dout(20) << __func__
<< " osr " << osr
<< " already has running"
12654 dout(20) << __func__
<< " osr " << osr
<< " has no pending" << dendl
;
12658 deferred_last_submitted
= ceph_clock_now();
12661 void BlueStore::_deferred_submit_unlock(OpSequencer
*osr
)
12663 dout(10) << __func__
<< " osr " << osr
12664 << " " << osr
->deferred_pending
->iomap
.size() << " ios pending "
12666 ceph_assert(osr
->deferred_pending
);
12667 ceph_assert(!osr
->deferred_running
);
12669 auto b
= osr
->deferred_pending
;
12670 deferred_queue_size
-= b
->seq_bytes
.size();
12671 ceph_assert(deferred_queue_size
>= 0);
12673 osr
->deferred_running
= osr
->deferred_pending
;
12674 osr
->deferred_pending
= nullptr;
12676 deferred_lock
.unlock();
12678 for (auto& txc
: b
->txcs
) {
12679 throttle
.log_state_latency(txc
, logger
, l_bluestore_state_deferred_queued_lat
);
12681 uint64_t start
= 0, pos
= 0;
12683 auto i
= b
->iomap
.begin();
12685 if (i
== b
->iomap
.end() || i
->first
!= pos
) {
12687 dout(20) << __func__
<< " write 0x" << std::hex
12688 << start
<< "~" << bl
.length()
12689 << " crc " << bl
.crc32c(-1) << std::dec
<< dendl
;
12690 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
12691 logger
->inc(l_bluestore_deferred_write_ops
);
12692 logger
->inc(l_bluestore_deferred_write_bytes
, bl
.length());
12693 int r
= bdev
->aio_write(start
, bl
, &b
->ioc
, false);
12694 ceph_assert(r
== 0);
12697 if (i
== b
->iomap
.end()) {
12704 dout(20) << __func__
<< " seq " << i
->second
.seq
<< " 0x"
12705 << std::hex
<< pos
<< "~" << i
->second
.bl
.length() << std::dec
12707 if (!bl
.length()) {
12710 pos
+= i
->second
.bl
.length();
12711 bl
.claim_append(i
->second
.bl
);
12715 bdev
->aio_submit(&b
->ioc
);
12718 struct C_DeferredTrySubmit
: public Context
{
12720 C_DeferredTrySubmit(BlueStore
*s
) : store(s
) {}
12721 void finish(int r
) {
12722 store
->deferred_try_submit();
12726 void BlueStore::_deferred_aio_finish(OpSequencer
*osr
)
12728 dout(10) << __func__
<< " osr " << osr
<< dendl
;
12729 ceph_assert(osr
->deferred_running
);
12730 DeferredBatch
*b
= osr
->deferred_running
;
12733 deferred_lock
.lock();
12734 ceph_assert(osr
->deferred_running
== b
);
12735 osr
->deferred_running
= nullptr;
12736 if (!osr
->deferred_pending
) {
12737 dout(20) << __func__
<< " dequeueing" << dendl
;
12738 auto q
= deferred_queue
.iterator_to(*osr
);
12739 deferred_queue
.erase(q
);
12740 deferred_lock
.unlock();
12742 deferred_lock
.unlock();
12743 if (deferred_aggressive
) {
12744 dout(20) << __func__
<< " queuing async deferred_try_submit" << dendl
;
12745 finisher
.queue(new C_DeferredTrySubmit(this));
12747 dout(20) << __func__
<< " leaving queued, more pending" << dendl
;
12753 uint64_t costs
= 0;
12755 for (auto& i
: b
->txcs
) {
12756 TransContext
*txc
= &i
;
12757 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_deferred_aio_wait_lat
);
12758 txc
->state
= TransContext::STATE_DEFERRED_CLEANUP
;
12759 costs
+= txc
->cost
;
12762 throttle
.release_deferred_throttle(costs
);
12766 std::lock_guard
l(kv_lock
);
12767 deferred_done_queue
.emplace_back(b
);
12769 // in the normal case, do not bother waking up the kv thread; it will
12770 // catch us on the next commit anyway.
12771 if (deferred_aggressive
&& !kv_sync_in_progress
) {
12772 kv_sync_in_progress
= true;
12773 kv_cond
.notify_one();
12778 int BlueStore::_deferred_replay()
12780 dout(10) << __func__
<< " start" << dendl
;
12783 CollectionRef ch
= _get_collection(coll_t::meta());
12784 bool fake_ch
= false;
12786 // hmm, replaying initial mkfs?
12787 ch
= static_cast<Collection
*>(create_new_collection(coll_t::meta()).get());
12790 OpSequencer
*osr
= static_cast<OpSequencer
*>(ch
->osr
.get());
12791 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_DEFERRED
);
12792 for (it
->lower_bound(string()); it
->valid(); it
->next(), ++count
) {
12793 dout(20) << __func__
<< " replay " << pretty_binary_string(it
->key())
12795 bluestore_deferred_transaction_t
*deferred_txn
=
12796 new bluestore_deferred_transaction_t
;
12797 bufferlist bl
= it
->value();
12798 auto p
= bl
.cbegin();
12800 decode(*deferred_txn
, p
);
12801 } catch (buffer::error
& e
) {
12802 derr
<< __func__
<< " failed to decode deferred txn "
12803 << pretty_binary_string(it
->key()) << dendl
;
12804 delete deferred_txn
;
12808 TransContext
*txc
= _txc_create(ch
.get(), osr
, nullptr);
12809 txc
->deferred_txn
= deferred_txn
;
12810 txc
->state
= TransContext::STATE_KV_DONE
;
12811 _txc_state_proc(txc
);
12814 dout(20) << __func__
<< " draining osr" << dendl
;
12815 _osr_register_zombie(osr
);
12818 new_coll_map
.clear();
12820 dout(10) << __func__
<< " completed " << count
<< " events" << dendl
;
12824 // ---------------------------
12827 int BlueStore::queue_transactions(
12828 CollectionHandle
& ch
,
12829 vector
<Transaction
>& tls
,
12831 ThreadPool::TPHandle
*handle
)
12834 list
<Context
*> on_applied
, on_commit
, on_applied_sync
;
12835 ObjectStore::Transaction::collect_contexts(
12836 tls
, &on_applied
, &on_commit
, &on_applied_sync
);
12838 auto start
= mono_clock::now();
12840 Collection
*c
= static_cast<Collection
*>(ch
.get());
12841 OpSequencer
*osr
= c
->osr
.get();
12842 dout(10) << __func__
<< " ch " << c
<< " " << c
->cid
<< dendl
;
12845 TransContext
*txc
= _txc_create(static_cast<Collection
*>(ch
.get()), osr
,
12848 for (vector
<Transaction
>::iterator p
= tls
.begin(); p
!= tls
.end(); ++p
) {
12849 txc
->bytes
+= (*p
).get_num_bytes();
12850 _txc_add_transaction(txc
, &(*p
));
12852 _txc_calc_cost(txc
);
12854 _txc_write_nodes(txc
, txc
->t
);
12856 // journal deferred items
12857 if (txc
->deferred_txn
) {
12858 txc
->deferred_txn
->seq
= ++deferred_seq
;
12860 encode(*txc
->deferred_txn
, bl
);
12862 get_deferred_key(txc
->deferred_txn
->seq
, &key
);
12863 txc
->t
->set(PREFIX_DEFERRED
, key
, bl
);
12866 _txc_finalize_kv(txc
, txc
->t
);
12868 handle
->suspend_tp_timeout();
12870 auto tstart
= mono_clock::now();
12872 if (!throttle
.try_start_transaction(
12876 // ensure we do not block here because of deferred writes
12877 dout(10) << __func__
<< " failed get throttle_deferred_bytes, aggressive"
12879 ++deferred_aggressive
;
12880 deferred_try_submit();
12882 // wake up any previously finished deferred events
12883 std::lock_guard
l(kv_lock
);
12884 if (!kv_sync_in_progress
) {
12885 kv_sync_in_progress
= true;
12886 kv_cond
.notify_one();
12889 throttle
.finish_start_transaction(*db
, *txc
, tstart
);
12890 --deferred_aggressive
;
12892 auto tend
= mono_clock::now();
12895 handle
->reset_tp_timeout();
12897 logger
->inc(l_bluestore_txc
);
12900 _txc_state_proc(txc
);
12902 // we're immediately readable (unlike FileStore)
12903 for (auto c
: on_applied_sync
) {
12906 if (!on_applied
.empty()) {
12907 if (c
->commit_queue
) {
12908 c
->commit_queue
->queue(on_applied
);
12910 finisher
.queue(on_applied
);
12914 log_latency("submit_transact",
12915 l_bluestore_submit_lat
,
12916 mono_clock::now() - start
,
12917 cct
->_conf
->bluestore_log_op_age
);
12918 log_latency("throttle_transact",
12919 l_bluestore_throttle_lat
,
12921 cct
->_conf
->bluestore_log_op_age
);
12925 void BlueStore::_txc_aio_submit(TransContext
*txc
)
12927 dout(10) << __func__
<< " txc " << txc
<< dendl
;
12928 bdev
->aio_submit(&txc
->ioc
);
12931 void BlueStore::_txc_add_transaction(TransContext
*txc
, Transaction
*t
)
12933 Transaction::iterator i
= t
->begin();
12935 _dump_transaction
<30>(cct
, t
);
12937 vector
<CollectionRef
> cvec(i
.colls
.size());
12939 for (vector
<coll_t
>::iterator p
= i
.colls
.begin(); p
!= i
.colls
.end();
12941 cvec
[j
] = _get_collection(*p
);
12944 vector
<OnodeRef
> ovec(i
.objects
.size());
12946 for (int pos
= 0; i
.have_op(); ++pos
) {
12947 Transaction::Op
*op
= i
.decode_op();
12951 if (op
->op
== Transaction::OP_NOP
)
12955 // collection operations
12956 CollectionRef
&c
= cvec
[op
->cid
];
12958 // initialize osd_pool_id and do a smoke test that all collections belong
12959 // to the same pool
12961 if (!!c
? c
->cid
.is_pg(&pgid
) : false) {
12962 ceph_assert(txc
->osd_pool_id
== META_POOL_ID
||
12963 txc
->osd_pool_id
== pgid
.pool());
12964 txc
->osd_pool_id
= pgid
.pool();
12968 case Transaction::OP_RMCOLL
:
12970 const coll_t
&cid
= i
.get_cid(op
->cid
);
12971 r
= _remove_collection(txc
, cid
, &c
);
12977 case Transaction::OP_MKCOLL
:
12980 const coll_t
&cid
= i
.get_cid(op
->cid
);
12981 r
= _create_collection(txc
, cid
, op
->split_bits
, &c
);
12987 case Transaction::OP_SPLIT_COLLECTION
:
12988 ceph_abort_msg("deprecated");
12991 case Transaction::OP_SPLIT_COLLECTION2
:
12993 uint32_t bits
= op
->split_bits
;
12994 uint32_t rem
= op
->split_rem
;
12995 r
= _split_collection(txc
, c
, cvec
[op
->dest_cid
], bits
, rem
);
13001 case Transaction::OP_MERGE_COLLECTION
:
13003 uint32_t bits
= op
->split_bits
;
13004 r
= _merge_collection(txc
, &c
, cvec
[op
->dest_cid
], bits
);
13010 case Transaction::OP_COLL_HINT
:
13012 uint32_t type
= op
->hint_type
;
13015 auto hiter
= hint
.cbegin();
13016 if (type
== Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
) {
13019 decode(pg_num
, hiter
);
13020 decode(num_objs
, hiter
);
13021 dout(10) << __func__
<< " collection hint objects is a no-op, "
13022 << " pg_num " << pg_num
<< " num_objects " << num_objs
13026 dout(10) << __func__
<< " unknown collection hint " << type
<< dendl
;
13032 case Transaction::OP_COLL_SETATTR
:
13036 case Transaction::OP_COLL_RMATTR
:
13040 case Transaction::OP_COLL_RENAME
:
13041 ceph_abort_msg("not implemented");
13045 derr
<< __func__
<< " error " << cpp_strerror(r
)
13046 << " not handled on operation " << op
->op
13047 << " (op " << pos
<< ", counting from 0)" << dendl
;
13048 _dump_transaction
<0>(cct
, t
);
13049 ceph_abort_msg("unexpected error");
13052 // these operations implicity create the object
13053 bool create
= false;
13054 if (op
->op
== Transaction::OP_TOUCH
||
13055 op
->op
== Transaction::OP_CREATE
||
13056 op
->op
== Transaction::OP_WRITE
||
13057 op
->op
== Transaction::OP_ZERO
) {
13061 // object operations
13062 std::unique_lock
l(c
->lock
);
13063 OnodeRef
&o
= ovec
[op
->oid
];
13065 ghobject_t oid
= i
.get_oid(op
->oid
);
13066 o
= c
->get_onode(oid
, create
, op
->op
== Transaction::OP_CREATE
);
13068 if (!create
&& (!o
|| !o
->exists
)) {
13069 dout(10) << __func__
<< " op " << op
->op
<< " got ENOENT on "
13070 << i
.get_oid(op
->oid
) << dendl
;
13076 case Transaction::OP_CREATE
:
13077 case Transaction::OP_TOUCH
:
13078 r
= _touch(txc
, c
, o
);
13081 case Transaction::OP_WRITE
:
13083 uint64_t off
= op
->off
;
13084 uint64_t len
= op
->len
;
13085 uint32_t fadvise_flags
= i
.get_fadvise_flags();
13088 r
= _write(txc
, c
, o
, off
, len
, bl
, fadvise_flags
);
13092 case Transaction::OP_ZERO
:
13094 uint64_t off
= op
->off
;
13095 uint64_t len
= op
->len
;
13096 r
= _zero(txc
, c
, o
, off
, len
);
13100 case Transaction::OP_TRIMCACHE
:
13102 // deprecated, no-op
13106 case Transaction::OP_TRUNCATE
:
13108 uint64_t off
= op
->off
;
13109 r
= _truncate(txc
, c
, o
, off
);
13113 case Transaction::OP_REMOVE
:
13115 r
= _remove(txc
, c
, o
);
13119 case Transaction::OP_SETATTR
:
13121 string name
= i
.decode_string();
13124 r
= _setattr(txc
, c
, o
, name
, bp
);
13128 case Transaction::OP_SETATTRS
:
13130 map
<string
, bufferptr
> aset
;
13131 i
.decode_attrset(aset
);
13132 r
= _setattrs(txc
, c
, o
, aset
);
13136 case Transaction::OP_RMATTR
:
13138 string name
= i
.decode_string();
13139 r
= _rmattr(txc
, c
, o
, name
);
13143 case Transaction::OP_RMATTRS
:
13145 r
= _rmattrs(txc
, c
, o
);
13149 case Transaction::OP_CLONE
:
13151 OnodeRef
& no
= ovec
[op
->dest_oid
];
13153 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
13154 no
= c
->get_onode(noid
, true);
13156 r
= _clone(txc
, c
, o
, no
);
13160 case Transaction::OP_CLONERANGE
:
13161 ceph_abort_msg("deprecated");
13164 case Transaction::OP_CLONERANGE2
:
13166 OnodeRef
& no
= ovec
[op
->dest_oid
];
13168 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
13169 no
= c
->get_onode(noid
, true);
13171 uint64_t srcoff
= op
->off
;
13172 uint64_t len
= op
->len
;
13173 uint64_t dstoff
= op
->dest_off
;
13174 r
= _clone_range(txc
, c
, o
, no
, srcoff
, len
, dstoff
);
13178 case Transaction::OP_COLL_ADD
:
13179 ceph_abort_msg("not implemented");
13182 case Transaction::OP_COLL_REMOVE
:
13183 ceph_abort_msg("not implemented");
13186 case Transaction::OP_COLL_MOVE
:
13187 ceph_abort_msg("deprecated");
13190 case Transaction::OP_COLL_MOVE_RENAME
:
13191 case Transaction::OP_TRY_RENAME
:
13193 ceph_assert(op
->cid
== op
->dest_cid
);
13194 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
13195 OnodeRef
& no
= ovec
[op
->dest_oid
];
13197 no
= c
->get_onode(noid
, false);
13199 r
= _rename(txc
, c
, o
, no
, noid
);
13203 case Transaction::OP_OMAP_CLEAR
:
13205 r
= _omap_clear(txc
, c
, o
);
13208 case Transaction::OP_OMAP_SETKEYS
:
13210 bufferlist aset_bl
;
13211 i
.decode_attrset_bl(&aset_bl
);
13212 r
= _omap_setkeys(txc
, c
, o
, aset_bl
);
13215 case Transaction::OP_OMAP_RMKEYS
:
13217 bufferlist keys_bl
;
13218 i
.decode_keyset_bl(&keys_bl
);
13219 r
= _omap_rmkeys(txc
, c
, o
, keys_bl
);
13222 case Transaction::OP_OMAP_RMKEYRANGE
:
13224 string first
, last
;
13225 first
= i
.decode_string();
13226 last
= i
.decode_string();
13227 r
= _omap_rmkey_range(txc
, c
, o
, first
, last
);
13230 case Transaction::OP_OMAP_SETHEADER
:
13234 r
= _omap_setheader(txc
, c
, o
, bl
);
13238 case Transaction::OP_SETALLOCHINT
:
13240 r
= _set_alloc_hint(txc
, c
, o
,
13241 op
->expected_object_size
,
13242 op
->expected_write_size
,
13243 op
->alloc_hint_flags
);
13248 derr
<< __func__
<< " bad op " << op
->op
<< dendl
;
13256 if (r
== -ENOENT
&& !(op
->op
== Transaction::OP_CLONERANGE
||
13257 op
->op
== Transaction::OP_CLONE
||
13258 op
->op
== Transaction::OP_CLONERANGE2
||
13259 op
->op
== Transaction::OP_COLL_ADD
||
13260 op
->op
== Transaction::OP_SETATTR
||
13261 op
->op
== Transaction::OP_SETATTRS
||
13262 op
->op
== Transaction::OP_RMATTR
||
13263 op
->op
== Transaction::OP_OMAP_SETKEYS
||
13264 op
->op
== Transaction::OP_OMAP_RMKEYS
||
13265 op
->op
== Transaction::OP_OMAP_RMKEYRANGE
||
13266 op
->op
== Transaction::OP_OMAP_SETHEADER
))
13267 // -ENOENT is usually okay
13273 const char *msg
= "unexpected error code";
13275 if (r
== -ENOENT
&& (op
->op
== Transaction::OP_CLONERANGE
||
13276 op
->op
== Transaction::OP_CLONE
||
13277 op
->op
== Transaction::OP_CLONERANGE2
))
13278 msg
= "ENOENT on clone suggests osd bug";
13281 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
13282 // by partially applying transactions.
13283 msg
= "ENOSPC from bluestore, misconfigured cluster";
13285 if (r
== -ENOTEMPTY
) {
13286 msg
= "ENOTEMPTY suggests garbage data in osd data dir";
13289 derr
<< __func__
<< " error " << cpp_strerror(r
)
13290 << " not handled on operation " << op
->op
13291 << " (op " << pos
<< ", counting from 0)"
13293 derr
<< msg
<< dendl
;
13294 _dump_transaction
<0>(cct
, t
);
13295 ceph_abort_msg("unexpected error");
13303 // -----------------
13304 // write operations
13306 int BlueStore::_touch(TransContext
*txc
,
13310 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
13312 _assign_nid(txc
, o
);
13313 txc
->write_onode(o
);
13314 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
13318 void BlueStore::_pad_zeros(
13319 bufferlist
*bl
, uint64_t *offset
,
13320 uint64_t chunk_size
)
13322 auto length
= bl
->length();
13323 dout(30) << __func__
<< " 0x" << std::hex
<< *offset
<< "~" << length
13324 << " chunk_size 0x" << chunk_size
<< std::dec
<< dendl
;
13325 dout(40) << "before:\n";
13326 bl
->hexdump(*_dout
);
13329 size_t front_pad
= *offset
% chunk_size
;
13330 size_t back_pad
= 0;
13331 size_t pad_count
= 0;
13333 size_t front_copy
= std::min
<uint64_t>(chunk_size
- front_pad
, length
);
13334 bufferptr z
= buffer::create_small_page_aligned(chunk_size
);
13335 z
.zero(0, front_pad
, false);
13336 pad_count
+= front_pad
;
13337 bl
->begin().copy(front_copy
, z
.c_str() + front_pad
);
13338 if (front_copy
+ front_pad
< chunk_size
) {
13339 back_pad
= chunk_size
- (length
+ front_pad
);
13340 z
.zero(front_pad
+ length
, back_pad
, false);
13341 pad_count
+= back_pad
;
13345 t
.substr_of(old
, front_copy
, length
- front_copy
);
13347 bl
->claim_append(t
);
13348 *offset
-= front_pad
;
13349 length
+= pad_count
;
13353 uint64_t end
= *offset
+ length
;
13354 unsigned back_copy
= end
% chunk_size
;
13356 ceph_assert(back_pad
== 0);
13357 back_pad
= chunk_size
- back_copy
;
13358 ceph_assert(back_copy
<= length
);
13359 bufferptr
tail(chunk_size
);
13360 bl
->begin(length
- back_copy
).copy(back_copy
, tail
.c_str());
13361 tail
.zero(back_copy
, back_pad
, false);
13364 bl
->substr_of(old
, 0, length
- back_copy
);
13366 length
+= back_pad
;
13367 pad_count
+= back_pad
;
13369 dout(20) << __func__
<< " pad 0x" << std::hex
<< front_pad
<< " + 0x"
13370 << back_pad
<< " on front/back, now 0x" << *offset
<< "~"
13371 << length
<< std::dec
<< dendl
;
13372 dout(40) << "after:\n";
13373 bl
->hexdump(*_dout
);
13376 logger
->inc(l_bluestore_write_pad_bytes
, pad_count
);
13377 ceph_assert(bl
->length() == length
);
13380 void BlueStore::_do_write_small(
13384 uint64_t offset
, uint64_t length
,
13385 bufferlist::iterator
& blp
,
13386 WriteContext
*wctx
)
13388 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
13389 << std::dec
<< dendl
;
13390 ceph_assert(length
< min_alloc_size
);
13391 uint64_t end_offs
= offset
+ length
;
13393 logger
->inc(l_bluestore_write_small
);
13394 logger
->inc(l_bluestore_write_small_bytes
, length
);
13397 blp
.copy(length
, bl
);
13399 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13400 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
13401 uint32_t alloc_len
= min_alloc_size
;
13402 auto offset0
= p2align
<uint64_t>(offset
, alloc_len
);
13406 // search suitable extent in both forward and reverse direction in
13407 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13408 // then check if blob can be reused via can_reuse_blob func or apply
13409 // direct/deferred write (the latter for extents including or higher
13410 // than 'offset' only).
13411 o
->extent_map
.fault_range(db
, min_off
, offset
+ max_bsize
- min_off
);
13413 // Look for an existing mutable blob we can use.
13414 auto begin
= o
->extent_map
.extent_map
.begin();
13415 auto end
= o
->extent_map
.extent_map
.end();
13416 auto ep
= o
->extent_map
.seek_lextent(offset
);
13419 if (ep
->blob_end() <= offset
) {
13424 if (prev_ep
!= begin
) {
13427 prev_ep
= end
; // to avoid this extent check as it's a duplicate
13430 boost::container::flat_set
<const bluestore_blob_t
*> inspected_blobs
;
13431 // We don't want to have more blobs than min alloc units fit
13432 // into 2 max blobs
13433 size_t blob_threshold
= max_blob_size
/ min_alloc_size
* 2 + 1;
13434 bool above_blob_threshold
= false;
13436 inspected_blobs
.reserve(blob_threshold
);
13438 uint64_t max_off
= 0;
13439 auto start_ep
= ep
;
13440 auto end_ep
= ep
; // exclusively
13442 any_change
= false;
13444 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
13445 BlobRef b
= ep
->blob
;
13446 if (!above_blob_threshold
) {
13447 inspected_blobs
.insert(&b
->get_blob());
13448 above_blob_threshold
= inspected_blobs
.size() >= blob_threshold
;
13450 max_off
= ep
->logical_end();
13451 auto bstart
= ep
->blob_start();
13453 dout(20) << __func__
<< " considering " << *b
13454 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
13455 if (bstart
>= end_offs
) {
13456 dout(20) << __func__
<< " ignoring distant " << *b
<< dendl
;
13457 } else if (!b
->get_blob().is_mutable()) {
13458 dout(20) << __func__
<< " ignoring immutable " << *b
<< dendl
;
13459 } else if (ep
->logical_offset
% min_alloc_size
!=
13460 ep
->blob_offset
% min_alloc_size
) {
13461 dout(20) << __func__
<< " ignoring offset-skewed " << *b
<< dendl
;
13463 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
13464 // can we pad our head/tail out with zeros?
13465 uint64_t head_pad
, tail_pad
;
13466 head_pad
= p2phase(offset
, chunk_size
);
13467 tail_pad
= p2nphase(end_offs
, chunk_size
);
13468 if (head_pad
|| tail_pad
) {
13469 o
->extent_map
.fault_range(db
, offset
- head_pad
,
13470 end_offs
- offset
+ head_pad
+ tail_pad
);
13473 o
->extent_map
.has_any_lextents(offset
- head_pad
, chunk_size
)) {
13476 if (tail_pad
&& o
->extent_map
.has_any_lextents(end_offs
, tail_pad
)) {
13480 uint64_t b_off
= offset
- head_pad
- bstart
;
13481 uint64_t b_len
= length
+ head_pad
+ tail_pad
;
13483 // direct write into unused blocks of an existing mutable blob?
13484 if ((b_off
% chunk_size
== 0 && b_len
% chunk_size
== 0) &&
13485 b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
13486 b
->get_blob().is_unused(b_off
, b_len
) &&
13487 b
->get_blob().is_allocated(b_off
, b_len
)) {
13488 _apply_padding(head_pad
, tail_pad
, bl
);
13490 dout(20) << __func__
<< " write to unused 0x" << std::hex
13491 << b_off
<< "~" << b_len
13492 << " pad 0x" << head_pad
<< " + 0x" << tail_pad
13493 << std::dec
<< " of mutable " << *b
<< dendl
;
13494 _buffer_cache_write(txc
, b
, b_off
, bl
,
13495 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13497 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13498 if (b_len
<= prefer_deferred_size
) {
13499 dout(20) << __func__
<< " deferring small 0x" << std::hex
13500 << b_len
<< std::dec
<< " unused write via deferred" << dendl
;
13501 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
13502 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13505 [&](uint64_t offset
, uint64_t length
) {
13506 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13511 b
->get_blob().map_bl(
13513 [&](uint64_t offset
, bufferlist
& t
) {
13514 bdev
->aio_write(offset
, t
,
13515 &txc
->ioc
, wctx
->buffered
);
13519 b
->dirty_blob().calc_csum(b_off
, bl
);
13520 dout(20) << __func__
<< " lex old " << *ep
<< dendl
;
13521 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, b_off
+ head_pad
, length
,
13523 &wctx
->old_extents
);
13524 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13525 txc
->statfs_delta
.stored() += le
->length
;
13526 dout(20) << __func__
<< " lex " << *le
<< dendl
;
13527 logger
->inc(l_bluestore_write_small_unused
);
13530 // read some data to fill out the chunk?
13531 uint64_t head_read
= p2phase(b_off
, chunk_size
);
13532 uint64_t tail_read
= p2nphase(b_off
+ b_len
, chunk_size
);
13533 if ((head_read
|| tail_read
) &&
13534 (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
+ tail_read
) &&
13535 head_read
+ tail_read
< min_alloc_size
) {
13536 b_off
-= head_read
;
13537 b_len
+= head_read
+ tail_read
;
13540 head_read
= tail_read
= 0;
13543 // chunk-aligned deferred overwrite?
13544 if (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
13545 b_off
% chunk_size
== 0 &&
13546 b_len
% chunk_size
== 0 &&
13547 b
->get_blob().is_allocated(b_off
, b_len
)) {
13549 _apply_padding(head_pad
, tail_pad
, bl
);
13551 dout(20) << __func__
<< " reading head 0x" << std::hex
<< head_read
13552 << " and tail 0x" << tail_read
<< std::dec
<< dendl
;
13554 bufferlist head_bl
;
13555 int r
= _do_read(c
.get(), o
, offset
- head_pad
- head_read
, head_read
,
13557 ceph_assert(r
>= 0 && r
<= (int)head_read
);
13558 size_t zlen
= head_read
- r
;
13560 head_bl
.append_zero(zlen
);
13561 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13563 head_bl
.claim_append(bl
);
13565 logger
->inc(l_bluestore_write_penalty_read_ops
);
13568 bufferlist tail_bl
;
13569 int r
= _do_read(c
.get(), o
, offset
+ length
+ tail_pad
, tail_read
,
13571 ceph_assert(r
>= 0 && r
<= (int)tail_read
);
13572 size_t zlen
= tail_read
- r
;
13574 tail_bl
.append_zero(zlen
);
13575 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13577 bl
.claim_append(tail_bl
);
13578 logger
->inc(l_bluestore_write_penalty_read_ops
);
13580 logger
->inc(l_bluestore_write_small_pre_read
);
13582 _buffer_cache_write(txc
, b
, b_off
, bl
,
13583 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13585 if (b
->get_blob().csum_type
) {
13586 b
->dirty_blob().calc_csum(b_off
, bl
);
13589 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13590 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
13591 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13592 int r
= b
->get_blob().map(
13594 [&](uint64_t offset
, uint64_t length
) {
13595 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13598 ceph_assert(r
== 0);
13599 op
->data
.claim(bl
);
13600 dout(20) << __func__
<< " deferred write 0x" << std::hex
<< b_off
<< "~"
13601 << b_len
<< std::dec
<< " of mutable " << *b
13602 << " at " << op
->extents
<< dendl
;
13605 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, offset
- bstart
, length
,
13606 b
, &wctx
->old_extents
);
13607 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13608 txc
->statfs_delta
.stored() += le
->length
;
13609 dout(20) << __func__
<< " lex " << *le
<< dendl
;
13610 logger
->inc(l_bluestore_write_small_deferred
);
13613 // try to reuse blob if we can
13614 if (b
->can_reuse_blob(min_alloc_size
,
13618 ceph_assert(alloc_len
== min_alloc_size
); // expecting data always
13619 // fit into reused blob
13620 // Need to check for pending writes desiring to
13621 // reuse the same pextent. The rationale is that during GC two chunks
13622 // from garbage blobs(compressed?) can share logical space within the same
13623 // AU. That's in turn might be caused by unaligned len in clone_range2.
13624 // Hence the second write will fail in an attempt to reuse blob at
13625 // do_alloc_write().
13626 if (!wctx
->has_conflict(b
,
13628 offset0
+ alloc_len
,
13631 // we can't reuse pad_head/pad_tail since they might be truncated
13632 // due to existent extents
13633 uint64_t b_off
= offset
- bstart
;
13634 uint64_t b_off0
= b_off
;
13635 _pad_zeros(&bl
, &b_off0
, chunk_size
);
13637 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13638 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
13639 << " (0x" << b_off
<< "~" << length
<< ")"
13640 << std::dec
<< dendl
;
13642 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13643 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13645 logger
->inc(l_bluestore_write_small_unused
);
13653 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13655 // check extent for reuse in reverse order
13656 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
13657 BlobRef b
= prev_ep
->blob
;
13658 if (!above_blob_threshold
) {
13659 inspected_blobs
.insert(&b
->get_blob());
13660 above_blob_threshold
= inspected_blobs
.size() >= blob_threshold
;
13662 start_ep
= prev_ep
;
13663 auto bstart
= prev_ep
->blob_start();
13664 dout(20) << __func__
<< " considering " << *b
13665 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
13666 if (b
->can_reuse_blob(min_alloc_size
,
13670 ceph_assert(alloc_len
== min_alloc_size
); // expecting data always
13671 // fit into reused blob
13672 // Need to check for pending writes desiring to
13673 // reuse the same pextent. The rationale is that during GC two chunks
13674 // from garbage blobs(compressed?) can share logical space within the same
13675 // AU. That's in turn might be caused by unaligned len in clone_range2.
13676 // Hence the second write will fail in an attempt to reuse blob at
13677 // do_alloc_write().
13678 if (!wctx
->has_conflict(b
,
13680 offset0
+ alloc_len
,
13683 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
13684 uint64_t b_off
= offset
- bstart
;
13685 uint64_t b_off0
= b_off
;
13686 _pad_zeros(&bl
, &b_off0
, chunk_size
);
13688 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13689 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
13690 << " (0x" << b_off
<< "~" << length
<< ")"
13691 << std::dec
<< dendl
;
13693 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13694 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13696 logger
->inc(l_bluestore_write_small_unused
);
13700 if (prev_ep
!= begin
) {
13704 prev_ep
= end
; // to avoid useless first extent re-check
13706 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13707 } while (any_change
);
13709 if (above_blob_threshold
) {
13710 dout(10) << __func__
<< " request GC, blobs >= " << inspected_blobs
.size()
13711 << " " << std::hex
<< min_off
<< "~" << max_off
<< std::dec
13713 ceph_assert(start_ep
!= end_ep
);
13714 for (auto ep
= start_ep
; ep
!= end_ep
; ++ep
) {
13715 dout(20) << __func__
<< " inserting for GC "
13716 << std::hex
<< ep
->logical_offset
<< "~" << ep
->length
13717 << std::dec
<< dendl
;
13719 wctx
->extents_to_gc
.union_insert(ep
->logical_offset
, ep
->length
);
13721 // insert newly written extent to GC
13722 wctx
->extents_to_gc
.union_insert(offset
, length
);
13723 dout(20) << __func__
<< " inserting (last) for GC "
13724 << std::hex
<< offset
<< "~" << length
13725 << std::dec
<< dendl
;
13728 BlobRef b
= c
->new_blob();
13729 uint64_t b_off
= p2phase
<uint64_t>(offset
, alloc_len
);
13730 uint64_t b_off0
= b_off
;
13731 _pad_zeros(&bl
, &b_off0
, block_size
);
13732 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13733 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13734 min_alloc_size
!= block_size
, // use 'unused' bitmap when alloc granularity
13735 // doesn't match disk one only
13741 void BlueStore::_do_write_big(
13745 uint64_t offset
, uint64_t length
,
13746 bufferlist::iterator
& blp
,
13747 WriteContext
*wctx
)
13749 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
13750 << " target_blob_size 0x" << wctx
->target_blob_size
<< std::dec
13751 << " compress " << (int)wctx
->compress
13753 logger
->inc(l_bluestore_write_big
);
13754 logger
->inc(l_bluestore_write_big_bytes
, length
);
13755 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13756 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13757 while (length
> 0) {
13758 bool new_blob
= false;
13759 uint32_t l
= std::min(max_bsize
, length
);
13761 uint32_t b_off
= 0;
13763 //attempting to reuse existing blob
13764 if (!wctx
->compress
) {
13765 // look for an existing mutable blob we can reuse
13766 auto begin
= o
->extent_map
.extent_map
.begin();
13767 auto end
= o
->extent_map
.extent_map
.end();
13768 auto ep
= o
->extent_map
.seek_lextent(offset
);
13770 if (prev_ep
!= begin
) {
13773 prev_ep
= end
; // to avoid this extent check as it's a duplicate
13775 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
13776 // search suitable extent in both forward and reverse direction in
13777 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13778 // then check if blob can be reused via can_reuse_blob func.
13781 any_change
= false;
13782 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
13783 if (offset
>= ep
->blob_start() &&
13784 ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
13785 offset
- ep
->blob_start(),
13788 b_off
= offset
- ep
->blob_start();
13789 prev_ep
= end
; // to avoid check below
13790 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13791 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
13798 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
13799 if (prev_ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
13800 offset
- prev_ep
->blob_start(),
13803 b_off
= offset
- prev_ep
->blob_start();
13804 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13805 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
13806 } else if (prev_ep
!= begin
) {
13810 prev_ep
= end
; // to avoid useless first extent re-check
13813 } while (b
== nullptr && any_change
);
13815 if (b
== nullptr) {
13823 wctx
->write(offset
, b
, l
, b_off
, t
, b_off
, l
, false, new_blob
);
13826 logger
->inc(l_bluestore_write_big_blobs
);
13830 int BlueStore::_do_alloc_write(
13832 CollectionRef coll
,
13834 WriteContext
*wctx
)
13836 dout(20) << __func__
<< " txc " << txc
13837 << " " << wctx
->writes
.size() << " blobs"
13839 if (wctx
->writes
.empty()) {
13845 if (wctx
->compress
) {
13847 "compression_algorithm",
13851 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_ALGORITHM
, &val
)) {
13852 CompressorRef cp
= compressor
;
13853 if (!cp
|| cp
->get_type_name() != val
) {
13854 cp
= Compressor::create(cct
, val
);
13856 if (_set_compression_alert(false, val
.c_str())) {
13857 derr
<< __func__
<< " unable to initialize " << val
.c_str()
13858 << " compressor" << dendl
;
13862 return boost::optional
<CompressorRef
>(cp
);
13864 return boost::optional
<CompressorRef
>();
13868 crr
= select_option(
13869 "compression_required_ratio",
13870 cct
->_conf
->bluestore_compression_required_ratio
,
13873 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO
, &val
)) {
13874 return boost::optional
<double>(val
);
13876 return boost::optional
<double>();
13882 int64_t csum
= csum_type
.load();
13883 csum
= select_option(
13888 if (coll
->pool_opts
.get(pool_opts_t::CSUM_TYPE
, &val
)) {
13889 return boost::optional
<int64_t>(val
);
13891 return boost::optional
<int64_t>();
13895 // compress (as needed) and calc needed space
13897 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13898 for (auto& wi
: wctx
->writes
) {
13899 if (c
&& wi
.blob_length
> min_alloc_size
) {
13900 auto start
= mono_clock::now();
13903 ceph_assert(wi
.b_off
== 0);
13904 ceph_assert(wi
.blob_length
== wi
.bl
.length());
13906 // FIXME: memory alignment here is bad
13908 int r
= c
->compress(wi
.bl
, t
);
13909 uint64_t want_len_raw
= wi
.blob_length
* crr
;
13910 uint64_t want_len
= p2roundup(want_len_raw
, min_alloc_size
);
13911 bool rejected
= false;
13912 uint64_t compressed_len
= t
.length();
13913 // do an approximate (fast) estimation for resulting blob size
13914 // that doesn't take header overhead into account
13915 uint64_t result_len
= p2roundup(compressed_len
, min_alloc_size
);
13916 if (r
== 0 && result_len
<= want_len
&& result_len
< wi
.blob_length
) {
13917 bluestore_compression_header_t chdr
;
13918 chdr
.type
= c
->get_type();
13919 chdr
.length
= t
.length();
13920 encode(chdr
, wi
.compressed_bl
);
13921 wi
.compressed_bl
.claim_append(t
);
13923 compressed_len
= wi
.compressed_bl
.length();
13924 result_len
= p2roundup(compressed_len
, min_alloc_size
);
13925 if (result_len
<= want_len
&& result_len
< wi
.blob_length
) {
13926 // Cool. We compressed at least as much as we were hoping to.
13927 // pad out to min_alloc_size
13928 wi
.compressed_bl
.append_zero(result_len
- compressed_len
);
13929 wi
.compressed_len
= compressed_len
;
13930 wi
.compressed
= true;
13931 logger
->inc(l_bluestore_write_pad_bytes
, result_len
- compressed_len
);
13932 dout(20) << __func__
<< std::hex
<< " compressed 0x" << wi
.blob_length
13933 << " -> 0x" << compressed_len
<< " => 0x" << result_len
13934 << " with " << c
->get_type()
13935 << std::dec
<< dendl
;
13936 txc
->statfs_delta
.compressed() += compressed_len
;
13937 txc
->statfs_delta
.compressed_original() += wi
.blob_length
;
13938 txc
->statfs_delta
.compressed_allocated() += result_len
;
13939 logger
->inc(l_bluestore_compress_success_count
);
13940 need
+= result_len
;
13944 } else if (r
!= 0) {
13945 dout(5) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
13946 << " bytes compressed using " << c
->get_type_name()
13948 << " failed with errcode = " << r
13949 << ", leaving uncompressed"
13951 logger
->inc(l_bluestore_compress_rejected_count
);
13952 need
+= wi
.blob_length
;
13958 dout(20) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
13959 << " compressed to 0x" << compressed_len
<< " -> 0x" << result_len
13960 << " with " << c
->get_type()
13961 << ", which is more than required 0x" << want_len_raw
13962 << " -> 0x" << want_len
13963 << ", leaving uncompressed"
13964 << std::dec
<< dendl
;
13965 logger
->inc(l_bluestore_compress_rejected_count
);
13966 need
+= wi
.blob_length
;
13968 log_latency("compress@_do_alloc_write",
13969 l_bluestore_compress_lat
,
13970 mono_clock::now() - start
,
13971 cct
->_conf
->bluestore_log_op_age
);
13973 need
+= wi
.blob_length
;
13976 PExtentVector prealloc
;
13977 prealloc
.reserve(2 * wctx
->writes
.size());;
13978 int64_t prealloc_left
= 0;
13979 prealloc_left
= alloc
->allocate(
13980 need
, min_alloc_size
, need
,
13982 if (prealloc_left
< 0 || prealloc_left
< (int64_t)need
) {
13983 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< need
13984 << " allocated 0x " << (prealloc_left
< 0 ? 0 : prealloc_left
)
13985 << " min_alloc_size 0x" << min_alloc_size
13986 << " available 0x " << alloc
->get_free()
13987 << std::dec
<< dendl
;
13988 if (prealloc
.size()) {
13989 alloc
->release(prealloc
);
13993 _collect_allocation_stats(need
, min_alloc_size
, prealloc
.size());
13995 dout(20) << __func__
<< " prealloc " << prealloc
<< dendl
;
13996 auto prealloc_pos
= prealloc
.begin();
13998 for (auto& wi
: wctx
->writes
) {
14000 bluestore_blob_t
& dblob
= b
->dirty_blob();
14001 uint64_t b_off
= wi
.b_off
;
14002 bufferlist
*l
= &wi
.bl
;
14003 uint64_t final_length
= wi
.blob_length
;
14004 uint64_t csum_length
= wi
.blob_length
;
14005 if (wi
.compressed
) {
14006 final_length
= wi
.compressed_bl
.length();
14007 csum_length
= final_length
;
14008 unsigned csum_order
= ctz(csum_length
);
14009 l
= &wi
.compressed_bl
;
14010 dblob
.set_compressed(wi
.blob_length
, wi
.compressed_len
);
14011 if (csum
!= Checksummer::CSUM_NONE
) {
14012 dout(20) << __func__
<< " initialize csum setting for compressed blob " << *b
14013 << " csum_type " << Checksummer::get_csum_type_string(csum
)
14014 << " csum_order " << csum_order
14015 << " csum_length 0x" << std::hex
<< csum_length
14016 << " blob_length 0x" << wi
.blob_length
14017 << " compressed_length 0x" << wi
.compressed_len
<< std::dec
14019 dblob
.init_csum(csum
, csum_order
, csum_length
);
14021 } else if (wi
.new_blob
) {
14022 unsigned csum_order
;
14023 // initialize newly created blob only
14024 ceph_assert(dblob
.is_mutable());
14025 if (l
->length() != wi
.blob_length
) {
14026 // hrm, maybe we could do better here, but let's not bother.
14027 dout(20) << __func__
<< " forcing csum_order to block_size_order "
14028 << block_size_order
<< dendl
;
14029 csum_order
= block_size_order
;
14031 csum_order
= std::min(wctx
->csum_order
, ctz(l
->length()));
14033 // try to align blob with max_blob_size to improve
14034 // its reuse ratio, e.g. in case of reverse write
14035 uint32_t suggested_boff
=
14036 (wi
.logical_offset
- (wi
.b_off0
- wi
.b_off
)) % max_bsize
;
14037 if ((suggested_boff
% (1 << csum_order
)) == 0 &&
14038 suggested_boff
+ final_length
<= max_bsize
&&
14039 suggested_boff
> b_off
) {
14040 dout(20) << __func__
<< " forcing blob_offset to 0x"
14041 << std::hex
<< suggested_boff
<< std::dec
<< dendl
;
14042 ceph_assert(suggested_boff
>= b_off
);
14043 csum_length
+= suggested_boff
- b_off
;
14044 b_off
= suggested_boff
;
14046 if (csum
!= Checksummer::CSUM_NONE
) {
14047 dout(20) << __func__
<< " initialize csum setting for new blob " << *b
14048 << " csum_type " << Checksummer::get_csum_type_string(csum
)
14049 << " csum_order " << csum_order
14050 << " csum_length 0x" << std::hex
<< csum_length
<< std::dec
14052 dblob
.init_csum(csum
, csum_order
, csum_length
);
14056 PExtentVector extents
;
14057 int64_t left
= final_length
;
14059 ceph_assert(prealloc_left
> 0);
14060 if (prealloc_pos
->length
<= left
) {
14061 prealloc_left
-= prealloc_pos
->length
;
14062 left
-= prealloc_pos
->length
;
14063 txc
->statfs_delta
.allocated() += prealloc_pos
->length
;
14064 extents
.push_back(*prealloc_pos
);
14067 extents
.emplace_back(prealloc_pos
->offset
, left
);
14068 prealloc_pos
->offset
+= left
;
14069 prealloc_pos
->length
-= left
;
14070 prealloc_left
-= left
;
14071 txc
->statfs_delta
.allocated() += left
;
14076 for (auto& p
: extents
) {
14077 txc
->allocated
.insert(p
.offset
, p
.length
);
14079 dblob
.allocated(p2align(b_off
, min_alloc_size
), final_length
, extents
);
14081 dout(20) << __func__
<< " blob " << *b
<< dendl
;
14082 if (dblob
.has_csum()) {
14083 dblob
.calc_csum(b_off
, *l
);
14086 if (wi
.mark_unused
) {
14087 ceph_assert(!dblob
.is_compressed());
14088 auto b_end
= b_off
+ wi
.bl
.length();
14090 dblob
.add_unused(0, b_off
);
14092 uint64_t llen
= dblob
.get_logical_length();
14093 if (b_end
< llen
) {
14094 dblob
.add_unused(b_end
, llen
- b_end
);
14098 Extent
*le
= o
->extent_map
.set_lextent(coll
, wi
.logical_offset
,
14099 b_off
+ (wi
.b_off0
- wi
.b_off
),
14103 wi
.b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
14104 txc
->statfs_delta
.stored() += le
->length
;
14105 dout(20) << __func__
<< " lex " << *le
<< dendl
;
14106 _buffer_cache_write(txc
, wi
.b
, b_off
, wi
.bl
,
14107 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
14110 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
14111 if (l
->length() <= prefer_deferred_size
.load()) {
14112 dout(20) << __func__
<< " deferring small 0x" << std::hex
14113 << l
->length() << std::dec
<< " write via deferred" << dendl
;
14114 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
14115 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
14116 int r
= b
->get_blob().map(
14117 b_off
, l
->length(),
14118 [&](uint64_t offset
, uint64_t length
) {
14119 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
14122 ceph_assert(r
== 0);
14124 logger
->inc(l_bluestore_write_small_deferred
);
14126 b
->get_blob().map_bl(
14128 [&](uint64_t offset
, bufferlist
& t
) {
14129 bdev
->aio_write(offset
, t
, &txc
->ioc
, false);
14131 logger
->inc(l_bluestore_write_small_new
);
14135 ceph_assert(prealloc_pos
== prealloc
.end());
14136 ceph_assert(prealloc_left
== 0);
14140 void BlueStore::_wctx_finish(
14144 WriteContext
*wctx
,
14145 set
<SharedBlob
*> *maybe_unshared_blobs
)
14147 auto oep
= wctx
->old_extents
.begin();
14148 while (oep
!= wctx
->old_extents
.end()) {
14150 oep
= wctx
->old_extents
.erase(oep
);
14151 dout(20) << __func__
<< " lex_old " << lo
.e
<< dendl
;
14152 BlobRef b
= lo
.e
.blob
;
14153 const bluestore_blob_t
& blob
= b
->get_blob();
14154 if (blob
.is_compressed()) {
14155 if (lo
.blob_empty
) {
14156 txc
->statfs_delta
.compressed() -= blob
.get_compressed_payload_length();
14158 txc
->statfs_delta
.compressed_original() -= lo
.e
.length
;
14161 txc
->statfs_delta
.stored() -= lo
.e
.length
;
14163 dout(20) << __func__
<< " blob release " << r
<< dendl
;
14164 if (blob
.is_shared()) {
14165 PExtentVector final
;
14166 c
->load_shared_blob(b
->shared_blob
);
14167 bool unshare
= false;
14168 bool* unshare_ptr
=
14169 !maybe_unshared_blobs
|| b
->is_referenced() ? nullptr : &unshare
;
14171 b
->shared_blob
->put_ref(
14172 e
.offset
, e
.length
, &final
,
14176 ceph_assert(maybe_unshared_blobs
);
14177 maybe_unshared_blobs
->insert(b
->shared_blob
.get());
14179 dout(20) << __func__
<< " shared_blob release " << final
14180 << " from " << *b
->shared_blob
<< dendl
;
14181 txc
->write_shared_blob(b
->shared_blob
);
14186 // we can't invalidate our logical extents as we drop them because
14187 // other lextents (either in our onode or others) may still
14188 // reference them. but we can throw out anything that is no
14189 // longer allocated. Note that this will leave behind edge bits
14190 // that are no longer referenced but not deallocated (until they
14191 // age out of the cache naturally).
14192 b
->discard_unallocated(c
.get());
14194 dout(20) << __func__
<< " release " << e
<< dendl
;
14195 txc
->released
.insert(e
.offset
, e
.length
);
14196 txc
->statfs_delta
.allocated() -= e
.length
;
14197 if (blob
.is_compressed()) {
14198 txc
->statfs_delta
.compressed_allocated() -= e
.length
;
14202 if (b
->is_spanning() && !b
->is_referenced() && lo
.blob_empty
) {
14203 dout(20) << __func__
<< " spanning_blob_map removing empty " << *b
14205 o
->extent_map
.spanning_blob_map
.erase(b
->id
);
14211 void BlueStore::_do_write_data(
14218 WriteContext
*wctx
)
14220 uint64_t end
= offset
+ length
;
14221 bufferlist::iterator p
= bl
.begin();
14223 if (offset
/ min_alloc_size
== (end
- 1) / min_alloc_size
&&
14224 (length
!= min_alloc_size
)) {
14225 // we fall within the same block
14226 _do_write_small(txc
, c
, o
, offset
, length
, p
, wctx
);
14228 uint64_t head_offset
, head_length
;
14229 uint64_t middle_offset
, middle_length
;
14230 uint64_t tail_offset
, tail_length
;
14232 head_offset
= offset
;
14233 head_length
= p2nphase(offset
, min_alloc_size
);
14235 tail_offset
= p2align(end
, min_alloc_size
);
14236 tail_length
= p2phase(end
, min_alloc_size
);
14238 middle_offset
= head_offset
+ head_length
;
14239 middle_length
= length
- head_length
- tail_length
;
14242 _do_write_small(txc
, c
, o
, head_offset
, head_length
, p
, wctx
);
14245 if (middle_length
) {
14246 _do_write_big(txc
, c
, o
, middle_offset
, middle_length
, p
, wctx
);
14250 _do_write_small(txc
, c
, o
, tail_offset
, tail_length
, p
, wctx
);
14255 void BlueStore::_choose_write_options(
14258 uint32_t fadvise_flags
,
14259 WriteContext
*wctx
)
14261 if (fadvise_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
14262 dout(20) << __func__
<< " will do buffered write" << dendl
;
14263 wctx
->buffered
= true;
14264 } else if (cct
->_conf
->bluestore_default_buffered_write
&&
14265 (fadvise_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
14266 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
14267 dout(20) << __func__
<< " defaulting to buffered write" << dendl
;
14268 wctx
->buffered
= true;
14271 // apply basic csum block size
14272 wctx
->csum_order
= block_size_order
;
14274 // compression parameters
14275 unsigned alloc_hints
= o
->onode
.alloc_hint_flags
;
14276 auto cm
= select_option(
14277 "compression_mode",
14281 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MODE
, &val
)) {
14282 return boost::optional
<Compressor::CompressionMode
>(
14283 Compressor::get_comp_mode_type(val
));
14285 return boost::optional
<Compressor::CompressionMode
>();
14289 wctx
->compress
= (cm
!= Compressor::COMP_NONE
) &&
14290 ((cm
== Compressor::COMP_FORCE
) ||
14291 (cm
== Compressor::COMP_AGGRESSIVE
&&
14292 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE
) == 0) ||
14293 (cm
== Compressor::COMP_PASSIVE
&&
14294 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE
)));
14296 if ((alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ
) &&
14297 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ
) == 0 &&
14298 (alloc_hints
& (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE
|
14299 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY
)) &&
14300 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE
) == 0) {
14302 dout(20) << __func__
<< " will prefer large blob and csum sizes" << dendl
;
14304 if (o
->onode
.expected_write_size
) {
14305 wctx
->csum_order
= std::max(min_alloc_size_order
,
14306 (uint8_t)ctz(o
->onode
.expected_write_size
));
14308 wctx
->csum_order
= min_alloc_size_order
;
14311 if (wctx
->compress
) {
14312 wctx
->target_blob_size
= select_option(
14313 "compression_max_blob_size",
14314 comp_max_blob_size
.load(),
14317 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, &val
)) {
14318 return boost::optional
<uint64_t>((uint64_t)val
);
14320 return boost::optional
<uint64_t>();
14325 if (wctx
->compress
) {
14326 wctx
->target_blob_size
= select_option(
14327 "compression_min_blob_size",
14328 comp_min_blob_size
.load(),
14331 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, &val
)) {
14332 return boost::optional
<uint64_t>((uint64_t)val
);
14334 return boost::optional
<uint64_t>();
14340 uint64_t max_bsize
= max_blob_size
.load();
14341 if (wctx
->target_blob_size
== 0 || wctx
->target_blob_size
> max_bsize
) {
14342 wctx
->target_blob_size
= max_bsize
;
14345 // set the min blob size floor at 2x the min_alloc_size, or else we
14346 // won't be able to allocate a smaller extent for the compressed
14348 if (wctx
->compress
&&
14349 wctx
->target_blob_size
< min_alloc_size
* 2) {
14350 wctx
->target_blob_size
= min_alloc_size
* 2;
14353 dout(20) << __func__
<< " prefer csum_order " << wctx
->csum_order
14354 << " target_blob_size 0x" << std::hex
<< wctx
->target_blob_size
14355 << " compress=" << (int)wctx
->compress
14356 << " buffered=" << (int)wctx
->buffered
14357 << std::dec
<< dendl
;
14360 int BlueStore::_do_gc(
14364 const WriteContext
& wctx
,
14365 uint64_t *dirty_start
,
14366 uint64_t *dirty_end
)
14369 bool dirty_range_updated
= false;
14370 WriteContext wctx_gc
;
14371 wctx_gc
.fork(wctx
); // make a clone for garbage collection
14373 auto & extents_to_collect
= wctx
.extents_to_gc
;
14374 for (auto it
= extents_to_collect
.begin();
14375 it
!= extents_to_collect
.end();
14378 auto offset
= (*it
).first
;
14379 auto length
= (*it
).second
;
14380 dout(20) << __func__
<< " processing " << std::hex
14381 << offset
<< "~" << length
<< std::dec
14383 int r
= _do_read(c
.get(), o
, offset
, length
, bl
, 0);
14384 ceph_assert(r
== (int)length
);
14386 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx_gc
);
14387 logger
->inc(l_bluestore_gc_merged
, length
);
14389 if (*dirty_start
> offset
) {
14390 *dirty_start
= offset
;
14391 dirty_range_updated
= true;
14394 if (*dirty_end
< offset
+ length
) {
14395 *dirty_end
= offset
+ length
;
14396 dirty_range_updated
= true;
14399 if (dirty_range_updated
) {
14400 o
->extent_map
.fault_range(db
, *dirty_start
, *dirty_end
);
14403 dout(30) << __func__
<< " alloc write" << dendl
;
14404 int r
= _do_alloc_write(txc
, c
, o
, &wctx_gc
);
14406 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
14411 _wctx_finish(txc
, c
, o
, &wctx_gc
);
14415 int BlueStore::_do_write(
14422 uint32_t fadvise_flags
)
14426 dout(20) << __func__
14428 << " 0x" << std::hex
<< offset
<< "~" << length
14429 << " - have 0x" << o
->onode
.size
14430 << " (" << std::dec
<< o
->onode
.size
<< ")"
14432 << " fadvise_flags 0x" << std::hex
<< fadvise_flags
<< std::dec
14434 _dump_onode
<30>(cct
, *o
);
14440 uint64_t end
= offset
+ length
;
14442 GarbageCollector
gc(c
->store
->cct
);
14443 int64_t benefit
= 0;
14444 auto dirty_start
= offset
;
14445 auto dirty_end
= end
;
14448 _choose_write_options(c
, o
, fadvise_flags
, &wctx
);
14449 o
->extent_map
.fault_range(db
, offset
, length
);
14450 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx
);
14451 r
= _do_alloc_write(txc
, c
, o
, &wctx
);
14453 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
14458 if (wctx
.extents_to_gc
.empty() ||
14459 wctx
.extents_to_gc
.range_start() > offset
||
14460 wctx
.extents_to_gc
.range_end() < offset
+ length
) {
14461 benefit
= gc
.estimate(offset
,
14468 // NB: _wctx_finish() will empty old_extents
14469 // so we must do gc estimation before that
14470 _wctx_finish(txc
, c
, o
, &wctx
);
14471 if (end
> o
->onode
.size
) {
14472 dout(20) << __func__
<< " extending size to 0x" << std::hex
<< end
14473 << std::dec
<< dendl
;
14474 o
->onode
.size
= end
;
14477 if (benefit
>= g_conf()->bluestore_gc_enable_total_threshold
) {
14478 wctx
.extents_to_gc
.union_of(gc
.get_extents_to_collect());
14479 dout(20) << __func__
14480 << " perform garbage collection for compressed extents, "
14481 << "expected benefit = " << benefit
<< " AUs" << dendl
;
14483 if (!wctx
.extents_to_gc
.empty()) {
14484 dout(20) << __func__
<< " perform garbage collection" << dendl
;
14486 r
= _do_gc(txc
, c
, o
,
14488 &dirty_start
, &dirty_end
);
14490 derr
<< __func__
<< " _do_gc failed with " << cpp_strerror(r
)
14494 dout(20)<<__func__
<<" gc range is " << std::hex
<< dirty_start
14495 << "~" << dirty_end
- dirty_start
<< std::dec
<< dendl
;
14497 o
->extent_map
.compress_extent_map(dirty_start
, dirty_end
- dirty_start
);
14498 o
->extent_map
.dirty_range(dirty_start
, dirty_end
- dirty_start
);
14506 int BlueStore::_write(TransContext
*txc
,
14509 uint64_t offset
, size_t length
,
14511 uint32_t fadvise_flags
)
14513 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14514 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14517 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
14520 _assign_nid(txc
, o
);
14521 r
= _do_write(txc
, c
, o
, offset
, length
, bl
, fadvise_flags
);
14522 txc
->write_onode(o
);
14524 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14525 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14526 << " = " << r
<< dendl
;
14530 int BlueStore::_zero(TransContext
*txc
,
14533 uint64_t offset
, size_t length
)
14535 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14536 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14539 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
14542 _assign_nid(txc
, o
);
14543 r
= _do_zero(txc
, c
, o
, offset
, length
);
14545 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14546 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14547 << " = " << r
<< dendl
;
14551 int BlueStore::_do_zero(TransContext
*txc
,
14554 uint64_t offset
, size_t length
)
14556 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14557 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14561 _dump_onode
<30>(cct
, *o
);
14564 o
->extent_map
.fault_range(db
, offset
, length
);
14565 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
14566 o
->extent_map
.dirty_range(offset
, length
);
14567 _wctx_finish(txc
, c
, o
, &wctx
);
14569 if (length
> 0 && offset
+ length
> o
->onode
.size
) {
14570 o
->onode
.size
= offset
+ length
;
14571 dout(20) << __func__
<< " extending size to " << offset
+ length
14574 txc
->write_onode(o
);
14576 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14577 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14578 << " = " << r
<< dendl
;
14582 void BlueStore::_do_truncate(
14583 TransContext
*txc
, CollectionRef
& c
, OnodeRef o
, uint64_t offset
,
14584 set
<SharedBlob
*> *maybe_unshared_blobs
)
14586 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14587 << " 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
14589 _dump_onode
<30>(cct
, *o
);
14591 if (offset
== o
->onode
.size
)
14594 if (offset
< o
->onode
.size
) {
14596 uint64_t length
= o
->onode
.size
- offset
;
14597 o
->extent_map
.fault_range(db
, offset
, length
);
14598 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
14599 o
->extent_map
.dirty_range(offset
, length
);
14600 _wctx_finish(txc
, c
, o
, &wctx
, maybe_unshared_blobs
);
14602 // if we have shards past EOF, ask for a reshard
14603 if (!o
->onode
.extent_map_shards
.empty() &&
14604 o
->onode
.extent_map_shards
.back().offset
>= offset
) {
14605 dout(10) << __func__
<< " request reshard past EOF" << dendl
;
14607 o
->extent_map
.request_reshard(offset
- 1, offset
+ length
);
14609 o
->extent_map
.request_reshard(0, length
);
14614 o
->onode
.size
= offset
;
14616 txc
->write_onode(o
);
14619 int BlueStore::_truncate(TransContext
*txc
,
14624 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14625 << " 0x" << std::hex
<< offset
<< std::dec
14628 if (offset
>= OBJECT_MAX_SIZE
) {
14631 _do_truncate(txc
, c
, o
, offset
);
14633 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14634 << " 0x" << std::hex
<< offset
<< std::dec
14635 << " = " << r
<< dendl
;
14639 int BlueStore::_do_remove(
14644 set
<SharedBlob
*> maybe_unshared_blobs
;
14645 bool is_gen
= !o
->oid
.is_no_gen();
14646 _do_truncate(txc
, c
, o
, 0, is_gen
? &maybe_unshared_blobs
: nullptr);
14647 if (o
->onode
.has_omap()) {
14649 _do_omap_clear(txc
, o
);
14653 for (auto &s
: o
->extent_map
.shards
) {
14654 dout(20) << __func__
<< " removing shard 0x" << std::hex
14655 << s
.shard_info
->offset
<< std::dec
<< dendl
;
14656 generate_extent_shard_key_and_apply(o
->key
, s
.shard_info
->offset
, &key
,
14657 [&](const string
& final_key
) {
14658 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
14662 txc
->t
->rmkey(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size());
14663 txc
->note_removed_object(o
);
14664 o
->extent_map
.clear();
14665 o
->onode
= bluestore_onode_t();
14666 _debug_obj_on_delete(o
->oid
);
14668 if (!is_gen
|| maybe_unshared_blobs
.empty()) {
14672 // see if we can unshare blobs still referenced by the head
14673 dout(10) << __func__
<< " gen and maybe_unshared_blobs "
14674 << maybe_unshared_blobs
<< dendl
;
14675 ghobject_t nogen
= o
->oid
;
14676 nogen
.generation
= ghobject_t::NO_GEN
;
14677 OnodeRef h
= c
->onode_map
.lookup(nogen
);
14679 if (!h
|| !h
->exists
) {
14683 dout(20) << __func__
<< " checking for unshareable blobs on " << h
14684 << " " << h
->oid
<< dendl
;
14685 map
<SharedBlob
*,bluestore_extent_ref_map_t
> expect
;
14686 for (auto& e
: h
->extent_map
.extent_map
) {
14687 const bluestore_blob_t
& b
= e
.blob
->get_blob();
14688 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
14689 if (b
.is_shared() &&
14691 maybe_unshared_blobs
.count(sb
)) {
14692 if (b
.is_compressed()) {
14693 expect
[sb
].get(0, b
.get_ondisk_length());
14695 b
.map(e
.blob_offset
, e
.length
, [&](uint64_t off
, uint64_t len
) {
14696 expect
[sb
].get(off
, len
);
14703 vector
<SharedBlob
*> unshared_blobs
;
14704 unshared_blobs
.reserve(maybe_unshared_blobs
.size());
14705 for (auto& p
: expect
) {
14706 dout(20) << " ? " << *p
.first
<< " vs " << p
.second
<< dendl
;
14707 if (p
.first
->persistent
->ref_map
== p
.second
) {
14708 SharedBlob
*sb
= p
.first
;
14709 dout(20) << __func__
<< " unsharing " << *sb
<< dendl
;
14710 unshared_blobs
.push_back(sb
);
14711 txc
->unshare_blob(sb
);
14712 uint64_t sbid
= c
->make_blob_unshared(sb
);
14714 get_shared_blob_key(sbid
, &key
);
14715 txc
->t
->rmkey(PREFIX_SHARED_BLOB
, key
);
14719 if (unshared_blobs
.empty()) {
14723 for (auto& e
: h
->extent_map
.extent_map
) {
14724 const bluestore_blob_t
& b
= e
.blob
->get_blob();
14725 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
14726 if (b
.is_shared() &&
14727 std::find(unshared_blobs
.begin(), unshared_blobs
.end(),
14728 sb
) != unshared_blobs
.end()) {
14729 dout(20) << __func__
<< " unsharing " << e
<< dendl
;
14730 bluestore_blob_t
& blob
= e
.blob
->dirty_blob();
14731 blob
.clear_flag(bluestore_blob_t::FLAG_SHARED
);
14732 h
->extent_map
.dirty_range(e
.logical_offset
, 1);
14735 txc
->write_onode(h
);
14740 int BlueStore::_remove(TransContext
*txc
,
14744 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14745 << " onode " << o
.get()
14746 << " txc "<< txc
<< dendl
;
14748 auto start_time
= mono_clock::now();
14749 int r
= _do_remove(txc
, c
, o
);
14752 l_bluestore_remove_lat
,
14753 mono_clock::now() - start_time
,
14754 cct
->_conf
->bluestore_log_op_age
,
14755 [&](const ceph::timespan
& lat
) {
14756 ostringstream ostr
;
14757 ostr
<< ", lat = " << timespan_str(lat
)
14758 << " cid =" << c
->cid
14759 << " oid =" << o
->oid
;
14764 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14768 int BlueStore::_setattr(TransContext
*txc
,
14771 const string
& name
,
14774 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14775 << " " << name
<< " (" << val
.length() << " bytes)"
14778 if (val
.is_partial()) {
14779 auto& b
= o
->onode
.attrs
[name
.c_str()] = bufferptr(val
.c_str(),
14781 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14783 auto& b
= o
->onode
.attrs
[name
.c_str()] = val
;
14784 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14786 txc
->write_onode(o
);
14787 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14788 << " " << name
<< " (" << val
.length() << " bytes)"
14789 << " = " << r
<< dendl
;
14793 int BlueStore::_setattrs(TransContext
*txc
,
14796 const map
<string
,bufferptr
>& aset
)
14798 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14799 << " " << aset
.size() << " keys"
14802 for (map
<string
,bufferptr
>::const_iterator p
= aset
.begin();
14803 p
!= aset
.end(); ++p
) {
14804 if (p
->second
.is_partial()) {
14805 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] =
14806 bufferptr(p
->second
.c_str(), p
->second
.length());
14807 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14809 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] = p
->second
;
14810 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14813 txc
->write_onode(o
);
14814 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14815 << " " << aset
.size() << " keys"
14816 << " = " << r
<< dendl
;
14821 int BlueStore::_rmattr(TransContext
*txc
,
14824 const string
& name
)
14826 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14827 << " " << name
<< dendl
;
14829 auto it
= o
->onode
.attrs
.find(name
.c_str());
14830 if (it
== o
->onode
.attrs
.end())
14833 o
->onode
.attrs
.erase(it
);
14834 txc
->write_onode(o
);
14837 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14838 << " " << name
<< " = " << r
<< dendl
;
14842 int BlueStore::_rmattrs(TransContext
*txc
,
14846 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14849 if (o
->onode
.attrs
.empty())
14852 o
->onode
.attrs
.clear();
14853 txc
->write_onode(o
);
14856 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14860 void BlueStore::_do_omap_clear(TransContext
*txc
, OnodeRef
& o
)
14862 const string
& omap_prefix
= o
->get_omap_prefix();
14863 string prefix
, tail
;
14864 o
->get_omap_header(&prefix
);
14865 o
->get_omap_tail(&tail
);
14866 txc
->t
->rm_range_keys(omap_prefix
, prefix
, tail
);
14867 txc
->t
->rmkey(omap_prefix
, tail
);
14868 dout(20) << __func__
<< " remove range start: "
14869 << pretty_binary_string(prefix
) << " end: "
14870 << pretty_binary_string(tail
) << dendl
;
14873 int BlueStore::_omap_clear(TransContext
*txc
,
14877 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14879 if (o
->onode
.has_omap()) {
14881 _do_omap_clear(txc
, o
);
14882 o
->onode
.clear_omap_flag();
14883 txc
->write_onode(o
);
14885 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14889 int BlueStore::_omap_setkeys(TransContext
*txc
,
14894 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14896 auto p
= bl
.cbegin();
14898 if (!o
->onode
.has_omap()) {
14899 if (o
->oid
.is_pgmeta()) {
14900 o
->onode
.set_omap_flags_pgmeta();
14902 o
->onode
.set_omap_flags();
14904 txc
->write_onode(o
);
14906 const string
& prefix
= o
->get_omap_prefix();
14909 o
->get_omap_tail(&key_tail
);
14910 txc
->t
->set(prefix
, key_tail
, tail
);
14912 txc
->note_modified_object(o
);
14914 const string
& prefix
= o
->get_omap_prefix();
14916 o
->get_omap_key(string(), &final_key
);
14917 size_t base_key_len
= final_key
.size();
14924 final_key
.resize(base_key_len
); // keep prefix
14926 dout(20) << __func__
<< " " << pretty_binary_string(final_key
)
14927 << " <- " << key
<< dendl
;
14928 txc
->t
->set(prefix
, final_key
, value
);
14931 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14935 int BlueStore::_omap_setheader(TransContext
*txc
,
14940 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14943 if (!o
->onode
.has_omap()) {
14944 if (o
->oid
.is_pgmeta()) {
14945 o
->onode
.set_omap_flags_pgmeta();
14947 o
->onode
.set_omap_flags();
14949 txc
->write_onode(o
);
14951 const string
& prefix
= o
->get_omap_prefix();
14954 o
->get_omap_tail(&key_tail
);
14955 txc
->t
->set(prefix
, key_tail
, tail
);
14957 txc
->note_modified_object(o
);
14959 const string
& prefix
= o
->get_omap_prefix();
14960 o
->get_omap_header(&key
);
14961 txc
->t
->set(prefix
, key
, bl
);
14963 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14967 int BlueStore::_omap_rmkeys(TransContext
*txc
,
14972 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14974 auto p
= bl
.cbegin();
14978 if (!o
->onode
.has_omap()) {
14982 const string
& prefix
= o
->get_omap_prefix();
14983 o
->get_omap_key(string(), &final_key
);
14984 size_t base_key_len
= final_key
.size();
14989 final_key
.resize(base_key_len
); // keep prefix
14991 dout(20) << __func__
<< " rm " << pretty_binary_string(final_key
)
14992 << " <- " << key
<< dendl
;
14993 txc
->t
->rmkey(prefix
, final_key
);
14996 txc
->note_modified_object(o
);
14999 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15003 int BlueStore::_omap_rmkey_range(TransContext
*txc
,
15006 const string
& first
, const string
& last
)
15008 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
15009 string key_first
, key_last
;
15011 if (!o
->onode
.has_omap()) {
15015 const string
& prefix
= o
->get_omap_prefix();
15017 o
->get_omap_key(first
, &key_first
);
15018 o
->get_omap_key(last
, &key_last
);
15019 txc
->t
->rm_range_keys(prefix
, key_first
, key_last
);
15020 dout(20) << __func__
<< " remove range start: "
15021 << pretty_binary_string(key_first
) << " end: "
15022 << pretty_binary_string(key_last
) << dendl
;
15024 txc
->note_modified_object(o
);
15027 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
15031 int BlueStore::_set_alloc_hint(
15035 uint64_t expected_object_size
,
15036 uint64_t expected_write_size
,
15039 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
15040 << " object_size " << expected_object_size
15041 << " write_size " << expected_write_size
15042 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
15045 o
->onode
.expected_object_size
= expected_object_size
;
15046 o
->onode
.expected_write_size
= expected_write_size
;
15047 o
->onode
.alloc_hint_flags
= flags
;
15048 txc
->write_onode(o
);
15049 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
15050 << " object_size " << expected_object_size
15051 << " write_size " << expected_write_size
15052 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
15053 << " = " << r
<< dendl
;
15057 int BlueStore::_clone(TransContext
*txc
,
15062 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15063 << newo
->oid
<< dendl
;
15065 if (oldo
->oid
.hobj
.get_hash() != newo
->oid
.hobj
.get_hash()) {
15066 derr
<< __func__
<< " mismatched hash on " << oldo
->oid
15067 << " and " << newo
->oid
<< dendl
;
15071 _assign_nid(txc
, newo
);
15075 _do_truncate(txc
, c
, newo
, 0);
15076 if (cct
->_conf
->bluestore_clone_cow
) {
15077 _do_clone_range(txc
, c
, oldo
, newo
, 0, oldo
->onode
.size
, 0);
15080 r
= _do_read(c
.get(), oldo
, 0, oldo
->onode
.size
, bl
, 0);
15083 r
= _do_write(txc
, c
, newo
, 0, oldo
->onode
.size
, bl
, 0);
15089 newo
->onode
.attrs
= oldo
->onode
.attrs
;
15092 if (newo
->onode
.has_omap()) {
15093 dout(20) << __func__
<< " clearing old omap data" << dendl
;
15095 _do_omap_clear(txc
, newo
);
15096 newo
->onode
.clear_omap_flag();
15098 if (oldo
->onode
.has_omap()) {
15099 dout(20) << __func__
<< " copying omap data" << dendl
;
15100 if (newo
->oid
.is_pgmeta()) {
15101 newo
->onode
.set_omap_flags_pgmeta();
15103 newo
->onode
.set_omap_flags();
15105 const string
& prefix
= newo
->get_omap_prefix();
15106 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
15108 oldo
->get_omap_header(&head
);
15109 oldo
->get_omap_tail(&tail
);
15110 it
->lower_bound(head
);
15111 while (it
->valid()) {
15112 if (it
->key() >= tail
) {
15113 dout(30) << __func__
<< " reached tail" << dendl
;
15116 dout(30) << __func__
<< " got header/data "
15117 << pretty_binary_string(it
->key()) << dendl
;
15119 newo
->rewrite_omap_key(it
->key(), &key
);
15120 txc
->t
->set(prefix
, key
, it
->value());
15125 bufferlist new_tail_value
;
15126 newo
->get_omap_tail(&new_tail
);
15127 txc
->t
->set(prefix
, new_tail
, new_tail_value
);
15130 txc
->write_onode(newo
);
15134 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15135 << newo
->oid
<< " = " << r
<< dendl
;
15139 int BlueStore::_do_clone_range(
15148 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15150 << " 0x" << std::hex
<< srcoff
<< "~" << length
<< " -> "
15151 << " 0x" << dstoff
<< "~" << length
<< std::dec
<< dendl
;
15152 oldo
->extent_map
.fault_range(db
, srcoff
, length
);
15153 newo
->extent_map
.fault_range(db
, dstoff
, length
);
15154 _dump_onode
<30>(cct
, *oldo
);
15155 _dump_onode
<30>(cct
, *newo
);
15157 oldo
->extent_map
.dup(this, txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
15158 _dump_onode
<30>(cct
, *oldo
);
15159 _dump_onode
<30>(cct
, *newo
);
15163 int BlueStore::_clone_range(TransContext
*txc
,
15167 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
)
15169 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15170 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
15171 << " to offset 0x" << dstoff
<< std::dec
<< dendl
;
15174 if (srcoff
+ length
>= OBJECT_MAX_SIZE
||
15175 dstoff
+ length
>= OBJECT_MAX_SIZE
) {
15179 if (srcoff
+ length
> oldo
->onode
.size
) {
15184 _assign_nid(txc
, newo
);
15187 if (cct
->_conf
->bluestore_clone_cow
) {
15188 _do_zero(txc
, c
, newo
, dstoff
, length
);
15189 _do_clone_range(txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
15192 r
= _do_read(c
.get(), oldo
, srcoff
, length
, bl
, 0);
15195 r
= _do_write(txc
, c
, newo
, dstoff
, bl
.length(), bl
, 0);
15201 txc
->write_onode(newo
);
15205 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15206 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
15207 << " to offset 0x" << dstoff
<< std::dec
15208 << " = " << r
<< dendl
;
15212 int BlueStore::_rename(TransContext
*txc
,
15216 const ghobject_t
& new_oid
)
15218 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15219 << new_oid
<< dendl
;
15221 ghobject_t old_oid
= oldo
->oid
;
15222 mempool::bluestore_cache_meta::string new_okey
;
15225 if (newo
->exists
) {
15229 ceph_assert(txc
->onodes
.count(newo
) == 0);
15232 txc
->t
->rmkey(PREFIX_OBJ
, oldo
->key
.c_str(), oldo
->key
.size());
15236 oldo
->extent_map
.fault_range(db
, 0, oldo
->onode
.size
);
15237 get_object_key(cct
, new_oid
, &new_okey
);
15239 for (auto &s
: oldo
->extent_map
.shards
) {
15240 generate_extent_shard_key_and_apply(oldo
->key
, s
.shard_info
->offset
, &key
,
15241 [&](const string
& final_key
) {
15242 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
15250 txc
->write_onode(newo
);
15252 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
15253 // Onode in the old slot
15254 c
->onode_map
.rename(oldo
, old_oid
, new_oid
, new_okey
);
15257 // hold a ref to new Onode in old name position, to ensure we don't drop
15258 // it from the cache before this txc commits (or else someone may come along
15259 // and read newo's metadata via the old name).
15260 txc
->note_modified_object(oldo
);
15263 dout(10) << __func__
<< " " << c
->cid
<< " " << old_oid
<< " -> "
15264 << new_oid
<< " = " << r
<< dendl
;
15270 int BlueStore::_create_collection(
15276 dout(15) << __func__
<< " " << cid
<< " bits " << bits
<< dendl
;
15281 std::unique_lock
l(coll_lock
);
15286 auto p
= new_coll_map
.find(cid
);
15287 ceph_assert(p
!= new_coll_map
.end());
15289 (*c
)->cnode
.bits
= bits
;
15290 coll_map
[cid
] = *c
;
15291 new_coll_map
.erase(p
);
15293 encode((*c
)->cnode
, bl
);
15294 txc
->t
->set(PREFIX_COLL
, stringify(cid
), bl
);
15298 dout(10) << __func__
<< " " << cid
<< " bits " << bits
<< " = " << r
<< dendl
;
15302 int BlueStore::_remove_collection(TransContext
*txc
, const coll_t
&cid
,
15305 dout(15) << __func__
<< " " << cid
<< dendl
;
15308 (*c
)->flush_all_but_last();
15310 std::unique_lock
l(coll_lock
);
15315 size_t nonexistent_count
= 0;
15316 ceph_assert((*c
)->exists
);
15317 if ((*c
)->onode_map
.map_any([&](Onode
* o
) {
15319 dout(1) << __func__
<< " " << o
->oid
<< " " << o
15320 << " exists in onode_map" << dendl
;
15323 ++nonexistent_count
;
15330 vector
<ghobject_t
> ls
;
15332 // Enumerate onodes in db, up to nonexistent_count + 1
15333 // then check if all of them are marked as non-existent.
15334 // Bypass the check if (next != ghobject_t::get_max())
15335 r
= _collection_list(c
->get(), ghobject_t(), ghobject_t::get_max(),
15336 nonexistent_count
+ 1, false, &ls
, &next
);
15338 // If true mean collecton has more objects than nonexistent_count,
15339 // so bypass check.
15340 bool exists
= (!next
.is_max());
15341 for (auto it
= ls
.begin(); !exists
&& it
< ls
.end(); ++it
) {
15342 dout(10) << __func__
<< " oid " << *it
<< dendl
;
15343 auto onode
= (*c
)->onode_map
.lookup(*it
);
15344 exists
= !onode
|| onode
->exists
;
15346 dout(1) << __func__
<< " " << *it
15347 << " exists in db, "
15348 << (!onode
? "not present in ram" : "present in ram")
15353 _do_remove_collection(txc
, c
);
15356 dout(10) << __func__
<< " " << cid
15357 << " is non-empty" << dendl
;
15364 dout(10) << __func__
<< " " << cid
<< " = " << r
<< dendl
;
15368 void BlueStore::_do_remove_collection(TransContext
*txc
,
15371 coll_map
.erase((*c
)->cid
);
15372 txc
->removed_collections
.push_back(*c
);
15373 (*c
)->exists
= false;
15374 _osr_register_zombie((*c
)->osr
.get());
15375 txc
->t
->rmkey(PREFIX_COLL
, stringify((*c
)->cid
));
15379 int BlueStore::_split_collection(TransContext
*txc
,
15382 unsigned bits
, int rem
)
15384 dout(15) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
15385 << " bits " << bits
<< dendl
;
15386 std::unique_lock
l(c
->lock
);
15387 std::unique_lock
l2(d
->lock
);
15390 // flush all previous deferred writes on this sequencer. this is a bit
15391 // heavyweight, but we need to make sure all deferred writes complete
15392 // before we split as the new collection's sequencer may need to order
15393 // this after those writes, and we don't bother with the complexity of
15394 // moving those TransContexts over to the new osr.
15395 _osr_drain_preceding(txc
);
15397 // move any cached items (onodes and referenced shared blobs) that will
15398 // belong to the child collection post-split. leave everything else behind.
15399 // this may include things that don't strictly belong to the now-smaller
15400 // parent split, but the OSD will always send us a split for every new
15403 spg_t pgid
, dest_pgid
;
15404 bool is_pg
= c
->cid
.is_pg(&pgid
);
15405 ceph_assert(is_pg
);
15406 is_pg
= d
->cid
.is_pg(&dest_pgid
);
15407 ceph_assert(is_pg
);
15409 // the destination should initially be empty.
15410 ceph_assert(d
->onode_map
.empty());
15411 ceph_assert(d
->shared_blob_set
.empty());
15412 ceph_assert(d
->cnode
.bits
== bits
);
15414 c
->split_cache(d
.get());
15416 // adjust bits. note that this will be redundant for all but the first
15417 // split call for this parent (first child).
15418 c
->cnode
.bits
= bits
;
15419 ceph_assert(d
->cnode
.bits
== bits
);
15423 encode(c
->cnode
, bl
);
15424 txc
->t
->set(PREFIX_COLL
, stringify(c
->cid
), bl
);
15426 dout(10) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
15427 << " bits " << bits
<< " = " << r
<< dendl
;
15431 int BlueStore::_merge_collection(
15437 dout(15) << __func__
<< " " << (*c
)->cid
<< " to " << d
->cid
15438 << " bits " << bits
<< dendl
;
15439 std::unique_lock
l((*c
)->lock
);
15440 std::unique_lock
l2(d
->lock
);
15443 coll_t cid
= (*c
)->cid
;
15445 // flush all previous deferred writes on the source collection to ensure
15446 // that all deferred writes complete before we merge as the target collection's
15447 // sequencer may need to order new ops after those writes.
15449 _osr_drain((*c
)->osr
.get());
15451 // move any cached items (onodes and referenced shared blobs) that will
15452 // belong to the child collection post-split. leave everything else behind.
15453 // this may include things that don't strictly belong to the now-smaller
15454 // parent split, but the OSD will always send us a split for every new
15457 spg_t pgid
, dest_pgid
;
15458 bool is_pg
= cid
.is_pg(&pgid
);
15459 ceph_assert(is_pg
);
15460 is_pg
= d
->cid
.is_pg(&dest_pgid
);
15461 ceph_assert(is_pg
);
15463 // adjust bits. note that this will be redundant for all but the first
15464 // merge call for the parent/target.
15465 d
->cnode
.bits
= bits
;
15467 // behavior depends on target (d) bits, so this after that is updated.
15468 (*c
)->split_cache(d
.get());
15470 // remove source collection
15472 std::unique_lock
l3(coll_lock
);
15473 _do_remove_collection(txc
, c
);
15479 encode(d
->cnode
, bl
);
15480 txc
->t
->set(PREFIX_COLL
, stringify(d
->cid
), bl
);
15482 dout(10) << __func__
<< " " << cid
<< " to " << d
->cid
<< " "
15483 << " bits " << bits
<< " = " << r
<< dendl
;
15487 void BlueStore::log_latency(
15490 const ceph::timespan
& l
,
15491 double lat_threshold
,
15492 const char* info
) const
15494 logger
->tinc(idx
, l
);
15495 if (lat_threshold
> 0.0 &&
15496 l
>= make_timespan(lat_threshold
)) {
15497 dout(0) << __func__
<< " slow operation observed for " << name
15498 << ", latency = " << l
15504 void BlueStore::log_latency_fn(
15507 const ceph::timespan
& l
,
15508 double lat_threshold
,
15509 std::function
<string (const ceph::timespan
& lat
)> fn
) const
15511 logger
->tinc(idx
, l
);
15512 if (lat_threshold
> 0.0 &&
15513 l
>= make_timespan(lat_threshold
)) {
15514 dout(0) << __func__
<< " slow operation observed for " << name
15515 << ", latency = " << l
15521 #if defined(WITH_LTTNG)
15522 void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15525 mono_clock::time_point start_throttle_acquire
)
15527 pending_kv_ios
+= txc
.ios
;
15528 if (txc
.deferred_txn
) {
15529 pending_deferred_ios
+= txc
.ios
;
15532 uint64_t started
= 0;
15533 uint64_t completed
= 0;
15534 if (should_trace(&started
, &completed
)) {
15535 txc
.tracing
= true;
15536 uint64_t rocksdb_base_level
,
15537 rocksdb_estimate_pending_compaction_bytes
,
15538 rocksdb_cur_size_all_mem_tables
,
15539 rocksdb_compaction_pending
,
15540 rocksdb_mem_table_flush_pending
,
15541 rocksdb_num_running_compactions
,
15542 rocksdb_num_running_flushes
,
15543 rocksdb_actual_delayed_write_rate
;
15545 "rocksdb.base-level",
15546 &rocksdb_base_level
);
15548 "rocksdb.estimate-pending-compaction-bytes",
15549 &rocksdb_estimate_pending_compaction_bytes
);
15551 "rocksdb.cur-size-all-mem-tables",
15552 &rocksdb_cur_size_all_mem_tables
);
15554 "rocksdb.compaction-pending",
15555 &rocksdb_compaction_pending
);
15557 "rocksdb.mem-table-flush-pending",
15558 &rocksdb_mem_table_flush_pending
);
15560 "rocksdb.num-running-compactions",
15561 &rocksdb_num_running_compactions
);
15563 "rocksdb.num-running-flushes",
15564 &rocksdb_num_running_flushes
);
15566 "rocksdb.actual-delayed-write-rate",
15567 &rocksdb_actual_delayed_write_rate
);
15572 transaction_initial_state
,
15573 txc
.osr
->get_sequencer_id(),
15575 throttle_bytes
.get_current(),
15576 throttle_deferred_bytes
.get_current(),
15578 pending_deferred_ios
,
15581 ceph::to_seconds
<double>(mono_clock::now() - start_throttle_acquire
));
15585 transaction_initial_state_rocksdb
,
15586 txc
.osr
->get_sequencer_id(),
15588 rocksdb_base_level
,
15589 rocksdb_estimate_pending_compaction_bytes
,
15590 rocksdb_cur_size_all_mem_tables
,
15591 rocksdb_compaction_pending
,
15592 rocksdb_mem_table_flush_pending
,
15593 rocksdb_num_running_compactions
,
15594 rocksdb_num_running_flushes
,
15595 rocksdb_actual_delayed_write_rate
);
15600 mono_clock::duration
BlueStore::BlueStoreThrottle::log_state_latency(
15601 TransContext
&txc
, PerfCounters
*logger
, int state
)
15603 mono_clock::time_point now
= mono_clock::now();
15604 mono_clock::duration lat
= now
- txc
.last_stamp
;
15605 logger
->tinc(state
, lat
);
15606 #if defined(WITH_LTTNG)
15608 state
>= l_bluestore_state_prepare_lat
&&
15609 state
<= l_bluestore_state_done_lat
) {
15610 OID_ELAPSED("", lat
.to_nsec() / 1000.0, txc
.get_state_latency_name(state
));
15613 transaction_state_duration
,
15614 txc
.osr
->get_sequencer_id(),
15617 ceph::to_seconds
<double>(lat
));
15620 txc
.last_stamp
= now
;
15624 bool BlueStore::BlueStoreThrottle::try_start_transaction(
15627 mono_clock::time_point start_throttle_acquire
)
15629 throttle_bytes
.get(txc
.cost
);
15631 if (!txc
.deferred_txn
|| throttle_deferred_bytes
.get_or_fail(txc
.cost
)) {
15632 emit_initial_tracepoint(db
, txc
, start_throttle_acquire
);
15639 void BlueStore::BlueStoreThrottle::finish_start_transaction(
15642 mono_clock::time_point start_throttle_acquire
)
15644 ceph_assert(txc
.deferred_txn
);
15645 throttle_deferred_bytes
.get(txc
.cost
);
15646 emit_initial_tracepoint(db
, txc
, start_throttle_acquire
);
15649 #if defined(WITH_LTTNG)
15650 void BlueStore::BlueStoreThrottle::complete_kv(TransContext
&txc
)
15652 pending_kv_ios
-= 1;
15653 ios_completed_since_last_traced
++;
15657 transaction_commit_latency
,
15658 txc
.osr
->get_sequencer_id(),
15660 ceph::to_seconds
<double>(mono_clock::now() - txc
.start
));
15665 #if defined(WITH_LTTNG)
15666 void BlueStore::BlueStoreThrottle::complete(TransContext
&txc
)
15668 if (txc
.deferred_txn
) {
15669 pending_deferred_ios
-= 1;
15672 mono_clock::time_point now
= mono_clock::now();
15673 mono_clock::duration lat
= now
- txc
.start
;
15676 transaction_total_duration
,
15677 txc
.osr
->get_sequencer_id(),
15679 ceph::to_seconds
<double>(lat
));
15684 // DB key value Histogram
15685 #define KEY_SLAB 32
15686 #define VALUE_SLAB 64
15688 const string prefix_onode
= "o";
15689 const string prefix_onode_shard
= "x";
15690 const string prefix_other
= "Z";
15692 int BlueStore::DBHistogram::get_key_slab(size_t sz
)
15694 return (sz
/KEY_SLAB
);
15697 string
BlueStore::DBHistogram::get_key_slab_to_range(int slab
)
15699 int lower_bound
= slab
* KEY_SLAB
;
15700 int upper_bound
= (slab
+ 1) * KEY_SLAB
;
15701 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
15705 int BlueStore::DBHistogram::get_value_slab(size_t sz
)
15707 return (sz
/VALUE_SLAB
);
15710 string
BlueStore::DBHistogram::get_value_slab_to_range(int slab
)
15712 int lower_bound
= slab
* VALUE_SLAB
;
15713 int upper_bound
= (slab
+ 1) * VALUE_SLAB
;
15714 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
15718 void BlueStore::DBHistogram::update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
15719 const string
&prefix
, size_t key_size
, size_t value_size
)
15721 uint32_t key_slab
= get_key_slab(key_size
);
15722 uint32_t value_slab
= get_value_slab(value_size
);
15723 key_hist
[prefix
][key_slab
].count
++;
15724 key_hist
[prefix
][key_slab
].max_len
=
15725 std::max
<size_t>(key_size
, key_hist
[prefix
][key_slab
].max_len
);
15726 key_hist
[prefix
][key_slab
].val_map
[value_slab
].count
++;
15727 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
=
15728 std::max
<size_t>(value_size
,
15729 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
);
15732 void BlueStore::DBHistogram::dump(Formatter
*f
)
15734 f
->open_object_section("rocksdb_value_distribution");
15735 for (auto i
: value_hist
) {
15736 f
->dump_unsigned(get_value_slab_to_range(i
.first
).data(), i
.second
);
15738 f
->close_section();
15740 f
->open_object_section("rocksdb_key_value_histogram");
15741 for (auto i
: key_hist
) {
15742 f
->dump_string("prefix", i
.first
);
15743 f
->open_object_section("key_hist");
15744 for ( auto k
: i
.second
) {
15745 f
->dump_unsigned(get_key_slab_to_range(k
.first
).data(), k
.second
.count
);
15746 f
->dump_unsigned("max_len", k
.second
.max_len
);
15747 f
->open_object_section("value_hist");
15748 for ( auto j
: k
.second
.val_map
) {
15749 f
->dump_unsigned(get_value_slab_to_range(j
.first
).data(), j
.second
.count
);
15750 f
->dump_unsigned("max_len", j
.second
.max_len
);
15752 f
->close_section();
15754 f
->close_section();
15756 f
->close_section();
15759 //Itrerates through the db and collects the stats
15760 void BlueStore::generate_db_histogram(Formatter
*f
)
15763 uint64_t num_onodes
= 0;
15764 uint64_t num_shards
= 0;
15765 uint64_t num_super
= 0;
15766 uint64_t num_coll
= 0;
15767 uint64_t num_omap
= 0;
15768 uint64_t num_pgmeta_omap
= 0;
15769 uint64_t num_deferred
= 0;
15770 uint64_t num_alloc
= 0;
15771 uint64_t num_stat
= 0;
15772 uint64_t num_others
= 0;
15773 uint64_t num_shared_shards
= 0;
15774 size_t max_key_size
=0, max_value_size
= 0;
15775 uint64_t total_key_size
= 0, total_value_size
= 0;
15776 size_t key_size
= 0, value_size
= 0;
15779 auto start
= coarse_mono_clock::now();
15781 KeyValueDB::WholeSpaceIterator iter
= db
->get_wholespace_iterator();
15782 iter
->seek_to_first();
15783 while (iter
->valid()) {
15784 dout(30) << __func__
<< " Key: " << iter
->key() << dendl
;
15785 key_size
= iter
->key_size();
15786 value_size
= iter
->value_size();
15787 hist
.value_hist
[hist
.get_value_slab(value_size
)]++;
15788 max_key_size
= std::max(max_key_size
, key_size
);
15789 max_value_size
= std::max(max_value_size
, value_size
);
15790 total_key_size
+= key_size
;
15791 total_value_size
+= value_size
;
15793 pair
<string
,string
> key(iter
->raw_key());
15795 if (key
.first
== PREFIX_SUPER
) {
15796 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SUPER
, key_size
, value_size
);
15798 } else if (key
.first
== PREFIX_STAT
) {
15799 hist
.update_hist_entry(hist
.key_hist
, PREFIX_STAT
, key_size
, value_size
);
15801 } else if (key
.first
== PREFIX_COLL
) {
15802 hist
.update_hist_entry(hist
.key_hist
, PREFIX_COLL
, key_size
, value_size
);
15804 } else if (key
.first
== PREFIX_OBJ
) {
15805 if (key
.second
.back() == ONODE_KEY_SUFFIX
) {
15806 hist
.update_hist_entry(hist
.key_hist
, prefix_onode
, key_size
, value_size
);
15809 hist
.update_hist_entry(hist
.key_hist
, prefix_onode_shard
, key_size
, value_size
);
15812 } else if (key
.first
== PREFIX_OMAP
) {
15813 hist
.update_hist_entry(hist
.key_hist
, PREFIX_OMAP
, key_size
, value_size
);
15815 } else if (key
.first
== PREFIX_PGMETA_OMAP
) {
15816 hist
.update_hist_entry(hist
.key_hist
, PREFIX_PGMETA_OMAP
, key_size
, value_size
);
15818 } else if (key
.first
== PREFIX_DEFERRED
) {
15819 hist
.update_hist_entry(hist
.key_hist
, PREFIX_DEFERRED
, key_size
, value_size
);
15821 } else if (key
.first
== PREFIX_ALLOC
|| key
.first
== PREFIX_ALLOC_BITMAP
) {
15822 hist
.update_hist_entry(hist
.key_hist
, PREFIX_ALLOC
, key_size
, value_size
);
15824 } else if (key
.first
== PREFIX_SHARED_BLOB
) {
15825 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SHARED_BLOB
, key_size
, value_size
);
15826 num_shared_shards
++;
15828 hist
.update_hist_entry(hist
.key_hist
, prefix_other
, key_size
, value_size
);
15834 ceph::timespan duration
= coarse_mono_clock::now() - start
;
15835 f
->open_object_section("rocksdb_key_value_stats");
15836 f
->dump_unsigned("num_onodes", num_onodes
);
15837 f
->dump_unsigned("num_shards", num_shards
);
15838 f
->dump_unsigned("num_super", num_super
);
15839 f
->dump_unsigned("num_coll", num_coll
);
15840 f
->dump_unsigned("num_omap", num_omap
);
15841 f
->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap
);
15842 f
->dump_unsigned("num_deferred", num_deferred
);
15843 f
->dump_unsigned("num_alloc", num_alloc
);
15844 f
->dump_unsigned("num_stat", num_stat
);
15845 f
->dump_unsigned("num_shared_shards", num_shared_shards
);
15846 f
->dump_unsigned("num_others", num_others
);
15847 f
->dump_unsigned("max_key_size", max_key_size
);
15848 f
->dump_unsigned("max_value_size", max_value_size
);
15849 f
->dump_unsigned("total_key_size", total_key_size
);
15850 f
->dump_unsigned("total_value_size", total_value_size
);
15851 f
->close_section();
15855 dout(20) << __func__
<< " finished in " << duration
<< " seconds" << dendl
;
15859 void BlueStore::_shutdown_cache()
15861 dout(10) << __func__
<< dendl
;
15862 for (auto i
: buffer_cache_shards
) {
15864 ceph_assert(i
->empty());
15866 for (auto& p
: coll_map
) {
15867 p
.second
->onode_map
.clear();
15868 if (!p
.second
->shared_blob_set
.empty()) {
15869 derr
<< __func__
<< " stray shared blobs on " << p
.first
<< dendl
;
15870 p
.second
->shared_blob_set
.dump
<0>(cct
);
15872 ceph_assert(p
.second
->onode_map
.empty());
15873 ceph_assert(p
.second
->shared_blob_set
.empty());
15876 for (auto i
: onode_cache_shards
) {
15877 ceph_assert(i
->empty());
15881 // For external caller.
15882 // We use a best-effort policy instead, e.g.,
15883 // we don't care if there are still some pinned onodes/data in the cache
15884 // after this command is completed.
15885 int BlueStore::flush_cache(ostream
*os
)
15887 dout(10) << __func__
<< dendl
;
15888 for (auto i
: onode_cache_shards
) {
15891 for (auto i
: buffer_cache_shards
) {
15898 void BlueStore::_apply_padding(uint64_t head_pad
,
15900 bufferlist
& padded
)
15903 padded
.prepend_zero(head_pad
);
15906 padded
.append_zero(tail_pad
);
15908 if (head_pad
|| tail_pad
) {
15909 dout(20) << __func__
<< " can pad head 0x" << std::hex
<< head_pad
15910 << " tail 0x" << tail_pad
<< std::dec
<< dendl
;
15911 logger
->inc(l_bluestore_write_pad_bytes
, head_pad
+ tail_pad
);
15915 void BlueStore::_record_onode(OnodeRef
&o
, KeyValueDB::Transaction
&txn
)
15917 // finalize extent_map shards
15918 o
->extent_map
.update(txn
, false);
15919 if (o
->extent_map
.needs_reshard()) {
15920 o
->extent_map
.reshard(db
, txn
);
15921 o
->extent_map
.update(txn
, true);
15922 if (o
->extent_map
.needs_reshard()) {
15923 dout(20) << __func__
<< " warning: still wants reshard, check options?"
15925 o
->extent_map
.clear_needs_reshard();
15927 logger
->inc(l_bluestore_onode_reshard
);
15932 denc(o
->onode
, bound
);
15933 o
->extent_map
.bound_encode_spanning_blobs(bound
);
15934 if (o
->onode
.extent_map_shards
.empty()) {
15935 denc(o
->extent_map
.inline_bl
, bound
);
15940 unsigned onode_part
, blob_part
, extent_part
;
15942 auto p
= bl
.get_contiguous_appender(bound
, true);
15944 onode_part
= p
.get_logical_offset();
15945 o
->extent_map
.encode_spanning_blobs(p
);
15946 blob_part
= p
.get_logical_offset() - onode_part
;
15947 if (o
->onode
.extent_map_shards
.empty()) {
15948 denc(o
->extent_map
.inline_bl
, p
);
15950 extent_part
= p
.get_logical_offset() - onode_part
- blob_part
;
15953 dout(20) << __func__
<< " onode " << o
->oid
<< " is " << bl
.length()
15954 << " (" << onode_part
<< " bytes onode + "
15955 << blob_part
<< " bytes spanning blobs + "
15956 << extent_part
<< " bytes inline extents)"
15960 txn
->set(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size(), bl
);
15963 void BlueStore::_log_alerts(osd_alert_list_t
& alerts
)
15965 std::lock_guard
l(qlock
);
15967 if (!disk_size_mismatch_alert
.empty()) {
15969 "BLUESTORE_DISK_SIZE_MISMATCH",
15970 disk_size_mismatch_alert
);
15972 if (!legacy_statfs_alert
.empty()) {
15974 "BLUESTORE_LEGACY_STATFS",
15975 legacy_statfs_alert
);
15977 if (!spillover_alert
.empty() &&
15978 cct
->_conf
->bluestore_warn_on_bluefs_spillover
) {
15980 "BLUEFS_SPILLOVER",
15983 if (!no_per_pool_omap_alert
.empty()) {
15985 "BLUESTORE_NO_PER_POOL_OMAP",
15986 no_per_pool_omap_alert
);
15988 string
s0(failed_cmode
);
15990 if (!failed_compressors
.empty()) {
15994 s0
+= "unable to load:";
15996 for (auto& s
: failed_compressors
) {
16005 "BLUESTORE_NO_COMPRESSION",
16010 void BlueStore::_collect_allocation_stats(uint64_t need
, uint32_t alloc_size
,
16013 alloc_stats_count
++;
16014 alloc_stats_fragments
+= extents
;
16015 alloc_stats_size
+= need
;
16018 void BlueStore::_record_allocation_stats()
16020 // don't care about data consistency,
16021 // fields can be partially modified while making the tuple
16022 auto t0
= std::make_tuple(
16023 alloc_stats_count
.exchange(0),
16024 alloc_stats_fragments
.exchange(0),
16025 alloc_stats_size
.exchange(0));
16027 dout(0) << " allocation stats probe "
16028 << probe_count
<< ":"
16029 << " cnt: " << std::get
<0>(t0
)
16030 << " frags: " << std::get
<1>(t0
)
16031 << " size: " << std::get
<2>(t0
)
16036 // Keep the history for probes from the power-of-two sequence:
16037 // -1, -2, -4, -8, -16
16040 for (auto& t
: alloc_stats_history
) {
16041 dout(0) << " probe -"
16042 << base
+ (probe_count
% base
) << ": "
16044 << ", " << std::get
<1>(t
)
16045 << ", " << std::get
<2>(t
)
16049 dout(0) << "------------" << dendl
;
16051 auto prev
= probe_count
++;
16052 auto mask
= (1 << alloc_stats_history
.size()) - 1;
16053 probe_count
&= mask
;
16055 for (size_t i
= cbits(prev
^ probe_count
) - 1; i
> 0 ; --i
) {
16056 alloc_stats_history
[i
] = alloc_stats_history
[i
- 1];
16058 alloc_stats_history
[0].swap(t0
);
16061 // ===========================================
16062 // BlueStoreRepairer
16064 size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
16065 const interval_set
<uint64_t>& extents
)
16067 ceph_assert(granularity
); // initialized
16068 // can't call for the second time
16069 ceph_assert(!was_filtered_out
);
16070 ceph_assert(collections_bfs
.size() == objects_bfs
.size());
16072 uint64_t prev_pos
= 0;
16073 uint64_t npos
= collections_bfs
.size();
16075 bloom_vector collections_reduced
;
16076 bloom_vector objects_reduced
;
16078 for (auto e
: extents
) {
16079 if (e
.second
== 0) {
16082 uint64_t pos
= max(e
.first
/ granularity
, prev_pos
);
16083 uint64_t end_pos
= 1 + (e
.first
+ e
.second
- 1) / granularity
;
16084 while (pos
!= npos
&& pos
< end_pos
) {
16085 ceph_assert( collections_bfs
[pos
].element_count() ==
16086 objects_bfs
[pos
].element_count());
16087 if (collections_bfs
[pos
].element_count()) {
16088 collections_reduced
.push_back(std::move(collections_bfs
[pos
]));
16089 objects_reduced
.push_back(std::move(objects_bfs
[pos
]));
16093 prev_pos
= end_pos
;
16095 collections_reduced
.swap(collections_bfs
);
16096 objects_reduced
.swap(objects_bfs
);
16097 was_filtered_out
= true;
16098 return collections_bfs
.size();
16101 bool BlueStoreRepairer::remove_key(KeyValueDB
*db
,
16102 const string
& prefix
,
16105 if (!remove_key_txn
) {
16106 remove_key_txn
= db
->get_transaction();
16109 remove_key_txn
->rmkey(prefix
, key
);
16114 void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB
*db
)
16116 fix_per_pool_omap_txn
= db
->get_transaction();
16120 fix_per_pool_omap_txn
->set(PREFIX_SUPER
, "per_pool_omap", bl
);
16123 bool BlueStoreRepairer::fix_shared_blob(
16126 const bufferlist
* bl
)
16128 KeyValueDB::Transaction txn
;
16129 if (fix_misreferences_txn
) { // reuse this txn
16130 txn
= fix_misreferences_txn
;
16132 if (!fix_shared_blob_txn
) {
16133 fix_shared_blob_txn
= db
->get_transaction();
16135 txn
= fix_shared_blob_txn
;
16138 get_shared_blob_key(sbid
, &key
);
16142 txn
->set(PREFIX_SHARED_BLOB
, key
, *bl
);
16144 txn
->rmkey(PREFIX_SHARED_BLOB
, key
);
16149 bool BlueStoreRepairer::fix_statfs(KeyValueDB
*db
,
16151 const store_statfs_t
& new_statfs
)
16153 if (!fix_statfs_txn
) {
16154 fix_statfs_txn
= db
->get_transaction();
16156 BlueStore::volatile_statfs vstatfs
;
16157 vstatfs
= new_statfs
;
16159 vstatfs
.encode(bl
);
16161 fix_statfs_txn
->set(PREFIX_STAT
, key
, bl
);
16165 bool BlueStoreRepairer::fix_leaked(KeyValueDB
*db
,
16166 FreelistManager
* fm
,
16167 uint64_t offset
, uint64_t len
)
16169 if (!fix_fm_leaked_txn
) {
16170 fix_fm_leaked_txn
= db
->get_transaction();
16173 fm
->release(offset
, len
, fix_fm_leaked_txn
);
16176 bool BlueStoreRepairer::fix_false_free(KeyValueDB
*db
,
16177 FreelistManager
* fm
,
16178 uint64_t offset
, uint64_t len
)
16180 if (!fix_fm_false_free_txn
) {
16181 fix_fm_false_free_txn
= db
->get_transaction();
16184 fm
->allocate(offset
, len
, fix_fm_false_free_txn
);
16188 bool BlueStoreRepairer::fix_bluefs_extents(std::atomic
<uint64_t>& out_of_sync_flag
)
16190 // this is just a stub to count num of repairs properly,
16191 // actual repair happens in BlueStore::_close_db_and_around()
16192 // while doing _sync_bluefs_and_fm
16193 ++out_of_sync_flag
;
16198 KeyValueDB::Transaction
BlueStoreRepairer::fix_spanning_blobs(KeyValueDB
* db
)
16200 if (!fix_onode_txn
) {
16201 fix_onode_txn
= db
->get_transaction();
16204 return fix_onode_txn
;
16207 bool BlueStoreRepairer::preprocess_misreference(KeyValueDB
*db
)
16209 if (misreferenced_extents
.size()) {
16210 size_t n
= space_usage_tracker
.filter_out(misreferenced_extents
);
16211 ceph_assert(n
> 0);
16212 if (!fix_misreferences_txn
) {
16213 fix_misreferences_txn
= db
->get_transaction();
16220 unsigned BlueStoreRepairer::apply(KeyValueDB
* db
)
16222 if (fix_per_pool_omap_txn
) {
16223 db
->submit_transaction_sync(fix_per_pool_omap_txn
);
16224 fix_per_pool_omap_txn
= nullptr;
16226 if (fix_fm_leaked_txn
) {
16227 db
->submit_transaction_sync(fix_fm_leaked_txn
);
16228 fix_fm_leaked_txn
= nullptr;
16230 if (fix_fm_false_free_txn
) {
16231 db
->submit_transaction_sync(fix_fm_false_free_txn
);
16232 fix_fm_false_free_txn
= nullptr;
16234 if (remove_key_txn
) {
16235 db
->submit_transaction_sync(remove_key_txn
);
16236 remove_key_txn
= nullptr;
16238 if (fix_misreferences_txn
) {
16239 db
->submit_transaction_sync(fix_misreferences_txn
);
16240 fix_misreferences_txn
= nullptr;
16242 if (fix_onode_txn
) {
16243 db
->submit_transaction_sync(fix_onode_txn
);
16244 fix_onode_txn
= nullptr;
16246 if (fix_shared_blob_txn
) {
16247 db
->submit_transaction_sync(fix_shared_blob_txn
);
16248 fix_shared_blob_txn
= nullptr;
16251 if (fix_statfs_txn
) {
16252 db
->submit_transaction_sync(fix_statfs_txn
);
16253 fix_statfs_txn
= nullptr;
16255 unsigned repaired
= to_repair_cnt
;
16260 // =======================================================
16261 // RocksDBBlueFSVolumeSelector
16263 uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h
) {
16264 ceph_assert(h
!= nullptr);
16265 uint64_t hint
= reinterpret_cast<uint64_t>(h
);
16269 res
= BlueFS::BDEV_SLOW
;
16270 if (db_avail4slow
> 0) {
16271 // considering statically available db space vs.
16272 // - observed maximums on DB dev for DB/WAL/UNSORTED data
16273 // - observed maximum spillovers
16274 uint64_t max_db_use
= 0; // max db usage we potentially observed
16275 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_LOG
- LEVEL_FIRST
);
16276 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_WAL
- LEVEL_FIRST
);
16277 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_DB
- LEVEL_FIRST
);
16278 // this could go to db hence using it in the estimation
16279 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_SLOW
, LEVEL_DB
- LEVEL_FIRST
);
16281 auto db_total
= l_totals
[LEVEL_DB
- LEVEL_FIRST
];
16282 uint64_t avail
= min(
16284 max_db_use
< db_total
? db_total
- max_db_use
: 0);
16286 // considering current DB dev usage for SLOW data
16287 if (avail
> per_level_per_dev_usage
.at(BlueFS::BDEV_DB
, LEVEL_SLOW
- LEVEL_FIRST
)) {
16288 res
= BlueFS::BDEV_DB
;
16294 res
= BlueFS::BDEV_WAL
;
16298 res
= BlueFS::BDEV_DB
;
16304 void RocksDBBlueFSVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const
16306 res
.emplace_back(base
, l_totals
[LEVEL_DB
- LEVEL_FIRST
]);
16307 res
.emplace_back(base
+ ".slow", l_totals
[LEVEL_SLOW
- LEVEL_FIRST
]);
16310 void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string
& dirname
) const {
16311 uint8_t res
= LEVEL_DB
;
16312 if (dirname
.length() > 5) {
16313 // the "db.slow" and "db.wal" directory names are hard-coded at
16314 // match up with bluestore. the slow device is always the second
16315 // one (when a dedicated block.db device is present and used at
16316 // bdev 0). the wal device is always last.
16317 if (boost::algorithm::ends_with(dirname
, ".slow")) {
16320 else if (boost::algorithm::ends_with(dirname
, ".wal")) {
16324 return reinterpret_cast<void*>(res
);
16327 void RocksDBBlueFSVolumeSelector::dump(ostream
& sout
) {
16328 auto max_x
= per_level_per_dev_usage
.get_max_x();
16329 auto max_y
= per_level_per_dev_usage
.get_max_y();
16330 sout
<< "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals
[LEVEL_WAL
- LEVEL_FIRST
]
16331 << ", db_total:" << l_totals
[LEVEL_DB
- LEVEL_FIRST
]
16332 << ", slow_total:" << l_totals
[LEVEL_SLOW
- LEVEL_FIRST
]
16333 << ", db_avail:" << db_avail4slow
<< std::endl
16334 << "Usage matrix:" << std::endl
;
16335 constexpr std::array
<const char*, 8> names
{ {
16345 const size_t width
= 12;
16346 for (size_t i
= 0; i
< names
.size(); ++i
) {
16347 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16352 for (size_t l
= 0; l
< max_y
; l
++) {
16353 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16355 switch (l
+ LEVEL_FIRST
) {
16357 sout
<< "LOG"; break;
16359 sout
<< "WAL"; break;
16361 sout
<< "DB"; break;
16363 sout
<< "SLOW"; break;
16365 sout
<< "TOTALS"; break;
16367 for (size_t d
= 0; d
< max_x
; d
++) {
16368 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16370 sout
<< stringify(byte_u_t(per_level_per_dev_usage
.at(d
, l
)));
16372 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16374 sout
<< stringify(per_level_files
[l
]) << std::endl
;
16376 ceph_assert(max_x
== per_level_per_dev_max
.get_max_x());
16377 ceph_assert(max_y
== per_level_per_dev_max
.get_max_y());
16378 sout
<< "MAXIMUMS:" << std::endl
;
16379 for (size_t l
= 0; l
< max_y
; l
++) {
16380 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16382 switch (l
+ LEVEL_FIRST
) {
16384 sout
<< "LOG"; break;
16386 sout
<< "WAL"; break;
16388 sout
<< "DB"; break;
16390 sout
<< "SLOW"; break;
16392 sout
<< "TOTALS"; break;
16394 for (size_t d
= 0; d
< max_x
- 1; d
++) {
16395 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16397 sout
<< stringify(byte_u_t(per_level_per_dev_max
.at(d
, l
)));
16399 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16401 sout
<< stringify(byte_u_t(per_level_per_dev_max
.at(max_x
- 1, l
)));
16402 if (l
< max_y
- 1) {
16408 // =======================================================