1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include <sys/types.h>
21 #include <boost/container/flat_set.hpp>
22 #include "boost/algorithm/string.hpp"
24 #include "include/cpp-btree/btree_set.h"
26 #include "bluestore_common.h"
27 #include "BlueStore.h"
29 #include "include/compat.h"
30 #include "include/intarith.h"
31 #include "include/stringify.h"
32 #include "include/str_map.h"
33 #include "include/util.h"
34 #include "common/errno.h"
35 #include "common/safe_io.h"
36 #include "common/PriorityCache.h"
37 #include "common/RWLock.h"
38 #include "Allocator.h"
39 #include "FreelistManager.h"
41 #include "BlueRocksEnv.h"
42 #include "auth/Crypto.h"
43 #include "common/EventTrace.h"
44 #include "perfglue/heap_profiler.h"
45 #include "common/blkdev.h"
46 #include "common/numa.h"
48 #if defined(WITH_LTTNG)
49 #define TRACEPOINT_DEFINE
50 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
51 #include "tracing/bluestore.h"
52 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
53 #undef TRACEPOINT_DEFINE
55 #define tracepoint(...)
58 #define dout_context cct
59 #define dout_subsys ceph_subsys_bluestore
61 using bid_t
= decltype(BlueStore::Blob::id
);
63 // bluestore_cache_onode
64 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode
, bluestore_onode
,
65 bluestore_cache_onode
);
67 // bluestore_cache_other
68 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer
, bluestore_buffer
,
70 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent
, bluestore_extent
,
72 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob
, bluestore_blob
,
74 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob
, bluestore_shared_blob
,
75 bluestore_SharedBlob
);
78 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext
, bluestore_transcontext
,
83 const string PREFIX_SUPER
= "S"; // field -> value
84 const string PREFIX_STAT
= "T"; // field -> value(int64 array)
85 const string PREFIX_COLL
= "C"; // collection name -> cnode_t
86 const string PREFIX_OBJ
= "O"; // object name -> onode_t
87 const string PREFIX_OMAP
= "M"; // u64 + keyname -> value
88 const string PREFIX_PGMETA_OMAP
= "P"; // u64 + keyname -> value(for meta coll)
89 const string PREFIX_PERPOOL_OMAP
= "m"; // s64 + u64 + keyname -> value
90 const string PREFIX_DEFERRED
= "L"; // id -> deferred_transaction_t
91 const string PREFIX_ALLOC
= "B"; // u64 offset -> u64 length (freelist)
92 const string PREFIX_ALLOC_BITMAP
= "b";// (see BitmapFreelistManager)
93 const string PREFIX_SHARED_BLOB
= "X"; // u64 offset -> shared_blob_t
95 const string BLUESTORE_GLOBAL_STATFS_KEY
= "bluestore_statfs";
97 // write a label in the first block. always use this size. note that
98 // bluefs makes a matching assumption about the location of its
99 // superblock (always the second block of the device).
100 #define BDEV_LABEL_BLOCK_SIZE 4096
102 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
103 #define SUPER_RESERVED 8192
105 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
109 * extent map blob encoding
111 * we use the low bits of the blobid field to indicate some common scenarios
112 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
114 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
115 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
116 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
117 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
118 #define BLOBID_SHIFT_BITS 4
121 * object name key structure
123 * encoded u8: shard + 2^7 (so that it sorts properly)
124 * encoded u64: poolid + 2^63 (so that it sorts properly)
125 * encoded u32: hash (bit reversed)
127 * escaped string: namespace
129 * escaped string: key or object name
130 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
131 * we are done. otherwise, we are followed by the object name.
132 * escaped string: object name (unless '=' above)
135 * encoded u64: generation
138 #define ONODE_KEY_SUFFIX 'o'
147 #define EXTENT_SHARD_KEY_SUFFIX 'x'
150 * string encoding in the key
152 * The key string needs to lexicographically sort the same way that
153 * ghobject_t does. We do this by escaping anything <= to '#' with #
154 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
157 * We use ! as a terminator for strings; this works because it is < #
158 * and will get escaped if it is present in the string.
160 * NOTE: There is a bug in this implementation: due to implicit
161 * character type conversion in comparison it may produce unexpected
162 * ordering. Unfortunately fixing the bug would mean invalidating the
163 * keys in existing deployments. Instead we do additional sorting
164 * where it is needed.
167 static void append_escaped(const string
&in
, S
*out
)
169 char hexbyte
[in
.length() * 3 + 1];
170 char* ptr
= &hexbyte
[0];
171 for (string::const_iterator i
= in
.begin(); i
!= in
.end(); ++i
) {
172 if (*i
<= '#') { // bug: unexpected result for *i > 0x7f
174 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
175 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
176 } else if (*i
>= '~') { // bug: unexpected result for *i > 0x7f
178 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
179 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
185 out
->append(hexbyte
, ptr
- &hexbyte
[0]);
188 inline unsigned h2i(char c
)
190 if ((c
>= '0') && (c
<= '9')) {
192 } else if ((c
>= 'a') && (c
<= 'f')) {
194 } else if ((c
>= 'A') && (c
<= 'F')) {
197 return 256; // make it always larger than 255
201 static int decode_escaped(const char *p
, string
*out
)
204 char* ptr
= &buff
[0];
205 char* max
= &buff
[252];
206 const char *orig_p
= p
;
207 while (*p
&& *p
!= '!') {
208 if (*p
== '#' || *p
== '~') {
211 hex
= h2i(*p
++) << 4;
224 out
->append(buff
, ptr
-buff
);
229 out
->append(buff
, ptr
-buff
);
234 // some things we encode in binary (as le32 or le64); print the
235 // resulting key strings nicely
237 static string
pretty_binary_string(const S
& in
)
241 out
.reserve(in
.length() * 3);
242 enum { NONE
, HEX
, STRING
} mode
= NONE
;
243 unsigned from
= 0, i
;
244 for (i
=0; i
< in
.length(); ++i
) {
245 if ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
246 (mode
== HEX
&& in
.length() - i
>= 4 &&
247 ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
248 (in
[i
+1] < 32 || (unsigned char)in
[i
+1] > 126) ||
249 (in
[i
+2] < 32 || (unsigned char)in
[i
+2] > 126) ||
250 (in
[i
+3] < 32 || (unsigned char)in
[i
+3] > 126)))) {
251 if (mode
== STRING
) {
252 out
.append(in
.c_str() + from
, i
- from
);
259 if (in
.length() - i
>= 4) {
260 // print a whole u32 at once
261 snprintf(buf
, sizeof(buf
), "%08x",
262 (uint32_t)(((unsigned char)in
[i
] << 24) |
263 ((unsigned char)in
[i
+1] << 16) |
264 ((unsigned char)in
[i
+2] << 8) |
265 ((unsigned char)in
[i
+3] << 0)));
268 snprintf(buf
, sizeof(buf
), "%02x", (int)(unsigned char)in
[i
]);
272 if (mode
!= STRING
) {
279 if (mode
== STRING
) {
280 out
.append(in
.c_str() + from
, i
- from
);
287 static void _key_encode_shard(shard_id_t shard
, T
*key
)
289 key
->push_back((char)((uint8_t)shard
.id
+ (uint8_t)0x80));
292 static const char *_key_decode_shard(const char *key
, shard_id_t
*pshard
)
294 pshard
->id
= (uint8_t)*key
- (uint8_t)0x80;
298 static void get_coll_range(const coll_t
& cid
, int bits
,
299 ghobject_t
*temp_start
, ghobject_t
*temp_end
,
300 ghobject_t
*start
, ghobject_t
*end
)
303 if (cid
.is_pg(&pgid
)) {
304 start
->shard_id
= pgid
.shard
;
305 *temp_start
= *start
;
307 start
->hobj
.pool
= pgid
.pool();
308 temp_start
->hobj
.pool
= -2ll - pgid
.pool();
311 *temp_end
= *temp_start
;
313 uint32_t reverse_hash
= hobject_t::_reverse_bits(pgid
.ps());
314 start
->hobj
.set_bitwise_key_u32(reverse_hash
);
315 temp_start
->hobj
.set_bitwise_key_u32(reverse_hash
);
317 uint64_t end_hash
= reverse_hash
+ (1ull << (32 - bits
));
318 if (end_hash
> 0xffffffffull
)
319 end_hash
= 0xffffffffull
;
321 end
->hobj
.set_bitwise_key_u32(end_hash
);
322 temp_end
->hobj
.set_bitwise_key_u32(end_hash
);
324 start
->shard_id
= shard_id_t::NO_SHARD
;
325 start
->hobj
.pool
= -1ull;
328 start
->hobj
.set_bitwise_key_u32(0);
329 end
->hobj
.set_bitwise_key_u32(0xffffffff);
331 // no separate temp section
336 start
->generation
= 0;
338 temp_start
->generation
= 0;
339 temp_end
->generation
= 0;
342 static void get_shared_blob_key(uint64_t sbid
, string
*key
)
345 _key_encode_u64(sbid
, key
);
348 static int get_key_shared_blob(const string
& key
, uint64_t *sbid
)
350 const char *p
= key
.c_str();
351 if (key
.length() < sizeof(uint64_t))
353 _key_decode_u64(p
, sbid
);
358 static void _key_encode_prefix(const ghobject_t
& oid
, S
*key
)
360 _key_encode_shard(oid
.shard_id
, key
);
361 _key_encode_u64(oid
.hobj
.pool
+ 0x8000000000000000ull
, key
);
362 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), key
);
365 static const char *_key_decode_prefix(const char *p
, ghobject_t
*oid
)
367 p
= _key_decode_shard(p
, &oid
->shard_id
);
370 p
= _key_decode_u64(p
, &pool
);
371 oid
->hobj
.pool
= pool
- 0x8000000000000000ull
;
374 p
= _key_decode_u32(p
, &hash
);
376 oid
->hobj
.set_bitwise_key_u32(hash
);
381 #define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
384 static int get_key_object(const S
& key
, ghobject_t
*oid
)
387 const char *p
= key
.c_str();
389 if (key
.length() < ENCODED_KEY_PREFIX_LEN
)
392 p
= _key_decode_prefix(p
, oid
);
394 if (key
.length() == ENCODED_KEY_PREFIX_LEN
)
397 r
= decode_escaped(p
, &oid
->hobj
.nspace
);
403 r
= decode_escaped(p
, &k
);
410 oid
->hobj
.oid
.name
= k
;
411 } else if (*p
== '<' || *p
== '>') {
414 r
= decode_escaped(p
, &oid
->hobj
.oid
.name
);
418 oid
->hobj
.set_key(k
);
424 p
= _key_decode_u64(p
, &oid
->hobj
.snap
.val
);
425 p
= _key_decode_u64(p
, &oid
->generation
);
427 if (*p
!= ONODE_KEY_SUFFIX
) {
432 // if we get something other than a null terminator here,
433 // something goes wrong.
441 static void get_object_key(CephContext
*cct
, const ghobject_t
& oid
, S
*key
)
445 size_t max_len
= ENCODED_KEY_PREFIX_LEN
+
446 (oid
.hobj
.nspace
.length() * 3 + 1) +
447 (oid
.hobj
.get_key().length() * 3 + 1) +
448 1 + // for '<', '=', or '>'
449 (oid
.hobj
.oid
.name
.length() * 3 + 1) +
451 key
->reserve(max_len
);
453 _key_encode_prefix(oid
, key
);
455 append_escaped(oid
.hobj
.nspace
, key
);
457 if (oid
.hobj
.get_key().length()) {
458 // is a key... could be < = or >.
459 append_escaped(oid
.hobj
.get_key(), key
);
460 // (ASCII chars < = and > sort in that order, yay)
461 int r
= oid
.hobj
.get_key().compare(oid
.hobj
.oid
.name
);
463 key
->append(r
> 0 ? ">" : "<");
464 append_escaped(oid
.hobj
.oid
.name
, key
);
471 append_escaped(oid
.hobj
.oid
.name
, key
);
475 _key_encode_u64(oid
.hobj
.snap
, key
);
476 _key_encode_u64(oid
.generation
, key
);
478 key
->push_back(ONODE_KEY_SUFFIX
);
483 int r
= get_key_object(*key
, &t
);
485 derr
<< " r " << r
<< dendl
;
486 derr
<< "key " << pretty_binary_string(*key
) << dendl
;
487 derr
<< "oid " << oid
<< dendl
;
488 derr
<< " t " << t
<< dendl
;
489 ceph_assert(r
== 0 && t
== oid
);
495 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
496 // char lets us quickly test whether it is a shard key without decoding any
497 // of the prefix bytes.
499 static void get_extent_shard_key(const S
& onode_key
, uint32_t offset
,
503 key
->reserve(onode_key
.length() + 4 + 1);
504 key
->append(onode_key
.c_str(), onode_key
.size());
505 _key_encode_u32(offset
, key
);
506 key
->push_back(EXTENT_SHARD_KEY_SUFFIX
);
509 static void rewrite_extent_shard_key(uint32_t offset
, string
*key
)
511 ceph_assert(key
->size() > sizeof(uint32_t) + 1);
512 ceph_assert(*key
->rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
513 _key_encode_u32(offset
, key
->size() - sizeof(uint32_t) - 1, key
);
517 static void generate_extent_shard_key_and_apply(
521 std::function
<void(const string
& final_key
)> apply
)
523 if (key
->empty()) { // make full key
524 ceph_assert(!onode_key
.empty());
525 get_extent_shard_key(onode_key
, offset
, key
);
527 rewrite_extent_shard_key(offset
, key
);
532 int get_key_extent_shard(const string
& key
, string
*onode_key
, uint32_t *offset
)
534 ceph_assert(key
.size() > sizeof(uint32_t) + 1);
535 ceph_assert(*key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
536 int okey_len
= key
.size() - sizeof(uint32_t) - 1;
537 *onode_key
= key
.substr(0, okey_len
);
538 const char *p
= key
.data() + okey_len
;
539 _key_decode_u32(p
, offset
);
543 static bool is_extent_shard_key(const string
& key
)
545 return *key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
;
548 static void get_deferred_key(uint64_t seq
, string
*out
)
550 _key_encode_u64(seq
, out
);
553 static void get_pool_stat_key(int64_t pool_id
, string
*key
)
556 _key_encode_u64(pool_id
, key
);
559 static int get_key_pool_stat(const string
& key
, uint64_t* pool_id
)
561 const char *p
= key
.c_str();
562 if (key
.length() < sizeof(uint64_t))
564 _key_decode_u64(p
, pool_id
);
568 template <int LogLevelV
>
569 void _dump_extent_map(CephContext
*cct
, const BlueStore::ExtentMap
&em
)
572 for (auto& s
: em
.shards
) {
573 dout(LogLevelV
) << __func__
<< " shard " << *s
.shard_info
574 << (s
.loaded
? " (loaded)" : "")
575 << (s
.dirty
? " (dirty)" : "")
578 for (auto& e
: em
.extent_map
) {
579 dout(LogLevelV
) << __func__
<< " " << e
<< dendl
;
580 ceph_assert(e
.logical_offset
>= pos
);
581 pos
= e
.logical_offset
+ e
.length
;
582 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
583 if (blob
.has_csum()) {
585 unsigned n
= blob
.get_csum_count();
586 for (unsigned i
= 0; i
< n
; ++i
)
587 v
.push_back(blob
.get_csum_item(i
));
588 dout(LogLevelV
) << __func__
<< " csum: " << std::hex
<< v
<< std::dec
591 std::lock_guard
l(e
.blob
->shared_blob
->get_cache()->lock
);
592 for (auto& i
: e
.blob
->shared_blob
->bc
.buffer_map
) {
593 dout(LogLevelV
) << __func__
<< " 0x" << std::hex
<< i
.first
594 << "~" << i
.second
->length
<< std::dec
595 << " " << *i
.second
<< dendl
;
600 template <int LogLevelV
>
601 void _dump_onode(CephContext
*cct
, const BlueStore::Onode
& o
)
603 if (!cct
->_conf
->subsys
.should_gather
<ceph_subsys_bluestore
, LogLevelV
>())
605 dout(LogLevelV
) << __func__
<< " " << &o
<< " " << o
.oid
606 << " nid " << o
.onode
.nid
607 << " size 0x" << std::hex
<< o
.onode
.size
608 << " (" << std::dec
<< o
.onode
.size
<< ")"
609 << " expected_object_size " << o
.onode
.expected_object_size
610 << " expected_write_size " << o
.onode
.expected_write_size
611 << " in " << o
.onode
.extent_map_shards
.size() << " shards"
612 << ", " << o
.extent_map
.spanning_blob_map
.size()
615 for (auto p
= o
.onode
.attrs
.begin();
616 p
!= o
.onode
.attrs
.end();
618 dout(LogLevelV
) << __func__
<< " attr " << p
->first
619 << " len " << p
->second
.length() << dendl
;
621 _dump_extent_map
<LogLevelV
>(cct
, o
.extent_map
);
624 template <int LogLevelV
>
625 void _dump_transaction(CephContext
*cct
, ObjectStore::Transaction
*t
)
627 dout(LogLevelV
) << __func__
<< " transaction dump:\n";
628 JSONFormatter
f(true);
629 f
.open_object_section("transaction");
638 struct Int64ArrayMergeOperator
: public KeyValueDB::MergeOperator
{
639 void merge_nonexistent(
640 const char *rdata
, size_t rlen
, std::string
*new_value
) override
{
641 *new_value
= std::string(rdata
, rlen
);
644 const char *ldata
, size_t llen
,
645 const char *rdata
, size_t rlen
,
646 std::string
*new_value
) override
{
647 ceph_assert(llen
== rlen
);
648 ceph_assert((rlen
% 8) == 0);
649 new_value
->resize(rlen
);
650 const ceph_le64
* lv
= (const ceph_le64
*)ldata
;
651 const ceph_le64
* rv
= (const ceph_le64
*)rdata
;
652 ceph_le64
* nv
= &(ceph_le64
&)new_value
->at(0);
653 for (size_t i
= 0; i
< rlen
>> 3; ++i
) {
654 nv
[i
] = lv
[i
] + rv
[i
];
657 // We use each operator name and each prefix to construct the
658 // overall RocksDB operator name for consistency check at open time.
659 const char *name() const override
{
660 return "int64_array";
667 ostream
& operator<<(ostream
& out
, const BlueStore::Buffer
& b
)
669 out
<< "buffer(" << &b
<< " space " << b
.space
<< " 0x" << std::hex
670 << b
.offset
<< "~" << b
.length
<< std::dec
671 << " " << BlueStore::Buffer::get_state_name(b
.state
);
673 out
<< " " << BlueStore::Buffer::get_flag_name(b
.flags
);
680 * Due to a bug in key string encoding (see a comment for append_escaped)
681 * the KeyValueDB iterator does not lexicographically sort the same
682 * way that ghobject_t does: objects with the same hash may have wrong order.
684 * This is the iterator wrapper that fixes the keys order.
687 class CollectionListIterator
{
689 CollectionListIterator(const KeyValueDB::Iterator
&it
)
692 virtual ~CollectionListIterator() {
695 virtual bool valid() const = 0;
696 virtual const ghobject_t
&oid() const = 0;
697 virtual void lower_bound(const ghobject_t
&oid
) = 0;
698 virtual void upper_bound(const ghobject_t
&oid
) = 0;
699 virtual void next() = 0;
701 virtual int cmp(const ghobject_t
&oid
) const = 0;
703 bool is_ge(const ghobject_t
&oid
) const {
704 return cmp(oid
) >= 0;
707 bool is_lt(const ghobject_t
&oid
) const {
712 KeyValueDB::Iterator m_it
;
715 class SimpleCollectionListIterator
: public CollectionListIterator
{
717 SimpleCollectionListIterator(CephContext
*cct
, const KeyValueDB::Iterator
&it
)
718 : CollectionListIterator(it
), m_cct(cct
) {
721 bool valid() const override
{
722 return m_it
->valid();
725 const ghobject_t
&oid() const override
{
726 ceph_assert(valid());
731 void lower_bound(const ghobject_t
&oid
) override
{
733 get_object_key(m_cct
, oid
, &key
);
735 m_it
->lower_bound(key
);
739 void upper_bound(const ghobject_t
&oid
) override
{
741 get_object_key(m_cct
, oid
, &key
);
743 m_it
->upper_bound(key
);
747 void next() override
{
748 ceph_assert(valid());
754 int cmp(const ghobject_t
&oid
) const override
{
755 ceph_assert(valid());
758 get_object_key(m_cct
, oid
, &key
);
760 return m_it
->key().compare(key
);
772 if (is_extent_shard_key(m_it
->key())) {
777 m_oid
= ghobject_t();
778 int r
= get_key_object(m_it
->key(), &m_oid
);
783 class SortedCollectionListIterator
: public CollectionListIterator
{
785 SortedCollectionListIterator(const KeyValueDB::Iterator
&it
)
786 : CollectionListIterator(it
), m_chunk_iter(m_chunk
.end()) {
789 bool valid() const override
{
790 return m_chunk_iter
!= m_chunk
.end();
793 const ghobject_t
&oid() const override
{
794 ceph_assert(valid());
796 return m_chunk_iter
->first
;
799 void lower_bound(const ghobject_t
&oid
) override
{
801 _key_encode_prefix(oid
, &key
);
803 m_it
->lower_bound(key
);
804 m_chunk_iter
= m_chunk
.end();
805 if (!get_next_chunk()) {
809 if (this->oid().shard_id
!= oid
.shard_id
||
810 this->oid().hobj
.pool
!= oid
.hobj
.pool
||
811 this->oid().hobj
.get_bitwise_key_u32() != oid
.hobj
.get_bitwise_key_u32()) {
815 m_chunk_iter
= m_chunk
.lower_bound(oid
);
816 if (m_chunk_iter
== m_chunk
.end()) {
821 void upper_bound(const ghobject_t
&oid
) override
{
824 if (valid() && this->oid() == oid
) {
829 void next() override
{
830 ceph_assert(valid());
833 if (m_chunk_iter
== m_chunk
.end()) {
838 int cmp(const ghobject_t
&oid
) const override
{
839 ceph_assert(valid());
841 if (this->oid() < oid
) {
844 if (this->oid() > oid
) {
851 std::map
<ghobject_t
, std::string
> m_chunk
;
852 std::map
<ghobject_t
, std::string
>::iterator m_chunk_iter
;
854 bool get_next_chunk() {
855 while (m_it
->valid() && is_extent_shard_key(m_it
->key())) {
859 if (!m_it
->valid()) {
864 int r
= get_key_object(m_it
->key(), &oid
);
869 m_chunk
.insert({oid
, m_it
->key()});
873 } while (m_it
->valid() && is_extent_shard_key(m_it
->key()));
875 if (!m_it
->valid()) {
880 r
= get_key_object(m_it
->key(), &next
);
882 if (next
.shard_id
!= oid
.shard_id
||
883 next
.hobj
.pool
!= oid
.hobj
.pool
||
884 next
.hobj
.get_bitwise_key_u32() != oid
.hobj
.get_bitwise_key_u32()) {
890 m_chunk_iter
= m_chunk
.begin();
895 } // anonymous namespace
899 void BlueStore::GarbageCollector::process_protrusive_extents(
900 const BlueStore::ExtentMap
& extent_map
,
901 uint64_t start_offset
,
903 uint64_t start_touch_offset
,
904 uint64_t end_touch_offset
,
905 uint64_t min_alloc_size
)
907 ceph_assert(start_offset
<= start_touch_offset
&& end_offset
>= end_touch_offset
);
909 uint64_t lookup_start_offset
= p2align(start_offset
, min_alloc_size
);
910 uint64_t lookup_end_offset
= round_up_to(end_offset
, min_alloc_size
);
912 dout(30) << __func__
<< " (hex): [" << std::hex
913 << lookup_start_offset
<< ", " << lookup_end_offset
914 << ")" << std::dec
<< dendl
;
916 for (auto it
= extent_map
.seek_lextent(lookup_start_offset
);
917 it
!= extent_map
.extent_map
.end() &&
918 it
->logical_offset
< lookup_end_offset
;
920 uint64_t alloc_unit_start
= it
->logical_offset
/ min_alloc_size
;
921 uint64_t alloc_unit_end
= (it
->logical_end() - 1) / min_alloc_size
;
923 dout(30) << __func__
<< " " << *it
924 << "alloc_units: " << alloc_unit_start
<< ".." << alloc_unit_end
927 Blob
* b
= it
->blob
.get();
929 if (it
->logical_offset
>=start_touch_offset
&&
930 it
->logical_end() <= end_touch_offset
) {
931 // Process extents within the range affected by
932 // the current write request.
933 // Need to take into account if existing extents
934 // can be merged with them (uncompressed case)
935 if (!b
->get_blob().is_compressed()) {
936 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
937 --blob_info_counted
->expected_allocations
; // don't need to allocate
938 // new AU for compressed
939 // data since another
940 // collocated uncompressed
941 // blob already exists
942 dout(30) << __func__
<< " --expected:"
943 << alloc_unit_start
<< dendl
;
945 used_alloc_unit
= alloc_unit_end
;
946 blob_info_counted
= nullptr;
948 } else if (b
->get_blob().is_compressed()) {
950 // additionally we take compressed blobs that were not impacted
951 // by the write into account too
953 affected_blobs
.emplace(
954 b
, BlobInfo(b
->get_referenced_bytes())).first
->second
;
957 (used_alloc_unit
&& used_alloc_unit
== alloc_unit_start
) ? 0 : 1;
958 bi
.expected_allocations
+= alloc_unit_end
- alloc_unit_start
+ adjust
;
959 dout(30) << __func__
<< " expected_allocations="
960 << bi
.expected_allocations
<< " end_au:"
961 << alloc_unit_end
<< dendl
;
963 blob_info_counted
= &bi
;
964 used_alloc_unit
= alloc_unit_end
;
966 ceph_assert(it
->length
<= bi
.referenced_bytes
);
967 bi
.referenced_bytes
-= it
->length
;
968 dout(30) << __func__
<< " affected_blob:" << *b
969 << " unref 0x" << std::hex
<< it
->length
970 << " referenced = 0x" << bi
.referenced_bytes
971 << std::dec
<< dendl
;
972 // NOTE: we can't move specific blob to resulting GC list here
973 // when reference counter == 0 since subsequent extents might
974 // decrement its expected_allocation.
975 // Hence need to enumerate all the extents first.
976 if (!bi
.collect_candidate
) {
977 bi
.first_lextent
= it
;
978 bi
.collect_candidate
= true;
980 bi
.last_lextent
= it
;
982 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
983 // don't need to allocate new AU for compressed data since another
984 // collocated uncompressed blob already exists
985 --blob_info_counted
->expected_allocations
;
986 dout(30) << __func__
<< " --expected_allocations:"
987 << alloc_unit_start
<< dendl
;
989 used_alloc_unit
= alloc_unit_end
;
990 blob_info_counted
= nullptr;
994 for (auto b_it
= affected_blobs
.begin();
995 b_it
!= affected_blobs
.end();
997 Blob
* b
= b_it
->first
;
998 BlobInfo
& bi
= b_it
->second
;
999 if (bi
.referenced_bytes
== 0) {
1000 uint64_t len_on_disk
= b_it
->first
->get_blob().get_ondisk_length();
1001 int64_t blob_expected_for_release
=
1002 round_up_to(len_on_disk
, min_alloc_size
) / min_alloc_size
;
1004 dout(30) << __func__
<< " " << *(b_it
->first
)
1005 << " expected4release=" << blob_expected_for_release
1006 << " expected_allocations=" << bi
.expected_allocations
1008 int64_t benefit
= blob_expected_for_release
- bi
.expected_allocations
;
1009 if (benefit
>= g_conf()->bluestore_gc_enable_blob_threshold
) {
1010 if (bi
.collect_candidate
) {
1011 auto it
= bi
.first_lextent
;
1014 if (it
->blob
.get() == b
) {
1015 extents_to_collect
.insert(it
->logical_offset
, it
->length
);
1017 bExit
= it
== bi
.last_lextent
;
1021 expected_for_release
+= blob_expected_for_release
;
1022 expected_allocations
+= bi
.expected_allocations
;
1028 int64_t BlueStore::GarbageCollector::estimate(
1029 uint64_t start_offset
,
1031 const BlueStore::ExtentMap
& extent_map
,
1032 const BlueStore::old_extent_map_t
& old_extents
,
1033 uint64_t min_alloc_size
)
1036 affected_blobs
.clear();
1037 extents_to_collect
.clear();
1038 used_alloc_unit
= boost::optional
<uint64_t >();
1039 blob_info_counted
= nullptr;
1041 uint64_t gc_start_offset
= start_offset
;
1042 uint64_t gc_end_offset
= start_offset
+ length
;
1044 uint64_t end_offset
= start_offset
+ length
;
1046 for (auto it
= old_extents
.begin(); it
!= old_extents
.end(); ++it
) {
1047 Blob
* b
= it
->e
.blob
.get();
1048 if (b
->get_blob().is_compressed()) {
1050 // update gc_start_offset/gc_end_offset if needed
1051 gc_start_offset
= min(gc_start_offset
, (uint64_t)it
->e
.blob_start());
1052 gc_end_offset
= std::max(gc_end_offset
, (uint64_t)it
->e
.blob_end());
1054 auto o
= it
->e
.logical_offset
;
1055 auto l
= it
->e
.length
;
1057 uint64_t ref_bytes
= b
->get_referenced_bytes();
1058 // micro optimization to bypass blobs that have no more references
1059 if (ref_bytes
!= 0) {
1060 dout(30) << __func__
<< " affected_blob:" << *b
1061 << " unref 0x" << std::hex
<< o
<< "~" << l
1062 << std::dec
<< dendl
;
1063 affected_blobs
.emplace(b
, BlobInfo(ref_bytes
));
1067 dout(30) << __func__
<< " gc range(hex): [" << std::hex
1068 << gc_start_offset
<< ", " << gc_end_offset
1069 << ")" << std::dec
<< dendl
;
1071 // enumerate preceeding extents to check if they reference affected blobs
1072 if (gc_start_offset
< start_offset
|| gc_end_offset
> end_offset
) {
1073 process_protrusive_extents(extent_map
,
1080 return expected_for_release
- expected_allocations
;
1083 // LruOnodeCacheShard
1084 struct LruOnodeCacheShard
: public BlueStore::OnodeCacheShard
{
1085 typedef boost::intrusive::list
<
1087 boost::intrusive::member_hook
<
1089 boost::intrusive::list_member_hook
<>,
1090 &BlueStore::Onode::lru_item
> > list_t
;
1094 explicit LruOnodeCacheShard(CephContext
*cct
) : BlueStore::OnodeCacheShard(cct
) {}
1096 void _add(BlueStore::Onode
* o
, int level
) override
1098 if (o
->put_cache()) {
1099 (level
> 0) ? lru
.push_front(*o
) : lru
.push_back(*o
);
1103 ++num
; // we count both pinned and unpinned entries
1104 dout(20) << __func__
<< " " << this << " " << o
->oid
<< " added, num=" << num
<< dendl
;
1106 void _rm(BlueStore::Onode
* o
) override
1108 if (o
->pop_cache()) {
1109 lru
.erase(lru
.iterator_to(*o
));
1111 ceph_assert(num_pinned
);
1116 dout(20) << __func__
<< " " << this << " " << " " << o
->oid
<< " removed, num=" << num
<< dendl
;
1118 void _pin(BlueStore::Onode
* o
) override
1120 lru
.erase(lru
.iterator_to(*o
));
1122 dout(20) << __func__
<< this << " " << " " << " " << o
->oid
<< " pinned" << dendl
;
1124 void _unpin(BlueStore::Onode
* o
) override
1127 ceph_assert(num_pinned
);
1129 dout(20) << __func__
<< this << " " << " " << " " << o
->oid
<< " unpinned" << dendl
;
1131 void _unpin_and_rm(BlueStore::Onode
* o
) override
1134 ceph_assert(num_pinned
);
1139 void _trim_to(uint64_t new_size
) override
1141 if (new_size
>= lru
.size()) {
1142 return; // don't even try
1144 uint64_t n
= lru
.size() - new_size
;
1146 ceph_assert(p
!= lru
.begin());
1148 ceph_assert(num
>= n
);
1151 BlueStore::Onode
*o
= &*p
;
1152 dout(20) << __func__
<< " rm " << o
->oid
<< " "
1153 << o
->nref
<< " " << o
->cached
<< " " << o
->pinned
<< dendl
;
1154 if (p
!= lru
.begin()) {
1157 ceph_assert(n
== 0);
1160 auto pinned
= !o
->pop_cache();
1161 ceph_assert(!pinned
);
1162 o
->c
->onode_map
._remove(o
->oid
);
1165 void move_pinned(OnodeCacheShard
*to
, BlueStore::Onode
*o
) override
1170 ceph_assert(o
->cached
);
1171 ceph_assert(o
->pinned
);
1173 ceph_assert(num_pinned
);
1179 void add_stats(uint64_t *onodes
, uint64_t *pinned_onodes
) override
1182 *pinned_onodes
+= num_pinned
;
1187 BlueStore::OnodeCacheShard
*BlueStore::OnodeCacheShard::create(
1190 PerfCounters
*logger
)
1192 BlueStore::OnodeCacheShard
*c
= nullptr;
1193 // Currently we only implement an LRU cache for onodes
1194 c
= new LruOnodeCacheShard(cct
);
1199 // LruBufferCacheShard
1200 struct LruBufferCacheShard
: public BlueStore::BufferCacheShard
{
1201 typedef boost::intrusive::list
<
1203 boost::intrusive::member_hook
<
1205 boost::intrusive::list_member_hook
<>,
1206 &BlueStore::Buffer::lru_item
> > list_t
;
1209 explicit LruBufferCacheShard(CephContext
*cct
) : BlueStore::BufferCacheShard(cct
) {}
1211 void _add(BlueStore::Buffer
*b
, int level
, BlueStore::Buffer
*near
) override
{
1213 auto q
= lru
.iterator_to(*near
);
1215 } else if (level
> 0) {
1220 buffer_bytes
+= b
->length
;
1223 void _rm(BlueStore::Buffer
*b
) override
{
1224 ceph_assert(buffer_bytes
>= b
->length
);
1225 buffer_bytes
-= b
->length
;
1226 auto q
= lru
.iterator_to(*b
);
1230 void _move(BlueStore::BufferCacheShard
*src
, BlueStore::Buffer
*b
) override
{
1232 _add(b
, 0, nullptr);
1234 void _adjust_size(BlueStore::Buffer
*b
, int64_t delta
) override
{
1235 ceph_assert((int64_t)buffer_bytes
+ delta
>= 0);
1236 buffer_bytes
+= delta
;
1238 void _touch(BlueStore::Buffer
*b
) override
{
1239 auto p
= lru
.iterator_to(*b
);
1243 _audit("_touch_buffer end");
1246 void _trim_to(uint64_t max
) override
1248 while (buffer_bytes
> max
) {
1249 auto i
= lru
.rbegin();
1250 if (i
== lru
.rend()) {
1251 // stop if lru is now empty
1255 BlueStore::Buffer
*b
= &*i
;
1256 ceph_assert(b
->is_clean());
1257 dout(20) << __func__
<< " rm " << *b
<< dendl
;
1258 b
->space
->_rm_buffer(this, b
);
1263 void add_stats(uint64_t *extents
,
1266 uint64_t *bytes
) override
{
1267 *extents
+= num_extents
;
1268 *blobs
+= num_blobs
;
1270 *bytes
+= buffer_bytes
;
1273 void _audit(const char *s
) override
1275 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1277 for (auto i
= lru
.begin(); i
!= lru
.end(); ++i
) {
1280 if (s
!= buffer_bytes
) {
1281 derr
<< __func__
<< " buffer_size " << buffer_bytes
<< " actual " << s
1283 for (auto i
= lru
.begin(); i
!= lru
.end(); ++i
) {
1284 derr
<< __func__
<< " " << *i
<< dendl
;
1286 ceph_assert(s
== buffer_bytes
);
1288 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1294 // TwoQBufferCacheShard
1296 struct TwoQBufferCacheShard
: public BlueStore::BufferCacheShard
{
1297 typedef boost::intrusive::list
<
1299 boost::intrusive::member_hook
<
1301 boost::intrusive::list_member_hook
<>,
1302 &BlueStore::Buffer::lru_item
> > list_t
;
1303 list_t hot
; ///< "Am" hot buffers
1304 list_t warm_in
; ///< "A1in" newly warm buffers
1305 list_t warm_out
; ///< "A1out" empty buffers we've evicted
1306 uint64_t buffer_bytes
= 0; ///< bytes
1310 BUFFER_WARM_IN
, ///< in warm_in
1311 BUFFER_WARM_OUT
, ///< in warm_out
1312 BUFFER_HOT
, ///< in hot
1316 uint64_t list_bytes
[BUFFER_TYPE_MAX
] = {0}; ///< bytes per type
1319 explicit TwoQBufferCacheShard(CephContext
*cct
) : BufferCacheShard(cct
) {}
1321 void _add(BlueStore::Buffer
*b
, int level
, BlueStore::Buffer
*near
) override
1323 dout(20) << __func__
<< " level " << level
<< " near " << near
1325 << " which has cache_private " << b
->cache_private
<< dendl
;
1327 b
->cache_private
= near
->cache_private
;
1328 switch (b
->cache_private
) {
1329 case BUFFER_WARM_IN
:
1330 warm_in
.insert(warm_in
.iterator_to(*near
), *b
);
1332 case BUFFER_WARM_OUT
:
1333 ceph_assert(b
->is_empty());
1334 warm_out
.insert(warm_out
.iterator_to(*near
), *b
);
1337 hot
.insert(hot
.iterator_to(*near
), *b
);
1340 ceph_abort_msg("bad cache_private");
1342 } else if (b
->cache_private
== BUFFER_NEW
) {
1343 b
->cache_private
= BUFFER_WARM_IN
;
1345 warm_in
.push_front(*b
);
1347 // take caller hint to start at the back of the warm queue
1348 warm_in
.push_back(*b
);
1351 // we got a hint from discard
1352 switch (b
->cache_private
) {
1353 case BUFFER_WARM_IN
:
1354 // stay in warm_in. move to front, even though 2Q doesn't actually
1356 dout(20) << __func__
<< " move to front of warm " << *b
<< dendl
;
1357 warm_in
.push_front(*b
);
1359 case BUFFER_WARM_OUT
:
1360 b
->cache_private
= BUFFER_HOT
;
1361 // move to hot. fall-thru
1363 dout(20) << __func__
<< " move to front of hot " << *b
<< dendl
;
1367 ceph_abort_msg("bad cache_private");
1370 if (!b
->is_empty()) {
1371 buffer_bytes
+= b
->length
;
1372 list_bytes
[b
->cache_private
] += b
->length
;
1374 num
= hot
.size() + warm_in
.size();
1377 void _rm(BlueStore::Buffer
*b
) override
1379 dout(20) << __func__
<< " " << *b
<< dendl
;
1380 if (!b
->is_empty()) {
1381 ceph_assert(buffer_bytes
>= b
->length
);
1382 buffer_bytes
-= b
->length
;
1383 ceph_assert(list_bytes
[b
->cache_private
] >= b
->length
);
1384 list_bytes
[b
->cache_private
] -= b
->length
;
1386 switch (b
->cache_private
) {
1387 case BUFFER_WARM_IN
:
1388 warm_in
.erase(warm_in
.iterator_to(*b
));
1390 case BUFFER_WARM_OUT
:
1391 warm_out
.erase(warm_out
.iterator_to(*b
));
1394 hot
.erase(hot
.iterator_to(*b
));
1397 ceph_abort_msg("bad cache_private");
1399 num
= hot
.size() + warm_in
.size();
1402 void _move(BlueStore::BufferCacheShard
*srcc
, BlueStore::Buffer
*b
) override
1404 TwoQBufferCacheShard
*src
= static_cast<TwoQBufferCacheShard
*>(srcc
);
1407 // preserve which list we're on (even if we can't preserve the order!)
1408 switch (b
->cache_private
) {
1409 case BUFFER_WARM_IN
:
1410 ceph_assert(!b
->is_empty());
1411 warm_in
.push_back(*b
);
1413 case BUFFER_WARM_OUT
:
1414 ceph_assert(b
->is_empty());
1415 warm_out
.push_back(*b
);
1418 ceph_assert(!b
->is_empty());
1422 ceph_abort_msg("bad cache_private");
1424 if (!b
->is_empty()) {
1425 buffer_bytes
+= b
->length
;
1426 list_bytes
[b
->cache_private
] += b
->length
;
1428 num
= hot
.size() + warm_in
.size();
1431 void _adjust_size(BlueStore::Buffer
*b
, int64_t delta
) override
1433 dout(20) << __func__
<< " delta " << delta
<< " on " << *b
<< dendl
;
1434 if (!b
->is_empty()) {
1435 ceph_assert((int64_t)buffer_bytes
+ delta
>= 0);
1436 buffer_bytes
+= delta
;
1437 ceph_assert((int64_t)list_bytes
[b
->cache_private
] + delta
>= 0);
1438 list_bytes
[b
->cache_private
] += delta
;
1442 void _touch(BlueStore::Buffer
*b
) override
{
1443 switch (b
->cache_private
) {
1444 case BUFFER_WARM_IN
:
1445 // do nothing (somewhat counter-intuitively!)
1447 case BUFFER_WARM_OUT
:
1448 // move from warm_out to hot LRU
1449 ceph_abort_msg("this happens via discard hint");
1452 // move to front of hot LRU
1453 hot
.erase(hot
.iterator_to(*b
));
1457 num
= hot
.size() + warm_in
.size();
1458 _audit("_touch_buffer end");
1461 void _trim_to(uint64_t max
) override
1463 if (buffer_bytes
> max
) {
1464 uint64_t kin
= max
* cct
->_conf
->bluestore_2q_cache_kin_ratio
;
1465 uint64_t khot
= max
- kin
;
1467 // pre-calculate kout based on average buffer size too,
1468 // which is typical(the warm_in and hot lists may change later)
1470 uint64_t buffer_num
= hot
.size() + warm_in
.size();
1472 uint64_t avg_size
= buffer_bytes
/ buffer_num
;
1473 ceph_assert(avg_size
);
1474 uint64_t calculated_num
= max
/ avg_size
;
1475 kout
= calculated_num
* cct
->_conf
->bluestore_2q_cache_kout_ratio
;
1478 if (list_bytes
[BUFFER_HOT
] < khot
) {
1479 // hot is small, give slack to warm_in
1480 kin
+= khot
- list_bytes
[BUFFER_HOT
];
1481 } else if (list_bytes
[BUFFER_WARM_IN
] < kin
) {
1482 // warm_in is small, give slack to hot
1483 khot
+= kin
- list_bytes
[BUFFER_WARM_IN
];
1486 // adjust warm_in list
1487 int64_t to_evict_bytes
= list_bytes
[BUFFER_WARM_IN
] - kin
;
1488 uint64_t evicted
= 0;
1490 while (to_evict_bytes
> 0) {
1491 auto p
= warm_in
.rbegin();
1492 if (p
== warm_in
.rend()) {
1493 // stop if warm_in list is now empty
1497 BlueStore::Buffer
*b
= &*p
;
1498 ceph_assert(b
->is_clean());
1499 dout(20) << __func__
<< " buffer_warm_in -> out " << *b
<< dendl
;
1500 ceph_assert(buffer_bytes
>= b
->length
);
1501 buffer_bytes
-= b
->length
;
1502 ceph_assert(list_bytes
[BUFFER_WARM_IN
] >= b
->length
);
1503 list_bytes
[BUFFER_WARM_IN
] -= b
->length
;
1504 to_evict_bytes
-= b
->length
;
1505 evicted
+= b
->length
;
1506 b
->state
= BlueStore::Buffer::STATE_EMPTY
;
1508 warm_in
.erase(warm_in
.iterator_to(*b
));
1509 warm_out
.push_front(*b
);
1510 b
->cache_private
= BUFFER_WARM_OUT
;
1514 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1515 << " from warm_in list, done evicting warm_in buffers"
1520 to_evict_bytes
= list_bytes
[BUFFER_HOT
] - khot
;
1523 while (to_evict_bytes
> 0) {
1524 auto p
= hot
.rbegin();
1525 if (p
== hot
.rend()) {
1526 // stop if hot list is now empty
1530 BlueStore::Buffer
*b
= &*p
;
1531 dout(20) << __func__
<< " buffer_hot rm " << *b
<< dendl
;
1532 ceph_assert(b
->is_clean());
1533 // adjust evict size before buffer goes invalid
1534 to_evict_bytes
-= b
->length
;
1535 evicted
+= b
->length
;
1536 b
->space
->_rm_buffer(this, b
);
1540 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1541 << " from hot list, done evicting hot buffers"
1545 // adjust warm out list too, if necessary
1546 int64_t n
= warm_out
.size() - kout
;
1548 BlueStore::Buffer
*b
= &*warm_out
.rbegin();
1549 ceph_assert(b
->is_empty());
1550 dout(20) << __func__
<< " buffer_warm_out rm " << *b
<< dendl
;
1551 b
->space
->_rm_buffer(this, b
);
1554 num
= hot
.size() + warm_in
.size();
1557 void add_stats(uint64_t *extents
,
1560 uint64_t *bytes
) override
{
1561 *extents
+= num_extents
;
1562 *blobs
+= num_blobs
;
1564 *bytes
+= buffer_bytes
;
1568 void _audit(const char *s
) override
1570 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1572 for (auto i
= hot
.begin(); i
!= hot
.end(); ++i
) {
1576 uint64_t hot_bytes
= s
;
1577 if (hot_bytes
!= list_bytes
[BUFFER_HOT
]) {
1578 derr
<< __func__
<< " hot_list_bytes "
1579 << list_bytes
[BUFFER_HOT
]
1580 << " != actual " << hot_bytes
1582 ceph_assert(hot_bytes
== list_bytes
[BUFFER_HOT
]);
1585 for (auto i
= warm_in
.begin(); i
!= warm_in
.end(); ++i
) {
1589 uint64_t warm_in_bytes
= s
- hot_bytes
;
1590 if (warm_in_bytes
!= list_bytes
[BUFFER_WARM_IN
]) {
1591 derr
<< __func__
<< " warm_in_list_bytes "
1592 << list_bytes
[BUFFER_WARM_IN
]
1593 << " != actual " << warm_in_bytes
1595 ceph_assert(warm_in_bytes
== list_bytes
[BUFFER_WARM_IN
]);
1598 if (s
!= buffer_bytes
) {
1599 derr
<< __func__
<< " buffer_bytes " << buffer_bytes
<< " actual " << s
1601 ceph_assert(s
== buffer_bytes
);
1604 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1612 BlueStore::BufferCacheShard
*BlueStore::BufferCacheShard::create(
1615 PerfCounters
*logger
)
1617 BufferCacheShard
*c
= nullptr;
1619 c
= new LruBufferCacheShard(cct
);
1620 else if (type
== "2q")
1621 c
= new TwoQBufferCacheShard(cct
);
1623 ceph_abort_msg("unrecognized cache type");
1631 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1633 void BlueStore::BufferSpace::_clear(BufferCacheShard
* cache
)
1635 // note: we already hold cache->lock
1636 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1637 while (!buffer_map
.empty()) {
1638 _rm_buffer(cache
, buffer_map
.begin());
1642 int BlueStore::BufferSpace::_discard(BufferCacheShard
* cache
, uint32_t offset
, uint32_t length
)
1644 // note: we already hold cache->lock
1645 ldout(cache
->cct
, 20) << __func__
<< std::hex
<< " 0x" << offset
<< "~" << length
1646 << std::dec
<< dendl
;
1647 int cache_private
= 0;
1648 cache
->_audit("discard start");
1649 auto i
= _data_lower_bound(offset
);
1650 uint32_t end
= offset
+ length
;
1651 while (i
!= buffer_map
.end()) {
1652 Buffer
*b
= i
->second
.get();
1653 if (b
->offset
>= end
) {
1656 if (b
->cache_private
> cache_private
) {
1657 cache_private
= b
->cache_private
;
1659 if (b
->offset
< offset
) {
1660 int64_t front
= offset
- b
->offset
;
1661 if (b
->end() > end
) {
1662 // drop middle (split)
1663 uint32_t tail
= b
->end() - end
;
1664 if (b
->data
.length()) {
1666 bl
.substr_of(b
->data
, b
->length
- tail
, tail
);
1667 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1668 nb
->maybe_rebuild();
1669 _add_buffer(cache
, nb
, 0, b
);
1671 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, tail
),
1674 if (!b
->is_writing()) {
1675 cache
->_adjust_size(b
, front
- (int64_t)b
->length
);
1679 cache
->_audit("discard end 1");
1683 if (!b
->is_writing()) {
1684 cache
->_adjust_size(b
, front
- (int64_t)b
->length
);
1692 if (b
->end() <= end
) {
1693 // drop entire buffer
1694 _rm_buffer(cache
, i
++);
1698 uint32_t keep
= b
->end() - end
;
1699 if (b
->data
.length()) {
1701 bl
.substr_of(b
->data
, b
->length
- keep
, keep
);
1702 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1703 nb
->maybe_rebuild();
1704 _add_buffer(cache
, nb
, 0, b
);
1706 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, keep
), 0, b
);
1708 _rm_buffer(cache
, i
);
1709 cache
->_audit("discard end 2");
1712 return cache_private
;
1715 void BlueStore::BufferSpace::read(
1716 BufferCacheShard
* cache
,
1719 BlueStore::ready_regions_t
& res
,
1720 interval_set
<uint32_t>& res_intervals
,
1724 res_intervals
.clear();
1725 uint32_t want_bytes
= length
;
1726 uint32_t end
= offset
+ length
;
1729 std::lock_guard
l(cache
->lock
);
1730 for (auto i
= _data_lower_bound(offset
);
1731 i
!= buffer_map
.end() && offset
< end
&& i
->first
< end
;
1733 Buffer
*b
= i
->second
.get();
1734 ceph_assert(b
->end() > offset
);
1737 if (flags
& BYPASS_CLEAN_CACHE
)
1738 val
= b
->is_writing();
1740 val
= b
->is_writing() || b
->is_clean();
1742 if (b
->offset
< offset
) {
1743 uint32_t skip
= offset
- b
->offset
;
1744 uint32_t l
= min(length
, b
->length
- skip
);
1745 res
[offset
].substr_of(b
->data
, skip
, l
);
1746 res_intervals
.insert(offset
, l
);
1749 if (!b
->is_writing()) {
1754 if (b
->offset
> offset
) {
1755 uint32_t gap
= b
->offset
- offset
;
1756 if (length
<= gap
) {
1762 if (!b
->is_writing()) {
1765 if (b
->length
> length
) {
1766 res
[offset
].substr_of(b
->data
, 0, length
);
1767 res_intervals
.insert(offset
, length
);
1770 res
[offset
].append(b
->data
);
1771 res_intervals
.insert(offset
, b
->length
);
1772 if (b
->length
== length
)
1774 offset
+= b
->length
;
1775 length
-= b
->length
;
1781 uint64_t hit_bytes
= res_intervals
.size();
1782 ceph_assert(hit_bytes
<= want_bytes
);
1783 uint64_t miss_bytes
= want_bytes
- hit_bytes
;
1784 cache
->logger
->inc(l_bluestore_buffer_hit_bytes
, hit_bytes
);
1785 cache
->logger
->inc(l_bluestore_buffer_miss_bytes
, miss_bytes
);
1788 void BlueStore::BufferSpace::_finish_write(BufferCacheShard
* cache
, uint64_t seq
)
1790 auto i
= writing
.begin();
1791 while (i
!= writing
.end()) {
1801 ceph_assert(b
->is_writing());
1803 if (b
->flags
& Buffer::FLAG_NOCACHE
) {
1805 ldout(cache
->cct
, 20) << __func__
<< " discard " << *b
<< dendl
;
1806 buffer_map
.erase(b
->offset
);
1808 b
->state
= Buffer::STATE_CLEAN
;
1811 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
1812 cache
->_add(b
, 1, nullptr);
1813 ldout(cache
->cct
, 20) << __func__
<< " added " << *b
<< dendl
;
1817 cache
->_audit("finish_write end");
1820 void BlueStore::BufferSpace::split(BufferCacheShard
* cache
, size_t pos
, BlueStore::BufferSpace
&r
)
1822 std::lock_guard
lk(cache
->lock
);
1823 if (buffer_map
.empty())
1826 auto p
= --buffer_map
.end();
1828 if (p
->second
->end() <= pos
)
1831 if (p
->second
->offset
< pos
) {
1832 ldout(cache
->cct
, 30) << __func__
<< " cut " << *p
->second
<< dendl
;
1833 size_t left
= pos
- p
->second
->offset
;
1834 size_t right
= p
->second
->length
- left
;
1835 if (p
->second
->data
.length()) {
1837 bl
.substr_of(p
->second
->data
, left
, right
);
1838 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, bl
),
1839 0, p
->second
.get());
1841 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, right
),
1842 0, p
->second
.get());
1844 cache
->_adjust_size(p
->second
.get(), -right
);
1845 p
->second
->truncate(left
);
1849 ceph_assert(p
->second
->end() > pos
);
1850 ldout(cache
->cct
, 30) << __func__
<< " move " << *p
->second
<< dendl
;
1851 if (p
->second
->data
.length()) {
1852 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1853 p
->second
->offset
- pos
, p
->second
->data
),
1854 0, p
->second
.get());
1856 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1857 p
->second
->offset
- pos
, p
->second
->length
),
1858 0, p
->second
.get());
1860 if (p
== buffer_map
.begin()) {
1861 _rm_buffer(cache
, p
);
1864 _rm_buffer(cache
, p
--);
1867 ceph_assert(writing
.empty());
1874 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1876 BlueStore::OnodeRef
BlueStore::OnodeSpace::add(const ghobject_t
& oid
,
1879 std::lock_guard
l(cache
->lock
);
1880 auto p
= onode_map
.find(oid
);
1881 if (p
!= onode_map
.end()) {
1882 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
1883 << " raced, returning existing " << p
->second
1887 ldout(cache
->cct
, 20) << __func__
<< " " << oid
<< " " << o
<< dendl
;
1889 cache
->_add(o
.get(), 1);
1894 void BlueStore::OnodeSpace::_remove(const ghobject_t
& oid
)
1896 ldout(cache
->cct
, 20) << __func__
<< " " << oid
<< " " << dendl
;
1897 onode_map
.erase(oid
);
1900 BlueStore::OnodeRef
BlueStore::OnodeSpace::lookup(const ghobject_t
& oid
)
1902 ldout(cache
->cct
, 30) << __func__
<< dendl
;
1907 std::lock_guard
l(cache
->lock
);
1908 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator p
= onode_map
.find(oid
);
1909 if (p
== onode_map
.end()) {
1910 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " miss" << dendl
;
1912 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " hit " << p
->second
1913 << " " << p
->second
->nref
1914 << " " << p
->second
->cached
1915 << " " << p
->second
->pinned
1917 // This will pin onode and implicitly touch the cache when Onode
1918 // eventually will become unpinned
1920 ceph_assert(!o
->cached
|| o
->pinned
);
1927 cache
->logger
->inc(l_bluestore_onode_hits
);
1929 cache
->logger
->inc(l_bluestore_onode_misses
);
1934 void BlueStore::OnodeSpace::clear()
1936 std::lock_guard
l(cache
->lock
);
1937 ldout(cache
->cct
, 10) << __func__
<< " " << onode_map
.size()<< dendl
;
1938 for (auto &p
: onode_map
) {
1939 cache
->_rm(p
.second
.get());
1944 bool BlueStore::OnodeSpace::empty()
1946 std::lock_guard
l(cache
->lock
);
1947 return onode_map
.empty();
1950 void BlueStore::OnodeSpace::rename(
1952 const ghobject_t
& old_oid
,
1953 const ghobject_t
& new_oid
,
1954 const mempool::bluestore_cache_meta::string
& new_okey
)
1956 std::lock_guard
l(cache
->lock
);
1957 ldout(cache
->cct
, 30) << __func__
<< " " << old_oid
<< " -> " << new_oid
1959 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator po
, pn
;
1960 po
= onode_map
.find(old_oid
);
1961 pn
= onode_map
.find(new_oid
);
1962 ceph_assert(po
!= pn
);
1964 ceph_assert(po
!= onode_map
.end());
1965 if (pn
!= onode_map
.end()) {
1966 ldout(cache
->cct
, 30) << __func__
<< " removing target " << pn
->second
1968 cache
->_rm(pn
->second
.get());
1969 onode_map
.erase(pn
);
1971 OnodeRef o
= po
->second
;
1973 // install a non-existent onode at old location
1974 oldo
.reset(new Onode(o
->c
, old_oid
, o
->key
));
1976 cache
->_add(oldo
.get(), 1);
1977 // add at new position and fix oid, key.
1978 // This will pin 'o' and implicitly touch cache
1979 // when it will eventually become unpinned
1980 onode_map
.insert(make_pair(new_oid
, o
));
1981 ceph_assert(o
->pinned
);
1988 bool BlueStore::OnodeSpace::map_any(std::function
<bool(Onode
*)> f
)
1990 std::lock_guard
l(cache
->lock
);
1991 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1992 for (auto& i
: onode_map
) {
1993 if (f(i
.second
.get())) {
2000 template <int LogLevelV
= 30>
2001 void BlueStore::OnodeSpace::dump(CephContext
*cct
)
2003 for (auto& i
: onode_map
) {
2004 ldout(cct
, LogLevelV
) << i
.first
<< " : " << i
.second
2005 << " " << i
.second
->nref
2006 << " " << i
.second
->cached
2007 << " " << i
.second
->pinned
2015 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
2017 #define dout_context coll->store->cct
2019 void BlueStore::SharedBlob::dump(Formatter
* f
) const
2021 f
->dump_bool("loaded", loaded
);
2023 persistent
->dump(f
);
2025 f
->dump_unsigned("sbid_unloaded", sbid_unloaded
);
2029 ostream
& operator<<(ostream
& out
, const BlueStore::SharedBlob
& sb
)
2031 out
<< "SharedBlob(" << &sb
;
2034 out
<< " loaded " << *sb
.persistent
;
2036 out
<< " sbid 0x" << std::hex
<< sb
.sbid_unloaded
<< std::dec
;
2041 BlueStore::SharedBlob::SharedBlob(uint64_t i
, Collection
*_coll
)
2042 : coll(_coll
), sbid_unloaded(i
)
2044 ceph_assert(sbid_unloaded
> 0);
2046 get_cache()->add_blob();
2050 BlueStore::SharedBlob::~SharedBlob()
2052 if (loaded
&& persistent
) {
2057 void BlueStore::SharedBlob::put()
2060 dout(20) << __func__
<< " " << this
2061 << " removing self from set " << get_parent()
2064 auto coll_snap
= coll
;
2066 std::lock_guard
l(coll_snap
->cache
->lock
);
2067 if (coll_snap
!= coll
) {
2070 if (!coll_snap
->shared_blob_set
.remove(this, true)) {
2074 bc
._clear(coll_snap
->cache
);
2075 coll_snap
->cache
->rm_blob();
2081 void BlueStore::SharedBlob::get_ref(uint64_t offset
, uint32_t length
)
2083 ceph_assert(persistent
);
2084 persistent
->ref_map
.get(offset
, length
);
2087 void BlueStore::SharedBlob::put_ref(uint64_t offset
, uint32_t length
,
2091 ceph_assert(persistent
);
2092 persistent
->ref_map
.put(offset
, length
, r
,
2093 unshare
&& !*unshare
? unshare
: nullptr);
2096 void BlueStore::SharedBlob::finish_write(uint64_t seq
)
2099 BufferCacheShard
*cache
= coll
->cache
;
2100 std::lock_guard
l(cache
->lock
);
2101 if (coll
->cache
!= cache
) {
2102 dout(20) << __func__
2103 << " raced with sb cache update, was " << cache
2104 << ", now " << coll
->cache
<< ", retrying"
2108 bc
._finish_write(cache
, seq
);
2116 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
2118 template <int LogLevelV
= 30>
2119 void BlueStore::SharedBlobSet::dump(CephContext
*cct
)
2121 std::lock_guard
l(lock
);
2122 for (auto& i
: sb_map
) {
2123 ldout(cct
, LogLevelV
) << i
.first
<< " : " << *i
.second
<< dendl
;
2130 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
2132 void BlueStore::Blob::dump(Formatter
* f
) const
2134 if (is_spanning()) {
2135 f
->dump_unsigned("spanning_id ", id
);
2139 f
->dump_object("shared", *shared_blob
);
2143 ostream
& operator<<(ostream
& out
, const BlueStore::Blob
& b
)
2145 out
<< "Blob(" << &b
;
2146 if (b
.is_spanning()) {
2147 out
<< " spanning " << b
.id
;
2149 out
<< " " << b
.get_blob() << " " << b
.get_blob_use_tracker();
2150 if (b
.shared_blob
) {
2151 out
<< " " << *b
.shared_blob
;
2153 out
<< " (shared_blob=NULL)";
2159 void BlueStore::Blob::discard_unallocated(Collection
*coll
)
2161 if (get_blob().is_shared()) {
2164 if (get_blob().is_compressed()) {
2165 bool discard
= false;
2166 bool all_invalid
= true;
2167 for (auto e
: get_blob().get_extents()) {
2168 if (!e
.is_valid()) {
2171 all_invalid
= false;
2174 ceph_assert(discard
== all_invalid
); // in case of compressed blob all
2175 // or none pextents are invalid.
2177 shared_blob
->bc
.discard(shared_blob
->get_cache(), 0,
2178 get_blob().get_logical_length());
2182 for (auto e
: get_blob().get_extents()) {
2183 if (!e
.is_valid()) {
2184 dout(20) << __func__
<< " 0x" << std::hex
<< pos
2186 << std::dec
<< dendl
;
2187 shared_blob
->bc
.discard(shared_blob
->get_cache(), pos
, e
.length
);
2191 if (get_blob().can_prune_tail()) {
2192 dirty_blob().prune_tail();
2193 used_in_blob
.prune_tail(get_blob().get_ondisk_length());
2194 dout(20) << __func__
<< " pruned tail, now " << get_blob() << dendl
;
2199 void BlueStore::Blob::get_ref(
2204 // Caller has to initialize Blob's logical length prior to increment
2205 // references. Otherwise one is neither unable to determine required
2206 // amount of counters in case of per-au tracking nor obtain min_release_size
2207 // for single counter mode.
2208 ceph_assert(get_blob().get_logical_length() != 0);
2209 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2210 << std::dec
<< " " << *this << dendl
;
2212 if (used_in_blob
.is_empty()) {
2213 uint32_t min_release_size
=
2214 get_blob().get_release_size(coll
->store
->min_alloc_size
);
2215 uint64_t l
= get_blob().get_logical_length();
2216 dout(20) << __func__
<< " init 0x" << std::hex
<< l
<< ", "
2217 << min_release_size
<< std::dec
<< dendl
;
2218 used_in_blob
.init(l
, min_release_size
);
2225 bool BlueStore::Blob::put_ref(
2231 PExtentVector logical
;
2233 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2234 << std::dec
<< " " << *this << dendl
;
2236 bool empty
= used_in_blob
.put(
2241 // nothing to release
2242 if (!empty
&& logical
.empty()) {
2246 bluestore_blob_t
& b
= dirty_blob();
2247 return b
.release_extents(empty
, logical
, r
);
2250 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size
,
2251 uint32_t target_blob_size
,
2253 uint32_t *length0
) {
2254 ceph_assert(min_alloc_size
);
2255 ceph_assert(target_blob_size
);
2256 if (!get_blob().is_mutable()) {
2260 uint32_t length
= *length0
;
2261 uint32_t end
= b_offset
+ length
;
2263 // Currently for the sake of simplicity we omit blob reuse if data is
2264 // unaligned with csum chunk. Later we can perform padding if needed.
2265 if (get_blob().has_csum() &&
2266 ((b_offset
% get_blob().get_csum_chunk_size()) != 0 ||
2267 (end
% get_blob().get_csum_chunk_size()) != 0)) {
2271 auto blen
= get_blob().get_logical_length();
2272 uint32_t new_blen
= blen
;
2274 // make sure target_blob_size isn't less than current blob len
2275 target_blob_size
= std::max(blen
, target_blob_size
);
2277 if (b_offset
>= blen
) {
2278 // new data totally stands out of the existing blob
2281 // new data overlaps with the existing blob
2282 new_blen
= std::max(blen
, end
);
2284 uint32_t overlap
= 0;
2285 if (new_blen
> blen
) {
2286 overlap
= blen
- b_offset
;
2291 if (!get_blob().is_unallocated(b_offset
, overlap
)) {
2292 // abort if any piece of the overlap has already been allocated
2297 if (new_blen
> blen
) {
2298 int64_t overflow
= int64_t(new_blen
) - target_blob_size
;
2299 // Unable to decrease the provided length to fit into max_blob_size
2300 if (overflow
>= length
) {
2304 // FIXME: in some cases we could reduce unused resolution
2305 if (get_blob().has_unused()) {
2310 new_blen
-= overflow
;
2315 if (new_blen
> blen
) {
2316 dirty_blob().add_tail(new_blen
);
2317 used_in_blob
.add_tail(new_blen
,
2318 get_blob().get_release_size(min_alloc_size
));
2324 void BlueStore::Blob::split(Collection
*coll
, uint32_t blob_offset
, Blob
*r
)
2326 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2327 << " start " << *this << dendl
;
2328 ceph_assert(blob
.can_split());
2329 ceph_assert(used_in_blob
.can_split());
2330 bluestore_blob_t
&lb
= dirty_blob();
2331 bluestore_blob_t
&rb
= r
->dirty_blob();
2335 &(r
->used_in_blob
));
2337 lb
.split(blob_offset
, rb
);
2338 shared_blob
->bc
.split(shared_blob
->get_cache(), blob_offset
, r
->shared_blob
->bc
);
2340 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2341 << " finish " << *this << dendl
;
2342 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2343 << " and " << *r
<< dendl
;
2346 #ifndef CACHE_BLOB_BL
2347 void BlueStore::Blob::decode(
2349 bufferptr::const_iterator
& p
,
2352 bool include_ref_map
)
2354 denc(blob
, p
, struct_v
);
2355 if (blob
.is_shared()) {
2358 if (include_ref_map
) {
2360 used_in_blob
.decode(p
);
2362 used_in_blob
.clear();
2363 bluestore_extent_ref_map_t legacy_ref_map
;
2364 legacy_ref_map
.decode(p
);
2365 for (auto r
: legacy_ref_map
.ref_map
) {
2369 r
.second
.refs
* r
.second
.length
);
2378 void BlueStore::Extent::dump(Formatter
* f
) const
2380 f
->dump_unsigned("logical_offset", logical_offset
);
2381 f
->dump_unsigned("length", length
);
2382 f
->dump_unsigned("blob_offset", blob_offset
);
2383 f
->dump_object("blob", *blob
);
2386 ostream
& operator<<(ostream
& out
, const BlueStore::Extent
& e
)
2388 return out
<< std::hex
<< "0x" << e
.logical_offset
<< "~" << e
.length
2389 << ": 0x" << e
.blob_offset
<< "~" << e
.length
<< std::dec
2394 BlueStore::OldExtent
* BlueStore::OldExtent::create(CollectionRef c
,
2399 OldExtent
* oe
= new OldExtent(lo
, o
, l
, b
);
2400 b
->put_ref(c
.get(), o
, l
, &(oe
->r
));
2401 oe
->blob_empty
= !b
->is_referenced();
2408 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2410 #define dout_context onode->c->store->cct
2412 BlueStore::ExtentMap::ExtentMap(Onode
*o
)
2415 o
->c
->store
->cct
->_conf
->bluestore_extent_map_inline_shard_prealloc_size
) {
2418 void BlueStore::ExtentMap::dump(Formatter
* f
) const
2420 f
->open_array_section("extents");
2422 for (auto& e
: extent_map
) {
2423 f
->dump_object("extent", e
);
2428 void BlueStore::ExtentMap::dup(BlueStore
* b
, TransContext
* txc
,
2429 CollectionRef
& c
, OnodeRef
& oldo
, OnodeRef
& newo
, uint64_t& srcoff
,
2430 uint64_t& length
, uint64_t& dstoff
) {
2432 auto cct
= onode
->c
->store
->cct
;
2434 cct
->_conf
->bluestore_debug_inject_bug21040
;
2435 vector
<BlobRef
> id_to_blob(oldo
->extent_map
.extent_map
.size());
2436 for (auto& e
: oldo
->extent_map
.extent_map
) {
2437 e
.blob
->last_encoded_id
= -1;
2441 uint64_t end
= srcoff
+ length
;
2442 uint32_t dirty_range_begin
= 0;
2443 uint32_t dirty_range_end
= 0;
2444 bool src_dirty
= false;
2445 for (auto ep
= oldo
->extent_map
.seek_lextent(srcoff
);
2446 ep
!= oldo
->extent_map
.extent_map
.end();
2449 if (e
.logical_offset
>= end
) {
2452 dout(20) << __func__
<< " src " << e
<< dendl
;
2454 bool blob_duped
= true;
2455 if (e
.blob
->last_encoded_id
>= 0) {
2456 cb
= id_to_blob
[e
.blob
->last_encoded_id
];
2460 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
2461 // make sure it is shared
2462 if (!blob
.is_shared()) {
2463 c
->make_blob_shared(b
->_assign_blobid(txc
), e
.blob
);
2464 if (!inject_21040
&& !src_dirty
) {
2466 dirty_range_begin
= e
.logical_offset
;
2467 } else if (inject_21040
&&
2468 dirty_range_begin
== 0 && dirty_range_end
== 0) {
2469 dirty_range_begin
= e
.logical_offset
;
2471 ceph_assert(e
.logical_end() > 0);
2472 // -1 to exclude next potential shard
2473 dirty_range_end
= e
.logical_end() - 1;
2475 c
->load_shared_blob(e
.blob
->shared_blob
);
2478 e
.blob
->last_encoded_id
= n
;
2481 // bump the extent refs on the copied blob's extents
2482 for (auto p
: blob
.get_extents()) {
2484 e
.blob
->shared_blob
->get_ref(p
.offset
, p
.length
);
2487 txc
->write_shared_blob(e
.blob
->shared_blob
);
2488 dout(20) << __func__
<< " new " << *cb
<< dendl
;
2491 int skip_front
, skip_back
;
2492 if (e
.logical_offset
< srcoff
) {
2493 skip_front
= srcoff
- e
.logical_offset
;
2497 if (e
.logical_end() > end
) {
2498 skip_back
= e
.logical_end() - end
;
2503 Extent
* ne
= new Extent(e
.logical_offset
+ skip_front
+ dstoff
- srcoff
,
2504 e
.blob_offset
+ skip_front
, e
.length
- skip_front
- skip_back
, cb
);
2505 newo
->extent_map
.extent_map
.insert(*ne
);
2506 ne
->blob
->get_ref(c
.get(), ne
->blob_offset
, ne
->length
);
2507 // fixme: we may leave parts of new blob unreferenced that could
2508 // be freed (relative to the shared_blob).
2509 txc
->statfs_delta
.stored() += ne
->length
;
2510 if (e
.blob
->get_blob().is_compressed()) {
2511 txc
->statfs_delta
.compressed_original() += ne
->length
;
2513 txc
->statfs_delta
.compressed() +=
2514 cb
->get_blob().get_compressed_payload_length();
2517 dout(20) << __func__
<< " dst " << *ne
<< dendl
;
2520 if ((!inject_21040
&& src_dirty
) ||
2521 (inject_21040
&& dirty_range_end
> dirty_range_begin
)) {
2522 oldo
->extent_map
.dirty_range(dirty_range_begin
,
2523 dirty_range_end
- dirty_range_begin
);
2524 txc
->write_onode(oldo
);
2526 txc
->write_onode(newo
);
2528 if (dstoff
+ length
> newo
->onode
.size
) {
2529 newo
->onode
.size
= dstoff
+ length
;
2531 newo
->extent_map
.dirty_range(dstoff
, length
);
2533 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t
,
2536 auto cct
= onode
->c
->store
->cct
; //used by dout
2537 dout(20) << __func__
<< " " << onode
->oid
<< (force
? " force" : "") << dendl
;
2538 if (onode
->onode
.extent_map_shards
.empty()) {
2539 if (inline_bl
.length() == 0) {
2541 // we need to encode inline_bl to measure encoded length
2542 bool never_happen
= encode_some(0, OBJECT_MAX_SIZE
, inline_bl
, &n
);
2543 inline_bl
.reassign_to_mempool(mempool::mempool_bluestore_inline_bl
);
2544 ceph_assert(!never_happen
);
2545 size_t len
= inline_bl
.length();
2546 dout(20) << __func__
<< " inline shard " << len
<< " bytes from " << n
2547 << " extents" << dendl
;
2548 if (!force
&& len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2549 request_reshard(0, OBJECT_MAX_SIZE
);
2553 // will persist in the onode key.
2555 // pending shard update
2556 struct dirty_shard_t
{
2559 dirty_shard_t(Shard
*s
) : shard(s
) {}
2561 vector
<dirty_shard_t
> encoded_shards
;
2562 // allocate slots for all shards in a single call instead of
2563 // doing multiple allocations - one per each dirty shard
2564 encoded_shards
.reserve(shards
.size());
2566 auto p
= shards
.begin();
2568 while (p
!= shards
.end()) {
2569 ceph_assert(p
->shard_info
->offset
>= prev_p
->shard_info
->offset
);
2574 if (n
== shards
.end()) {
2575 endoff
= OBJECT_MAX_SIZE
;
2577 endoff
= n
->shard_info
->offset
;
2579 encoded_shards
.emplace_back(dirty_shard_t(&(*p
)));
2580 bufferlist
& bl
= encoded_shards
.back().bl
;
2581 if (encode_some(p
->shard_info
->offset
, endoff
- p
->shard_info
->offset
,
2584 derr
<< __func__
<< " encode_some needs reshard" << dendl
;
2585 ceph_assert(!force
);
2588 size_t len
= bl
.length();
2590 dout(20) << __func__
<< " shard 0x" << std::hex
2591 << p
->shard_info
->offset
<< std::dec
<< " is " << len
2592 << " bytes (was " << p
->shard_info
->bytes
<< ") from "
2593 << p
->extents
<< " extents" << dendl
;
2596 if (len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2597 // we are big; reshard ourselves
2598 request_reshard(p
->shard_info
->offset
, endoff
);
2600 // avoid resharding the trailing shard, even if it is small
2601 else if (n
!= shards
.end() &&
2602 len
< g_conf()->bluestore_extent_map_shard_min_size
) {
2603 ceph_assert(endoff
!= OBJECT_MAX_SIZE
);
2604 if (p
== shards
.begin()) {
2605 // we are the first shard, combine with next shard
2606 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2608 // combine either with the previous shard or the next,
2609 // whichever is smaller
2610 if (prev_p
->shard_info
->bytes
> n
->shard_info
->bytes
) {
2611 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2613 request_reshard(prev_p
->shard_info
->offset
, endoff
);
2622 if (needs_reshard()) {
2626 // schedule DB update for dirty shards
2628 for (auto& it
: encoded_shards
) {
2629 it
.shard
->dirty
= false;
2630 it
.shard
->shard_info
->bytes
= it
.bl
.length();
2631 generate_extent_shard_key_and_apply(
2633 it
.shard
->shard_info
->offset
,
2635 [&](const string
& final_key
) {
2636 t
->set(PREFIX_OBJ
, final_key
, it
.bl
);
2643 bid_t
BlueStore::ExtentMap::allocate_spanning_blob_id()
2645 if (spanning_blob_map
.empty())
2647 bid_t bid
= spanning_blob_map
.rbegin()->first
+ 1;
2648 // bid is valid and available.
2651 // Find next unused bid;
2652 bid
= rand() % (numeric_limits
<bid_t
>::max() + 1);
2653 const auto begin_bid
= bid
;
2655 if (!spanning_blob_map
.count(bid
))
2659 if (bid
< 0) bid
= 0;
2661 } while (bid
!= begin_bid
);
2662 auto cct
= onode
->c
->store
->cct
; // used by dout
2663 _dump_onode
<0>(cct
, *onode
);
2664 ceph_abort_msg("no available blob id");
2667 void BlueStore::ExtentMap::reshard(
2669 KeyValueDB::Transaction t
)
2671 auto cct
= onode
->c
->store
->cct
; // used by dout
2673 dout(10) << __func__
<< " 0x[" << std::hex
<< needs_reshard_begin
<< ","
2674 << needs_reshard_end
<< ")" << std::dec
2675 << " of " << onode
->onode
.extent_map_shards
.size()
2676 << " shards on " << onode
->oid
<< dendl
;
2677 for (auto& p
: spanning_blob_map
) {
2678 dout(20) << __func__
<< " spanning blob " << p
.first
<< " " << *p
.second
2681 // determine shard index range
2682 unsigned si_begin
= 0, si_end
= 0;
2683 if (!shards
.empty()) {
2684 while (si_begin
+ 1 < shards
.size() &&
2685 shards
[si_begin
+ 1].shard_info
->offset
<= needs_reshard_begin
) {
2688 needs_reshard_begin
= shards
[si_begin
].shard_info
->offset
;
2689 for (si_end
= si_begin
; si_end
< shards
.size(); ++si_end
) {
2690 if (shards
[si_end
].shard_info
->offset
>= needs_reshard_end
) {
2691 needs_reshard_end
= shards
[si_end
].shard_info
->offset
;
2695 if (si_end
== shards
.size()) {
2696 needs_reshard_end
= OBJECT_MAX_SIZE
;
2698 dout(20) << __func__
<< " shards [" << si_begin
<< "," << si_end
<< ")"
2699 << " over 0x[" << std::hex
<< needs_reshard_begin
<< ","
2700 << needs_reshard_end
<< ")" << std::dec
<< dendl
;
2703 fault_range(db
, needs_reshard_begin
, (needs_reshard_end
- needs_reshard_begin
));
2705 // we may need to fault in a larger interval later must have all
2706 // referring extents for spanning blobs loaded in order to have
2707 // accurate use_tracker values.
2708 uint32_t spanning_scan_begin
= needs_reshard_begin
;
2709 uint32_t spanning_scan_end
= needs_reshard_end
;
2713 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2714 generate_extent_shard_key_and_apply(
2715 onode
->key
, shards
[i
].shard_info
->offset
, &key
,
2716 [&](const string
& final_key
) {
2717 t
->rmkey(PREFIX_OBJ
, final_key
);
2722 // calculate average extent size
2724 unsigned extents
= 0;
2725 if (onode
->onode
.extent_map_shards
.empty()) {
2726 bytes
= inline_bl
.length();
2727 extents
= extent_map
.size();
2729 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2730 bytes
+= shards
[i
].shard_info
->bytes
;
2731 extents
+= shards
[i
].extents
;
2734 unsigned target
= cct
->_conf
->bluestore_extent_map_shard_target_size
;
2735 unsigned slop
= target
*
2736 cct
->_conf
->bluestore_extent_map_shard_target_size_slop
;
2737 unsigned extent_avg
= bytes
/ std::max(1u, extents
);
2738 dout(20) << __func__
<< " extent_avg " << extent_avg
<< ", target " << target
2739 << ", slop " << slop
<< dendl
;
2742 unsigned estimate
= 0;
2743 unsigned offset
= needs_reshard_begin
;
2744 vector
<bluestore_onode_t::shard_info
> new_shard_info
;
2745 unsigned max_blob_end
= 0;
2746 Extent
dummy(needs_reshard_begin
);
2747 for (auto e
= extent_map
.lower_bound(dummy
);
2748 e
!= extent_map
.end();
2750 if (e
->logical_offset
>= needs_reshard_end
) {
2753 dout(30) << " extent " << *e
<< dendl
;
2755 // disfavor shard boundaries that span a blob
2756 bool would_span
= (e
->logical_offset
< max_blob_end
) || e
->blob_offset
;
2758 estimate
+ extent_avg
> target
+ (would_span
? slop
: 0)) {
2760 if (offset
== needs_reshard_begin
) {
2761 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2762 new_shard_info
.back().offset
= offset
;
2763 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2764 << std::dec
<< dendl
;
2766 offset
= e
->logical_offset
;
2767 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2768 new_shard_info
.back().offset
= offset
;
2769 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2770 << std::dec
<< dendl
;
2773 estimate
+= extent_avg
;
2774 unsigned bs
= e
->blob_start();
2775 if (bs
< spanning_scan_begin
) {
2776 spanning_scan_begin
= bs
;
2778 uint32_t be
= e
->blob_end();
2779 if (be
> max_blob_end
) {
2782 if (be
> spanning_scan_end
) {
2783 spanning_scan_end
= be
;
2786 if (new_shard_info
.empty() && (si_begin
> 0 ||
2787 si_end
< shards
.size())) {
2788 // we resharded a partial range; we must produce at least one output
2790 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2791 new_shard_info
.back().offset
= needs_reshard_begin
;
2792 dout(20) << __func__
<< " new shard 0x" << std::hex
<< needs_reshard_begin
2793 << std::dec
<< " (singleton degenerate case)" << dendl
;
2796 auto& sv
= onode
->onode
.extent_map_shards
;
2797 dout(20) << __func__
<< " new " << new_shard_info
<< dendl
;
2798 dout(20) << __func__
<< " old " << sv
<< dendl
;
2800 // no old shards to keep
2801 sv
.swap(new_shard_info
);
2802 init_shards(true, true);
2804 // splice in new shards
2805 sv
.erase(sv
.begin() + si_begin
, sv
.begin() + si_end
);
2806 shards
.erase(shards
.begin() + si_begin
, shards
.begin() + si_end
);
2808 sv
.begin() + si_begin
,
2809 new_shard_info
.begin(),
2810 new_shard_info
.end());
2811 shards
.insert(shards
.begin() + si_begin
, new_shard_info
.size(), Shard());
2812 si_end
= si_begin
+ new_shard_info
.size();
2814 ceph_assert(sv
.size() == shards
.size());
2816 // note that we need to update every shard_info of shards here,
2817 // as sv might have been totally re-allocated above
2818 for (unsigned i
= 0; i
< shards
.size(); i
++) {
2819 shards
[i
].shard_info
= &sv
[i
];
2822 // mark newly added shards as dirty
2823 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2824 shards
[i
].loaded
= true;
2825 shards
[i
].dirty
= true;
2828 dout(20) << __func__
<< " fin " << sv
<< dendl
;
2832 // no more shards; unspan all previously spanning blobs
2833 auto p
= spanning_blob_map
.begin();
2834 while (p
!= spanning_blob_map
.end()) {
2836 dout(30) << __func__
<< " un-spanning " << *p
->second
<< dendl
;
2837 p
= spanning_blob_map
.erase(p
);
2840 // identify new spanning blobs
2841 dout(20) << __func__
<< " checking spanning blobs 0x[" << std::hex
2842 << spanning_scan_begin
<< "," << spanning_scan_end
<< ")" << dendl
;
2843 if (spanning_scan_begin
< needs_reshard_begin
) {
2844 fault_range(db
, spanning_scan_begin
,
2845 needs_reshard_begin
- spanning_scan_begin
);
2847 if (spanning_scan_end
> needs_reshard_end
) {
2848 fault_range(db
, needs_reshard_end
,
2849 spanning_scan_end
- needs_reshard_end
);
2851 auto sp
= sv
.begin() + si_begin
;
2852 auto esp
= sv
.end();
2853 unsigned shard_start
= sp
->offset
;
2857 shard_end
= OBJECT_MAX_SIZE
;
2859 shard_end
= sp
->offset
;
2861 Extent
dummy(needs_reshard_begin
);
2863 bool was_too_many_blobs_check
= false;
2864 auto too_many_blobs_threshold
=
2865 g_conf()->bluestore_debug_too_many_blobs_threshold
;
2866 auto& dumped_onodes
= onode
->c
->onode_map
.cache
->dumped_onodes
;
2867 decltype(onode
->c
->onode_map
.cache
->dumped_onodes
)::value_type
* oid_slot
= nullptr;
2868 decltype(onode
->c
->onode_map
.cache
->dumped_onodes
)::value_type
* oldest_slot
= nullptr;
2870 for (auto e
= extent_map
.lower_bound(dummy
); e
!= extent_map
.end(); ++e
) {
2871 if (e
->logical_offset
>= needs_reshard_end
) {
2874 dout(30) << " extent " << *e
<< dendl
;
2875 while (e
->logical_offset
>= shard_end
) {
2876 shard_start
= shard_end
;
2877 ceph_assert(sp
!= esp
);
2880 shard_end
= OBJECT_MAX_SIZE
;
2882 shard_end
= sp
->offset
;
2884 dout(30) << __func__
<< " shard 0x" << std::hex
<< shard_start
2885 << " to 0x" << shard_end
<< std::dec
<< dendl
;
2888 if (e
->blob_escapes_range(shard_start
, shard_end
- shard_start
)) {
2889 if (!e
->blob
->is_spanning()) {
2890 // We have two options: (1) split the blob into pieces at the
2891 // shard boundaries (and adjust extents accordingly), or (2)
2892 // mark it spanning. We prefer to cut the blob if we can. Note that
2893 // we may have to split it multiple times--potentially at every
2895 bool must_span
= false;
2896 BlobRef b
= e
->blob
;
2897 if (b
->can_split()) {
2898 uint32_t bstart
= e
->blob_start();
2899 uint32_t bend
= e
->blob_end();
2900 for (const auto& sh
: shards
) {
2901 if (bstart
< sh
.shard_info
->offset
&&
2902 bend
> sh
.shard_info
->offset
) {
2903 uint32_t blob_offset
= sh
.shard_info
->offset
- bstart
;
2904 if (b
->can_split_at(blob_offset
)) {
2905 dout(20) << __func__
<< " splitting blob, bstart 0x"
2906 << std::hex
<< bstart
<< " blob_offset 0x"
2907 << blob_offset
<< std::dec
<< " " << *b
<< dendl
;
2908 b
= split_blob(b
, blob_offset
, sh
.shard_info
->offset
);
2909 // switch b to the new right-hand side, in case it
2910 // *also* has to get split.
2911 bstart
+= blob_offset
;
2912 onode
->c
->store
->logger
->inc(l_bluestore_blob_split
);
2923 auto bid
= allocate_spanning_blob_id();
2925 spanning_blob_map
[b
->id
] = b
;
2926 dout(20) << __func__
<< " adding spanning " << *b
<< dendl
;
2927 if (!was_too_many_blobs_check
&&
2928 too_many_blobs_threshold
&&
2929 spanning_blob_map
.size() >= size_t(too_many_blobs_threshold
)) {
2931 was_too_many_blobs_check
= true;
2932 for (size_t i
= 0; i
< dumped_onodes
.size(); ++i
) {
2933 if (dumped_onodes
[i
].first
== onode
->oid
) {
2934 oid_slot
= &dumped_onodes
[i
];
2937 if (!oldest_slot
|| (oldest_slot
&&
2938 dumped_onodes
[i
].second
< oldest_slot
->second
)) {
2939 oldest_slot
= &dumped_onodes
[i
];
2946 if (e
->blob
->is_spanning()) {
2947 spanning_blob_map
.erase(e
->blob
->id
);
2949 dout(30) << __func__
<< " un-spanning " << *e
->blob
<< dendl
;
2953 bool do_dump
= (!oid_slot
&& was_too_many_blobs_check
) ||
2955 (mono_clock::now() - oid_slot
->second
>= make_timespan(5 * 60)));
2958 << " spanning blob count exceeds threshold, "
2959 << spanning_blob_map
.size() << " spanning blobs"
2961 _dump_onode
<0>(cct
, *onode
);
2963 oid_slot
->second
= mono_clock::now();
2965 ceph_assert(oldest_slot
);
2966 oldest_slot
->first
= onode
->oid
;
2967 oldest_slot
->second
= mono_clock::now();
2972 clear_needs_reshard();
2975 bool BlueStore::ExtentMap::encode_some(
2981 Extent
dummy(offset
);
2982 auto start
= extent_map
.lower_bound(dummy
);
2983 uint32_t end
= offset
+ length
;
2985 __u8 struct_v
= 2; // Version 2 differs from v1 in blob's ref_map
2986 // serialization only. Hence there is no specific
2987 // handling at ExtentMap level.
2991 bool must_reshard
= false;
2992 for (auto p
= start
;
2993 p
!= extent_map
.end() && p
->logical_offset
< end
;
2995 ceph_assert(p
->logical_offset
>= offset
);
2996 p
->blob
->last_encoded_id
= -1;
2997 if (!p
->blob
->is_spanning() && p
->blob_escapes_range(offset
, length
)) {
2998 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2999 << std::dec
<< " hit new spanning blob " << *p
<< dendl
;
3000 request_reshard(p
->blob_start(), p
->blob_end());
3001 must_reshard
= true;
3003 if (!must_reshard
) {
3004 denc_varint(0, bound
); // blobid
3005 denc_varint(0, bound
); // logical_offset
3006 denc_varint(0, bound
); // len
3007 denc_varint(0, bound
); // blob_offset
3009 p
->blob
->bound_encode(
3012 p
->blob
->shared_blob
->get_sbid(),
3020 denc(struct_v
, bound
);
3021 denc_varint(0, bound
); // number of extents
3024 auto app
= bl
.get_contiguous_appender(bound
);
3025 denc(struct_v
, app
);
3026 denc_varint(n
, app
);
3033 uint64_t prev_len
= 0;
3034 for (auto p
= start
;
3035 p
!= extent_map
.end() && p
->logical_offset
< end
;
3038 bool include_blob
= false;
3039 if (p
->blob
->is_spanning()) {
3040 blobid
= p
->blob
->id
<< BLOBID_SHIFT_BITS
;
3041 blobid
|= BLOBID_FLAG_SPANNING
;
3042 } else if (p
->blob
->last_encoded_id
< 0) {
3043 p
->blob
->last_encoded_id
= n
+ 1; // so it is always non-zero
3044 include_blob
= true;
3045 blobid
= 0; // the decoder will infer the id from n
3047 blobid
= p
->blob
->last_encoded_id
<< BLOBID_SHIFT_BITS
;
3049 if (p
->logical_offset
== pos
) {
3050 blobid
|= BLOBID_FLAG_CONTIGUOUS
;
3052 if (p
->blob_offset
== 0) {
3053 blobid
|= BLOBID_FLAG_ZEROOFFSET
;
3055 if (p
->length
== prev_len
) {
3056 blobid
|= BLOBID_FLAG_SAMELENGTH
;
3058 prev_len
= p
->length
;
3060 denc_varint(blobid
, app
);
3061 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
3062 denc_varint_lowz(p
->logical_offset
- pos
, app
);
3064 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
3065 denc_varint_lowz(p
->blob_offset
, app
);
3067 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
3068 denc_varint_lowz(p
->length
, app
);
3070 pos
= p
->logical_end();
3072 p
->blob
->encode(app
, struct_v
, p
->blob
->shared_blob
->get_sbid(), false);
3076 /*derr << __func__ << bl << dendl;
3077 derr << __func__ << ":";
3084 unsigned BlueStore::ExtentMap::decode_some(bufferlist
& bl
)
3087 derr << __func__ << ":";
3092 ceph_assert(bl
.get_num_buffers() <= 1);
3093 auto p
= bl
.front().begin_deep();
3096 // Version 2 differs from v1 in blob's ref_map
3097 // serialization only. Hence there is no specific
3098 // handling at ExtentMap level below.
3099 ceph_assert(struct_v
== 1 || struct_v
== 2);
3102 denc_varint(num
, p
);
3103 vector
<BlobRef
> blobs(num
);
3105 uint64_t prev_len
= 0;
3109 Extent
*le
= new Extent();
3111 denc_varint(blobid
, p
);
3112 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
3114 denc_varint_lowz(gap
, p
);
3117 le
->logical_offset
= pos
;
3118 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
3119 denc_varint_lowz(le
->blob_offset
, p
);
3121 le
->blob_offset
= 0;
3123 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
3124 denc_varint_lowz(prev_len
, p
);
3126 le
->length
= prev_len
;
3128 if (blobid
& BLOBID_FLAG_SPANNING
) {
3129 dout(30) << __func__
<< " getting spanning blob "
3130 << (blobid
>> BLOBID_SHIFT_BITS
) << dendl
;
3131 le
->assign_blob(get_spanning_blob(blobid
>> BLOBID_SHIFT_BITS
));
3133 blobid
>>= BLOBID_SHIFT_BITS
;
3135 le
->assign_blob(blobs
[blobid
- 1]);
3136 ceph_assert(le
->blob
);
3138 Blob
*b
= new Blob();
3140 b
->decode(onode
->c
, p
, struct_v
, &sbid
, false);
3142 onode
->c
->open_shared_blob(sbid
, b
);
3145 // we build ref_map dynamically for non-spanning blobs
3153 extent_map
.insert(*le
);
3156 ceph_assert(n
== num
);
3160 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p
)
3162 // Version 2 differs from v1 in blob's ref_map
3163 // serialization only. Hence there is no specific
3164 // handling at ExtentMap level.
3168 denc_varint((uint32_t)0, p
);
3169 size_t key_size
= 0;
3170 denc_varint((uint32_t)0, key_size
);
3171 p
+= spanning_blob_map
.size() * key_size
;
3172 for (const auto& i
: spanning_blob_map
) {
3173 i
.second
->bound_encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
3177 void BlueStore::ExtentMap::encode_spanning_blobs(
3178 bufferlist::contiguous_appender
& p
)
3180 // Version 2 differs from v1 in blob's ref_map
3181 // serialization only. Hence there is no specific
3182 // handling at ExtentMap level.
3186 denc_varint(spanning_blob_map
.size(), p
);
3187 for (auto& i
: spanning_blob_map
) {
3188 denc_varint(i
.second
->id
, p
);
3189 i
.second
->encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
3193 void BlueStore::ExtentMap::decode_spanning_blobs(
3194 bufferptr::const_iterator
& p
)
3198 // Version 2 differs from v1 in blob's ref_map
3199 // serialization only. Hence there is no specific
3200 // handling at ExtentMap level.
3201 ceph_assert(struct_v
== 1 || struct_v
== 2);
3206 BlobRef
b(new Blob());
3207 denc_varint(b
->id
, p
);
3208 spanning_blob_map
[b
->id
] = b
;
3210 b
->decode(onode
->c
, p
, struct_v
, &sbid
, true);
3211 onode
->c
->open_shared_blob(sbid
, b
);
3215 void BlueStore::ExtentMap::init_shards(bool loaded
, bool dirty
)
3217 shards
.resize(onode
->onode
.extent_map_shards
.size());
3219 for (auto &s
: onode
->onode
.extent_map_shards
) {
3220 shards
[i
].shard_info
= &s
;
3221 shards
[i
].loaded
= loaded
;
3222 shards
[i
].dirty
= dirty
;
3227 void BlueStore::ExtentMap::fault_range(
3232 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3233 << std::dec
<< dendl
;
3234 auto start
= seek_shard(offset
);
3235 auto last
= seek_shard(offset
+ length
);
3240 ceph_assert(last
>= start
);
3242 while (start
<= last
) {
3243 ceph_assert((size_t)start
< shards
.size());
3244 auto p
= &shards
[start
];
3246 dout(30) << __func__
<< " opening shard 0x" << std::hex
3247 << p
->shard_info
->offset
<< std::dec
<< dendl
;
3249 generate_extent_shard_key_and_apply(
3250 onode
->key
, p
->shard_info
->offset
, &key
,
3251 [&](const string
& final_key
) {
3252 int r
= db
->get(PREFIX_OBJ
, final_key
, &v
);
3254 derr
<< __func__
<< " missing shard 0x" << std::hex
3255 << p
->shard_info
->offset
<< std::dec
<< " for " << onode
->oid
3257 ceph_assert(r
>= 0);
3261 p
->extents
= decode_some(v
);
3263 dout(20) << __func__
<< " open shard 0x" << std::hex
3264 << p
->shard_info
->offset
3265 << " for range 0x" << offset
<< "~" << length
<< std::dec
3266 << " (" << v
.length() << " bytes)" << dendl
;
3267 ceph_assert(p
->dirty
== false);
3268 ceph_assert(v
.length() == p
->shard_info
->bytes
);
3269 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_misses
);
3271 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_hits
);
3277 void BlueStore::ExtentMap::dirty_range(
3281 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3282 << std::dec
<< dendl
;
3283 if (shards
.empty()) {
3284 dout(20) << __func__
<< " mark inline shard dirty" << dendl
;
3288 auto start
= seek_shard(offset
);
3292 auto last
= seek_shard(offset
+ length
- 1);
3296 ceph_assert(last
>= start
);
3297 while (start
<= last
) {
3298 ceph_assert((size_t)start
< shards
.size());
3299 auto p
= &shards
[start
];
3301 derr
<< __func__
<< "on write 0x" << std::hex
<< offset
3302 << "~" << length
<< " shard 0x" << p
->shard_info
->offset
3303 << std::dec
<< " is not loaded, can't mark dirty" << dendl
;
3304 ceph_abort_msg("can't mark unloaded shard dirty");
3307 dout(20) << __func__
<< " mark shard 0x" << std::hex
3308 << p
->shard_info
->offset
<< std::dec
<< " dirty" << dendl
;
3315 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::find(
3318 Extent
dummy(offset
);
3319 return extent_map
.find(dummy
);
3322 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::seek_lextent(
3325 Extent
dummy(offset
);
3326 auto fp
= extent_map
.lower_bound(dummy
);
3327 if (fp
!= extent_map
.begin()) {
3329 if (fp
->logical_end() <= offset
) {
3336 BlueStore::extent_map_t::const_iterator
BlueStore::ExtentMap::seek_lextent(
3337 uint64_t offset
) const
3339 Extent
dummy(offset
);
3340 auto fp
= extent_map
.lower_bound(dummy
);
3341 if (fp
!= extent_map
.begin()) {
3343 if (fp
->logical_end() <= offset
) {
3350 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset
, uint64_t length
)
3352 auto fp
= seek_lextent(offset
);
3353 if (fp
== extent_map
.end() || fp
->logical_offset
>= offset
+ length
) {
3359 int BlueStore::ExtentMap::compress_extent_map(
3363 if (extent_map
.empty())
3366 auto p
= seek_lextent(offset
);
3367 if (p
!= extent_map
.begin()) {
3368 --p
; // start to the left of offset
3370 // the caller should have just written to this region
3371 ceph_assert(p
!= extent_map
.end());
3373 // identify the *next* shard
3374 auto pshard
= shards
.begin();
3375 while (pshard
!= shards
.end() &&
3376 p
->logical_offset
>= pshard
->shard_info
->offset
) {
3380 if (pshard
!= shards
.end()) {
3381 shard_end
= pshard
->shard_info
->offset
;
3383 shard_end
= OBJECT_MAX_SIZE
;
3387 for (++n
; n
!= extent_map
.end(); p
= n
++) {
3388 if (n
->logical_offset
> offset
+ length
) {
3389 break; // stop after end
3391 while (n
!= extent_map
.end() &&
3392 p
->logical_end() == n
->logical_offset
&&
3393 p
->blob
== n
->blob
&&
3394 p
->blob_offset
+ p
->length
== n
->blob_offset
&&
3395 n
->logical_offset
< shard_end
) {
3396 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3397 << " next shard 0x" << shard_end
<< std::dec
3398 << " merging " << *p
<< " and " << *n
<< dendl
;
3399 p
->length
+= n
->length
;
3403 if (n
== extent_map
.end()) {
3406 if (n
->logical_offset
>= shard_end
) {
3407 ceph_assert(pshard
!= shards
.end());
3409 if (pshard
!= shards
.end()) {
3410 shard_end
= pshard
->shard_info
->offset
;
3412 shard_end
= OBJECT_MAX_SIZE
;
3417 onode
->c
->store
->logger
->inc(l_bluestore_extent_compress
, removed
);
3422 void BlueStore::ExtentMap::punch_hole(
3426 old_extent_map_t
*old_extents
)
3428 auto p
= seek_lextent(offset
);
3429 uint64_t end
= offset
+ length
;
3430 while (p
!= extent_map
.end()) {
3431 if (p
->logical_offset
>= end
) {
3434 if (p
->logical_offset
< offset
) {
3435 if (p
->logical_end() > end
) {
3436 // split and deref middle
3437 uint64_t front
= offset
- p
->logical_offset
;
3438 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ front
,
3440 old_extents
->push_back(*oe
);
3442 p
->blob_offset
+ front
+ length
,
3443 p
->length
- front
- length
,
3449 ceph_assert(p
->logical_end() > offset
); // else seek_lextent bug
3450 uint64_t keep
= offset
- p
->logical_offset
;
3451 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ keep
,
3452 p
->length
- keep
, p
->blob
);
3453 old_extents
->push_back(*oe
);
3459 if (p
->logical_offset
+ p
->length
<= end
) {
3460 // deref whole lextent
3461 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
3462 p
->length
, p
->blob
);
3463 old_extents
->push_back(*oe
);
3468 uint64_t keep
= p
->logical_end() - end
;
3469 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
3470 p
->length
- keep
, p
->blob
);
3471 old_extents
->push_back(*oe
);
3473 add(end
, p
->blob_offset
+ p
->length
- keep
, keep
, p
->blob
);
3479 BlueStore::Extent
*BlueStore::ExtentMap::set_lextent(
3481 uint64_t logical_offset
,
3482 uint64_t blob_offset
, uint64_t length
, BlobRef b
,
3483 old_extent_map_t
*old_extents
)
3485 // We need to have completely initialized Blob to increment its ref counters.
3486 ceph_assert(b
->get_blob().get_logical_length() != 0);
3488 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3489 // old_extents list if we overwre the blob totally
3490 // This might happen during WAL overwrite.
3491 b
->get_ref(onode
->c
, blob_offset
, length
);
3494 punch_hole(c
, logical_offset
, length
, old_extents
);
3497 Extent
*le
= new Extent(logical_offset
, blob_offset
, length
, b
);
3498 extent_map
.insert(*le
);
3499 if (spans_shard(logical_offset
, length
)) {
3500 request_reshard(logical_offset
, logical_offset
+ length
);
3505 BlueStore::BlobRef
BlueStore::ExtentMap::split_blob(
3507 uint32_t blob_offset
,
3510 uint32_t end_pos
= pos
+ lb
->get_blob().get_logical_length() - blob_offset
;
3511 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< " end 0x" << end_pos
3512 << " blob_offset 0x" << blob_offset
<< std::dec
<< " " << *lb
3514 BlobRef rb
= onode
->c
->new_blob();
3515 lb
->split(onode
->c
, blob_offset
, rb
.get());
3517 for (auto ep
= seek_lextent(pos
);
3518 ep
!= extent_map
.end() && ep
->logical_offset
< end_pos
;
3520 if (ep
->blob
!= lb
) {
3523 if (ep
->logical_offset
< pos
) {
3525 size_t left
= pos
- ep
->logical_offset
;
3526 Extent
*ne
= new Extent(pos
, 0, ep
->length
- left
, rb
);
3527 extent_map
.insert(*ne
);
3529 dout(30) << __func__
<< " split " << *ep
<< dendl
;
3530 dout(30) << __func__
<< " to " << *ne
<< dendl
;
3533 ceph_assert(ep
->blob_offset
>= blob_offset
);
3536 ep
->blob_offset
-= blob_offset
;
3537 dout(30) << __func__
<< " adjusted " << *ep
<< dendl
;
3546 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3549 // A tricky thing about Onode's ref counter is that we do an additional
3550 // increment when newly pinned instance is detected. And -1 on unpin.
3551 // This prevents from a conflict with a delete call (when nref == 0).
3552 // The latter might happen while the thread is in unpin() function
3553 // (and e.g. waiting for lock acquisition) since nref is already
3554 // decremented. And another 'putting' thread on the instance will release it.
3556 void BlueStore::Onode::get() {
3557 if (++nref
>= 2 && !pinned
) {
3558 OnodeCacheShard
* ocs
= c
->get_onode_cache();
3559 std::lock_guard
l(ocs
->lock
);
3560 bool was_pinned
= pinned
;
3562 // additional increment for newly pinned instance
3563 bool r
= !was_pinned
&& pinned
;
3572 void BlueStore::Onode::put() {
3575 OnodeCacheShard
* ocs
= c
->get_onode_cache();
3576 std::lock_guard
l(ocs
->lock
);
3577 bool need_unpin
= pinned
;
3578 pinned
= pinned
&& nref
> 2; // intentionally use > not >= as we have
3579 // +1 due to pinned state
3580 need_unpin
= need_unpin
&& !pinned
;
3581 if (cached
&& need_unpin
) {
3585 ocs
->_unpin_and_rm(this);
3586 // remove will also decrement nref and delete Onode
3587 c
->onode_map
._remove(oid
);
3590 // additional decrement for newly unpinned instance
3591 // should be the last action since Onode can be released
3592 // at any point after this decrement
3602 BlueStore::Onode
* BlueStore::Onode::decode(
3604 const ghobject_t
& oid
,
3606 const bufferlist
& v
)
3608 Onode
* on
= new Onode(c
.get(), oid
, key
);
3610 auto p
= v
.front().begin_deep();
3611 on
->onode
.decode(p
);
3612 for (auto& i
: on
->onode
.attrs
) {
3613 i
.second
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
3616 // initialize extent_map
3617 on
->extent_map
.decode_spanning_blobs(p
);
3618 if (on
->onode
.extent_map_shards
.empty()) {
3619 denc(on
->extent_map
.inline_bl
, p
);
3620 on
->extent_map
.decode_some(on
->extent_map
.inline_bl
);
3621 on
->extent_map
.inline_bl
.reassign_to_mempool(
3622 mempool::mempool_bluestore_cache_data
);
3625 on
->extent_map
.init_shards(false, false);
3630 void BlueStore::Onode::flush()
3632 if (flushing_count
.load()) {
3633 ldout(c
->store
->cct
, 20) << __func__
<< " cnt:" << flushing_count
<< dendl
;
3635 std::unique_lock
l(flush_lock
);
3636 while (flushing_count
.load()) {
3641 ldout(c
->store
->cct
, 20) << __func__
<< " done" << dendl
;
3644 void BlueStore::Onode::dump(Formatter
* f
) const
3651 const string
& BlueStore::Onode::get_omap_prefix()
3653 if (onode
.is_pgmeta_omap()) {
3654 return PREFIX_PGMETA_OMAP
;
3656 if (onode
.is_perpool_omap()) {
3657 return PREFIX_PERPOOL_OMAP
;
3664 void BlueStore::Onode::get_omap_header(string
*out
)
3666 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3667 _key_encode_u64(c
->pool(), out
);
3669 _key_encode_u64(onode
.nid
, out
);
3670 out
->push_back('-');
3673 void BlueStore::Onode::get_omap_key(const string
& key
, string
*out
)
3675 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3676 _key_encode_u64(c
->pool(), out
);
3678 _key_encode_u64(onode
.nid
, out
);
3679 out
->push_back('.');
3683 void BlueStore::Onode::rewrite_omap_key(const string
& old
, string
*out
)
3685 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3686 _key_encode_u64(c
->pool(), out
);
3688 _key_encode_u64(onode
.nid
, out
);
3689 out
->append(old
.c_str() + out
->length(), old
.size() - out
->length());
3692 void BlueStore::Onode::get_omap_tail(string
*out
)
3694 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3695 _key_encode_u64(c
->pool(), out
);
3697 _key_encode_u64(onode
.nid
, out
);
3698 out
->push_back('~');
3701 void BlueStore::Onode::decode_omap_key(const string
& key
, string
*user_key
)
3703 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3704 *user_key
= key
.substr(sizeof(uint64_t)*2 + 1);
3706 *user_key
= key
.substr(sizeof(uint64_t) + 1);
3711 // =======================================================
3714 /// Checks for writes to the same pextent within a blob
3715 bool BlueStore::WriteContext::has_conflict(
3719 uint64_t min_alloc_size
)
3721 ceph_assert((loffs
% min_alloc_size
) == 0);
3722 ceph_assert((loffs_end
% min_alloc_size
) == 0);
3723 for (auto w
: writes
) {
3725 auto loffs2
= p2align(w
.logical_offset
, min_alloc_size
);
3726 auto loffs2_end
= p2roundup(w
.logical_offset
+ w
.length0
, min_alloc_size
);
3727 if ((loffs
<= loffs2
&& loffs_end
> loffs2
) ||
3728 (loffs
>= loffs2
&& loffs
< loffs2_end
)) {
3736 // =======================================================
3740 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3742 #define dout_context cct
3744 void BlueStore::DeferredBatch::prepare_write(
3746 uint64_t seq
, uint64_t offset
, uint64_t length
,
3747 bufferlist::const_iterator
& blp
)
3749 _discard(cct
, offset
, length
);
3750 auto i
= iomap
.insert(make_pair(offset
, deferred_io()));
3751 ceph_assert(i
.second
); // this should be a new insertion
3752 i
.first
->second
.seq
= seq
;
3753 blp
.copy(length
, i
.first
->second
.bl
);
3754 i
.first
->second
.bl
.reassign_to_mempool(
3755 mempool::mempool_bluestore_writing_deferred
);
3756 dout(20) << __func__
<< " seq " << seq
3757 << " 0x" << std::hex
<< offset
<< "~" << length
3758 << " crc " << i
.first
->second
.bl
.crc32c(-1)
3759 << std::dec
<< dendl
;
3760 seq_bytes
[seq
] += length
;
3761 #ifdef DEBUG_DEFERRED
3766 void BlueStore::DeferredBatch::_discard(
3767 CephContext
*cct
, uint64_t offset
, uint64_t length
)
3769 generic_dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3770 << std::dec
<< dendl
;
3771 auto p
= iomap
.lower_bound(offset
);
3772 if (p
!= iomap
.begin()) {
3774 auto end
= p
->first
+ p
->second
.bl
.length();
3777 head
.substr_of(p
->second
.bl
, 0, offset
- p
->first
);
3778 dout(20) << __func__
<< " keep head " << p
->second
.seq
3779 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3780 << " -> 0x" << head
.length() << std::dec
<< dendl
;
3781 auto i
= seq_bytes
.find(p
->second
.seq
);
3782 ceph_assert(i
!= seq_bytes
.end());
3783 if (end
> offset
+ length
) {
3785 tail
.substr_of(p
->second
.bl
, offset
+ length
- p
->first
,
3786 end
- (offset
+ length
));
3787 dout(20) << __func__
<< " keep tail " << p
->second
.seq
3788 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3789 << " -> 0x" << tail
.length() << std::dec
<< dendl
;
3790 auto &n
= iomap
[offset
+ length
];
3792 n
.seq
= p
->second
.seq
;
3793 i
->second
-= length
;
3795 i
->second
-= end
- offset
;
3797 ceph_assert(i
->second
>= 0);
3798 p
->second
.bl
.swap(head
);
3802 while (p
!= iomap
.end()) {
3803 if (p
->first
>= offset
+ length
) {
3806 auto i
= seq_bytes
.find(p
->second
.seq
);
3807 ceph_assert(i
!= seq_bytes
.end());
3808 auto end
= p
->first
+ p
->second
.bl
.length();
3809 if (end
> offset
+ length
) {
3810 unsigned drop_front
= offset
+ length
- p
->first
;
3811 unsigned keep_tail
= end
- (offset
+ length
);
3812 dout(20) << __func__
<< " truncate front " << p
->second
.seq
3813 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3814 << " drop_front 0x" << drop_front
<< " keep_tail 0x" << keep_tail
3815 << " to 0x" << (offset
+ length
) << "~" << keep_tail
3816 << std::dec
<< dendl
;
3817 auto &s
= iomap
[offset
+ length
];
3818 s
.seq
= p
->second
.seq
;
3819 s
.bl
.substr_of(p
->second
.bl
, drop_front
, keep_tail
);
3820 i
->second
-= drop_front
;
3822 dout(20) << __func__
<< " drop " << p
->second
.seq
3823 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3824 << std::dec
<< dendl
;
3825 i
->second
-= p
->second
.bl
.length();
3827 ceph_assert(i
->second
>= 0);
3832 void BlueStore::DeferredBatch::_audit(CephContext
*cct
)
3834 map
<uint64_t,int> sb
;
3835 for (auto p
: seq_bytes
) {
3836 sb
[p
.first
] = 0; // make sure we have the same set of keys
3839 for (auto& p
: iomap
) {
3840 ceph_assert(p
.first
>= pos
);
3841 sb
[p
.second
.seq
] += p
.second
.bl
.length();
3842 pos
= p
.first
+ p
.second
.bl
.length();
3844 ceph_assert(sb
== seq_bytes
);
3851 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3853 BlueStore::Collection::Collection(BlueStore
*store_
, OnodeCacheShard
*oc
, BufferCacheShard
*bc
, coll_t cid
)
3854 : CollectionImpl(store_
->cct
, cid
),
3859 commit_queue(nullptr)
3863 bool BlueStore::Collection::flush_commit(Context
*c
)
3865 return osr
->flush_commit(c
);
3868 void BlueStore::Collection::flush()
3873 void BlueStore::Collection::flush_all_but_last()
3875 osr
->flush_all_but_last();
3878 void BlueStore::Collection::open_shared_blob(uint64_t sbid
, BlobRef b
)
3880 ceph_assert(!b
->shared_blob
);
3881 const bluestore_blob_t
& blob
= b
->get_blob();
3882 if (!blob
.is_shared()) {
3883 b
->shared_blob
= new SharedBlob(this);
3887 b
->shared_blob
= shared_blob_set
.lookup(sbid
);
3888 if (b
->shared_blob
) {
3889 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3890 << std::dec
<< " had " << *b
->shared_blob
<< dendl
;
3892 b
->shared_blob
= new SharedBlob(sbid
, this);
3893 shared_blob_set
.add(this, b
->shared_blob
.get());
3894 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3895 << std::dec
<< " opened " << *b
->shared_blob
3900 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb
)
3902 if (!sb
->is_loaded()) {
3906 auto sbid
= sb
->get_sbid();
3907 get_shared_blob_key(sbid
, &key
);
3908 int r
= store
->db
->get(PREFIX_SHARED_BLOB
, key
, &v
);
3910 lderr(store
->cct
) << __func__
<< " sbid 0x" << std::hex
<< sbid
3911 << std::dec
<< " not found at key "
3912 << pretty_binary_string(key
) << dendl
;
3913 ceph_abort_msg("uh oh, missing shared_blob");
3917 sb
->persistent
= new bluestore_shared_blob_t(sbid
);
3918 auto p
= v
.cbegin();
3919 decode(*(sb
->persistent
), p
);
3920 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3921 << std::dec
<< " loaded shared_blob " << *sb
<< dendl
;
3925 void BlueStore::Collection::make_blob_shared(uint64_t sbid
, BlobRef b
)
3927 ldout(store
->cct
, 10) << __func__
<< " " << *b
<< dendl
;
3928 ceph_assert(!b
->shared_blob
->is_loaded());
3931 bluestore_blob_t
& blob
= b
->dirty_blob();
3932 blob
.set_flag(bluestore_blob_t::FLAG_SHARED
);
3934 // update shared blob
3935 b
->shared_blob
->loaded
= true;
3936 b
->shared_blob
->persistent
= new bluestore_shared_blob_t(sbid
);
3937 shared_blob_set
.add(this, b
->shared_blob
.get());
3938 for (auto p
: blob
.get_extents()) {
3940 b
->shared_blob
->get_ref(
3945 ldout(store
->cct
, 20) << __func__
<< " now " << *b
<< dendl
;
3948 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob
*sb
)
3950 ldout(store
->cct
, 10) << __func__
<< " " << *sb
<< dendl
;
3951 ceph_assert(sb
->is_loaded());
3953 uint64_t sbid
= sb
->get_sbid();
3954 shared_blob_set
.remove(sb
);
3956 delete sb
->persistent
;
3957 sb
->sbid_unloaded
= 0;
3958 ldout(store
->cct
, 20) << __func__
<< " now " << *sb
<< dendl
;
3962 BlueStore::OnodeRef
BlueStore::Collection::get_onode(
3963 const ghobject_t
& oid
,
3967 ceph_assert(create
? ceph_mutex_is_wlocked(lock
) : ceph_mutex_is_locked(lock
));
3970 if (cid
.is_pg(&pgid
)) {
3971 if (!oid
.match(cnode
.bits
, pgid
.ps())) {
3972 lderr(store
->cct
) << __func__
<< " oid " << oid
<< " not part of "
3973 << pgid
<< " bits " << cnode
.bits
<< dendl
;
3978 OnodeRef o
= onode_map
.lookup(oid
);
3983 get_object_key(store
->cct
, oid
, &key
);
3985 ldout(store
->cct
, 20) << __func__
<< " oid " << oid
<< " key "
3986 << pretty_binary_string(key
) << dendl
;
3992 r
= store
->db
->get(PREFIX_OBJ
, key
.c_str(), key
.size(), &v
);
3993 ldout(store
->cct
, 20) << " r " << r
<< " v.len " << v
.length() << dendl
;
3995 if (v
.length() == 0) {
3996 ceph_assert(r
== -ENOENT
);
3997 if (!store
->cct
->_conf
->bluestore_debug_misc
&&
4001 // new object, new onode
4002 on
= new Onode(this, oid
, key
);
4005 ceph_assert(r
>= 0);
4006 on
= Onode::decode(this, oid
, key
, v
);
4009 return onode_map
.add(oid
, o
);
4012 void BlueStore::Collection::split_cache(
4015 ldout(store
->cct
, 10) << __func__
<< " to " << dest
<< dendl
;
4017 // lock (one or both) cache shards
4018 std::lock(cache
->lock
, dest
->cache
->lock
);
4019 std::lock_guard
l(cache
->lock
, std::adopt_lock
);
4020 std::lock_guard
l2(dest
->cache
->lock
, std::adopt_lock
);
4022 int destbits
= dest
->cnode
.bits
;
4024 bool is_pg
= dest
->cid
.is_pg(&destpg
);
4027 auto p
= onode_map
.onode_map
.begin();
4028 while (p
!= onode_map
.onode_map
.end()) {
4029 OnodeRef o
= p
->second
;
4030 if (!p
->second
->oid
.match(destbits
, destpg
.pgid
.ps())) {
4031 // onode does not belong to this child
4032 ldout(store
->cct
, 20) << __func__
<< " not moving " << o
<< " " << o
->oid
4036 ldout(store
->cct
, 20) << __func__
<< " moving " << o
<< " " << o
->oid
4039 // ensuring that nref is always >= 2 and hence onode is pinned and
4040 // physically out of cache during the transition
4042 ceph_assert(o
->pinned
);
4044 p
= onode_map
.onode_map
.erase(p
);
4045 dest
->onode_map
.onode_map
[o
->oid
] = o
;
4047 get_onode_cache()->move_pinned(dest
->get_onode_cache(), o
.get());
4051 // move over shared blobs and buffers. cover shared blobs from
4052 // both extent map and spanning blob map (the full extent map
4053 // may not be faulted in)
4054 vector
<SharedBlob
*> sbvec
;
4055 for (auto& e
: o
->extent_map
.extent_map
) {
4056 sbvec
.push_back(e
.blob
->shared_blob
.get());
4058 for (auto& b
: o
->extent_map
.spanning_blob_map
) {
4059 sbvec
.push_back(b
.second
->shared_blob
.get());
4061 for (auto sb
: sbvec
) {
4062 if (sb
->coll
== dest
) {
4063 ldout(store
->cct
, 20) << __func__
<< " already moved " << *sb
4067 ldout(store
->cct
, 20) << __func__
<< " moving " << *sb
<< dendl
;
4068 if (sb
->get_sbid()) {
4069 ldout(store
->cct
, 20) << __func__
4070 << " moving registration " << *sb
<< dendl
;
4071 shared_blob_set
.remove(sb
);
4072 dest
->shared_blob_set
.add(dest
, sb
);
4075 if (dest
->cache
!= cache
) {
4076 for (auto& i
: sb
->bc
.buffer_map
) {
4077 if (!i
.second
->is_writing()) {
4078 ldout(store
->cct
, 20) << __func__
<< " moving " << *i
.second
4080 dest
->cache
->_move(cache
, i
.second
.get());
4087 dest
->cache
->_trim();
4090 // =======================================================
4095 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
4097 #define dout_context store->cct
4099 void *BlueStore::MempoolThread::entry()
4101 std::unique_lock l
{lock
};
4103 uint32_t prev_config_change
= store
->config_changed
.load();
4104 uint64_t base
= store
->osd_memory_base
;
4105 double fragmentation
= store
->osd_memory_expected_fragmentation
;
4106 uint64_t target
= store
->osd_memory_target
;
4107 uint64_t min
= store
->osd_memory_cache_min
;
4110 // When setting the maximum amount of memory to use for cache, first
4111 // assume some base amount of memory for the OSD and then fudge in
4112 // some overhead for fragmentation that scales with cache usage.
4113 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
4114 if (ltarget
> base
+ min
) {
4115 max
= ltarget
- base
;
4118 binned_kv_cache
= store
->db
->get_priority_cache();
4119 if (store
->cache_autotune
&& binned_kv_cache
!= nullptr) {
4120 pcm
= std::make_shared
<PriorityCache::Manager
>(
4121 store
->cct
, min
, max
, target
, true);
4122 pcm
->insert("kv", binned_kv_cache
, true);
4123 pcm
->insert("meta", meta_cache
, true);
4124 pcm
->insert("data", data_cache
, true);
4127 utime_t next_balance
= ceph_clock_now();
4128 utime_t next_resize
= ceph_clock_now();
4129 utime_t next_deferred_force_submit
= ceph_clock_now();
4130 utime_t alloc_stats_dump_clock
= ceph_clock_now();
4132 bool interval_stats_trim
= false;
4134 // Update pcm cache settings if related configuration was changed
4135 uint32_t cur_config_change
= store
->config_changed
.load();
4136 if (cur_config_change
!= prev_config_change
) {
4137 _update_cache_settings();
4138 prev_config_change
= cur_config_change
;
4141 // Before we trim, check and see if it's time to rebalance/resize.
4142 double autotune_interval
= store
->cache_autotune_interval
;
4143 double resize_interval
= store
->osd_memory_cache_resize_interval
;
4144 double max_defer_interval
= store
->max_defer_interval
;
4146 double alloc_stats_dump_interval
=
4147 store
->cct
->_conf
->bluestore_alloc_stats_dump_interval
;
4149 if (alloc_stats_dump_interval
> 0 &&
4150 alloc_stats_dump_clock
+ alloc_stats_dump_interval
< ceph_clock_now()) {
4151 store
->_record_allocation_stats();
4152 alloc_stats_dump_clock
= ceph_clock_now();
4154 if (autotune_interval
> 0 && next_balance
< ceph_clock_now()) {
4155 _adjust_cache_settings();
4157 // Log events at 5 instead of 20 when balance happens.
4158 interval_stats_trim
= true;
4160 if (pcm
!= nullptr) {
4164 next_balance
= ceph_clock_now();
4165 next_balance
+= autotune_interval
;
4167 if (resize_interval
> 0 && next_resize
< ceph_clock_now()) {
4168 if (ceph_using_tcmalloc() && pcm
!= nullptr) {
4171 next_resize
= ceph_clock_now();
4172 next_resize
+= resize_interval
;
4175 if (max_defer_interval
> 0 &&
4176 next_deferred_force_submit
< ceph_clock_now()) {
4177 if (store
->get_deferred_last_submitted() + max_defer_interval
<
4179 store
->deferred_try_submit();
4181 next_deferred_force_submit
= ceph_clock_now();
4182 next_deferred_force_submit
+= max_defer_interval
/3;
4185 // Now Resize the shards
4186 _resize_shards(interval_stats_trim
);
4187 interval_stats_trim
= false;
4189 store
->_update_cache_logger();
4190 auto wait
= ceph::make_timespan(
4191 store
->cct
->_conf
->bluestore_cache_trim_interval
);
4192 cond
.wait_for(l
, wait
);
4195 store
->_record_allocation_stats();
4200 void BlueStore::MempoolThread::_adjust_cache_settings()
4202 if (binned_kv_cache
!= nullptr) {
4203 binned_kv_cache
->set_cache_ratio(store
->cache_kv_ratio
);
4205 meta_cache
->set_cache_ratio(store
->cache_meta_ratio
);
4206 data_cache
->set_cache_ratio(store
->cache_data_ratio
);
4209 void BlueStore::MempoolThread::_resize_shards(bool interval_stats
)
4211 size_t onode_shards
= store
->onode_cache_shards
.size();
4212 size_t buffer_shards
= store
->buffer_cache_shards
.size();
4213 int64_t kv_used
= store
->db
->get_cache_usage();
4214 int64_t meta_used
= meta_cache
->_get_used_bytes();
4215 int64_t data_used
= data_cache
->_get_used_bytes();
4217 uint64_t cache_size
= store
->cache_size
;
4219 static_cast<int64_t>(store
->cache_kv_ratio
* cache_size
);
4220 int64_t meta_alloc
=
4221 static_cast<int64_t>(store
->cache_meta_ratio
* cache_size
);
4222 int64_t data_alloc
=
4223 static_cast<int64_t>(store
->cache_data_ratio
* cache_size
);
4225 if (pcm
!= nullptr && binned_kv_cache
!= nullptr) {
4226 cache_size
= pcm
->get_tuned_mem();
4227 kv_alloc
= binned_kv_cache
->get_committed_size();
4228 meta_alloc
= meta_cache
->get_committed_size();
4229 data_alloc
= data_cache
->get_committed_size();
4232 if (interval_stats
) {
4233 dout(5) << __func__
<< " cache_size: " << cache_size
4234 << " kv_alloc: " << kv_alloc
4235 << " kv_used: " << kv_used
4236 << " meta_alloc: " << meta_alloc
4237 << " meta_used: " << meta_used
4238 << " data_alloc: " << data_alloc
4239 << " data_used: " << data_used
<< dendl
;
4241 dout(20) << __func__
<< " cache_size: " << cache_size
4242 << " kv_alloc: " << kv_alloc
4243 << " kv_used: " << kv_used
4244 << " meta_alloc: " << meta_alloc
4245 << " meta_used: " << meta_used
4246 << " data_alloc: " << data_alloc
4247 << " data_used: " << data_used
<< dendl
;
4250 uint64_t max_shard_onodes
= static_cast<uint64_t>(
4251 (meta_alloc
/ (double) onode_shards
) / meta_cache
->get_bytes_per_onode());
4252 uint64_t max_shard_buffer
= static_cast<uint64_t>(data_alloc
/ buffer_shards
);
4254 dout(30) << __func__
<< " max_shard_onodes: " << max_shard_onodes
4255 << " max_shard_buffer: " << max_shard_buffer
<< dendl
;
4257 for (auto i
: store
->onode_cache_shards
) {
4258 i
->set_max(max_shard_onodes
);
4260 for (auto i
: store
->buffer_cache_shards
) {
4261 i
->set_max(max_shard_buffer
);
4265 void BlueStore::MempoolThread::_update_cache_settings()
4267 // Nothing to do if pcm is not used.
4268 if (pcm
== nullptr) {
4272 uint64_t target
= store
->osd_memory_target
;
4273 uint64_t base
= store
->osd_memory_base
;
4274 uint64_t min
= store
->osd_memory_cache_min
;
4276 double fragmentation
= store
->osd_memory_expected_fragmentation
;
4278 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
4279 if (ltarget
> base
+ min
) {
4280 max
= ltarget
- base
;
4283 // set pcm cache levels
4284 pcm
->set_target_memory(target
);
4285 pcm
->set_min_memory(min
);
4286 pcm
->set_max_memory(max
);
4288 dout(5) << __func__
<< " updated pcm target: " << target
4289 << " pcm min: " << min
4290 << " pcm max: " << max
4294 // =======================================================
4299 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4301 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4302 CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
)
4303 : c(c
), o(o
), it(it
)
4305 std::shared_lock
l(c
->lock
);
4306 if (o
->onode
.has_omap()) {
4307 o
->get_omap_key(string(), &head
);
4308 o
->get_omap_tail(&tail
);
4309 it
->lower_bound(head
);
4313 string
BlueStore::OmapIteratorImpl::_stringify() const
4316 s
<< " omap_iterator(cid = " << c
->cid
4317 <<", oid = " << o
->oid
<< ")";
4321 int BlueStore::OmapIteratorImpl::seek_to_first()
4323 std::shared_lock
l(c
->lock
);
4324 auto start1
= mono_clock::now();
4325 if (o
->onode
.has_omap()) {
4326 it
->lower_bound(head
);
4328 it
= KeyValueDB::Iterator();
4330 c
->store
->log_latency(
4332 l_bluestore_omap_seek_to_first_lat
,
4333 mono_clock::now() - start1
,
4334 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
4339 int BlueStore::OmapIteratorImpl::upper_bound(const string
& after
)
4341 std::shared_lock
l(c
->lock
);
4342 auto start1
= mono_clock::now();
4343 if (o
->onode
.has_omap()) {
4345 o
->get_omap_key(after
, &key
);
4346 ldout(c
->store
->cct
,20) << __func__
<< " after " << after
<< " key "
4347 << pretty_binary_string(key
) << dendl
;
4348 it
->upper_bound(key
);
4350 it
= KeyValueDB::Iterator();
4352 c
->store
->log_latency_fn(
4354 l_bluestore_omap_upper_bound_lat
,
4355 mono_clock::now() - start1
,
4356 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
,
4357 [&] (const ceph::timespan
& lat
) {
4358 return ", after = " + after
+
4365 int BlueStore::OmapIteratorImpl::lower_bound(const string
& to
)
4367 std::shared_lock
l(c
->lock
);
4368 auto start1
= mono_clock::now();
4369 if (o
->onode
.has_omap()) {
4371 o
->get_omap_key(to
, &key
);
4372 ldout(c
->store
->cct
,20) << __func__
<< " to " << to
<< " key "
4373 << pretty_binary_string(key
) << dendl
;
4374 it
->lower_bound(key
);
4376 it
= KeyValueDB::Iterator();
4378 c
->store
->log_latency_fn(
4380 l_bluestore_omap_lower_bound_lat
,
4381 mono_clock::now() - start1
,
4382 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
,
4383 [&] (const ceph::timespan
& lat
) {
4384 return ", to = " + to
+
4391 bool BlueStore::OmapIteratorImpl::valid()
4393 std::shared_lock
l(c
->lock
);
4394 bool r
= o
->onode
.has_omap() && it
&& it
->valid() &&
4395 it
->raw_key().second
< tail
;
4396 if (it
&& it
->valid()) {
4397 ldout(c
->store
->cct
,20) << __func__
<< " is at "
4398 << pretty_binary_string(it
->raw_key().second
)
4404 int BlueStore::OmapIteratorImpl::next()
4407 std::shared_lock
l(c
->lock
);
4408 auto start1
= mono_clock::now();
4409 if (o
->onode
.has_omap()) {
4413 c
->store
->log_latency(
4415 l_bluestore_omap_next_lat
,
4416 mono_clock::now() - start1
,
4417 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
4422 string
BlueStore::OmapIteratorImpl::key()
4424 std::shared_lock
l(c
->lock
);
4425 ceph_assert(it
->valid());
4426 string db_key
= it
->raw_key().second
;
4428 o
->decode_omap_key(db_key
, &user_key
);
4433 bufferlist
BlueStore::OmapIteratorImpl::value()
4435 std::shared_lock
l(c
->lock
);
4436 ceph_assert(it
->valid());
4441 // =====================================
4444 #define dout_prefix *_dout << "bluestore(" << path << ") "
4446 #define dout_context cct
4449 static void aio_cb(void *priv
, void *priv2
)
4451 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
4452 BlueStore::AioContext
*c
= static_cast<BlueStore::AioContext
*>(priv2
);
4453 c
->aio_finish(store
);
4456 static void discard_cb(void *priv
, void *priv2
)
4458 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
4459 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
4460 store
->handle_discard(*tmp
);
4463 void BlueStore::handle_discard(interval_set
<uint64_t>& to_release
)
4465 dout(10) << __func__
<< dendl
;
4467 alloc
->release(to_release
);
4470 BlueStore::BlueStore(CephContext
*cct
, const string
& path
)
4471 : BlueStore(cct
, path
, 0) {}
4473 BlueStore::BlueStore(CephContext
*cct
,
4475 uint64_t _min_alloc_size
)
4476 : ObjectStore(cct
, path
),
4478 finisher(cct
, "commit_finisher", "cfin"),
4479 kv_sync_thread(this),
4480 kv_finalize_thread(this),
4481 min_alloc_size(_min_alloc_size
),
4482 min_alloc_size_order(ctz(_min_alloc_size
)),
4483 mempool_thread(this)
4486 cct
->_conf
.add_observer(this);
4487 set_cache_shards(1);
4490 BlueStore::~BlueStore()
4492 cct
->_conf
.remove_observer(this);
4494 ceph_assert(!mounted
);
4495 ceph_assert(db
== NULL
);
4496 ceph_assert(bluefs
== NULL
);
4497 ceph_assert(fsid_fd
< 0);
4498 ceph_assert(path_fd
< 0);
4499 for (auto i
: onode_cache_shards
) {
4502 for (auto i
: buffer_cache_shards
) {
4505 onode_cache_shards
.clear();
4506 buffer_cache_shards
.clear();
4509 const char **BlueStore::get_tracked_conf_keys() const
4511 static const char* KEYS
[] = {
4512 "bluestore_csum_type",
4513 "bluestore_compression_mode",
4514 "bluestore_compression_algorithm",
4515 "bluestore_compression_min_blob_size",
4516 "bluestore_compression_min_blob_size_ssd",
4517 "bluestore_compression_min_blob_size_hdd",
4518 "bluestore_compression_max_blob_size",
4519 "bluestore_compression_max_blob_size_ssd",
4520 "bluestore_compression_max_blob_size_hdd",
4521 "bluestore_compression_required_ratio",
4522 "bluestore_max_alloc_size",
4523 "bluestore_prefer_deferred_size",
4524 "bluestore_prefer_deferred_size_hdd",
4525 "bluestore_prefer_deferred_size_ssd",
4526 "bluestore_deferred_batch_ops",
4527 "bluestore_deferred_batch_ops_hdd",
4528 "bluestore_deferred_batch_ops_ssd",
4529 "bluestore_throttle_bytes",
4530 "bluestore_throttle_deferred_bytes",
4531 "bluestore_throttle_cost_per_io_hdd",
4532 "bluestore_throttle_cost_per_io_ssd",
4533 "bluestore_throttle_cost_per_io",
4534 "bluestore_max_blob_size",
4535 "bluestore_max_blob_size_ssd",
4536 "bluestore_max_blob_size_hdd",
4537 "osd_memory_target",
4538 "osd_memory_target_cgroup_limit_ratio",
4540 "osd_memory_cache_min",
4541 "osd_memory_expected_fragmentation",
4542 "bluestore_cache_autotune",
4543 "bluestore_cache_autotune_interval",
4544 "bluestore_warn_on_legacy_statfs",
4545 "bluestore_warn_on_no_per_pool_omap",
4546 "bluestore_max_defer_interval",
4552 void BlueStore::handle_conf_change(const ConfigProxy
& conf
,
4553 const std::set
<std::string
> &changed
)
4555 if (changed
.count("bluestore_warn_on_legacy_statfs")) {
4556 _check_legacy_statfs_alert();
4558 if (changed
.count("bluestore_warn_on_no_per_pool_omap")) {
4559 _check_no_per_pool_omap_alert();
4562 if (changed
.count("bluestore_csum_type")) {
4565 if (changed
.count("bluestore_compression_mode") ||
4566 changed
.count("bluestore_compression_algorithm") ||
4567 changed
.count("bluestore_compression_min_blob_size") ||
4568 changed
.count("bluestore_compression_max_blob_size")) {
4573 if (changed
.count("bluestore_max_blob_size") ||
4574 changed
.count("bluestore_max_blob_size_ssd") ||
4575 changed
.count("bluestore_max_blob_size_hdd")) {
4577 // only after startup
4581 if (changed
.count("bluestore_prefer_deferred_size") ||
4582 changed
.count("bluestore_prefer_deferred_size_hdd") ||
4583 changed
.count("bluestore_prefer_deferred_size_ssd") ||
4584 changed
.count("bluestore_max_alloc_size") ||
4585 changed
.count("bluestore_deferred_batch_ops") ||
4586 changed
.count("bluestore_deferred_batch_ops_hdd") ||
4587 changed
.count("bluestore_deferred_batch_ops_ssd")) {
4589 // only after startup
4593 if (changed
.count("bluestore_throttle_cost_per_io") ||
4594 changed
.count("bluestore_throttle_cost_per_io_hdd") ||
4595 changed
.count("bluestore_throttle_cost_per_io_ssd")) {
4597 _set_throttle_params();
4600 if (changed
.count("bluestore_throttle_bytes") ||
4601 changed
.count("bluestore_throttle_deferred_bytes") ||
4602 changed
.count("bluestore_throttle_trace_rate")) {
4603 throttle
.reset_throttle(conf
);
4605 if (changed
.count("bluestore_max_defer_interval")) {
4607 _set_max_defer_interval();
4610 if (changed
.count("osd_memory_target") ||
4611 changed
.count("osd_memory_base") ||
4612 changed
.count("osd_memory_cache_min") ||
4613 changed
.count("osd_memory_expected_fragmentation")) {
4614 _update_osd_memory_options();
4618 void BlueStore::_set_compression()
4620 auto m
= Compressor::get_comp_mode_type(cct
->_conf
->bluestore_compression_mode
);
4622 _clear_compression_alert();
4625 derr
<< __func__
<< " unrecognized value '"
4626 << cct
->_conf
->bluestore_compression_mode
4627 << "' for bluestore_compression_mode, reverting to 'none'"
4629 comp_mode
= Compressor::COMP_NONE
;
4630 string
s("unknown mode: ");
4631 s
+= cct
->_conf
->bluestore_compression_mode
;
4632 _set_compression_alert(true, s
.c_str());
4635 compressor
= nullptr;
4637 if (cct
->_conf
->bluestore_compression_min_blob_size
) {
4638 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size
;
4641 if (_use_rotational_settings()) {
4642 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_hdd
;
4644 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_ssd
;
4648 if (cct
->_conf
->bluestore_compression_max_blob_size
) {
4649 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size
;
4652 if (_use_rotational_settings()) {
4653 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_hdd
;
4655 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_ssd
;
4659 auto& alg_name
= cct
->_conf
->bluestore_compression_algorithm
;
4660 if (!alg_name
.empty()) {
4661 compressor
= Compressor::create(cct
, alg_name
);
4663 derr
<< __func__
<< " unable to initialize " << alg_name
.c_str() << " compressor"
4665 _set_compression_alert(false, alg_name
.c_str());
4669 dout(10) << __func__
<< " mode " << Compressor::get_comp_mode_name(comp_mode
)
4670 << " alg " << (compressor
? compressor
->get_type_name() : "(none)")
4671 << " min_blob " << comp_min_blob_size
4672 << " max_blob " << comp_max_blob_size
4676 void BlueStore::_set_csum()
4678 csum_type
= Checksummer::CSUM_NONE
;
4679 int t
= Checksummer::get_csum_string_type(cct
->_conf
->bluestore_csum_type
);
4680 if (t
> Checksummer::CSUM_NONE
)
4683 dout(10) << __func__
<< " csum_type "
4684 << Checksummer::get_csum_type_string(csum_type
)
4688 void BlueStore::_set_throttle_params()
4690 if (cct
->_conf
->bluestore_throttle_cost_per_io
) {
4691 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io
;
4694 if (_use_rotational_settings()) {
4695 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_hdd
;
4697 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_ssd
;
4701 dout(10) << __func__
<< " throttle_cost_per_io " << throttle_cost_per_io
4704 void BlueStore::_set_blob_size()
4706 if (cct
->_conf
->bluestore_max_blob_size
) {
4707 max_blob_size
= cct
->_conf
->bluestore_max_blob_size
;
4710 if (_use_rotational_settings()) {
4711 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_hdd
;
4713 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_ssd
;
4716 dout(10) << __func__
<< " max_blob_size 0x" << std::hex
<< max_blob_size
4717 << std::dec
<< dendl
;
4720 void BlueStore::_update_osd_memory_options()
4722 osd_memory_target
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_target");
4723 osd_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
4724 osd_memory_expected_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
4725 osd_memory_cache_min
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_cache_min");
4727 dout(10) << __func__
4728 << " osd_memory_target " << osd_memory_target
4729 << " osd_memory_base " << osd_memory_base
4730 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4731 << " osd_memory_cache_min " << osd_memory_cache_min
4735 int BlueStore::_set_cache_sizes()
4738 cache_autotune
= cct
->_conf
.get_val
<bool>("bluestore_cache_autotune");
4739 cache_autotune_interval
=
4740 cct
->_conf
.get_val
<double>("bluestore_cache_autotune_interval");
4741 osd_memory_target
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_target");
4742 osd_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
4743 osd_memory_expected_fragmentation
=
4744 cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
4745 osd_memory_cache_min
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_cache_min");
4746 osd_memory_cache_resize_interval
=
4747 cct
->_conf
.get_val
<double>("osd_memory_cache_resize_interval");
4749 if (cct
->_conf
->bluestore_cache_size
) {
4750 cache_size
= cct
->_conf
->bluestore_cache_size
;
4752 // choose global cache size based on backend type
4753 if (_use_rotational_settings()) {
4754 cache_size
= cct
->_conf
->bluestore_cache_size_hdd
;
4756 cache_size
= cct
->_conf
->bluestore_cache_size_ssd
;
4760 cache_meta_ratio
= cct
->_conf
->bluestore_cache_meta_ratio
;
4761 if (cache_meta_ratio
< 0 || cache_meta_ratio
> 1.0) {
4762 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
4763 << ") must be in range [0,1.0]" << dendl
;
4767 cache_kv_ratio
= cct
->_conf
->bluestore_cache_kv_ratio
;
4768 if (cache_kv_ratio
< 0 || cache_kv_ratio
> 1.0) {
4769 derr
<< __func__
<< " bluestore_cache_kv_ratio (" << cache_kv_ratio
4770 << ") must be in range [0,1.0]" << dendl
;
4774 if (cache_meta_ratio
+ cache_kv_ratio
> 1.0) {
4775 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
4776 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4777 << ") = " << cache_meta_ratio
+ cache_kv_ratio
<< "; must be <= 1.0"
4783 (double)1.0 - (double)cache_meta_ratio
- (double)cache_kv_ratio
;
4784 if (cache_data_ratio
< 0) {
4785 // deal with floating point imprecision
4786 cache_data_ratio
= 0;
4789 dout(1) << __func__
<< " cache_size " << cache_size
4790 << " meta " << cache_meta_ratio
4791 << " kv " << cache_kv_ratio
4792 << " data " << cache_data_ratio
4797 int BlueStore::write_meta(const std::string
& key
, const std::string
& value
)
4799 bluestore_bdev_label_t label
;
4800 string p
= path
+ "/block";
4801 int r
= _read_bdev_label(cct
, p
, &label
);
4803 return ObjectStore::write_meta(key
, value
);
4805 label
.meta
[key
] = value
;
4806 r
= _write_bdev_label(cct
, p
, label
);
4807 ceph_assert(r
== 0);
4808 return ObjectStore::write_meta(key
, value
);
4811 int BlueStore::read_meta(const std::string
& key
, std::string
*value
)
4813 bluestore_bdev_label_t label
;
4814 string p
= path
+ "/block";
4815 int r
= _read_bdev_label(cct
, p
, &label
);
4817 return ObjectStore::read_meta(key
, value
);
4819 auto i
= label
.meta
.find(key
);
4820 if (i
== label
.meta
.end()) {
4821 return ObjectStore::read_meta(key
, value
);
4827 void BlueStore::_init_logger()
4829 PerfCountersBuilder
b(cct
, "bluestore",
4830 l_bluestore_first
, l_bluestore_last
);
4831 b
.add_time_avg(l_bluestore_kv_flush_lat
, "kv_flush_lat",
4832 "Average kv_thread flush latency",
4833 "fl_l", PerfCountersBuilder::PRIO_INTERESTING
);
4834 b
.add_time_avg(l_bluestore_kv_commit_lat
, "kv_commit_lat",
4835 "Average kv_thread commit latency");
4836 b
.add_time_avg(l_bluestore_kv_sync_lat
, "kv_sync_lat",
4837 "Average kv_sync thread latency",
4838 "ks_l", PerfCountersBuilder::PRIO_INTERESTING
);
4839 b
.add_time_avg(l_bluestore_kv_final_lat
, "kv_final_lat",
4840 "Average kv_finalize thread latency",
4841 "kf_l", PerfCountersBuilder::PRIO_INTERESTING
);
4842 b
.add_time_avg(l_bluestore_state_prepare_lat
, "state_prepare_lat",
4843 "Average prepare state latency");
4844 b
.add_time_avg(l_bluestore_state_aio_wait_lat
, "state_aio_wait_lat",
4845 "Average aio_wait state latency",
4846 "io_l", PerfCountersBuilder::PRIO_INTERESTING
);
4847 b
.add_time_avg(l_bluestore_state_io_done_lat
, "state_io_done_lat",
4848 "Average io_done state latency");
4849 b
.add_time_avg(l_bluestore_state_kv_queued_lat
, "state_kv_queued_lat",
4850 "Average kv_queued state latency");
4851 b
.add_time_avg(l_bluestore_state_kv_committing_lat
, "state_kv_commiting_lat",
4852 "Average kv_commiting state latency");
4853 b
.add_time_avg(l_bluestore_state_kv_done_lat
, "state_kv_done_lat",
4854 "Average kv_done state latency");
4855 b
.add_time_avg(l_bluestore_state_deferred_queued_lat
, "state_deferred_queued_lat",
4856 "Average deferred_queued state latency");
4857 b
.add_time_avg(l_bluestore_state_deferred_aio_wait_lat
, "state_deferred_aio_wait_lat",
4858 "Average aio_wait state latency");
4859 b
.add_time_avg(l_bluestore_state_deferred_cleanup_lat
, "state_deferred_cleanup_lat",
4860 "Average cleanup state latency");
4861 b
.add_time_avg(l_bluestore_state_finishing_lat
, "state_finishing_lat",
4862 "Average finishing state latency");
4863 b
.add_time_avg(l_bluestore_state_done_lat
, "state_done_lat",
4864 "Average done state latency");
4865 b
.add_time_avg(l_bluestore_throttle_lat
, "throttle_lat",
4866 "Average submit throttle latency",
4867 "th_l", PerfCountersBuilder::PRIO_CRITICAL
);
4868 b
.add_time_avg(l_bluestore_submit_lat
, "submit_lat",
4869 "Average submit latency",
4870 "s_l", PerfCountersBuilder::PRIO_CRITICAL
);
4871 b
.add_time_avg(l_bluestore_commit_lat
, "commit_lat",
4872 "Average commit latency",
4873 "c_l", PerfCountersBuilder::PRIO_CRITICAL
);
4874 b
.add_time_avg(l_bluestore_read_lat
, "read_lat",
4875 "Average read latency",
4876 "r_l", PerfCountersBuilder::PRIO_CRITICAL
);
4877 b
.add_time_avg(l_bluestore_read_onode_meta_lat
, "read_onode_meta_lat",
4878 "Average read onode metadata latency");
4879 b
.add_time_avg(l_bluestore_read_wait_aio_lat
, "read_wait_aio_lat",
4880 "Average read latency");
4881 b
.add_time_avg(l_bluestore_compress_lat
, "compress_lat",
4882 "Average compress latency");
4883 b
.add_time_avg(l_bluestore_decompress_lat
, "decompress_lat",
4884 "Average decompress latency");
4885 b
.add_time_avg(l_bluestore_csum_lat
, "csum_lat",
4886 "Average checksum latency");
4887 b
.add_u64_counter(l_bluestore_compress_success_count
, "compress_success_count",
4888 "Sum for beneficial compress ops");
4889 b
.add_u64_counter(l_bluestore_compress_rejected_count
, "compress_rejected_count",
4890 "Sum for compress ops rejected due to low net gain of space");
4891 b
.add_u64_counter(l_bluestore_write_pad_bytes
, "write_pad_bytes",
4892 "Sum for write-op padded bytes", NULL
, 0, unit_t(UNIT_BYTES
));
4893 b
.add_u64_counter(l_bluestore_deferred_write_ops
, "deferred_write_ops",
4894 "Sum for deferred write op");
4895 b
.add_u64_counter(l_bluestore_deferred_write_bytes
, "deferred_write_bytes",
4896 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES
));
4897 b
.add_u64_counter(l_bluestore_write_penalty_read_ops
, "write_penalty_read_ops",
4898 "Sum for write penalty read ops");
4899 b
.add_u64(l_bluestore_allocated
, "bluestore_allocated",
4900 "Sum for allocated bytes");
4901 b
.add_u64(l_bluestore_stored
, "bluestore_stored",
4902 "Sum for stored bytes");
4903 b
.add_u64(l_bluestore_compressed
, "bluestore_compressed",
4904 "Sum for stored compressed bytes",
4905 "c", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4906 b
.add_u64(l_bluestore_compressed_allocated
, "bluestore_compressed_allocated",
4907 "Sum for bytes allocated for compressed data",
4908 "c_a", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4909 b
.add_u64(l_bluestore_compressed_original
, "bluestore_compressed_original",
4910 "Sum for original bytes that were compressed",
4911 "c_o", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4912 b
.add_u64(l_bluestore_onodes
, "bluestore_onodes",
4913 "Number of onodes in cache");
4914 b
.add_u64(l_bluestore_pinned_onodes
, "bluestore_pinned_onodes",
4915 "Number of pinned onodes in cache");
4916 b
.add_u64_counter(l_bluestore_onode_hits
, "bluestore_onode_hits",
4917 "Sum for onode-lookups hit in the cache");
4918 b
.add_u64_counter(l_bluestore_onode_misses
, "bluestore_onode_misses",
4919 "Sum for onode-lookups missed in the cache");
4920 b
.add_u64_counter(l_bluestore_onode_shard_hits
, "bluestore_onode_shard_hits",
4921 "Sum for onode-shard lookups hit in the cache");
4922 b
.add_u64_counter(l_bluestore_onode_shard_misses
,
4923 "bluestore_onode_shard_misses",
4924 "Sum for onode-shard lookups missed in the cache");
4925 b
.add_u64(l_bluestore_extents
, "bluestore_extents",
4926 "Number of extents in cache");
4927 b
.add_u64(l_bluestore_blobs
, "bluestore_blobs",
4928 "Number of blobs in cache");
4929 b
.add_u64(l_bluestore_buffers
, "bluestore_buffers",
4930 "Number of buffers in cache");
4931 b
.add_u64(l_bluestore_buffer_bytes
, "bluestore_buffer_bytes",
4932 "Number of buffer bytes in cache", NULL
, 0, unit_t(UNIT_BYTES
));
4933 b
.add_u64_counter(l_bluestore_buffer_hit_bytes
, "bluestore_buffer_hit_bytes",
4934 "Sum for bytes of read hit in the cache", NULL
, 0, unit_t(UNIT_BYTES
));
4935 b
.add_u64_counter(l_bluestore_buffer_miss_bytes
, "bluestore_buffer_miss_bytes",
4936 "Sum for bytes of read missed in the cache", NULL
, 0, unit_t(UNIT_BYTES
));
4938 b
.add_u64_counter(l_bluestore_write_big
, "bluestore_write_big",
4939 "Large aligned writes into fresh blobs");
4940 b
.add_u64_counter(l_bluestore_write_big_bytes
, "bluestore_write_big_bytes",
4941 "Large aligned writes into fresh blobs (bytes)", NULL
, 0, unit_t(UNIT_BYTES
));
4942 b
.add_u64_counter(l_bluestore_write_big_blobs
, "bluestore_write_big_blobs",
4943 "Large aligned writes into fresh blobs (blobs)");
4944 b
.add_u64_counter(l_bluestore_write_small
, "bluestore_write_small",
4945 "Small writes into existing or sparse small blobs");
4946 b
.add_u64_counter(l_bluestore_write_small_bytes
, "bluestore_write_small_bytes",
4947 "Small writes into existing or sparse small blobs (bytes)", NULL
, 0, unit_t(UNIT_BYTES
));
4948 b
.add_u64_counter(l_bluestore_write_small_unused
,
4949 "bluestore_write_small_unused",
4950 "Small writes into unused portion of existing blob");
4951 b
.add_u64_counter(l_bluestore_write_small_deferred
,
4952 "bluestore_write_small_deferred",
4953 "Small overwrites using deferred");
4954 b
.add_u64_counter(l_bluestore_write_small_pre_read
,
4955 "bluestore_write_small_pre_read",
4956 "Small writes that required we read some data (possibly "
4957 "cached) to fill out the block");
4958 b
.add_u64_counter(l_bluestore_write_small_new
, "bluestore_write_small_new",
4959 "Small write into new (sparse) blob");
4961 b
.add_u64_counter(l_bluestore_txc
, "bluestore_txc", "Transactions committed");
4962 b
.add_u64_counter(l_bluestore_onode_reshard
, "bluestore_onode_reshard",
4963 "Onode extent map reshard events");
4964 b
.add_u64_counter(l_bluestore_blob_split
, "bluestore_blob_split",
4965 "Sum for blob splitting due to resharding");
4966 b
.add_u64_counter(l_bluestore_extent_compress
, "bluestore_extent_compress",
4967 "Sum for extents that have been removed due to compression");
4968 b
.add_u64_counter(l_bluestore_gc_merged
, "bluestore_gc_merged",
4969 "Sum for extents that have been merged due to garbage "
4971 b
.add_u64_counter(l_bluestore_read_eio
, "bluestore_read_eio",
4972 "Read EIO errors propagated to high level callers");
4973 b
.add_u64_counter(l_bluestore_reads_with_retries
, "bluestore_reads_with_retries",
4974 "Read operations that required at least one retry due to failed checksum validation");
4975 b
.add_u64(l_bluestore_fragmentation
, "bluestore_fragmentation_micros",
4976 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
4977 b
.add_time_avg(l_bluestore_omap_seek_to_first_lat
, "omap_seek_to_first_lat",
4978 "Average omap iterator seek_to_first call latency");
4979 b
.add_time_avg(l_bluestore_omap_upper_bound_lat
, "omap_upper_bound_lat",
4980 "Average omap iterator upper_bound call latency");
4981 b
.add_time_avg(l_bluestore_omap_lower_bound_lat
, "omap_lower_bound_lat",
4982 "Average omap iterator lower_bound call latency");
4983 b
.add_time_avg(l_bluestore_omap_next_lat
, "omap_next_lat",
4984 "Average omap iterator next call latency");
4985 b
.add_time_avg(l_bluestore_omap_get_keys_lat
, "omap_get_keys_lat",
4986 "Average omap get_keys call latency");
4987 b
.add_time_avg(l_bluestore_omap_get_values_lat
, "omap_get_values_lat",
4988 "Average omap get_values call latency");
4989 b
.add_time_avg(l_bluestore_clist_lat
, "clist_lat",
4990 "Average collection listing latency");
4991 b
.add_time_avg(l_bluestore_remove_lat
, "remove_lat",
4992 "Average removal latency");
4994 logger
= b
.create_perf_counters();
4995 cct
->get_perfcounters_collection()->add(logger
);
4998 int BlueStore::_reload_logger()
5000 struct store_statfs_t store_statfs
;
5001 int r
= statfs(&store_statfs
);
5003 logger
->set(l_bluestore_allocated
, store_statfs
.allocated
);
5004 logger
->set(l_bluestore_stored
, store_statfs
.data_stored
);
5005 logger
->set(l_bluestore_compressed
, store_statfs
.data_compressed
);
5006 logger
->set(l_bluestore_compressed_allocated
, store_statfs
.data_compressed_allocated
);
5007 logger
->set(l_bluestore_compressed_original
, store_statfs
.data_compressed_original
);
5012 void BlueStore::_shutdown_logger()
5014 cct
->get_perfcounters_collection()->remove(logger
);
5018 int BlueStore::get_block_device_fsid(CephContext
* cct
, const string
& path
,
5021 bluestore_bdev_label_t label
;
5022 int r
= _read_bdev_label(cct
, path
, &label
);
5025 *fsid
= label
.osd_uuid
;
5029 int BlueStore::_open_path()
5032 ceph_assert(path_fd
< 0);
5033 path_fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_DIRECTORY
|O_CLOEXEC
));
5036 derr
<< __func__
<< " unable to open " << path
<< ": " << cpp_strerror(r
)
5043 void BlueStore::_close_path()
5045 VOID_TEMP_FAILURE_RETRY(::close(path_fd
));
5049 int BlueStore::_write_bdev_label(CephContext
*cct
,
5050 string path
, bluestore_bdev_label_t label
)
5052 dout(10) << __func__
<< " path " << path
<< " label " << label
<< dendl
;
5055 uint32_t crc
= bl
.crc32c(-1);
5057 ceph_assert(bl
.length() <= BDEV_LABEL_BLOCK_SIZE
);
5058 bufferptr
z(BDEV_LABEL_BLOCK_SIZE
- bl
.length());
5060 bl
.append(std::move(z
));
5062 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_WRONLY
|O_CLOEXEC
));
5065 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
5069 int r
= bl
.write_fd(fd
);
5071 derr
<< __func__
<< " failed to write to " << path
5072 << ": " << cpp_strerror(r
) << dendl
;
5077 derr
<< __func__
<< " failed to fsync " << path
5078 << ": " << cpp_strerror(r
) << dendl
;
5081 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5085 int BlueStore::_read_bdev_label(CephContext
* cct
, string path
,
5086 bluestore_bdev_label_t
*label
)
5088 dout(10) << __func__
<< dendl
;
5089 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_RDONLY
|O_CLOEXEC
));
5092 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
5097 int r
= bl
.read_fd(fd
, BDEV_LABEL_BLOCK_SIZE
);
5098 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5100 derr
<< __func__
<< " failed to read from " << path
5101 << ": " << cpp_strerror(r
) << dendl
;
5105 uint32_t crc
, expected_crc
;
5106 auto p
= bl
.cbegin();
5110 t
.substr_of(bl
, 0, p
.get_off());
5112 decode(expected_crc
, p
);
5114 catch (buffer::error
& e
) {
5115 dout(2) << __func__
<< " unable to decode label at offset " << p
.get_off()
5120 if (crc
!= expected_crc
) {
5121 derr
<< __func__
<< " bad crc on label, expected " << expected_crc
5122 << " != actual " << crc
<< dendl
;
5125 dout(10) << __func__
<< " got " << *label
<< dendl
;
5129 int BlueStore::_check_or_set_bdev_label(
5130 string path
, uint64_t size
, string desc
, bool create
)
5132 bluestore_bdev_label_t label
;
5134 label
.osd_uuid
= fsid
;
5136 label
.btime
= ceph_clock_now();
5137 label
.description
= desc
;
5138 int r
= _write_bdev_label(cct
, path
, label
);
5142 int r
= _read_bdev_label(cct
, path
, &label
);
5145 if (cct
->_conf
->bluestore_debug_permit_any_bdev_label
) {
5146 dout(20) << __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
5147 << " and fsid " << fsid
<< " check bypassed" << dendl
;
5148 } else if (label
.osd_uuid
!= fsid
) {
5149 derr
<< __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
5150 << " does not match our fsid " << fsid
<< dendl
;
5157 void BlueStore::_set_alloc_sizes(void)
5159 max_alloc_size
= cct
->_conf
->bluestore_max_alloc_size
;
5161 if (cct
->_conf
->bluestore_prefer_deferred_size
) {
5162 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size
;
5165 if (_use_rotational_settings()) {
5166 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_hdd
;
5168 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_ssd
;
5172 if (cct
->_conf
->bluestore_deferred_batch_ops
) {
5173 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops
;
5176 if (_use_rotational_settings()) {
5177 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_hdd
;
5179 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_ssd
;
5183 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
5184 << std::dec
<< " order " << (int)min_alloc_size_order
5185 << " max_alloc_size 0x" << std::hex
<< max_alloc_size
5186 << " prefer_deferred_size 0x" << prefer_deferred_size
5188 << " deferred_batch_ops " << deferred_batch_ops
5192 int BlueStore::_open_bdev(bool create
)
5194 ceph_assert(bdev
== NULL
);
5195 string p
= path
+ "/block";
5196 bdev
= BlockDevice::create(cct
, p
, aio_cb
, static_cast<void*>(this), discard_cb
, static_cast<void*>(this));
5197 int r
= bdev
->open(p
);
5201 if (create
&& cct
->_conf
->bdev_enable_discard
) {
5202 bdev
->discard(0, bdev
->get_size());
5205 if (bdev
->supported_bdev_label()) {
5206 r
= _check_or_set_bdev_label(p
, bdev
->get_size(), "main", create
);
5211 // initialize global block parameters
5212 block_size
= bdev
->get_block_size();
5213 block_mask
= ~(block_size
- 1);
5214 block_size_order
= ctz(block_size
);
5215 ceph_assert(block_size
== 1u << block_size_order
);
5216 _set_max_defer_interval();
5217 // and set cache_size based on device type
5218 r
= _set_cache_sizes();
5232 void BlueStore::_validate_bdev()
5235 ceph_assert(min_alloc_size
); // _get_odisk_reserved depends on that
5236 uint64_t dev_size
= bdev
->get_size();
5238 _get_ondisk_reserved() + cct
->_conf
->bluestore_bluefs_min
) {
5239 dout(1) << __func__
<< " main device size " << byte_u_t(dev_size
)
5240 << " is too small, disable bluestore_bluefs_min for now"
5242 ceph_assert(dev_size
>= _get_ondisk_reserved());
5244 int r
= cct
->_conf
.set_val("bluestore_bluefs_min", "0");
5245 ceph_assert(r
== 0);
5249 void BlueStore::_close_bdev()
5257 int BlueStore::_open_fm(KeyValueDB::Transaction t
, bool read_only
)
5260 bluestore_bdev_label_t label
;
5262 ceph_assert(fm
== NULL
);
5263 fm
= FreelistManager::create(cct
, freelist_type
, PREFIX_ALLOC
);
5266 // create mode. initialize freespace
5267 dout(20) << __func__
<< " initializing freespace" << dendl
;
5270 bl
.append(freelist_type
);
5271 t
->set(PREFIX_SUPER
, "freelist_type", bl
);
5273 // being able to allocate in units less than bdev block size
5274 // seems to be a bad idea.
5275 ceph_assert( cct
->_conf
->bdev_block_size
<= (int64_t)min_alloc_size
);
5276 fm
->create(bdev
->get_size(), (int64_t)min_alloc_size
, t
);
5278 // allocate superblock reserved space. note that we do not mark
5279 // bluefs space as allocated in the freelist; we instead rely on
5281 auto reserved
= _get_ondisk_reserved();
5282 fm
->allocate(0, reserved
, t
);
5284 if (cct
->_conf
->bluestore_bluefs
) {
5285 ceph_assert(bluefs_extents
.num_intervals() == 1);
5286 interval_set
<uint64_t>::iterator p
= bluefs_extents
.begin();
5287 reserved
= round_up_to(p
.get_start() + p
.get_len(), min_alloc_size
);
5288 dout(20) << __func__
<< " reserved 0x" << std::hex
<< reserved
<< std::dec
5289 << " for bluefs" << dendl
;
5292 if (cct
->_conf
->bluestore_debug_prefill
> 0) {
5293 uint64_t end
= bdev
->get_size() - reserved
;
5294 dout(1) << __func__
<< " pre-fragmenting freespace, using "
5295 << cct
->_conf
->bluestore_debug_prefill
<< " with max free extent "
5296 << cct
->_conf
->bluestore_debug_prefragment_max
<< dendl
;
5297 uint64_t start
= p2roundup(reserved
, min_alloc_size
);
5298 uint64_t max_b
= cct
->_conf
->bluestore_debug_prefragment_max
/ min_alloc_size
;
5299 float r
= cct
->_conf
->bluestore_debug_prefill
;
5303 while (!stop
&& start
< end
) {
5304 uint64_t l
= (rand() % max_b
+ 1) * min_alloc_size
;
5305 if (start
+ l
> end
) {
5307 l
= p2align(l
, min_alloc_size
);
5309 ceph_assert(start
+ l
<= end
);
5311 uint64_t u
= 1 + (uint64_t)(r
* (double)l
);
5312 u
= p2roundup(u
, min_alloc_size
);
5313 if (start
+ l
+ u
> end
) {
5314 u
= end
- (start
+ l
);
5315 // trim to align so we don't overflow again
5316 u
= p2align(u
, min_alloc_size
);
5319 ceph_assert(start
+ l
+ u
<= end
);
5321 dout(20) << __func__
<< " free 0x" << std::hex
<< start
<< "~" << l
5322 << " use 0x" << u
<< std::dec
<< dendl
;
5325 // break if u has been trimmed to nothing
5329 fm
->allocate(start
+ l
, u
, t
);
5333 r
= _write_out_fm_meta(0, false, &label
);
5334 ceph_assert(r
== 0);
5336 string p
= path
+ "/block";
5337 r
= _read_bdev_label(cct
, p
, &label
);
5339 derr
<< __func__
<< " freelist init failed, error reading bdev label: " << cpp_strerror(r
) << dendl
;
5345 r
= fm
->init(label
, db
, read_only
);
5347 derr
<< __func__
<< " freelist init failed: " << cpp_strerror(r
) << dendl
;
5352 // if space size tracked by free list manager is that higher than actual
5353 // dev size one can hit out-of-space allocation which will result
5354 // in data loss and/or assertions
5355 // Probably user altered the device size somehow.
5356 // The only fix for now is to redeploy OSD.
5357 if (fm
->get_size() >= bdev
->get_size() + min_alloc_size
) {
5359 ss
<< "slow device size mismatch detected, "
5360 << " fm size(" << fm
->get_size()
5361 << ") > slow device size(" << bdev
->get_size()
5362 << "), Please stop using this OSD as it might cause data loss.";
5363 _set_disk_size_mismatch_alert(ss
.str());
5368 void BlueStore::_close_fm()
5370 dout(10) << __func__
<< dendl
;
5377 int BlueStore::_write_out_fm_meta(uint64_t target_size
,
5378 bool update_root_size
,
5379 bluestore_bdev_label_t
* res_label
)
5381 string p
= path
+ "/block";
5383 std::vector
<std::pair
<string
, string
>> fm_meta
;
5384 fm
->get_meta(target_size
, &fm_meta
);
5386 bluestore_bdev_label_t label
;
5387 int r
= _read_bdev_label(cct
, p
, &label
);
5391 for (auto& m
: fm_meta
) {
5392 label
.meta
[m
.first
] = m
.second
;
5394 if (update_root_size
) {
5395 label
.size
= target_size
;
5397 r
= _write_bdev_label(cct
, p
, label
);
5405 int BlueStore::_open_alloc()
5407 ceph_assert(alloc
== NULL
);
5408 ceph_assert(bdev
->get_size());
5411 bluefs_extents
.clear();
5412 auto r
= bluefs
->get_block_extents(bluefs_layout
.shared_bdev
,
5415 lderr(cct
) << __func__
<< " failed to retrieve bluefs_extents: "
5416 << cpp_strerror(r
) << dendl
;
5420 dout(10) << __func__
<< " bluefs extents 0x"
5421 << std::hex
<< bluefs_extents
<< std::dec
5425 alloc
= Allocator::create(cct
, cct
->_conf
->bluestore_allocator
,
5427 min_alloc_size
, "block");
5429 lderr(cct
) << __func__
<< " Allocator::unknown alloc type "
5430 << cct
->_conf
->bluestore_allocator
5435 uint64_t num
= 0, bytes
= 0;
5437 dout(1) << __func__
<< " opening allocation metadata" << dendl
;
5438 // initialize from freelist
5439 fm
->enumerate_reset();
5440 uint64_t offset
, length
;
5441 while (fm
->enumerate_next(db
, &offset
, &length
)) {
5442 alloc
->init_add_free(offset
, length
);
5446 fm
->enumerate_reset();
5448 // also mark bluefs space as allocated
5449 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
5450 alloc
->init_rm_free(e
.get_start(), e
.get_len());
5453 dout(1) << __func__
<< " loaded " << byte_u_t(bytes
)
5454 << " in " << num
<< " extents"
5455 << " available " << byte_u_t(alloc
->get_free())
5461 void BlueStore::_close_alloc()
5464 bdev
->discard_drain();
5470 bluefs_extents
.clear();
5473 int BlueStore::_open_fsid(bool create
)
5475 ceph_assert(fsid_fd
< 0);
5476 int flags
= O_RDWR
|O_CLOEXEC
;
5479 fsid_fd
= ::openat(path_fd
, "fsid", flags
, 0644);
5482 derr
<< __func__
<< " " << cpp_strerror(err
) << dendl
;
5488 int BlueStore::_read_fsid(uuid_d
*uuid
)
5491 memset(fsid_str
, 0, sizeof(fsid_str
));
5492 int ret
= safe_read(fsid_fd
, fsid_str
, sizeof(fsid_str
));
5494 derr
<< __func__
<< " failed: " << cpp_strerror(ret
) << dendl
;
5501 if (!uuid
->parse(fsid_str
)) {
5502 derr
<< __func__
<< " unparsable uuid " << fsid_str
<< dendl
;
5508 int BlueStore::_write_fsid()
5510 int r
= ::ftruncate(fsid_fd
, 0);
5513 derr
<< __func__
<< " fsid truncate failed: " << cpp_strerror(r
) << dendl
;
5516 string str
= stringify(fsid
) + "\n";
5517 r
= safe_write(fsid_fd
, str
.c_str(), str
.length());
5519 derr
<< __func__
<< " fsid write failed: " << cpp_strerror(r
) << dendl
;
5522 r
= ::fsync(fsid_fd
);
5525 derr
<< __func__
<< " fsid fsync failed: " << cpp_strerror(r
) << dendl
;
5531 void BlueStore::_close_fsid()
5533 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd
));
5537 int BlueStore::_lock_fsid()
5540 memset(&l
, 0, sizeof(l
));
5542 l
.l_whence
= SEEK_SET
;
5543 int r
= ::fcntl(fsid_fd
, F_SETLK
, &l
);
5546 derr
<< __func__
<< " failed to lock " << path
<< "/fsid"
5547 << " (is another ceph-osd still running?)"
5548 << cpp_strerror(err
) << dendl
;
5554 bool BlueStore::is_rotational()
5557 return bdev
->is_rotational();
5560 bool rotational
= true;
5561 int r
= _open_path();
5564 r
= _open_fsid(false);
5567 r
= _read_fsid(&fsid
);
5573 r
= _open_bdev(false);
5576 rotational
= bdev
->is_rotational();
5586 bool BlueStore::is_journal_rotational()
5589 dout(5) << __func__
<< " bluefs disabled, default to store media type"
5591 return is_rotational();
5593 dout(10) << __func__
<< " " << (int)bluefs
->wal_is_rotational() << dendl
;
5594 return bluefs
->wal_is_rotational();
5597 bool BlueStore::_use_rotational_settings()
5599 if (cct
->_conf
->bluestore_debug_enforce_settings
== "hdd") {
5602 if (cct
->_conf
->bluestore_debug_enforce_settings
== "ssd") {
5605 return bdev
->is_rotational();
5608 bool BlueStore::test_mount_in_use()
5610 // most error conditions mean the mount is not in use (e.g., because
5611 // it doesn't exist). only if we fail to lock do we conclude it is
5614 int r
= _open_path();
5617 r
= _open_fsid(false);
5622 ret
= true; // if we can't lock, it is in use
5629 int BlueStore::_minimal_open_bluefs(bool create
)
5632 bluefs
= new BlueFS(cct
);
5637 bfn
= path
+ "/block.db";
5638 if (::stat(bfn
.c_str(), &st
) == 0) {
5639 r
= bluefs
->add_block_device(
5640 BlueFS::BDEV_DB
, bfn
,
5641 create
&& cct
->_conf
->bdev_enable_discard
);
5643 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5644 << cpp_strerror(r
) << dendl
;
5648 if (bluefs
->bdev_support_label(BlueFS::BDEV_DB
)) {
5649 r
= _check_or_set_bdev_label(
5651 bluefs
->get_block_device_size(BlueFS::BDEV_DB
),
5652 "bluefs db", create
);
5655 << " check block device(" << bfn
<< ") label returned: "
5656 << cpp_strerror(r
) << dendl
;
5661 bluefs
->add_block_extent(
5664 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) - SUPER_RESERVED
);
5666 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
5667 bluefs_layout
.dedicated_db
= true;
5670 if (::lstat(bfn
.c_str(), &st
) == -1) {
5672 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
5674 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
5675 << cpp_strerror(r
) << dendl
;
5681 bfn
= path
+ "/block";
5683 r
= bluefs
->add_block_device(bluefs_layout
.shared_bdev
, bfn
, false,
5684 true /* shared with bluestore */);
5686 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5687 << cpp_strerror(r
) << dendl
;
5691 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
5693 bdev
->get_size() * (cct
->_conf
->bluestore_bluefs_min_ratio
+
5694 cct
->_conf
->bluestore_bluefs_gift_ratio
);
5695 initial
= std::max(initial
, cct
->_conf
->bluestore_bluefs_min
);
5696 uint64_t alloc_size
= cct
->_conf
->bluefs_shared_alloc_size
;
5697 if (alloc_size
% min_alloc_size
) {
5698 derr
<< __func__
<< " bluefs_shared_alloc_size 0x" << std::hex
5699 << alloc_size
<< " is not a multiple of "
5700 << "min_alloc_size 0x" << min_alloc_size
<< std::dec
<< dendl
;
5704 // align to bluefs's alloc_size
5705 initial
= p2roundup(initial
, alloc_size
);
5706 // put bluefs in the middle of the device in case it is an HDD
5707 uint64_t start
= p2align((bdev
->get_size() - initial
) / 2, alloc_size
);
5708 //avoiding superblock overwrite
5709 start
= std::max(alloc_size
, start
);
5710 ceph_assert(start
>=_get_ondisk_reserved());
5712 bluefs
->add_block_extent(bluefs_layout
.shared_bdev
, start
, initial
);
5713 bluefs_extents
.insert(start
, initial
);
5717 bfn
= path
+ "/block.wal";
5718 if (::stat(bfn
.c_str(), &st
) == 0) {
5719 r
= bluefs
->add_block_device(BlueFS::BDEV_WAL
, bfn
,
5720 create
&& cct
->_conf
->bdev_enable_discard
);
5722 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5723 << cpp_strerror(r
) << dendl
;
5727 if (bluefs
->bdev_support_label(BlueFS::BDEV_WAL
)) {
5728 r
= _check_or_set_bdev_label(
5730 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
),
5731 "bluefs wal", create
);
5733 derr
<< __func__
<< " check block device(" << bfn
5734 << ") label returned: " << cpp_strerror(r
) << dendl
;
5740 bluefs
->add_block_extent(
5741 BlueFS::BDEV_WAL
, BDEV_LABEL_BLOCK_SIZE
,
5742 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) -
5743 BDEV_LABEL_BLOCK_SIZE
);
5745 bluefs_layout
.dedicated_wal
= true;
5748 if (::lstat(bfn
.c_str(), &st
) != -1) {
5750 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
5751 << cpp_strerror(r
) << dendl
;
5758 ceph_assert(bluefs
);
5764 int BlueStore::_open_bluefs(bool create
)
5766 int r
= _minimal_open_bluefs(create
);
5770 RocksDBBlueFSVolumeSelector
* vselector
= nullptr;
5771 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
5773 string options
= cct
->_conf
->bluestore_rocksdb_options
;
5775 rocksdb::Options rocks_opts
;
5776 int r
= RocksDBStore::ParseOptionsFromStringStatic(
5785 double reserved_factor
= cct
->_conf
->bluestore_volume_selection_reserved_factor
;
5787 new RocksDBBlueFSVolumeSelector(
5788 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
5789 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
5790 bluefs
->get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100,
5791 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5792 rocks_opts
.max_bytes_for_level_base
,
5793 rocks_opts
.max_bytes_for_level_multiplier
,
5795 cct
->_conf
->bluestore_volume_selection_reserved
,
5796 cct
->_conf
->bluestore_volume_selection_policy
!= "rocksdb_original");
5799 bluefs
->mkfs(fsid
, bluefs_layout
);
5801 bluefs
->set_volume_selector(vselector
);
5802 r
= bluefs
->mount();
5804 derr
<< __func__
<< " failed bluefs mount: " << cpp_strerror(r
) << dendl
;
5806 ceph_assert_always(bluefs
->maybe_verify_layout(bluefs_layout
) == 0);
5810 void BlueStore::_close_bluefs(bool cold_close
)
5812 bluefs
->umount(cold_close
);
5813 _minimal_close_bluefs();
5816 void BlueStore::_minimal_close_bluefs()
5822 int BlueStore::_is_bluefs(bool create
, bool* ret
)
5825 *ret
= cct
->_conf
->bluestore_bluefs
;
5828 int r
= read_meta("bluefs", &s
);
5830 derr
<< __func__
<< " unable to read 'bluefs' meta" << dendl
;
5835 } else if (s
== "0") {
5838 derr
<< __func__
<< " bluefs = " << s
<< " : not 0 or 1, aborting"
5847 * opens both DB and dependant super_meta, FreelistManager and allocator
5848 * in the proper order
5850 int BlueStore::_open_db_and_around(bool read_only
)
5853 bool do_bluefs
= false;
5854 _is_bluefs(false, &do_bluefs
); // ignore err code
5856 // open in read-only first to read FM list and init allocator
5857 // as they might be needed for some BlueFS procedures
5858 r
= _open_db(false, false, true);
5862 r
= _open_super_meta();
5867 r
= _open_fm(nullptr, true);
5875 // now open in R/W mode
5879 r
= _open_db(false, false, false);
5888 r
= _open_db(false, false);
5892 r
= _open_super_meta();
5897 r
= _open_fm(nullptr, false);
5910 _close_db(read_only
);
5914 void BlueStore::_close_db_and_around(bool read_only
)
5917 if (!read_only
&& out_of_sync_fm
.fetch_and(0)) {
5918 _sync_bluefs_and_fm();
5920 _close_db(read_only
);
5921 while(!read_only
&& out_of_sync_fm
.fetch_and(0)) {
5922 // if seen some allocations during close - repeat open_db, sync fm, close
5923 dout(0) << __func__
<< " syncing FreelistManager" << dendl
;
5924 int r
= _open_db(false, false, false);
5927 << " unable to open db, FreelistManager is probably out of sync"
5931 _sync_bluefs_and_fm();
5941 _close_db(read_only
);
5945 // updates legacy bluefs related recs in DB to a state valid for
5946 // downgrades from nautilus.
5947 void BlueStore::_sync_bluefs_and_fm()
5949 if (cct
->_conf
->bluestore_bluefs_db_compatibility
) {
5951 encode(bluefs_extents
, bl
);
5952 dout(20) << __func__
<< " bluefs_extents at KV is now 0x"
5953 << std::hex
<< bluefs_extents
<< std::dec
5955 KeyValueDB::Transaction synct
= db
->get_transaction();
5956 synct
->set(PREFIX_SUPER
, "bluefs_extents", bl
);
5957 synct
->set(PREFIX_SUPER
, "bluefs_extents_back", bl
);
5959 // Nice thing is that we don't need to update FreelistManager here.
5960 // It always has corresponding bits set to 'Free' for both Nautilus+ and
5961 // pre-Nautilis releases.
5962 // So once we get an extent to bluefs_extents this means it's
5963 // been free in allocator and hence it's free in FM too.
5965 db
->submit_transaction_sync(synct
);
5969 int BlueStore::_open_db(bool create
, bool to_repair_db
, bool read_only
)
5973 ceph_assert(!(create
&& read_only
));
5974 string fn
= path
+ "/db";
5977 std::shared_ptr
<Int64ArrayMergeOperator
> merge_op(new Int64ArrayMergeOperator
);
5980 std::vector
<KeyValueDB::ColumnFamily
> cfs
;
5983 kv_backend
= cct
->_conf
->bluestore_kvbackend
;
5985 r
= read_meta("kv_backend", &kv_backend
);
5987 derr
<< __func__
<< " unable to read 'kv_backend' meta" << dendl
;
5991 dout(10) << __func__
<< " kv_backend = " << kv_backend
<< dendl
;
5994 r
= _is_bluefs(create
, &do_bluefs
);
5998 dout(10) << __func__
<< " do_bluefs = " << do_bluefs
<< dendl
;
6000 map
<string
,string
> kv_options
;
6001 // force separate wal dir for all new deployments.
6002 kv_options
["separate_wal_dir"] = 1;
6003 rocksdb::Env
*env
= NULL
;
6005 dout(10) << __func__
<< " initializing bluefs" << dendl
;
6006 if (kv_backend
!= "rocksdb") {
6007 derr
<< " backend must be rocksdb to use bluefs" << dendl
;
6011 r
= _open_bluefs(create
);
6016 if (cct
->_conf
->bluestore_bluefs_env_mirror
) {
6017 rocksdb::Env
* a
= new BlueRocksEnv(bluefs
);
6018 rocksdb::Env
* b
= rocksdb::Env::Default();
6020 string cmd
= "rm -rf " + path
+ "/db " +
6021 path
+ "/db.slow " +
6023 int r
= system(cmd
.c_str());
6026 env
= new rocksdb::EnvMirror(b
, a
, false, true);
6028 env
= new BlueRocksEnv(bluefs
);
6030 // simplify the dir names, too, as "seen" by rocksdb
6033 bluefs
->set_slow_device_expander(this);
6034 BlueFSVolumeSelector::paths paths
;
6035 bluefs
->get_vselector_paths(fn
, paths
);
6037 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
6038 // we have both block.db and block; tell rocksdb!
6039 // note: the second (last) size value doesn't really matter
6040 ostringstream db_paths
;
6042 for (auto& p
: paths
) {
6047 db_paths
<< p
.first
<< "," << p
.second
;
6050 kv_options
["db_paths"] = db_paths
.str();
6051 dout(1) << __func__
<< " set db_paths to " << db_paths
.str() << dendl
;
6055 for (auto& p
: paths
) {
6056 env
->CreateDir(p
.first
);
6058 // Selectors don't provide wal path so far hence create explicitly
6059 env
->CreateDir(fn
+ ".wal");
6061 std::vector
<std::string
> res
;
6062 // check for dir presence
6063 auto r
= env
->GetChildren(fn
+".wal", &res
);
6064 if (r
.IsNotFound()) {
6065 kv_options
.erase("separate_wal_dir");
6069 string walfn
= path
+ "/db.wal";
6072 int r
= ::mkdir(fn
.c_str(), 0755);
6075 if (r
< 0 && r
!= -EEXIST
) {
6076 derr
<< __func__
<< " failed to create " << fn
<< ": " << cpp_strerror(r
)
6082 r
= ::mkdir(walfn
.c_str(), 0755);
6085 if (r
< 0 && r
!= -EEXIST
) {
6086 derr
<< __func__
<< " failed to create " << walfn
6087 << ": " << cpp_strerror(r
)
6093 r
= ::stat(walfn
.c_str(), &st
);
6094 if (r
< 0 && errno
== ENOENT
) {
6095 kv_options
.erase("separate_wal_dir");
6101 db
= KeyValueDB::create(cct
,
6105 static_cast<void*>(env
));
6107 derr
<< __func__
<< " error creating db" << dendl
;
6109 _close_bluefs(read_only
);
6111 // delete env manually here since we can't depend on db to do this
6118 FreelistManager::setup_merge_operators(db
);
6119 db
->set_merge_operator(PREFIX_STAT
, merge_op
);
6120 db
->set_cache_size(cache_kv_ratio
* cache_size
);
6122 if (kv_backend
== "rocksdb") {
6123 options
= cct
->_conf
->bluestore_rocksdb_options
;
6125 map
<string
,string
> cf_map
;
6126 cct
->_conf
.with_val
<string
>("bluestore_rocksdb_cfs",
6130 for (auto& i
: cf_map
) {
6131 dout(10) << "column family " << i
.first
<< ": " << i
.second
<< dendl
;
6132 cfs
.push_back(KeyValueDB::ColumnFamily(i
.first
, i
.second
));
6140 if (cct
->_conf
.get_val
<bool>("bluestore_rocksdb_cf")) {
6141 r
= db
->create_and_open(err
, cfs
);
6143 r
= db
->create_and_open(err
);
6146 // we pass in cf list here, but it is only used if the db already has
6147 // column families created.
6149 db
->open_read_only(err
, cfs
) :
6153 derr
<< __func__
<< " erroring opening db: " << err
.str() << dendl
;
6154 _close_db(read_only
);
6157 dout(1) << __func__
<< " opened " << kv_backend
6158 << " path " << fn
<< " options " << options
<< dendl
;
6162 void BlueStore::_close_db(bool cold_close
)
6168 _close_bluefs(cold_close
);
6172 void BlueStore::_dump_alloc_on_failure()
6174 auto dump_interval
=
6175 cct
->_conf
->bluestore_bluefs_alloc_failure_dump_interval
;
6176 if (dump_interval
> 0 &&
6177 next_dump_on_bluefs_alloc_failure
<= ceph_clock_now()) {
6179 next_dump_on_bluefs_alloc_failure
= ceph_clock_now();
6180 next_dump_on_bluefs_alloc_failure
+= dump_interval
;
6185 int BlueStore::allocate_bluefs_freespace(
6188 PExtentVector
* extents_out
)
6190 ceph_assert(min_size
<= size
);
6192 // round up to alloc size
6193 uint64_t alloc_size
= bluefs
->get_alloc_size(bluefs_layout
.shared_bdev
);
6194 min_size
= p2roundup(min_size
, alloc_size
);
6195 size
= p2roundup(size
, alloc_size
);
6197 PExtentVector extents_local
;
6198 PExtentVector
* extents
= extents_out
? extents_out
: &extents_local
;
6202 uint64_t allocated
= 0;
6205 auto extent_count0
= extents
->size();
6207 // hard cap to fit into 32 bits
6208 gift
= std::min
<uint64_t>(size
, 1ull << 30);
6209 dout(10) << __func__
<< " gifting " << gift
6210 << " (" << byte_u_t(gift
) << ")" << dendl
;
6212 alloc_len
= alloc
->allocate(gift
, alloc_size
, 0, 0, extents
);
6213 if (alloc_len
> 0) {
6214 allocated
+= alloc_len
;
6218 if (alloc_len
< 0 ||
6219 (alloc_len
< (int64_t)gift
&& (min_size
> allocated
))) {
6221 << " failed to allocate on 0x" << std::hex
<< gift
6222 << " min_size 0x" << min_size
6223 << " > allocated total 0x" << allocated
6224 << " bluefs_shared_alloc_size 0x" << alloc_size
6225 << " allocated 0x" << (alloc_len
< 0 ? 0 : alloc_len
)
6226 << " available 0x " << alloc
->get_free()
6227 << std::dec
<< dendl
;
6229 _dump_alloc_on_failure();
6230 alloc
->release(*extents
);
6234 } while (size
&& alloc_len
> 0);
6235 _collect_allocation_stats(need
, alloc_size
, extents
->size() - extent_count0
);
6237 for (auto& e
: *extents
) {
6238 dout(5) << __func__
<< " gifting " << e
<< " to bluefs" << dendl
;
6239 bluefs_extents
.insert(e
.offset
, e
.length
);
6241 // apply to bluefs if not requested from outside
6243 bluefs
->add_block_extent(bluefs_layout
.shared_bdev
, e
.offset
, e
.length
);
6250 uint64_t BlueStore::available_freespace(uint64_t alloc_size
) {
6252 auto iterated_allocation
= [&](uint64_t off
, uint64_t len
) {
6253 //only count in size that is alloc_size aligned
6254 uint64_t dist_to_alignment
;
6255 uint64_t offset_in_block
= off
& (alloc_size
- 1);
6256 if (offset_in_block
== 0)
6257 dist_to_alignment
= 0;
6259 dist_to_alignment
= alloc_size
- offset_in_block
;
6260 if (dist_to_alignment
>= len
)
6262 len
-= dist_to_alignment
;
6263 total
+= p2align(len
, alloc_size
);
6265 alloc
->dump(iterated_allocation
);
6269 int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free
, uint64_t bluefs_total
)
6271 float bluefs_free_ratio
= (float)bluefs_free
/ (float)bluefs_total
;
6273 uint64_t my_free
= alloc
->get_free();
6274 uint64_t total
= bdev
->get_size();
6275 float my_free_ratio
= (float)my_free
/ (float)total
;
6277 uint64_t total_free
= bluefs_free
+ my_free
;
6279 float bluefs_ratio
= (float)bluefs_free
/ (float)total_free
;
6281 dout(10) << __func__
6282 << " bluefs " << byte_u_t(bluefs_free
)
6283 << " free (" << bluefs_free_ratio
6284 << ") bluestore " << byte_u_t(my_free
)
6285 << " free (" << my_free_ratio
6286 << "), bluefs_ratio " << bluefs_ratio
6290 uint64_t reclaim
= 0;
6291 if (bluefs_ratio
< cct
->_conf
->bluestore_bluefs_min_ratio
) {
6292 gift
= cct
->_conf
->bluestore_bluefs_gift_ratio
* total_free
;
6293 if (gift
>= my_free
)
6295 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
6296 << " < min_ratio " << cct
->_conf
->bluestore_bluefs_min_ratio
6297 << ", should gift " << byte_u_t(gift
) << dendl
;
6298 } else if (bluefs_ratio
> cct
->_conf
->bluestore_bluefs_max_ratio
) {
6299 reclaim
= cct
->_conf
->bluestore_bluefs_reclaim_ratio
* total_free
;
6300 if (bluefs_total
- reclaim
< cct
->_conf
->bluestore_bluefs_min
)
6301 reclaim
= bluefs_total
- cct
->_conf
->bluestore_bluefs_min
;
6302 if (reclaim
>= bluefs_free
)
6303 reclaim
= bluefs_free
/ 2;
6304 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
6305 << " > max_ratio " << cct
->_conf
->bluestore_bluefs_max_ratio
6306 << ", should reclaim " << byte_u_t(reclaim
) << dendl
;
6309 // don't take over too much of the freespace
6310 uint64_t free_cap
= cct
->_conf
->bluestore_bluefs_max_ratio
* total_free
;
6311 if (bluefs_total
< cct
->_conf
->bluestore_bluefs_min
&&
6312 cct
->_conf
->bluestore_bluefs_min
< free_cap
) {
6313 uint64_t g
= cct
->_conf
->bluestore_bluefs_min
- bluefs_total
;
6314 dout(10) << __func__
<< " bluefs_total " << bluefs_total
6315 << " < min " << cct
->_conf
->bluestore_bluefs_min
6316 << ", should gift " << byte_u_t(g
) << dendl
;
6322 cct
->_conf
.get_val
<Option::size_t>("bluestore_bluefs_min_free");
6323 if (bluefs_free
< min_free
&&
6324 min_free
< free_cap
) {
6325 uint64_t g
= min_free
- bluefs_free
;
6326 dout(10) << __func__
<< " bluefs_free " << bluefs_free
6327 << " < min " << min_free
6328 << ", should gift " << byte_u_t(g
) << dendl
;
6334 cct
->_conf
.get_val
<Option::size_t>("bluestore_bluefs_max_free");
6335 if (bluefs_free
> max_free
) {
6336 dout(10) << __func__
<< " bluefs_free " << bluefs_free
6337 << " > max " << max_free
6338 << ", stop gifting for now" << dendl
;
6341 ceph_assert((int64_t)gift
>= 0);
6342 ceph_assert((int64_t)reclaim
>= 0);
6343 return gift
> 0 ? (int64_t)gift
: -(int64_t)reclaim
;
6346 int BlueStore::_balance_bluefs_freespace()
6349 ceph_assert(bluefs
);
6351 vector
<pair
<uint64_t,uint64_t>> bluefs_usage
; // <free, total> ...
6352 bluefs
->get_usage(&bluefs_usage
);
6353 ceph_assert(bluefs_usage
.size() > bluefs_layout
.shared_bdev
);
6355 bool clear_alert
= true;
6356 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
6357 auto& p
= bluefs_usage
[bluefs_layout
.shared_bdev
];
6358 if (p
.first
!= p
.second
) {
6359 auto& db
= bluefs_usage
[BlueFS::BDEV_DB
];
6361 ss
<< "spilled over " << byte_u_t(p
.second
- p
.first
)
6362 << " metadata from 'db' device (" << byte_u_t(db
.second
- db
.first
)
6363 << " used of " << byte_u_t(db
.second
) << ") to slow device";
6364 _set_spillover_alert(ss
.str());
6365 clear_alert
= false;
6369 _clear_spillover_alert();
6372 // fixme: look at primary bdev only for now
6373 int64_t delta
= _get_bluefs_size_delta(
6374 bluefs_usage
[bluefs_layout
.shared_bdev
].first
,
6375 bluefs_usage
[bluefs_layout
.shared_bdev
].second
);
6377 // reclaim from bluefs?
6379 // round up to alloc size
6380 uint64_t alloc_size
= bluefs
->get_alloc_size(bluefs_layout
.shared_bdev
);
6381 auto reclaim
= p2roundup(uint64_t(-delta
), alloc_size
);
6383 // hard cap to fit into 32 bits
6384 reclaim
= std::min
<uint64_t>(reclaim
, 1ull << 30);
6385 dout(10) << __func__
<< " reclaiming " << reclaim
6386 << " (" << byte_u_t(reclaim
) << ")" << dendl
;
6388 while (reclaim
> 0) {
6389 // NOTE: this will block and do IO.
6390 PExtentVector extents
;
6391 int r
= bluefs
->reclaim_blocks(bluefs_layout
.shared_bdev
, reclaim
,
6394 derr
<< __func__
<< " failed to reclaim space from bluefs"
6398 for (auto e
: extents
) {
6400 bluefs_extents
.erase(e
.offset
, e
.length
);
6401 bluefs_extents_reclaiming
.insert(e
.offset
, e
.length
);
6402 reclaim
-= e
.length
;
6412 int BlueStore::_open_collections()
6414 dout(10) << __func__
<< dendl
;
6415 collections_had_errors
= false;
6416 ceph_assert(coll_map
.empty());
6417 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
6418 for (it
->upper_bound(string());
6422 if (cid
.parse(it
->key())) {
6423 auto c
= ceph::make_ref
<Collection
>(
6425 onode_cache_shards
[cid
.hash_to_shard(onode_cache_shards
.size())],
6426 buffer_cache_shards
[cid
.hash_to_shard(buffer_cache_shards
.size())],
6428 bufferlist bl
= it
->value();
6429 auto p
= bl
.cbegin();
6431 decode(c
->cnode
, p
);
6432 } catch (buffer::error
& e
) {
6433 derr
<< __func__
<< " failed to decode cnode, key:"
6434 << pretty_binary_string(it
->key()) << dendl
;
6437 dout(20) << __func__
<< " opened " << cid
<< " " << c
6438 << " " << c
->cnode
<< dendl
;
6439 _osr_attach(c
.get());
6443 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
6444 collections_had_errors
= true;
6450 void BlueStore::_fsck_collections(int64_t* errors
)
6452 if (collections_had_errors
) {
6453 dout(10) << __func__
<< dendl
;
6454 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
6455 for (it
->upper_bound(string());
6459 if (!cid
.parse(it
->key())) {
6460 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
6469 void BlueStore::_set_per_pool_omap()
6471 per_pool_omap
= false;
6473 db
->get(PREFIX_SUPER
, "per_pool_omap", &bl
);
6475 per_pool_omap
= true;
6476 dout(10) << __func__
<< " per_pool_omap=1" << dendl
;
6478 dout(10) << __func__
<< " per_pool_omap not present" << dendl
;
6480 _check_no_per_pool_omap_alert();
6483 void BlueStore::_open_statfs()
6489 int r
= db
->get(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, &bl
);
6491 per_pool_stat_collection
= false;
6492 if (size_t(bl
.length()) >= sizeof(vstatfs
.values
)) {
6493 auto it
= bl
.cbegin();
6495 dout(10) << __func__
<< " store_statfs is found" << dendl
;
6497 dout(10) << __func__
<< " store_statfs is corrupt, using empty" << dendl
;
6499 _check_legacy_statfs_alert();
6501 per_pool_stat_collection
= true;
6502 dout(10) << __func__
<< " per-pool statfs is enabled" << dendl
;
6503 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_STAT
);
6504 for (it
->upper_bound(string());
6509 int r
= get_key_pool_stat(it
->key(), &pool_id
);
6510 ceph_assert(r
== 0);
6514 auto p
= bl
.cbegin();
6515 auto& st
= osd_pools
[pool_id
];
6520 dout(30) << __func__
<< " pool " << pool_id
6521 << " statfs " << st
<< dendl
;
6522 } catch (buffer::error
& e
) {
6523 derr
<< __func__
<< " failed to decode pool stats, key:"
6524 << pretty_binary_string(it
->key()) << dendl
;
6528 dout(30) << __func__
<< " statfs " << vstatfs
<< dendl
;
6532 int BlueStore::_setup_block_symlink_or_file(
6538 dout(20) << __func__
<< " name " << name
<< " path " << epath
6539 << " size " << size
<< " create=" << (int)create
<< dendl
;
6541 int flags
= O_RDWR
|O_CLOEXEC
;
6544 if (epath
.length()) {
6545 r
= ::symlinkat(epath
.c_str(), path_fd
, name
.c_str());
6548 derr
<< __func__
<< " failed to create " << name
<< " symlink to "
6549 << epath
<< ": " << cpp_strerror(r
) << dendl
;
6553 if (!epath
.compare(0, strlen(SPDK_PREFIX
), SPDK_PREFIX
)) {
6554 int fd
= ::openat(path_fd
, epath
.c_str(), flags
, 0644);
6557 derr
<< __func__
<< " failed to open " << epath
<< " file: "
6558 << cpp_strerror(r
) << dendl
;
6561 // write the Transport ID of the NVMe device
6562 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6563 // where "0000:02:00.0" is the selector of a PCI device, see
6564 // the first column of "lspci -mm -n -D"
6565 string trid
{"trtype:PCIe "};
6567 trid
+= epath
.substr(strlen(SPDK_PREFIX
));
6568 r
= ::write(fd
, trid
.c_str(), trid
.size());
6569 ceph_assert(r
== static_cast<int>(trid
.size()));
6570 dout(1) << __func__
<< " created " << name
<< " symlink to "
6572 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6576 int fd
= ::openat(path_fd
, name
.c_str(), flags
, 0644);
6578 // block file is present
6580 int r
= ::fstat(fd
, &st
);
6582 S_ISREG(st
.st_mode
) && // if it is a regular file
6583 st
.st_size
== 0) { // and is 0 bytes
6584 r
= ::ftruncate(fd
, size
);
6587 derr
<< __func__
<< " failed to resize " << name
<< " file to "
6588 << size
<< ": " << cpp_strerror(r
) << dendl
;
6589 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6593 if (cct
->_conf
->bluestore_block_preallocate_file
) {
6594 r
= ::ceph_posix_fallocate(fd
, 0, size
);
6596 derr
<< __func__
<< " failed to prefallocate " << name
<< " file to "
6597 << size
<< ": " << cpp_strerror(r
) << dendl
;
6598 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6602 dout(1) << __func__
<< " resized " << name
<< " file to "
6603 << byte_u_t(size
) << dendl
;
6605 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6609 derr
<< __func__
<< " failed to open " << name
<< " file: "
6610 << cpp_strerror(r
) << dendl
;
6618 int BlueStore::mkfs()
6620 dout(1) << __func__
<< " path " << path
<< dendl
;
6624 if (cct
->_conf
->osd_max_object_size
> OBJECT_MAX_SIZE
) {
6625 derr
<< __func__
<< " osd_max_object_size "
6626 << cct
->_conf
->osd_max_object_size
<< " > bluestore max "
6627 << OBJECT_MAX_SIZE
<< dendl
;
6633 r
= read_meta("mkfs_done", &done
);
6635 dout(1) << __func__
<< " already created" << dendl
;
6636 if (cct
->_conf
->bluestore_fsck_on_mkfs
) {
6637 r
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
6639 derr
<< __func__
<< " fsck found fatal error: " << cpp_strerror(r
)
6644 derr
<< __func__
<< " fsck found " << r
<< " errors" << dendl
;
6648 return r
; // idempotent
6654 r
= read_meta("type", &type
);
6656 if (type
!= "bluestore") {
6657 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
6661 r
= write_meta("type", "bluestore");
6667 freelist_type
= "bitmap";
6673 r
= _open_fsid(true);
6679 goto out_close_fsid
;
6681 r
= _read_fsid(&old_fsid
);
6682 if (r
< 0 || old_fsid
.is_zero()) {
6683 if (fsid
.is_zero()) {
6684 fsid
.generate_random();
6685 dout(1) << __func__
<< " generated fsid " << fsid
<< dendl
;
6687 dout(1) << __func__
<< " using provided fsid " << fsid
<< dendl
;
6689 // we'll write it later.
6691 if (!fsid
.is_zero() && fsid
!= old_fsid
) {
6692 derr
<< __func__
<< " on-disk fsid " << old_fsid
6693 << " != provided " << fsid
<< dendl
;
6695 goto out_close_fsid
;
6700 r
= _setup_block_symlink_or_file("block", cct
->_conf
->bluestore_block_path
,
6701 cct
->_conf
->bluestore_block_size
,
6702 cct
->_conf
->bluestore_block_create
);
6704 goto out_close_fsid
;
6705 if (cct
->_conf
->bluestore_bluefs
) {
6706 r
= _setup_block_symlink_or_file("block.wal", cct
->_conf
->bluestore_block_wal_path
,
6707 cct
->_conf
->bluestore_block_wal_size
,
6708 cct
->_conf
->bluestore_block_wal_create
);
6710 goto out_close_fsid
;
6711 r
= _setup_block_symlink_or_file("block.db", cct
->_conf
->bluestore_block_db_path
,
6712 cct
->_conf
->bluestore_block_db_size
,
6713 cct
->_conf
->bluestore_block_db_create
);
6715 goto out_close_fsid
;
6718 r
= _open_bdev(true);
6720 goto out_close_fsid
;
6722 // choose min_alloc_size
6723 if (cct
->_conf
->bluestore_min_alloc_size
) {
6724 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size
;
6727 if (bdev
->is_rotational()) {
6728 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_hdd
;
6730 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_ssd
;
6735 // make sure min_alloc_size is power of 2 aligned.
6736 if (!isp2(min_alloc_size
)) {
6737 derr
<< __func__
<< " min_alloc_size 0x"
6738 << std::hex
<< min_alloc_size
<< std::dec
6739 << " is not power of 2 aligned!"
6742 goto out_close_bdev
;
6747 goto out_close_bdev
;
6750 KeyValueDB::Transaction t
= db
->get_transaction();
6751 r
= _open_fm(t
, true);
6756 encode((uint64_t)0, bl
);
6757 t
->set(PREFIX_SUPER
, "nid_max", bl
);
6758 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
6763 encode((uint64_t)min_alloc_size
, bl
);
6764 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
6769 t
->set(PREFIX_SUPER
, "per_pool_omap", bl
);
6771 ondisk_format
= latest_ondisk_format
;
6772 _prepare_ondisk_format_super(t
);
6773 db
->submit_transaction_sync(t
);
6776 r
= write_meta("kv_backend", cct
->_conf
->bluestore_kvbackend
);
6780 r
= write_meta("bluefs", stringify(bluefs
? 1 : 0));
6784 if (fsid
!= old_fsid
) {
6787 derr
<< __func__
<< " error writing fsid: " << cpp_strerror(r
) << dendl
;
6792 if (out_of_sync_fm
.fetch_and(0)) {
6793 _sync_bluefs_and_fm();
6808 cct
->_conf
->bluestore_fsck_on_mkfs
) {
6809 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
6813 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
6819 // indicate success by writing the 'mkfs_done' file
6820 r
= write_meta("mkfs_done", "yes");
6824 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
6826 dout(0) << __func__
<< " success" << dendl
;
6831 int BlueStore::_mount_for_bluefs()
6833 int r
= _open_path();
6834 ceph_assert(r
== 0);
6835 r
= _open_fsid(false);
6836 ceph_assert(r
== 0);
6837 r
= _read_fsid(&fsid
);
6838 ceph_assert(r
== 0);
6840 ceph_assert(r
== 0);
6841 r
= _open_bluefs(false);
6842 ceph_assert(r
== 0);
6846 void BlueStore::_umount_for_bluefs()
6848 _close_bluefs(false);
6853 int BlueStore::add_new_bluefs_device(int id
, const string
& dev_path
)
6855 dout(10) << __func__
<< " path " << dev_path
<< " id:" << id
<< dendl
;
6857 ceph_assert(path_fd
< 0);
6859 ceph_assert(id
== BlueFS::BDEV_NEWWAL
|| id
== BlueFS::BDEV_NEWDB
);
6861 if (!cct
->_conf
->bluestore_bluefs
) {
6862 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6866 r
= _mount_for_bluefs();
6869 if (id
== BlueFS::BDEV_NEWWAL
) {
6870 string p
= path
+ "/block.wal";
6871 r
= _setup_block_symlink_or_file("block.wal", dev_path
,
6872 cct
->_conf
->bluestore_block_wal_size
,
6874 ceph_assert(r
== 0);
6876 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWWAL
, p
,
6877 cct
->_conf
->bdev_enable_discard
);
6878 ceph_assert(r
== 0);
6880 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWWAL
)) {
6881 r
= _check_or_set_bdev_label(
6883 bluefs
->get_block_device_size(BlueFS::BDEV_NEWWAL
),
6886 ceph_assert(r
== 0);
6889 reserved
= BDEV_LABEL_BLOCK_SIZE
;
6890 bluefs_layout
.dedicated_wal
= true;
6891 } else if (id
== BlueFS::BDEV_NEWDB
) {
6892 string p
= path
+ "/block.db";
6893 r
= _setup_block_symlink_or_file("block.db", dev_path
,
6894 cct
->_conf
->bluestore_block_db_size
,
6896 ceph_assert(r
== 0);
6898 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWDB
, p
,
6899 cct
->_conf
->bdev_enable_discard
);
6900 ceph_assert(r
== 0);
6902 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWDB
)) {
6903 r
= _check_or_set_bdev_label(
6905 bluefs
->get_block_device_size(BlueFS::BDEV_NEWDB
),
6908 ceph_assert(r
== 0);
6910 reserved
= SUPER_RESERVED
;
6911 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
6912 bluefs_layout
.dedicated_db
= true;
6918 bluefs
->add_block_extent(
6921 bluefs
->get_block_device_size(id
) - reserved
, true);
6923 r
= bluefs
->prepare_new_device(id
, bluefs_layout
);
6924 ceph_assert(r
== 0);
6927 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
6929 dout(0) << __func__
<< " success" << dendl
;
6932 _umount_for_bluefs();
6936 int BlueStore::migrate_to_existing_bluefs_device(const set
<int>& devs_source
,
6939 dout(10) << __func__
<< " id:" << id
<< dendl
;
6940 ceph_assert(path_fd
< 0);
6942 ceph_assert(id
== BlueFS::BDEV_SLOW
|| id
== BlueFS::BDEV_DB
);
6944 if (!cct
->_conf
->bluestore_bluefs
) {
6945 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6949 int r
= _mount_for_bluefs();
6951 // require bluestore_bluefs_min_free to be free at target device!
6952 uint64_t used_space
= cct
->_conf
.get_val
<Option::size_t>("bluestore_bluefs_min_free");
6953 for(auto src_id
: devs_source
) {
6954 used_space
+= bluefs
->get_total(src_id
) - bluefs
->get_free(src_id
);
6956 uint64_t target_free
= bluefs
->get_free(id
);
6957 if (id
== BlueFS::BDEV_SLOW
&& target_free
< used_space
) {
6958 // will need to remount full BlueStore instance to allocate more space
6959 _umount_for_bluefs();
6962 ceph_assert(r
== 0);
6964 << " Allocating more space at slow device for BlueFS: +"
6965 << used_space
- target_free
<< " bytes" << dendl
;
6966 r
= allocate_bluefs_freespace(
6967 used_space
- target_free
,
6968 used_space
- target_free
,
6974 << " can't migrate, unable to allocate extra space: "
6975 << used_space
- target_free
<< " at target:" << id
6980 r
= _mount_for_bluefs();
6981 ceph_assert(r
== 0);
6982 } else if (target_free
< used_space
) {
6984 << " can't migrate, free space at target: " << target_free
6985 << " is less than required space: " << used_space
6989 if (devs_source
.count(BlueFS::BDEV_DB
)) {
6990 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
6991 bluefs_layout
.dedicated_db
= false;
6993 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
6994 bluefs_layout
.dedicated_wal
= false;
6996 r
= bluefs
->device_migrate_to_existing(cct
, devs_source
, id
, bluefs_layout
);
6998 derr
<< __func__
<< " failed during BlueFS migration, " << cpp_strerror(r
) << dendl
;
7002 if (devs_source
.count(BlueFS::BDEV_DB
)) {
7003 r
= unlink(string(path
+ "/block.db").c_str());
7004 ceph_assert(r
== 0);
7006 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
7007 r
= unlink(string(path
+ "/block.wal").c_str());
7008 ceph_assert(r
== 0);
7012 _umount_for_bluefs();
7016 int BlueStore::migrate_to_new_bluefs_device(const set
<int>& devs_source
,
7018 const string
& dev_path
)
7020 dout(10) << __func__
<< " path " << dev_path
<< " id:" << id
<< dendl
;
7022 ceph_assert(path_fd
< 0);
7024 ceph_assert(id
== BlueFS::BDEV_NEWWAL
|| id
== BlueFS::BDEV_NEWDB
);
7026 if (!cct
->_conf
->bluestore_bluefs
) {
7027 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
7031 r
= _mount_for_bluefs();
7036 if (devs_source
.count(BlueFS::BDEV_DB
) &&
7037 bluefs_layout
.shared_bdev
!= BlueFS::BDEV_DB
) {
7038 link_db
= path
+ "/block.db";
7039 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
7040 bluefs_layout
.dedicated_db
= false;
7042 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
7043 link_wal
= path
+ "/block.wal";
7044 bluefs_layout
.dedicated_wal
= false;
7049 if (id
== BlueFS::BDEV_NEWWAL
) {
7050 target_name
= "block.wal";
7051 target_size
= cct
->_conf
->bluestore_block_wal_size
;
7052 bluefs_layout
.dedicated_wal
= true;
7054 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWWAL
, dev_path
,
7055 cct
->_conf
->bdev_enable_discard
);
7056 ceph_assert(r
== 0);
7058 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWWAL
)) {
7059 r
= _check_or_set_bdev_label(
7061 bluefs
->get_block_device_size(BlueFS::BDEV_NEWWAL
),
7064 ceph_assert(r
== 0);
7066 reserved
= BDEV_LABEL_BLOCK_SIZE
;
7067 } else if (id
== BlueFS::BDEV_NEWDB
) {
7068 target_name
= "block.db";
7069 target_size
= cct
->_conf
->bluestore_block_db_size
;
7070 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
7071 bluefs_layout
.dedicated_db
= true;
7073 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWDB
, dev_path
,
7074 cct
->_conf
->bdev_enable_discard
);
7075 ceph_assert(r
== 0);
7077 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWDB
)) {
7078 r
= _check_or_set_bdev_label(
7080 bluefs
->get_block_device_size(BlueFS::BDEV_NEWDB
),
7083 ceph_assert(r
== 0);
7085 reserved
= SUPER_RESERVED
;
7091 bluefs
->add_block_extent(
7092 id
, reserved
, bluefs
->get_block_device_size(id
) - reserved
);
7094 r
= bluefs
->device_migrate_to_new(cct
, devs_source
, id
, bluefs_layout
);
7097 derr
<< __func__
<< " failed during BlueFS migration, " << cpp_strerror(r
) << dendl
;
7101 if (!link_db
.empty()) {
7102 r
= unlink(link_db
.c_str());
7103 ceph_assert(r
== 0);
7105 if (!link_wal
.empty()) {
7106 r
= unlink(link_wal
.c_str());
7107 ceph_assert(r
== 0);
7109 r
= _setup_block_symlink_or_file(
7114 ceph_assert(r
== 0);
7115 dout(0) << __func__
<< " success" << dendl
;
7118 _umount_for_bluefs();
7122 string
BlueStore::get_device_path(unsigned id
)
7125 if (id
< BlueFS::MAX_BDEV
) {
7127 case BlueFS::BDEV_WAL
:
7128 res
= path
+ "/block.wal";
7130 case BlueFS::BDEV_DB
:
7131 if (id
== bluefs_layout
.shared_bdev
) {
7132 res
= path
+ "/block";
7134 res
= path
+ "/block.db";
7137 case BlueFS::BDEV_SLOW
:
7138 res
= path
+ "/block";
7145 int BlueStore::expand_devices(ostream
& out
)
7147 int r
= cold_open();
7148 ceph_assert(r
== 0);
7149 bluefs
->dump_block_extents(out
);
7150 out
<< "Expanding DB/WAL..." << std::endl
;
7151 for (auto devid
: { BlueFS::BDEV_WAL
, BlueFS::BDEV_DB
}) {
7152 if (devid
== bluefs_layout
.shared_bdev
) {
7155 uint64_t size
= bluefs
->get_block_device_size(devid
);
7161 interval_set
<uint64_t> before
;
7162 bluefs
->get_block_extents(devid
, &before
);
7163 ceph_assert(!before
.empty());
7164 uint64_t end
= before
.range_end();
7167 <<" : expanding " << " from 0x" << std::hex
7168 << end
<< " to 0x" << size
<< std::dec
<< std::endl
;
7169 bluefs
->add_block_extent(devid
, end
, size
-end
);
7170 string p
= get_device_path(devid
);
7171 const char* path
= p
.c_str();
7172 if (path
== nullptr) {
7174 <<": can't find device path " << dendl
;
7177 bluestore_bdev_label_t label
;
7178 int r
= _read_bdev_label(cct
, path
, &label
);
7180 derr
<< "unable to read label for " << path
<< ": "
7181 << cpp_strerror(r
) << dendl
;
7185 r
= _write_bdev_label(cct
, path
, label
);
7187 derr
<< "unable to write label for " << path
<< ": "
7188 << cpp_strerror(r
) << dendl
;
7192 <<" : size label updated to " << size
7196 uint64_t size0
= fm
->get_size();
7197 uint64_t size
= bdev
->get_size();
7199 out
<< bluefs_layout
.shared_bdev
7200 << " : expanding " << " from 0x" << std::hex
7201 << size0
<< " to 0x" << size
<< std::dec
<< std::endl
;
7202 _write_out_fm_meta(size
, true);
7205 // mount in read/write to sync expansion changes
7207 ceph_assert(r
== 0);
7215 int BlueStore::dump_bluefs_sizes(ostream
& out
)
7217 int r
= cold_open();
7218 ceph_assert(r
== 0);
7219 bluefs
->dump_block_extents(out
);
7224 void BlueStore::set_cache_shards(unsigned num
)
7226 dout(10) << __func__
<< " " << num
<< dendl
;
7227 size_t oold
= onode_cache_shards
.size();
7228 size_t bold
= buffer_cache_shards
.size();
7229 ceph_assert(num
>= oold
&& num
>= bold
);
7230 onode_cache_shards
.resize(num
);
7231 buffer_cache_shards
.resize(num
);
7232 for (unsigned i
= oold
; i
< num
; ++i
) {
7233 onode_cache_shards
[i
] =
7234 OnodeCacheShard::create(cct
, cct
->_conf
->bluestore_cache_type
,
7237 for (unsigned i
= bold
; i
< num
; ++i
) {
7238 buffer_cache_shards
[i
] =
7239 BufferCacheShard::create(cct
, cct
->_conf
->bluestore_cache_type
,
7244 int BlueStore::_mount(bool kv_only
, bool open_db
)
7246 dout(1) << __func__
<< " path " << path
<< dendl
;
7252 int r
= read_meta("type", &type
);
7254 derr
<< __func__
<< " failed to load os-type: " << cpp_strerror(r
)
7259 if (type
!= "bluestore") {
7260 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
7265 if (cct
->_conf
->bluestore_fsck_on_mount
) {
7266 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mount_deep
);
7270 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
7275 if (cct
->_conf
->osd_max_object_size
> OBJECT_MAX_SIZE
) {
7276 derr
<< __func__
<< " osd_max_object_size "
7277 << cct
->_conf
->osd_max_object_size
<< " > bluestore max "
7278 << OBJECT_MAX_SIZE
<< dendl
;
7282 int r
= _open_path();
7285 r
= _open_fsid(false);
7289 r
= _read_fsid(&fsid
);
7297 r
= _open_bdev(false);
7302 r
= _open_db_and_around(false);
7304 // we can bypass db open exclusively in case of kv_only mode
7305 ceph_assert(kv_only
);
7306 r
= _open_db(false, true);
7315 r
= _upgrade_super();
7320 r
= _open_collections();
7324 r
= _reload_logger();
7330 r
= _deferred_replay();
7334 mempool_thread
.init();
7336 if ((!per_pool_stat_collection
|| !per_pool_omap
) &&
7337 cct
->_conf
->bluestore_fsck_quick_fix_on_mount
== true) {
7339 bool was_per_pool_omap
= per_pool_omap
;
7341 dout(1) << __func__
<< " quick-fix on mount" << dendl
;
7342 _fsck_on_open(FSCK_SHALLOW
, true);
7345 //FIXME minor: replace with actual open/close?
7347 _check_legacy_statfs_alert();
7349 //set again as hopefully it has been fixed
7350 if (!was_per_pool_omap
) {
7351 _set_per_pool_omap();
7363 _close_db_and_around(false);
7373 int BlueStore::umount()
7375 ceph_assert(_kv_only
|| mounted
);
7376 dout(1) << __func__
<< dendl
;
7382 mempool_thread
.shutdown();
7383 dout(20) << __func__
<< " stopping kv thread" << dendl
;
7386 dout(20) << __func__
<< " closing" << dendl
;
7389 _close_db_and_around(false);
7394 if (cct
->_conf
->bluestore_fsck_on_umount
) {
7395 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_umount_deep
);
7399 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
7406 int BlueStore::cold_open()
7408 int r
= _open_path();
7411 r
= _open_fsid(false);
7415 r
= _read_fsid(&fsid
);
7423 r
= _open_bdev(false);
7426 r
= _open_db_and_around(true);
7439 int BlueStore::cold_close()
7441 _close_db_and_around(true);
7448 // derr wrapper to limit enormous output and avoid log flooding.
7449 // Of limited use where such output is expected for now
7450 #define fsck_derr(err_cnt, threshold) \
7451 if (err_cnt <= threshold) { \
7452 bool need_skip_print = err_cnt == threshold; \
7455 #define fsck_dendl \
7457 if (need_skip_print) \
7458 derr << "more error lines skipped..." << dendl; \
7461 int _fsck_sum_extents(
7462 const PExtentVector
& extents
,
7464 store_statfs_t
& expected_statfs
)
7466 for (auto e
: extents
) {
7469 expected_statfs
.allocated
+= e
.length
;
7471 expected_statfs
.data_compressed_allocated
+= e
.length
;
7477 int BlueStore::_fsck_check_extents(
7479 const ghobject_t
& oid
,
7480 const PExtentVector
& extents
,
7482 mempool_dynamic_bitset
&used_blocks
,
7483 uint64_t granularity
,
7484 BlueStoreRepairer
* repairer
,
7485 store_statfs_t
& expected_statfs
,
7488 dout(30) << __func__
<< " oid " << oid
<< " extents " << extents
<< dendl
;
7490 for (auto e
: extents
) {
7493 expected_statfs
.allocated
+= e
.length
;
7495 expected_statfs
.data_compressed_allocated
+= e
.length
;
7497 if (depth
!= FSCK_SHALLOW
) {
7498 bool already
= false;
7499 apply_for_bitset_range(
7500 e
.offset
, e
.length
, granularity
, used_blocks
,
7501 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
7504 repairer
->note_misreference(
7505 pos
* min_alloc_size
, min_alloc_size
, !already
);
7508 derr
<< "fsck error: " << oid
<< " extent " << e
7509 << " or a subset is already allocated (misreferenced)" << dendl
;
7518 repairer
->get_space_usage_tracker().set_used( e
.offset
, e
.length
, cid
, oid
);
7521 if (e
.end() > bdev
->get_size()) {
7522 derr
<< "fsck error: " << oid
<< " extent " << e
7523 << " past end of block device" << dendl
;
7531 void BlueStore::_fsck_check_pool_statfs(
7532 BlueStore::per_pool_statfs
& expected_pool_statfs
,
7535 BlueStoreRepairer
* repairer
)
7537 auto it
= db
->get_iterator(PREFIX_STAT
);
7539 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
7540 string key
= it
->key();
7541 if (key
== BLUESTORE_GLOBAL_STATFS_KEY
) {
7544 repairer
->remove_key(db
, PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
);
7545 derr
<< "fsck error: " << "legacy statfs record found, removing"
7551 if (get_key_pool_stat(key
, &pool_id
) < 0) {
7552 derr
<< "fsck error: bad key " << key
7553 << "in statfs namespece" << dendl
;
7555 repairer
->remove_key(db
, PREFIX_STAT
, key
);
7561 volatile_statfs vstatfs
;
7562 bufferlist bl
= it
->value();
7563 auto blp
= bl
.cbegin();
7565 vstatfs
.decode(blp
);
7566 } catch (buffer::error
& e
) {
7567 derr
<< "fsck error: failed to decode Pool StatFS record"
7568 << pretty_binary_string(key
) << dendl
;
7570 dout(20) << __func__
<< " undecodable Pool StatFS record, key:'"
7571 << pretty_binary_string(key
)
7572 << "', removing" << dendl
;
7573 repairer
->remove_key(db
, PREFIX_STAT
, key
);
7578 auto stat_it
= expected_pool_statfs
.find(pool_id
);
7579 if (stat_it
== expected_pool_statfs
.end()) {
7580 if (vstatfs
.is_empty()) {
7581 // we don't consider that as an error since empty pool statfs
7582 // are left in DB for now
7583 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7584 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7586 // but we need to increment error count in case of repair
7587 // to have proper counters at the end
7588 // (as repairer increments recovery counter anyway).
7592 derr
<< "fsck error: found stray Pool StatFS record for pool id 0x"
7593 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7597 repairer
->remove_key(db
, PREFIX_SHARED_BLOB
, key
);
7601 store_statfs_t statfs
;
7602 vstatfs
.publish(&statfs
);
7603 if (!(stat_it
->second
== statfs
)) {
7604 derr
<< "fsck error: actual " << statfs
7605 << " != expected " << stat_it
->second
7607 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7609 repairer
->fix_statfs(db
, key
, stat_it
->second
);
7613 expected_pool_statfs
.erase(stat_it
);
7616 for (auto& s
: expected_pool_statfs
) {
7617 if (s
.second
.is_zero()) {
7618 // we might lack empty statfs recs in DB
7621 derr
<< "fsck error: missing Pool StatFS record for pool "
7622 << std::hex
<< s
.first
<< std::dec
<< dendl
;
7625 get_pool_stat_key(s
.first
, &key
);
7626 repairer
->fix_statfs(db
, key
, s
.second
);
7630 if (!per_pool_stat_collection
&&
7632 // by virtue of running this method, we correct the top-level
7633 // error of having global stats
7634 repairer
->inc_repaired();
7638 BlueStore::OnodeRef
BlueStore::fsck_check_objects_shallow(
7639 BlueStore::FSCKDepth depth
,
7641 BlueStore::CollectionRef c
,
7642 const ghobject_t
& oid
,
7644 const bufferlist
& value
,
7645 mempool::bluestore_fsck::list
<string
>* expecting_shards
,
7646 map
<BlobRef
, bluestore_blob_t::unused_t
>* referenced
,
7647 const BlueStore::FSCK_ObjectCtx
& ctx
)
7649 auto& errors
= ctx
.errors
;
7650 auto& num_objects
= ctx
.num_objects
;
7651 auto& num_extents
= ctx
.num_extents
;
7652 auto& num_blobs
= ctx
.num_blobs
;
7653 auto& num_sharded_objects
= ctx
.num_sharded_objects
;
7654 auto& num_spanning_blobs
= ctx
.num_spanning_blobs
;
7655 auto used_blocks
= ctx
.used_blocks
;
7656 auto sb_info_lock
= ctx
.sb_info_lock
;
7657 auto& sb_info
= ctx
.sb_info
;
7658 auto repairer
= ctx
.repairer
;
7660 store_statfs_t
* res_statfs
= (per_pool_stat_collection
|| repairer
) ?
7661 &ctx
.expected_pool_statfs
[pool_id
] :
7662 &ctx
.expected_store_statfs
;
7664 dout(10) << __func__
<< " " << oid
<< dendl
;
7666 o
.reset(Onode::decode(c
, oid
, key
, value
));
7669 num_spanning_blobs
+= o
->extent_map
.spanning_blob_map
.size();
7671 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
7672 _dump_onode
<30>(cct
, *o
);
7674 if (!o
->extent_map
.shards
.empty()) {
7675 ++num_sharded_objects
;
7676 if (depth
!= FSCK_SHALLOW
) {
7677 ceph_assert(expecting_shards
);
7678 for (auto& s
: o
->extent_map
.shards
) {
7679 dout(20) << __func__
<< " shard " << *s
.shard_info
<< dendl
;
7680 expecting_shards
->push_back(string());
7681 get_extent_shard_key(o
->key
, s
.shard_info
->offset
,
7682 &expecting_shards
->back());
7683 if (s
.shard_info
->offset
>= o
->onode
.size
) {
7684 derr
<< "fsck error: " << oid
<< " shard 0x" << std::hex
7685 << s
.shard_info
->offset
<< " past EOF at 0x" << o
->onode
.size
7686 << std::dec
<< dendl
;
7695 mempool::bluestore_fsck::map
<BlobRef
,
7696 bluestore_blob_use_tracker_t
> ref_map
;
7697 for (auto& l
: o
->extent_map
.extent_map
) {
7698 dout(20) << __func__
<< " " << l
<< dendl
;
7699 if (l
.logical_offset
< pos
) {
7700 derr
<< "fsck error: " << oid
<< " lextent at 0x"
7701 << std::hex
<< l
.logical_offset
7702 << " overlaps with the previous, which ends at 0x" << pos
7703 << std::dec
<< dendl
;
7706 if (depth
!= FSCK_SHALLOW
&&
7707 o
->extent_map
.spans_shard(l
.logical_offset
, l
.length
)) {
7708 derr
<< "fsck error: " << oid
<< " lextent at 0x"
7709 << std::hex
<< l
.logical_offset
<< "~" << l
.length
7710 << " spans a shard boundary"
7711 << std::dec
<< dendl
;
7714 pos
= l
.logical_offset
+ l
.length
;
7715 res_statfs
->data_stored
+= l
.length
;
7716 ceph_assert(l
.blob
);
7717 const bluestore_blob_t
& blob
= l
.blob
->get_blob();
7719 auto& ref
= ref_map
[l
.blob
];
7720 if (ref
.is_empty()) {
7721 uint32_t min_release_size
= blob
.get_release_size(min_alloc_size
);
7722 uint32_t l
= blob
.get_logical_length();
7723 ref
.init(l
, min_release_size
);
7729 if (depth
!= FSCK_SHALLOW
&&
7730 blob
.has_unused()) {
7731 ceph_assert(referenced
);
7732 auto p
= referenced
->find(l
.blob
);
7733 bluestore_blob_t::unused_t
* pu
;
7734 if (p
== referenced
->end()) {
7735 pu
= &(*referenced
)[l
.blob
];
7740 uint64_t blob_len
= blob
.get_logical_length();
7741 ceph_assert((blob_len
% (sizeof(*pu
) * 8)) == 0);
7742 ceph_assert(l
.blob_offset
+ l
.length
<= blob_len
);
7743 uint64_t chunk_size
= blob_len
/ (sizeof(*pu
) * 8);
7744 uint64_t start
= l
.blob_offset
/ chunk_size
;
7746 round_up_to(l
.blob_offset
+ l
.length
, chunk_size
) / chunk_size
;
7747 for (auto i
= start
; i
< end
; ++i
) {
7751 } //for (auto& l : o->extent_map.extent_map)
7753 for (auto& i
: ref_map
) {
7755 const bluestore_blob_t
& blob
= i
.first
->get_blob();
7757 depth
== FSCK_SHALLOW
? true :
7758 i
.first
->get_blob_use_tracker().equal(i
.second
);
7760 derr
<< "fsck error: " << oid
<< " blob " << *i
.first
7761 << " doesn't match expected ref_map " << i
.second
<< dendl
;
7764 if (blob
.is_compressed()) {
7765 res_statfs
->data_compressed
+= blob
.get_compressed_payload_length();
7766 res_statfs
->data_compressed_original
+=
7767 i
.first
->get_referenced_bytes();
7769 if (blob
.is_shared()) {
7770 if (i
.first
->shared_blob
->get_sbid() > blobid_max
) {
7771 derr
<< "fsck error: " << oid
<< " blob " << blob
7772 << " sbid " << i
.first
->shared_blob
->get_sbid() << " > blobid_max "
7773 << blobid_max
<< dendl
;
7776 else if (i
.first
->shared_blob
->get_sbid() == 0) {
7777 derr
<< "fsck error: " << oid
<< " blob " << blob
7778 << " marked as shared but has uninitialized sbid"
7782 // the below lock is optional and provided in multithreading mode only
7784 sb_info_lock
->lock();
7786 sb_info_t
& sbi
= sb_info
[i
.first
->shared_blob
->get_sbid()];
7787 ceph_assert(sbi
.cid
== coll_t() || sbi
.cid
== c
->cid
);
7788 ceph_assert(sbi
.pool_id
== INT64_MIN
||
7789 sbi
.pool_id
== oid
.hobj
.get_logical_pool());
7791 sbi
.pool_id
= oid
.hobj
.get_logical_pool();
7792 sbi
.sb
= i
.first
->shared_blob
;
7793 sbi
.oids
.push_back(oid
);
7794 sbi
.compressed
= blob
.is_compressed();
7795 for (auto e
: blob
.get_extents()) {
7797 sbi
.ref_map
.get(e
.offset
, e
.length
);
7801 sb_info_lock
->unlock();
7803 } else if (depth
!= FSCK_SHALLOW
) {
7804 ceph_assert(used_blocks
);
7805 errors
+= _fsck_check_extents(c
->cid
, oid
, blob
.get_extents(),
7806 blob
.is_compressed(),
7808 fm
->get_alloc_size(),
7813 errors
+= _fsck_sum_extents(
7815 blob
.is_compressed(),
7818 } // for (auto& i : ref_map)
7821 auto &sbm
= o
->extent_map
.spanning_blob_map
;
7823 BlobRef first_broken
;
7824 for (auto it
= sbm
.begin(); it
!= sbm
.end();) {
7826 if (ref_map
.count(it1
->second
) == 0) {
7828 first_broken
= it1
->second
;
7838 derr
<< "fsck error: " << oid
<< " - " << broken
7839 << " zombie spanning blob(s) found, the first one: "
7840 << *first_broken
<< dendl
;
7842 auto txn
= repairer
->fix_spanning_blobs(db
);
7843 _record_onode(o
, txn
);
7848 if (o
->onode
.has_omap()) {
7849 _fsck_check_object_omap(depth
, o
, ctx
);
7855 #include "common/WorkQueue.h"
7857 class ShallowFSCKThreadPool
: public ThreadPool
7860 ShallowFSCKThreadPool(CephContext
* cct_
, std::string nm
, std::string tn
, int n
) :
7861 ThreadPool(cct_
, nm
, tn
, n
) {
7863 void worker(ThreadPool::WorkThread
* wt
) override
{
7866 next_wq
%= work_queues
.size();
7867 WorkQueue_
*wq
= work_queues
[next_wq
++];
7869 void* item
= wq
->_void_dequeue();
7872 TPHandle
tp_handle(cct
, nullptr, wq
->timeout_interval
, wq
->suicide_interval
);
7873 wq
->_void_process(item
, tp_handle
);
7878 template <size_t BatchLen
>
7879 struct FSCKWorkQueue
: public ThreadPool::WorkQueue_
7883 BlueStore::CollectionRef c
;
7889 std::atomic
<size_t> running
= { 0 };
7890 size_t entry_count
= 0;
7891 std::array
<Entry
, BatchLen
> entries
;
7894 int64_t warnings
= 0;
7895 uint64_t num_objects
= 0;
7896 uint64_t num_extents
= 0;
7897 uint64_t num_blobs
= 0;
7898 uint64_t num_sharded_objects
= 0;
7899 uint64_t num_spanning_blobs
= 0;
7900 store_statfs_t expected_store_statfs
;
7901 BlueStore::per_pool_statfs expected_pool_statfs
;
7905 BlueStore
* store
= nullptr;
7907 ceph::mutex
* sb_info_lock
= nullptr;
7908 BlueStore::sb_info_map_t
* sb_info
= nullptr;
7909 BlueStoreRepairer
* repairer
= nullptr;
7911 Batch
* batches
= nullptr;
7912 size_t last_batch_pos
= 0;
7913 bool batch_acquired
= false;
7915 FSCKWorkQueue(std::string n
,
7918 ceph::mutex
* _sb_info_lock
,
7919 BlueStore::sb_info_map_t
& _sb_info
,
7920 BlueStoreRepairer
* _repairer
) :
7921 WorkQueue_(n
, time_t(), time_t()),
7922 batchCount(_batchCount
),
7924 sb_info_lock(_sb_info_lock
),
7928 batches
= new Batch
[batchCount
];
7934 /// Remove all work items from the queue.
7935 void _clear() override
{
7938 /// Check whether there is anything to do.
7939 bool _empty() override
{
7943 /// Get the next work item to process.
7944 void* _void_dequeue() override
{
7945 size_t pos
= rand() % batchCount
;
7948 auto& batch
= batches
[pos
];
7949 if (batch
.running
.fetch_add(1) == 0) {
7950 if (batch
.entry_count
) {
7957 } while (pos
!= pos0
);
7960 /** @brief Process the work item.
7961 * This function will be called several times in parallel
7962 * and must therefore be thread-safe. */
7963 void _void_process(void* item
, TPHandle
& handle
) override
{
7964 Batch
* batch
= (Batch
*)item
;
7966 BlueStore::FSCK_ObjectCtx
ctx(
7972 batch
->num_sharded_objects
,
7973 batch
->num_spanning_blobs
,
7974 nullptr, // used_blocks
7975 nullptr, //used_omap_head
7978 batch
->expected_store_statfs
,
7979 batch
->expected_pool_statfs
,
7982 for (size_t i
= 0; i
< batch
->entry_count
; i
++) {
7983 auto& entry
= batch
->entries
[i
];
7985 store
->fsck_check_objects_shallow(
7986 BlueStore::FSCK_SHALLOW
,
7992 nullptr, // expecting_shards - this will need a protection if passed
7993 nullptr, // referenced
7996 //std::cout << "processed " << batch << std::endl;
7997 batch
->entry_count
= 0;
8000 /** @brief Synchronously finish processing a work item.
8001 * This function is called after _void_process with the global thread pool lock held,
8002 * so at most one copy will execute simultaneously for a given thread pool.
8003 * It can be used for non-thread-safe finalization. */
8004 void _void_process_finish(void*) override
{
8010 BlueStore::CollectionRef c
,
8011 const ghobject_t
& oid
,
8013 const bufferlist
& value
) {
8015 size_t pos0
= last_batch_pos
;
8016 if (!batch_acquired
) {
8018 auto& batch
= batches
[last_batch_pos
];
8019 if (batch
.running
.fetch_add(1) == 0) {
8020 if (batch
.entry_count
< BatchLen
) {
8021 batch_acquired
= true;
8025 batch
.running
.fetch_sub(1);
8027 last_batch_pos
%= batchCount
;
8028 } while (last_batch_pos
!= pos0
);
8030 if (batch_acquired
) {
8031 auto& batch
= batches
[last_batch_pos
];
8032 ceph_assert(batch
.running
);
8033 ceph_assert(batch
.entry_count
< BatchLen
);
8035 auto& entry
= batch
.entries
[batch
.entry_count
];
8036 entry
.pool_id
= pool_id
;
8040 entry
.value
= value
;
8042 ++batch
.entry_count
;
8043 if (batch
.entry_count
== BatchLen
) {
8044 batch_acquired
= false;
8045 batch
.running
.fetch_sub(1);
8047 last_batch_pos
%= batchCount
;
8054 void finalize(ThreadPool
& tp
,
8055 BlueStore::FSCK_ObjectCtx
& ctx
) {
8056 if (batch_acquired
) {
8057 auto& batch
= batches
[last_batch_pos
];
8058 ceph_assert(batch
.running
);
8059 batch
.running
.fetch_sub(1);
8063 for (size_t i
= 0; i
< batchCount
; i
++) {
8064 auto& batch
= batches
[i
];
8066 //process leftovers if any
8067 if (batch
.entry_count
) {
8068 TPHandle
tp_handle(store
->cct
,
8072 ceph_assert(batch
.running
== 0);
8074 batch
.running
++; // just to be on-par with the regular call
8075 _void_process(&batch
, tp_handle
);
8077 ceph_assert(batch
.entry_count
== 0);
8079 ctx
.errors
+= batch
.errors
;
8080 ctx
.warnings
+= batch
.warnings
;
8081 ctx
.num_objects
+= batch
.num_objects
;
8082 ctx
.num_extents
+= batch
.num_extents
;
8083 ctx
.num_blobs
+= batch
.num_blobs
;
8084 ctx
.num_sharded_objects
+= batch
.num_sharded_objects
;
8085 ctx
.num_spanning_blobs
+= batch
.num_spanning_blobs
;
8087 ctx
.expected_store_statfs
.add(batch
.expected_store_statfs
);
8089 for (auto it
= batch
.expected_pool_statfs
.begin();
8090 it
!= batch
.expected_pool_statfs
.end();
8092 ctx
.expected_pool_statfs
[it
->first
].add(it
->second
);
8099 void BlueStore::_fsck_check_object_omap(FSCKDepth depth
,
8101 const BlueStore::FSCK_ObjectCtx
& ctx
)
8103 auto& errors
= ctx
.errors
;
8104 auto& warnings
= ctx
.warnings
;
8105 auto repairer
= ctx
.repairer
;
8107 ceph_assert(o
->onode
.has_omap());
8108 if (!o
->onode
.is_perpool_omap() && !o
->onode
.is_pgmeta_omap()) {
8109 if (per_pool_omap
) {
8110 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8111 << "fsck error: " << o
->oid
8112 << " has omap that is not per-pool or pgmeta"
8118 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_omap
) {
8127 fsck_derr(num
, MAX_FSCK_ERROR_LINES
)
8128 << "fsck " << w
<< ": " << o
->oid
8129 << " has omap that is not per-pool or pgmeta"
8134 !o
->onode
.is_perpool_omap() &&
8135 !o
->onode
.is_pgmeta_omap()) {
8136 dout(10) << "fsck converting " << o
->oid
<< " omap to per-pool" << dendl
;
8138 map
<string
, bufferlist
> kv
;
8139 int r
= _onode_omap_get(o
, &h
, &kv
);
8141 derr
<< " got " << r
<< " " << cpp_strerror(r
) << dendl
;
8143 KeyValueDB::Transaction txn
= db
->get_transaction();
8145 const string
& old_omap_prefix
= o
->get_omap_prefix();
8146 string old_head
, old_tail
;
8147 o
->get_omap_header(&old_head
);
8148 o
->get_omap_tail(&old_tail
);
8149 txn
->rm_range_keys(old_omap_prefix
, old_head
, old_tail
);
8150 txn
->rmkey(old_omap_prefix
, old_tail
);
8152 o
->onode
.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP
);
8153 _record_onode(o
, txn
);
8154 const string
& new_omap_prefix
= o
->get_omap_prefix();
8158 o
->get_omap_header(&new_head
);
8159 txn
->set(new_omap_prefix
, new_head
, h
);
8163 o
->get_omap_tail(&new_tail
);
8165 txn
->set(new_omap_prefix
, new_tail
, empty
);
8168 o
->get_omap_key(string(), &final_key
);
8169 size_t base_key_len
= final_key
.size();
8170 for (auto& i
: kv
) {
8171 final_key
.resize(base_key_len
);
8172 final_key
+= i
.first
;
8173 txn
->set(new_omap_prefix
, final_key
, i
.second
);
8175 db
->submit_transaction_sync(txn
);
8176 repairer
->inc_repaired();
8181 void BlueStore::_fsck_check_objects(FSCKDepth depth
,
8182 BlueStore::FSCK_ObjectCtx
& ctx
)
8184 auto& errors
= ctx
.errors
;
8185 auto sb_info_lock
= ctx
.sb_info_lock
;
8186 auto& sb_info
= ctx
.sb_info
;
8187 auto repairer
= ctx
.repairer
;
8189 uint64_t_btree_t used_nids
;
8191 size_t processed_myself
= 0;
8193 auto it
= db
->get_iterator(PREFIX_OBJ
);
8194 mempool::bluestore_fsck::list
<string
> expecting_shards
;
8196 const size_t thread_count
= cct
->_conf
->bluestore_fsck_quick_fix_threads
;
8197 typedef ShallowFSCKThreadPool::FSCKWorkQueue
<256> WQ
;
8198 std::unique_ptr
<WQ
> wq(
8201 (thread_count
? : 1) * 32,
8207 ShallowFSCKThreadPool
thread_pool(cct
, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count
);
8209 thread_pool
.add_work_queue(wq
.get());
8210 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
8211 //not the best place but let's check anyway
8212 ceph_assert(sb_info_lock
);
8213 thread_pool
.start();
8216 //fill global if not overriden below
8218 int64_t pool_id
= -1;
8220 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8221 dout(30) << __func__
<< " key "
8222 << pretty_binary_string(it
->key()) << dendl
;
8223 if (is_extent_shard_key(it
->key())) {
8224 if (depth
== FSCK_SHALLOW
) {
8227 while (!expecting_shards
.empty() &&
8228 expecting_shards
.front() < it
->key()) {
8229 derr
<< "fsck error: missing shard key "
8230 << pretty_binary_string(expecting_shards
.front())
8233 expecting_shards
.pop_front();
8235 if (!expecting_shards
.empty() &&
8236 expecting_shards
.front() == it
->key()) {
8238 expecting_shards
.pop_front();
8244 get_key_extent_shard(it
->key(), &okey
, &offset
);
8245 derr
<< "fsck error: stray shard 0x" << std::hex
<< offset
8246 << std::dec
<< dendl
;
8247 if (expecting_shards
.empty()) {
8248 derr
<< "fsck error: " << pretty_binary_string(it
->key())
8249 << " is unexpected" << dendl
;
8253 while (expecting_shards
.front() > it
->key()) {
8254 derr
<< "fsck error: saw " << pretty_binary_string(it
->key())
8256 derr
<< "fsck error: exp "
8257 << pretty_binary_string(expecting_shards
.front()) << dendl
;
8259 expecting_shards
.pop_front();
8260 if (expecting_shards
.empty()) {
8268 int r
= get_key_object(it
->key(), &oid
);
8270 derr
<< "fsck error: bad object key "
8271 << pretty_binary_string(it
->key()) << dendl
;
8276 oid
.shard_id
!= pgid
.shard
||
8277 oid
.hobj
.get_logical_pool() != (int64_t)pgid
.pool() ||
8278 !c
->contains(oid
)) {
8280 for (auto& p
: coll_map
) {
8281 if (p
.second
->contains(oid
)) {
8287 derr
<< "fsck error: stray object " << oid
8288 << " not owned by any collection" << dendl
;
8292 pool_id
= c
->cid
.is_pg(&pgid
) ? pgid
.pool() : META_POOL_ID
;
8293 dout(20) << __func__
<< " collection " << c
->cid
<< " " << c
->cnode
8297 if (depth
!= FSCK_SHALLOW
&&
8298 !expecting_shards
.empty()) {
8299 for (auto& k
: expecting_shards
) {
8300 derr
<< "fsck error: missing shard key "
8301 << pretty_binary_string(k
) << dendl
;
8304 expecting_shards
.clear();
8307 bool queued
= false;
8308 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
8317 map
<BlobRef
, bluestore_blob_t::unused_t
> referenced
;
8322 o
= fsck_check_objects_shallow(
8334 if (depth
!= FSCK_SHALLOW
) {
8335 ceph_assert(o
!= nullptr);
8337 if (o
->onode
.nid
> nid_max
) {
8338 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
8339 << " > nid_max " << nid_max
<< dendl
;
8342 if (used_nids
.count(o
->onode
.nid
)) {
8343 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
8344 << " already in use" << dendl
;
8346 continue; // go for next object
8348 used_nids
.insert(o
->onode
.nid
);
8350 for (auto& i
: referenced
) {
8351 dout(20) << __func__
<< " referenced 0x" << std::hex
<< i
.second
8352 << std::dec
<< " for " << *i
.first
<< dendl
;
8353 const bluestore_blob_t
& blob
= i
.first
->get_blob();
8354 if (i
.second
& blob
.unused
) {
8355 derr
<< "fsck error: " << oid
<< " blob claims unused 0x"
8356 << std::hex
<< blob
.unused
8357 << " but extents reference 0x" << i
.second
<< std::dec
8358 << " on blob " << *i
.first
<< dendl
;
8361 if (blob
.has_csum()) {
8362 uint64_t blob_len
= blob
.get_logical_length();
8363 uint64_t unused_chunk_size
= blob_len
/ (sizeof(blob
.unused
) * 8);
8364 unsigned csum_count
= blob
.get_csum_count();
8365 unsigned csum_chunk_size
= blob
.get_csum_chunk_size();
8366 for (unsigned p
= 0; p
< csum_count
; ++p
) {
8367 unsigned pos
= p
* csum_chunk_size
;
8368 unsigned firstbit
= pos
/ unused_chunk_size
; // [firstbit,lastbit]
8369 unsigned lastbit
= (pos
+ csum_chunk_size
- 1) / unused_chunk_size
;
8370 unsigned mask
= 1u << firstbit
;
8371 for (unsigned b
= firstbit
+ 1; b
<= lastbit
; ++b
) {
8374 if ((blob
.unused
& mask
) == mask
) {
8375 // this csum chunk region is marked unused
8376 if (blob
.get_csum_item(p
) != 0) {
8377 derr
<< "fsck error: " << oid
8378 << " blob claims csum chunk 0x" << std::hex
<< pos
8379 << "~" << csum_chunk_size
8380 << " is unused (mask 0x" << mask
<< " of unused 0x"
8381 << blob
.unused
<< ") but csum is non-zero 0x"
8382 << blob
.get_csum_item(p
) << std::dec
<< " on blob "
8383 << *i
.first
<< dendl
;
8391 if (o
->onode
.has_omap()) {
8392 ceph_assert(ctx
.used_omap_head
);
8393 if (ctx
.used_omap_head
->count(o
->onode
.nid
)) {
8394 derr
<< "fsck error: " << o
->oid
<< " omap_head " << o
->onode
.nid
8395 << " already in use" << dendl
;
8398 ctx
.used_omap_head
->insert(o
->onode
.nid
);
8400 } // if (o->onode.has_omap())
8401 if (depth
== FSCK_DEEP
) {
8403 uint64_t max_read_block
= cct
->_conf
->bluestore_fsck_read_bytes_cap
;
8404 uint64_t offset
= 0;
8406 uint64_t l
= std::min(uint64_t(o
->onode
.size
- offset
), max_read_block
);
8407 int r
= _do_read(c
.get(), o
, offset
, l
, bl
,
8408 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
8411 derr
<< "fsck error: " << oid
<< std::hex
8412 << " error during read: "
8413 << " " << offset
<< "~" << l
8414 << " " << cpp_strerror(r
) << std::dec
8419 } while (offset
< o
->onode
.size
);
8421 } //if (depth != FSCK_SHALLOW)
8422 } // for (it->lower_bound(string()); it->valid(); it->next())
8423 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
8424 wq
->finalize(thread_pool
, ctx
);
8425 if (processed_myself
) {
8426 // may be needs more threads?
8427 dout(0) << __func__
<< " partial offload"
8428 << ", done myself " << processed_myself
8429 << " of " << ctx
.num_objects
8430 << "objects, threads " << thread_count
8437 An overview for currently implemented repair logics
8438 performed in fsck in two stages: detection(+preparation) and commit.
8439 Detection stage (in processing order):
8440 (Issue -> Repair action to schedule)
8441 - Detect undecodable keys for Shared Blobs -> Remove
8442 - Detect undecodable records for Shared Blobs -> Remove
8443 (might trigger missed Shared Blob detection below)
8444 - Detect stray records for Shared Blobs -> Remove
8445 - Detect misreferenced pextents -> Fix
8446 Prepare Bloom-like filter to track cid/oid -> pextent
8447 Prepare list of extents that are improperly referenced
8448 Enumerate Onode records that might use 'misreferenced' pextents
8449 (Bloom-like filter applied to reduce computation)
8450 Per each questinable Onode enumerate all blobs and identify broken ones
8451 (i.e. blobs having 'misreferences')
8452 Rewrite each broken blob data by allocating another extents and
8454 If blob is shared - unshare it and mark corresponding Shared Blob
8456 Release previously allocated space
8458 - Detect missed Shared Blobs -> Recreate
8459 - Detect undecodable deferred transaction -> Remove
8460 - Detect Freelist Manager's 'false free' entries -> Mark as used
8461 - Detect Freelist Manager's leaked entries -> Mark as free
8462 - Detect statfs inconsistency - Update
8463 Commit stage (separate DB commit per each step):
8464 - Apply leaked FM entries fix
8465 - Apply 'false free' FM entries fix
8466 - Apply 'Remove' actions
8467 - Apply fix for misreference pextents
8468 - Apply Shared Blob recreate
8469 (can be merged with the step above if misreferences were dectected)
8470 - Apply StatFS update
8472 int BlueStore::_fsck(BlueStore::FSCKDepth depth
, bool repair
)
8475 << (repair
? " repair" : " check")
8476 << (depth
== FSCK_DEEP
? " (deep)" :
8477 depth
== FSCK_SHALLOW
? " (shallow)" : " (regular)")
8480 // in deep mode we need R/W write access to be able to replay deferred ops
8481 bool read_only
= !(repair
|| depth
== FSCK_DEEP
);
8483 int r
= _open_path();
8486 r
= _open_fsid(false);
8490 r
= _read_fsid(&fsid
);
8498 r
= _open_bdev(false);
8502 r
= _open_db_and_around(read_only
);
8507 r
= _upgrade_super();
8513 r
= _open_collections();
8517 mempool_thread
.init();
8519 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8520 // enable in repair or deep mode modes only
8523 r
= _deferred_replay();
8529 r
= _fsck_on_open(depth
, repair
);
8532 mempool_thread
.shutdown();
8535 _close_db_and_around(false);
8546 int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth
, bool repair
)
8550 << (repair
? " repair" : " check")
8551 << (depth
== FSCK_DEEP
? " (deep)" :
8552 depth
== FSCK_SHALLOW
? " (shallow)" : " (regular)")
8553 << " start" << dendl
;
8555 int64_t warnings
= 0;
8556 unsigned repaired
= 0;
8558 uint64_t_btree_t used_omap_head
;
8559 uint64_t_btree_t used_sbids
;
8561 mempool_dynamic_bitset used_blocks
;
8562 KeyValueDB::Iterator it
;
8563 store_statfs_t expected_store_statfs
, actual_statfs
;
8564 per_pool_statfs expected_pool_statfs
;
8566 sb_info_map_t sb_info
;
8568 uint64_t num_objects
= 0;
8569 uint64_t num_extents
= 0;
8570 uint64_t num_blobs
= 0;
8571 uint64_t num_spanning_blobs
= 0;
8572 uint64_t num_shared_blobs
= 0;
8573 uint64_t num_sharded_objects
= 0;
8574 BlueStoreRepairer repairer
;
8576 utime_t start
= ceph_clock_now();
8578 _fsck_collections(&errors
);
8579 used_blocks
.resize(fm
->get_alloc_units());
8580 apply_for_bitset_range(
8581 0, std::max
<uint64_t>(min_alloc_size
, SUPER_RESERVED
), fm
->get_alloc_size(), used_blocks
,
8582 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8587 repairer
.get_space_usage_tracker().init(
8593 if( cct
->_conf
->bluestore_bluefs_db_compatibility
) {
8594 interval_set
<uint64_t> bluefs_extents_db
;
8596 db
->get(PREFIX_SUPER
, "bluefs_extents", &bl
);
8597 auto p
= bl
.cbegin();
8598 auto prev_errors
= errors
;
8600 decode(bluefs_extents_db
, p
);
8601 bluefs_extents_db
.union_of(bluefs_extents
);
8602 bluefs_extents_db
.subtract(bluefs_extents
);
8603 if (!bluefs_extents_db
.empty()) {
8604 derr
<< "fsck error: bluefs_extents inconsistency, "
8605 << "downgrade to previous releases might be broken."
8610 catch (buffer::error
& e
) {
8611 derr
<< "fsck error: failed to retrieve bluefs_extents from kv" << dendl
;
8614 if (errors
!= prev_errors
&& repair
) {
8615 repairer
.fix_bluefs_extents(out_of_sync_fm
);
8619 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
8620 apply_for_bitset_range(
8621 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
8622 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8626 int r
= bluefs
->fsck();
8634 if (!per_pool_stat_collection
) {
8636 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_stats
) {
8643 derr
<< "fsck " << w
<< ": store not yet converted to per-pool stats"
8646 if (!per_pool_omap
) {
8648 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_omap
) {
8655 derr
<< "fsck " << w
<< ": store not yet converted to per-pool omap"
8659 // get expected statfs; reset unaffected fields to be able to compare
8661 statfs(&actual_statfs
);
8662 actual_statfs
.total
= 0;
8663 actual_statfs
.internally_reserved
= 0;
8664 actual_statfs
.available
= 0;
8665 actual_statfs
.internal_metadata
= 0;
8666 actual_statfs
.omap_allocated
= 0;
8668 if (g_conf()->bluestore_debug_fsck_abort
) {
8669 dout(1) << __func__
<< " debug abort" << dendl
;
8674 dout(1) << __func__
<< " walking object keyspace" << dendl
;
8675 ceph::mutex sb_info_lock
= ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8676 BlueStore::FSCK_ObjectCtx
ctx(
8682 num_sharded_objects
,
8686 //no need for the below lock when in non-shallow mode as
8687 // there is no multithreading in this case
8688 depth
== FSCK_SHALLOW
? &sb_info_lock
: nullptr,
8690 expected_store_statfs
,
8691 expected_pool_statfs
,
8692 repair
? &repairer
: nullptr);
8694 _fsck_check_objects(depth
, ctx
);
8697 dout(1) << __func__
<< " checking shared_blobs" << dendl
;
8698 it
= db
->get_iterator(PREFIX_SHARED_BLOB
);
8700 // FIXME minor: perhaps simplify for shallow mode?
8701 // fill global if not overriden below
8702 auto expected_statfs
= &expected_store_statfs
;
8704 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8705 string key
= it
->key();
8707 if (get_key_shared_blob(key
, &sbid
)) {
8708 derr
<< "fsck error: bad key '" << key
8709 << "' in shared blob namespace" << dendl
;
8711 repairer
.remove_key(db
, PREFIX_SHARED_BLOB
, key
);
8716 auto p
= sb_info
.find(sbid
);
8717 if (p
== sb_info
.end()) {
8718 derr
<< "fsck error: found stray shared blob data for sbid 0x"
8719 << std::hex
<< sbid
<< std::dec
<< dendl
;
8721 repairer
.remove_key(db
, PREFIX_SHARED_BLOB
, key
);
8726 sb_info_t
& sbi
= p
->second
;
8727 bluestore_shared_blob_t
shared_blob(sbid
);
8728 bufferlist bl
= it
->value();
8729 auto blp
= bl
.cbegin();
8731 decode(shared_blob
, blp
);
8732 } catch (buffer::error
& e
) {
8734 // Force update and don't report as missing
8735 sbi
.updated
= sbi
.passed
= true;
8737 derr
<< "fsck error: failed to decode Shared Blob"
8738 << pretty_binary_string(it
->key()) << dendl
;
8740 dout(20) << __func__
<< " undecodable Shared Blob, key:'"
8741 << pretty_binary_string(it
->key())
8742 << "', removing" << dendl
;
8743 repairer
.remove_key(db
, PREFIX_DEFERRED
, it
->key());
8747 dout(20) << __func__
<< " " << *sbi
.sb
<< " " << shared_blob
<< dendl
;
8748 if (shared_blob
.ref_map
!= sbi
.ref_map
) {
8749 derr
<< "fsck error: shared blob 0x" << std::hex
<< sbid
8750 << std::dec
<< " ref_map " << shared_blob
.ref_map
8751 << " != expected " << sbi
.ref_map
<< dendl
;
8752 sbi
.updated
= true; // will update later in repair mode only!
8755 PExtentVector extents
;
8756 for (auto &r
: shared_blob
.ref_map
.ref_map
) {
8757 extents
.emplace_back(bluestore_pextent_t(r
.first
, r
.second
.length
));
8759 if (per_pool_stat_collection
|| repair
) {
8760 expected_statfs
= &expected_pool_statfs
[sbi
.pool_id
];
8762 errors
+= _fsck_check_extents(sbi
.cid
,
8763 p
->second
.oids
.front(),
8765 p
->second
.compressed
,
8767 fm
->get_alloc_size(),
8768 repair
? &repairer
: nullptr,
8776 if (repair
&& repairer
.preprocess_misreference(db
)) {
8778 dout(1) << __func__
<< " sorting out misreferenced extents" << dendl
;
8779 auto& space_tracker
= repairer
.get_space_usage_tracker();
8780 auto& misref_extents
= repairer
.get_misreferences();
8781 interval_set
<uint64_t> to_release
;
8782 it
= db
->get_iterator(PREFIX_OBJ
);
8784 // fill global if not overriden below
8785 auto expected_statfs
= &expected_store_statfs
;
8789 KeyValueDB::Transaction txn
= repairer
.get_fix_misreferences_txn();
8790 bool bypass_rest
= false;
8791 for (it
->lower_bound(string()); it
->valid() && !bypass_rest
;
8793 dout(30) << __func__
<< " key "
8794 << pretty_binary_string(it
->key()) << dendl
;
8795 if (is_extent_shard_key(it
->key())) {
8800 int r
= get_key_object(it
->key(), &oid
);
8801 if (r
< 0 || !space_tracker
.is_used(oid
)) {
8806 oid
.shard_id
!= pgid
.shard
||
8807 oid
.hobj
.get_logical_pool() != (int64_t)pgid
.pool() ||
8808 !c
->contains(oid
)) {
8810 for (auto& p
: coll_map
) {
8811 if (p
.second
->contains(oid
)) {
8819 if (per_pool_stat_collection
|| repair
) {
8820 auto pool_id
= c
->cid
.is_pg(&pgid
) ? pgid
.pool() : META_POOL_ID
;
8821 expected_statfs
= &expected_pool_statfs
[pool_id
];
8824 if (!space_tracker
.is_used(c
->cid
)) {
8828 dout(20) << __func__
<< " check misreference for col:" << c
->cid
8829 << " obj:" << oid
<< dendl
;
8832 o
.reset(Onode::decode(c
, oid
, it
->key(), it
->value()));
8833 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
8834 mempool::bluestore_fsck::set
<BlobRef
> blobs
;
8836 for (auto& e
: o
->extent_map
.extent_map
) {
8837 blobs
.insert(e
.blob
);
8839 bool need_onode_update
= false;
8840 bool first_dump
= true;
8841 for(auto b
: blobs
) {
8842 bool broken_blob
= false;
8843 auto& pextents
= b
->dirty_blob().dirty_extents();
8844 for (auto& e
: pextents
) {
8845 if (!e
.is_valid()) {
8848 // for the sake of simplicity and proper shared blob handling
8849 // always rewrite the whole blob even when it's partially
8851 if (misref_extents
.intersects(e
.offset
, e
.length
)) {
8854 _dump_onode
<10>(cct
, *o
);
8862 bool compressed
= b
->get_blob().is_compressed();
8863 need_onode_update
= true;
8864 dout(10) << __func__
8865 << " fix misreferences in oid:" << oid
8866 << " " << *b
<< dendl
;
8868 PExtentVector pext_to_release
;
8869 pext_to_release
.reserve(pextents
.size());
8870 // rewriting all valid pextents
8871 for (auto e
= pextents
.begin(); e
!= pextents
.end();
8872 b_off
+= e
->length
, e
++) {
8873 if (!e
->is_valid()) {
8877 int64_t alloc_len
= alloc
->allocate(e
->length
, min_alloc_size
,
8879 if (alloc_len
< 0 || alloc_len
< (int64_t)e
->length
) {
8881 << " failed to allocate 0x" << std::hex
<< e
->length
8882 << " allocated 0x " << (alloc_len
< 0 ? 0 : alloc_len
)
8883 << " min_alloc_size 0x" << min_alloc_size
8884 << " available 0x " << alloc
->get_free()
8885 << std::dec
<< dendl
;
8886 if (alloc_len
> 0) {
8887 alloc
->release(exts
);
8892 expected_statfs
->allocated
+= e
->length
;
8894 expected_statfs
->data_compressed_allocated
+= e
->length
;
8898 IOContext
ioc(cct
, NULL
, true); // allow EIO
8899 r
= bdev
->read(e
->offset
, e
->length
, &bl
, &ioc
, false);
8901 derr
<< __func__
<< " failed to read from 0x" << std::hex
<< e
->offset
8902 <<"~" << e
->length
<< std::dec
<< dendl
;
8903 ceph_abort_msg("read failed, wtf");
8905 pext_to_release
.push_back(*e
);
8906 e
= pextents
.erase(e
);
8907 e
= pextents
.insert(e
, exts
.begin(), exts
.end());
8908 b
->get_blob().map_bl(
8910 [&](uint64_t offset
, bufferlist
& t
) {
8911 int r
= bdev
->write(offset
, t
, false);
8912 ceph_assert(r
== 0);
8914 e
+= exts
.size() - 1;
8915 for (auto& p
: exts
) {
8916 fm
->allocate(p
.offset
, p
.length
, txn
);
8918 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8920 if (b
->get_blob().is_shared()) {
8921 b
->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED
);
8923 auto sb_it
= sb_info
.find(b
->shared_blob
->get_sbid());
8924 ceph_assert(sb_it
!= sb_info
.end());
8925 sb_info_t
& sbi
= sb_it
->second
;
8927 for (auto& r
: sbi
.ref_map
.ref_map
) {
8928 expected_statfs
->allocated
-= r
.second
.length
;
8929 if (sbi
.compressed
) {
8930 // NB: it's crucial to use compressed flag from sb_info_t
8931 // as we originally used that value while accumulating
8933 expected_statfs
->data_compressed_allocated
-= r
.second
.length
;
8936 sbi
.updated
= sbi
.passed
= true;
8937 sbi
.ref_map
.clear();
8939 // relying on blob's pextents to decide what to release.
8940 for (auto& p
: pext_to_release
) {
8941 to_release
.union_insert(p
.offset
, p
.length
);
8944 for (auto& p
: pext_to_release
) {
8945 expected_statfs
->allocated
-= p
.length
;
8947 expected_statfs
->data_compressed_allocated
-= p
.length
;
8949 to_release
.union_insert(p
.offset
, p
.length
);
8955 } // for(auto b : blobs)
8956 if (need_onode_update
) {
8957 o
->extent_map
.dirty_range(0, OBJECT_MAX_SIZE
);
8958 _record_onode(o
, txn
);
8960 } // for (it->lower_bound(string()); it->valid(); it->next())
8962 for (auto it
= to_release
.begin(); it
!= to_release
.end(); ++it
) {
8963 dout(10) << __func__
<< " release 0x" << std::hex
<< it
.get_start()
8964 << "~" << it
.get_len() << std::dec
<< dendl
;
8965 fm
->release(it
.get_start(), it
.get_len(), txn
);
8967 alloc
->release(to_release
);
8970 } //if (repair && repairer.preprocess_misreference()) {
8972 if (depth
!= FSCK_SHALLOW
) {
8973 for (auto &p
: sb_info
) {
8974 sb_info_t
& sbi
= p
.second
;
8976 derr
<< "fsck error: missing " << *sbi
.sb
<< dendl
;
8979 if (repair
&& (!sbi
.passed
|| sbi
.updated
)) {
8980 auto sbid
= p
.first
;
8981 if (sbi
.ref_map
.empty()) {
8982 ceph_assert(sbi
.passed
);
8983 dout(20) << __func__
<< " " << *sbi
.sb
8984 << " is empty, removing" << dendl
;
8985 repairer
.fix_shared_blob(db
, sbid
, nullptr);
8988 bluestore_shared_blob_t
persistent(sbid
, std::move(sbi
.ref_map
));
8989 encode(persistent
, bl
);
8990 dout(20) << __func__
<< " " << *sbi
.sb
8991 << " is " << bl
.length() << " bytes, updating" << dendl
;
8993 repairer
.fix_shared_blob(db
, sbid
, &bl
);
9000 // check global stats only if fscking (not repairing) w/o per-pool stats
9001 if (!per_pool_stat_collection
&&
9003 !(actual_statfs
== expected_store_statfs
)) {
9004 derr
<< "fsck error: actual " << actual_statfs
9005 << " != expected " << expected_store_statfs
<< dendl
;
9007 repairer
.fix_statfs(db
, BLUESTORE_GLOBAL_STATFS_KEY
,
9008 expected_store_statfs
);
9013 dout(1) << __func__
<< " checking pool_statfs" << dendl
;
9014 _fsck_check_pool_statfs(expected_pool_statfs
,
9015 errors
, warnings
, repair
? &repairer
: nullptr);
9017 if (depth
!= FSCK_SHALLOW
) {
9018 dout(1) << __func__
<< " checking for stray omap data " << dendl
;
9019 it
= db
->get_iterator(PREFIX_OMAP
);
9021 uint64_t last_omap_head
= 0;
9022 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
9024 _key_decode_u64(it
->key().c_str(), &omap_head
);
9025 if (used_omap_head
.count(omap_head
) == 0 &&
9026 omap_head
!= last_omap_head
) {
9027 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
9028 << "fsck error: found stray omap data on omap_head "
9029 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
)<< fsck_dendl
;
9031 last_omap_head
= omap_head
;
9035 it
= db
->get_iterator(PREFIX_PGMETA_OMAP
);
9037 uint64_t last_omap_head
= 0;
9038 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
9040 _key_decode_u64(it
->key().c_str(), &omap_head
);
9041 if (used_omap_head
.count(omap_head
) == 0 &&
9042 omap_head
!= last_omap_head
) {
9043 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
9044 << "fsck error: found stray (pgmeta) omap data on omap_head "
9045 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
9046 last_omap_head
= omap_head
;
9051 it
= db
->get_iterator(PREFIX_PERPOOL_OMAP
);
9053 uint64_t last_omap_head
= 0;
9054 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
9057 string k
= it
->key();
9058 const char *c
= k
.c_str();
9059 c
= _key_decode_u64(c
, &pool
);
9060 c
= _key_decode_u64(c
, &omap_head
);
9061 if (used_omap_head
.count(omap_head
) == 0 &&
9062 omap_head
!= last_omap_head
) {
9063 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
9064 << "fsck error: found stray (per-pool) omap data on omap_head "
9065 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
9067 last_omap_head
= omap_head
;
9071 dout(1) << __func__
<< " checking deferred events" << dendl
;
9072 it
= db
->get_iterator(PREFIX_DEFERRED
);
9074 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
9075 bufferlist bl
= it
->value();
9076 auto p
= bl
.cbegin();
9077 bluestore_deferred_transaction_t wt
;
9080 } catch (buffer::error
& e
) {
9081 derr
<< "fsck error: failed to decode deferred txn "
9082 << pretty_binary_string(it
->key()) << dendl
;
9084 dout(20) << __func__
<< " undecodable deferred TXN record, key: '"
9085 << pretty_binary_string(it
->key())
9086 << "', removing" << dendl
;
9087 repairer
.remove_key(db
, PREFIX_DEFERRED
, it
->key());
9091 dout(20) << __func__
<< " deferred " << wt
.seq
9092 << " ops " << wt
.ops
.size()
9093 << " released 0x" << std::hex
<< wt
.released
<< std::dec
<< dendl
;
9094 for (auto e
= wt
.released
.begin(); e
!= wt
.released
.end(); ++e
) {
9095 apply_for_bitset_range(
9096 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
9097 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
9105 dout(1) << __func__
<< " checking freelist vs allocated" << dendl
;
9107 // remove bluefs_extents from used set since the freelist doesn't
9108 // know they are allocated.
9109 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
9110 apply_for_bitset_range(
9111 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
9112 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
9117 fm
->enumerate_reset();
9118 uint64_t offset
, length
;
9119 while (fm
->enumerate_next(db
, &offset
, &length
)) {
9120 bool intersects
= false;
9121 apply_for_bitset_range(
9122 offset
, length
, fm
->get_alloc_size(), used_blocks
,
9123 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
9125 if (offset
== SUPER_RESERVED
&&
9126 length
== min_alloc_size
- SUPER_RESERVED
) {
9127 // this is due to the change just after luminous to min_alloc_size
9128 // granularity allocations, and our baked in assumption at the top
9129 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
9130 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
9131 // since we will never allocate this region below min_alloc_size.
9132 dout(10) << __func__
<< " ignoring free extent between SUPER_RESERVED"
9133 << " and min_alloc_size, 0x" << std::hex
<< offset
<< "~"
9134 << length
<< std::dec
<< dendl
;
9138 repairer
.fix_false_free(db
, fm
,
9139 pos
* min_alloc_size
,
9149 derr
<< "fsck error: free extent 0x" << std::hex
<< offset
9150 << "~" << length
<< std::dec
9151 << " intersects allocated blocks" << dendl
;
9155 fm
->enumerate_reset();
9156 size_t count
= used_blocks
.count();
9157 if (used_blocks
.size() != count
) {
9158 ceph_assert(used_blocks
.size() > count
);
9160 size_t start
= used_blocks
.find_first();
9161 while (start
!= decltype(used_blocks
)::npos
) {
9164 size_t next
= used_blocks
.find_next(cur
);
9165 if (next
!= cur
+ 1) {
9167 derr
<< "fsck error: leaked extent 0x" << std::hex
9168 << ((uint64_t)start
* fm
->get_alloc_size()) << "~"
9169 << ((cur
+ 1 - start
) * fm
->get_alloc_size()) << std::dec
9172 repairer
.fix_leaked(db
,
9174 start
* min_alloc_size
,
9175 (cur
+ 1 - start
) * min_alloc_size
);
9188 if (!per_pool_omap
) {
9189 dout(5) << __func__
<< " marking per_pool_omap=1" << dendl
;
9190 repairer
.fix_per_pool_omap(db
);
9193 dout(5) << __func__
<< " applying repair results" << dendl
;
9194 repaired
= repairer
.apply(db
);
9195 dout(5) << __func__
<< " repair applied" << dendl
;
9199 dout(2) << __func__
<< " " << num_objects
<< " objects, "
9200 << num_sharded_objects
<< " of them sharded. "
9202 dout(2) << __func__
<< " " << num_extents
<< " extents to "
9203 << num_blobs
<< " blobs, "
9204 << num_spanning_blobs
<< " spanning, "
9205 << num_shared_blobs
<< " shared."
9208 utime_t duration
= ceph_clock_now() - start
;
9209 dout(1) << __func__
<< " <<<FINISH>>> with " << errors
<< " errors, "
9210 << warnings
<< " warnings, "
9211 << repaired
<< " repaired, "
9212 << (errors
+ warnings
- (int)repaired
) << " remaining in "
9213 << duration
<< " seconds" << dendl
;
9215 // In non-repair mode we should return error count only as
9216 // it indicates if store status is OK.
9217 // In repair mode both errors and warnings are taken into account
9218 // since repaired counter relates to them both.
9219 return repair
? errors
+ warnings
- (int)repaired
: errors
;
9222 /// methods to inject various errors fsck can repair
9223 void BlueStore::inject_broken_shared_blob_key(const string
& key
,
9224 const bufferlist
& bl
)
9226 KeyValueDB::Transaction txn
;
9227 txn
= db
->get_transaction();
9228 txn
->set(PREFIX_SHARED_BLOB
, key
, bl
);
9229 db
->submit_transaction_sync(txn
);
9232 void BlueStore::inject_leaked(uint64_t len
)
9234 KeyValueDB::Transaction txn
;
9235 txn
= db
->get_transaction();
9238 int64_t alloc_len
= alloc
->allocate(len
, min_alloc_size
,
9239 min_alloc_size
* 256, 0, &exts
);
9240 ceph_assert(alloc_len
>= (int64_t)len
);
9241 for (auto& p
: exts
) {
9242 fm
->allocate(p
.offset
, p
.length
, txn
);
9244 db
->submit_transaction_sync(txn
);
9247 void BlueStore::inject_false_free(coll_t cid
, ghobject_t oid
)
9249 KeyValueDB::Transaction txn
;
9251 CollectionRef c
= _get_collection(cid
);
9254 std::unique_lock l
{c
->lock
}; // just to avoid internal asserts
9255 o
= c
->get_onode(oid
, false);
9257 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
9260 bool injected
= false;
9261 txn
= db
->get_transaction();
9262 auto& em
= o
->extent_map
.extent_map
;
9263 std::vector
<const PExtentVector
*> v
;
9265 v
.push_back(&em
.begin()->blob
->get_blob().get_extents());
9267 if (em
.size() > 1) {
9270 v
.push_back(&(it
->blob
->get_blob().get_extents()));
9272 for (auto pext
: v
) {
9274 auto p
= pext
->begin();
9275 while (p
!= pext
->end()) {
9276 if (p
->is_valid()) {
9277 dout(20) << __func__
<< " release 0x" << std::hex
<< p
->offset
9278 << "~" << p
->length
<< std::dec
<< dendl
;
9279 fm
->release(p
->offset
, p
->length
, txn
);
9287 ceph_assert(injected
);
9288 db
->submit_transaction_sync(txn
);
9291 void BlueStore::inject_legacy_omap()
9293 dout(1) << __func__
<< dendl
;
9294 per_pool_omap
= false;
9295 KeyValueDB::Transaction txn
;
9296 txn
= db
->get_transaction();
9297 txn
->rmkey(PREFIX_SUPER
, "per_pool_omap");
9298 db
->submit_transaction_sync(txn
);
9301 void BlueStore::inject_legacy_omap(coll_t cid
, ghobject_t oid
)
9303 dout(1) << __func__
<< " "
9304 << cid
<< " " << oid
9306 KeyValueDB::Transaction txn
;
9308 CollectionRef c
= _get_collection(cid
);
9311 std::unique_lock l
{ c
->lock
}; // just to avoid internal asserts
9312 o
= c
->get_onode(oid
, false);
9315 o
->onode
.clear_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP
| bluestore_onode_t::FLAG_PGMETA_OMAP
);
9316 txn
= db
->get_transaction();
9317 _record_onode(o
, txn
);
9318 db
->submit_transaction_sync(txn
);
9322 void BlueStore::inject_statfs(const string
& key
, const store_statfs_t
& new_statfs
)
9324 BlueStoreRepairer repairer
;
9325 repairer
.fix_statfs(db
, key
, new_statfs
);
9329 void BlueStore::inject_global_statfs(const store_statfs_t
& new_statfs
)
9331 KeyValueDB::Transaction t
= db
->get_transaction();
9336 t
->set(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, bl
);
9337 db
->submit_transaction_sync(t
);
9340 void BlueStore::inject_misreference(coll_t cid1
, ghobject_t oid1
,
9341 coll_t cid2
, ghobject_t oid2
,
9345 CollectionRef c1
= _get_collection(cid1
);
9348 std::unique_lock l
{c1
->lock
}; // just to avoid internal asserts
9349 o1
= c1
->get_onode(oid1
, false);
9351 o1
->extent_map
.fault_range(db
, offset
, OBJECT_MAX_SIZE
);
9354 CollectionRef c2
= _get_collection(cid2
);
9357 std::unique_lock l
{c2
->lock
}; // just to avoid internal asserts
9358 o2
= c2
->get_onode(oid2
, false);
9360 o2
->extent_map
.fault_range(db
, offset
, OBJECT_MAX_SIZE
);
9362 Extent
& e1
= *(o1
->extent_map
.seek_lextent(offset
));
9363 Extent
& e2
= *(o2
->extent_map
.seek_lextent(offset
));
9365 // require onode/extent layout to be the same (and simple)
9366 // to make things easier
9367 ceph_assert(o1
->onode
.extent_map_shards
.empty());
9368 ceph_assert(o2
->onode
.extent_map_shards
.empty());
9369 ceph_assert(o1
->extent_map
.spanning_blob_map
.size() == 0);
9370 ceph_assert(o2
->extent_map
.spanning_blob_map
.size() == 0);
9371 ceph_assert(e1
.logical_offset
== e2
.logical_offset
);
9372 ceph_assert(e1
.length
== e2
.length
);
9373 ceph_assert(e1
.blob_offset
== e2
.blob_offset
);
9375 KeyValueDB::Transaction txn
;
9376 txn
= db
->get_transaction();
9378 // along with misreference error this will create space leaks errors
9379 e2
.blob
->dirty_blob() = e1
.blob
->get_blob();
9380 o2
->extent_map
.dirty_range(offset
, e2
.length
);
9381 o2
->extent_map
.update(txn
, false);
9383 _record_onode(o2
, txn
);
9384 db
->submit_transaction_sync(txn
);
9387 void BlueStore::inject_zombie_spanning_blob(coll_t cid
, ghobject_t oid
,
9391 CollectionRef c
= _get_collection(cid
);
9394 std::unique_lock l
{ c
->lock
}; // just to avoid internal asserts
9395 o
= c
->get_onode(oid
, false);
9397 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
9400 BlobRef b
= c
->new_blob();
9402 o
->extent_map
.spanning_blob_map
[blob_id
] = b
;
9404 KeyValueDB::Transaction txn
;
9405 txn
= db
->get_transaction();
9407 _record_onode(o
, txn
);
9408 db
->submit_transaction_sync(txn
);
9411 void BlueStore::collect_metadata(map
<string
,string
> *pm
)
9413 dout(10) << __func__
<< dendl
;
9414 bdev
->collect_metadata("bluestore_bdev_", pm
);
9416 (*pm
)["bluefs"] = "1";
9417 // this value is for backward compatibility only
9418 (*pm
)["bluefs_single_shared_device"] = \
9419 stringify((int)bluefs_layout
.single_shared_device());
9420 (*pm
)["bluefs_dedicated_db"] = \
9421 stringify((int)bluefs_layout
.dedicated_db
);
9422 (*pm
)["bluefs_dedicated_wal"] = \
9423 stringify((int)bluefs_layout
.dedicated_wal
);
9424 bluefs
->collect_metadata(pm
, bluefs_layout
.shared_bdev
);
9426 (*pm
)["bluefs"] = "0";
9429 // report numa mapping for underlying devices
9433 int r
= get_numa_node(&node
, &nodes
, &failed
);
9435 if (!failed
.empty()) {
9436 (*pm
)["objectstore_numa_unknown_devices"] = stringify(failed
);
9438 if (!nodes
.empty()) {
9439 dout(1) << __func__
<< " devices span numa nodes " << nodes
<< dendl
;
9440 (*pm
)["objectstore_numa_nodes"] = stringify(nodes
);
9443 (*pm
)["objectstore_numa_node"] = stringify(node
);
9448 int BlueStore::get_numa_node(
9450 set
<int> *out_nodes
,
9451 set
<string
> *out_failed
)
9454 set
<string
> devices
;
9455 get_devices(&devices
);
9458 for (auto& devname
: devices
) {
9460 BlkDev
bdev(devname
);
9461 int r
= bdev
.get_numa_node(&n
);
9463 dout(10) << __func__
<< " bdev " << devname
<< " can't detect numa_node"
9465 failed
.insert(devname
);
9468 dout(10) << __func__
<< " bdev " << devname
<< " on numa_node " << n
9475 if (node
>= 0 && nodes
.size() == 1 && failed
.empty()) {
9482 *out_failed
= failed
;
9487 int BlueStore::get_devices(set
<string
> *ls
)
9490 bdev
->get_devices(ls
);
9492 bluefs
->get_devices(ls
);
9497 // grumble, we haven't started up yet.
9498 int r
= _open_path();
9501 r
= _open_fsid(false);
9504 r
= _read_fsid(&fsid
);
9510 r
= _open_bdev(false);
9513 r
= _minimal_open_bluefs(false);
9516 bdev
->get_devices(ls
);
9518 bluefs
->get_devices(ls
);
9521 _minimal_close_bluefs();
9532 void BlueStore::_get_statfs_overall(struct store_statfs_t
*buf
)
9536 buf
->omap_allocated
=
9537 db
->estimate_prefix_size(PREFIX_OMAP
, string()) +
9538 db
->estimate_prefix_size(PREFIX_PERPOOL_OMAP
, string());
9540 uint64_t bfree
= alloc
->get_free();
9543 int64_t bluefs_total
= bluefs
->get_total(bluefs_layout
.shared_bdev
);
9544 int64_t bluefs_free
= bluefs
->get_free(bluefs_layout
.shared_bdev
);
9545 // part of our shared device is "free" according to BlueFS, but we
9546 // can't touch bluestore_bluefs_min of it.
9547 int64_t shared_available
= std::min(
9549 int64_t(bluefs_total
- cct
->_conf
->bluestore_bluefs_min
));
9550 buf
->internally_reserved
= bluefs_total
- shared_available
;
9551 if (shared_available
> 0) {
9552 bfree
+= shared_available
;
9554 // include dedicated db, too, if that isn't the shared device.
9555 if (bluefs_layout
.shared_bdev
!= BlueFS::BDEV_DB
) {
9556 buf
->total
+= bluefs
->get_total(BlueFS::BDEV_DB
);
9558 // call any non-omap bluefs space "internal metadata"
9559 buf
->internal_metadata
=
9560 std::max(bluefs
->get_used(), (uint64_t)cct
->_conf
->bluestore_bluefs_min
)
9561 - buf
->omap_allocated
;
9564 uint64_t thin_total
, thin_avail
;
9565 if (bdev
->get_thin_utilization(&thin_total
, &thin_avail
)) {
9566 buf
->total
+= thin_total
;
9568 // we are limited by both the size of the virtual device and the
9569 // underlying physical device.
9570 bfree
= std::min(bfree
, thin_avail
);
9572 buf
->allocated
= thin_total
- thin_avail
;
9574 buf
->total
+= bdev
->get_size();
9576 buf
->available
= bfree
;
9579 int BlueStore::statfs(struct store_statfs_t
*buf
,
9580 osd_alert_list_t
* alerts
)
9584 _log_alerts(*alerts
);
9586 _get_statfs_overall(buf
);
9588 std::lock_guard
l(vstatfs_lock
);
9589 buf
->allocated
= vstatfs
.allocated();
9590 buf
->data_stored
= vstatfs
.stored();
9591 buf
->data_compressed
= vstatfs
.compressed();
9592 buf
->data_compressed_original
= vstatfs
.compressed_original();
9593 buf
->data_compressed_allocated
= vstatfs
.compressed_allocated();
9596 dout(20) << __func__
<< " " << *buf
<< dendl
;
9600 int BlueStore::pool_statfs(uint64_t pool_id
, struct store_statfs_t
*buf
,
9601 bool *out_per_pool_omap
)
9603 dout(20) << __func__
<< " pool " << pool_id
<< dendl
;
9605 if (!per_pool_stat_collection
) {
9606 dout(20) << __func__
<< " not supported in legacy mode " << dendl
;
9612 std::lock_guard
l(vstatfs_lock
);
9613 osd_pools
[pool_id
].publish(buf
);
9617 _key_encode_u64(pool_id
, &key_prefix
);
9618 buf
->omap_allocated
= db
->estimate_prefix_size(PREFIX_PERPOOL_OMAP
,
9620 *out_per_pool_omap
= per_pool_omap
;
9622 dout(10) << __func__
<< *buf
<< dendl
;
9626 void BlueStore::_check_legacy_statfs_alert()
9629 if (!per_pool_stat_collection
&&
9630 cct
->_conf
->bluestore_warn_on_legacy_statfs
) {
9631 s
= "legacy statfs reporting detected, "
9632 "suggest to run store repair to get consistent statistic reports";
9634 std::lock_guard
l(qlock
);
9635 legacy_statfs_alert
= s
;
9638 void BlueStore::_check_no_per_pool_omap_alert()
9641 if (!per_pool_omap
&&
9642 cct
->_conf
->bluestore_warn_on_no_per_pool_omap
) {
9643 s
= "legacy (not per-pool) omap detected, "
9644 "suggest to run store repair to measure per-pool omap usage";
9646 std::lock_guard
l(qlock
);
9647 no_per_pool_omap_alert
= s
;
9653 BlueStore::CollectionRef
BlueStore::_get_collection(const coll_t
& cid
)
9655 std::shared_lock
l(coll_lock
);
9656 ceph::unordered_map
<coll_t
,CollectionRef
>::iterator cp
= coll_map
.find(cid
);
9657 if (cp
== coll_map
.end())
9658 return CollectionRef();
9662 void BlueStore::_queue_reap_collection(CollectionRef
& c
)
9664 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
9665 // _reap_collections and this in the same thread,
9666 // so no need a lock.
9667 removed_collections
.push_back(c
);
9670 void BlueStore::_reap_collections()
9673 list
<CollectionRef
> removed_colls
;
9675 // _queue_reap_collection and this in the same thread.
9676 // So no need a lock.
9677 if (!removed_collections
.empty())
9678 removed_colls
.swap(removed_collections
);
9683 list
<CollectionRef
>::iterator p
= removed_colls
.begin();
9684 while (p
!= removed_colls
.end()) {
9685 CollectionRef c
= *p
;
9686 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
9687 if (c
->onode_map
.map_any([&](Onode
* o
) {
9688 ceph_assert(!o
->exists
);
9689 if (o
->flushing_count
.load()) {
9690 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " " << o
->oid
9691 << " flush_txns " << o
->flushing_count
<< dendl
;
9699 c
->onode_map
.clear();
9700 p
= removed_colls
.erase(p
);
9701 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " done" << dendl
;
9703 if (removed_colls
.empty()) {
9704 dout(10) << __func__
<< " all reaped" << dendl
;
9706 removed_collections
.splice(removed_collections
.begin(), removed_colls
);
9710 void BlueStore::_update_cache_logger()
9712 uint64_t num_onodes
= 0;
9713 uint64_t num_pinned_onodes
= 0;
9714 uint64_t num_extents
= 0;
9715 uint64_t num_blobs
= 0;
9716 uint64_t num_buffers
= 0;
9717 uint64_t num_buffer_bytes
= 0;
9718 for (auto c
: onode_cache_shards
) {
9719 c
->add_stats(&num_onodes
, &num_pinned_onodes
);
9721 for (auto c
: buffer_cache_shards
) {
9722 c
->add_stats(&num_extents
, &num_blobs
,
9723 &num_buffers
, &num_buffer_bytes
);
9725 logger
->set(l_bluestore_onodes
, num_onodes
);
9726 logger
->set(l_bluestore_pinned_onodes
, num_pinned_onodes
);
9727 logger
->set(l_bluestore_extents
, num_extents
);
9728 logger
->set(l_bluestore_blobs
, num_blobs
);
9729 logger
->set(l_bluestore_buffers
, num_buffers
);
9730 logger
->set(l_bluestore_buffer_bytes
, num_buffer_bytes
);
9736 ObjectStore::CollectionHandle
BlueStore::open_collection(const coll_t
& cid
)
9738 return _get_collection(cid
);
9741 ObjectStore::CollectionHandle
BlueStore::create_new_collection(
9744 std::unique_lock l
{coll_lock
};
9745 auto c
= ceph::make_ref
<Collection
>(
9747 onode_cache_shards
[cid
.hash_to_shard(onode_cache_shards
.size())],
9748 buffer_cache_shards
[cid
.hash_to_shard(buffer_cache_shards
.size())],
9750 new_coll_map
[cid
] = c
;
9751 _osr_attach(c
.get());
9755 void BlueStore::set_collection_commit_queue(
9757 ContextQueue
*commit_queue
)
9760 std::shared_lock
l(coll_lock
);
9761 if (coll_map
.count(cid
)) {
9762 coll_map
[cid
]->commit_queue
= commit_queue
;
9763 } else if (new_coll_map
.count(cid
)) {
9764 new_coll_map
[cid
]->commit_queue
= commit_queue
;
9770 bool BlueStore::exists(CollectionHandle
&c_
, const ghobject_t
& oid
)
9772 Collection
*c
= static_cast<Collection
*>(c_
.get());
9773 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
9780 std::shared_lock
l(c
->lock
);
9781 OnodeRef o
= c
->get_onode(oid
, false);
9782 if (!o
|| !o
->exists
)
9789 int BlueStore::stat(
9790 CollectionHandle
&c_
,
9791 const ghobject_t
& oid
,
9795 Collection
*c
= static_cast<Collection
*>(c_
.get());
9798 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
9801 std::shared_lock
l(c
->lock
);
9802 OnodeRef o
= c
->get_onode(oid
, false);
9803 if (!o
|| !o
->exists
)
9805 st
->st_size
= o
->onode
.size
;
9806 st
->st_blksize
= 4096;
9807 st
->st_blocks
= (st
->st_size
+ st
->st_blksize
- 1) / st
->st_blksize
;
9812 if (_debug_mdata_eio(oid
)) {
9814 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
9818 int BlueStore::set_collection_opts(
9819 CollectionHandle
& ch
,
9820 const pool_opts_t
& opts
)
9822 Collection
*c
= static_cast<Collection
*>(ch
.get());
9823 dout(15) << __func__
<< " " << ch
->cid
<< " options " << opts
<< dendl
;
9826 std::unique_lock l
{c
->lock
};
9827 c
->pool_opts
= opts
;
9831 int BlueStore::read(
9832 CollectionHandle
&c_
,
9833 const ghobject_t
& oid
,
9839 auto start
= mono_clock::now();
9840 Collection
*c
= static_cast<Collection
*>(c_
.get());
9841 const coll_t
&cid
= c
->get_cid();
9842 dout(15) << __func__
<< " " << cid
<< " " << oid
9843 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9851 std::shared_lock
l(c
->lock
);
9852 auto start1
= mono_clock::now();
9853 OnodeRef o
= c
->get_onode(oid
, false);
9854 log_latency("get_onode@read",
9855 l_bluestore_read_onode_meta_lat
,
9856 mono_clock::now() - start1
,
9857 cct
->_conf
->bluestore_log_op_age
);
9858 if (!o
|| !o
->exists
) {
9863 if (offset
== length
&& offset
== 0)
9864 length
= o
->onode
.size
;
9866 r
= _do_read(c
, o
, offset
, length
, bl
, op_flags
);
9868 logger
->inc(l_bluestore_read_eio
);
9873 if (r
>= 0 && _debug_data_eio(oid
)) {
9875 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
9876 } else if (oid
.hobj
.pool
> 0 && /* FIXME, see #23029 */
9877 cct
->_conf
->bluestore_debug_random_read_err
&&
9878 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
*
9880 dout(0) << __func__
<< ": inject random EIO" << dendl
;
9883 dout(10) << __func__
<< " " << cid
<< " " << oid
9884 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9885 << " = " << r
<< dendl
;
9886 log_latency(__func__
,
9887 l_bluestore_read_lat
,
9888 mono_clock::now() - start
,
9889 cct
->_conf
->bluestore_log_op_age
);
9893 void BlueStore::_read_cache(
9897 int read_cache_policy
,
9898 ready_regions_t
& ready_regions
,
9899 blobs2read_t
& blobs2read
)
9901 // build blob-wise list to of stuff read (that isn't cached)
9902 unsigned left
= length
;
9903 uint64_t pos
= offset
;
9904 auto lp
= o
->extent_map
.seek_lextent(offset
);
9905 while (left
> 0 && lp
!= o
->extent_map
.extent_map
.end()) {
9906 if (pos
< lp
->logical_offset
) {
9907 unsigned hole
= lp
->logical_offset
- pos
;
9911 dout(30) << __func__
<< " hole 0x" << std::hex
<< pos
<< "~" << hole
9912 << std::dec
<< dendl
;
9916 BlobRef
& bptr
= lp
->blob
;
9917 unsigned l_off
= pos
- lp
->logical_offset
;
9918 unsigned b_off
= l_off
+ lp
->blob_offset
;
9919 unsigned b_len
= std::min(left
, lp
->length
- l_off
);
9921 ready_regions_t cache_res
;
9922 interval_set
<uint32_t> cache_interval
;
9923 bptr
->shared_blob
->bc
.read(
9924 bptr
->shared_blob
->get_cache(), b_off
, b_len
, cache_res
, cache_interval
,
9926 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9927 << " need 0x" << b_off
<< "~" << b_len
9928 << " cache has 0x" << cache_interval
9929 << std::dec
<< dendl
;
9931 auto pc
= cache_res
.begin();
9932 uint64_t chunk_size
= bptr
->get_blob().get_chunk_size(block_size
);
9935 if (pc
!= cache_res
.end() &&
9936 pc
->first
== b_off
) {
9937 l
= pc
->second
.length();
9938 ready_regions
[pos
].claim(pc
->second
);
9939 dout(30) << __func__
<< " use cache 0x" << std::hex
<< pos
<< ": 0x"
9940 << b_off
<< "~" << l
<< std::dec
<< dendl
;
9944 if (pc
!= cache_res
.end()) {
9945 ceph_assert(pc
->first
> b_off
);
9946 l
= pc
->first
- b_off
;
9948 dout(30) << __func__
<< " will read 0x" << std::hex
<< pos
<< ": 0x"
9949 << b_off
<< "~" << l
<< std::dec
<< dendl
;
9952 uint64_t r_off
= b_off
;
9954 uint64_t front
= r_off
% chunk_size
;
9959 unsigned tail
= r_len
% chunk_size
;
9961 r_len
+= chunk_size
- tail
;
9963 bool merged
= false;
9964 regions2read_t
& r2r
= blobs2read
[bptr
];
9966 read_req_t
& pre
= r2r
.back();
9967 if (r_off
<= (pre
.r_off
+ pre
.r_len
)) {
9968 front
+= (r_off
- pre
.r_off
);
9969 pre
.r_len
+= (r_off
+ r_len
- pre
.r_off
- pre
.r_len
);
9970 pre
.regs
.emplace_back(region_t(pos
, b_off
, l
, front
));
9975 read_req_t
req(r_off
, r_len
);
9976 req
.regs
.emplace_back(region_t(pos
, b_off
, l
, front
));
9977 r2r
.emplace_back(std::move(req
));
9990 int BlueStore::_prepare_read_ioc(
9991 blobs2read_t
& blobs2read
,
9992 vector
<bufferlist
>* compressed_blob_bls
,
9995 for (auto& p
: blobs2read
) {
9996 const BlobRef
& bptr
= p
.first
;
9997 regions2read_t
& r2r
= p
.second
;
9998 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9999 << " need " << r2r
<< std::dec
<< dendl
;
10000 if (bptr
->get_blob().is_compressed()) {
10001 // read the whole thing
10002 if (compressed_blob_bls
->empty()) {
10003 // ensure we avoid any reallocation on subsequent blobs
10004 compressed_blob_bls
->reserve(blobs2read
.size());
10006 compressed_blob_bls
->push_back(bufferlist());
10007 bufferlist
& bl
= compressed_blob_bls
->back();
10008 auto r
= bptr
->get_blob().map(
10009 0, bptr
->get_blob().get_ondisk_length(),
10010 [&](uint64_t offset
, uint64_t length
) {
10011 int r
= bdev
->aio_read(offset
, length
, &bl
, ioc
);
10017 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
) << dendl
;
10019 // propagate EIO to caller
10022 ceph_assert(r
== 0);
10026 for (auto& req
: r2r
) {
10027 dout(20) << __func__
<< " region 0x" << std::hex
10028 << req
.regs
.front().logical_offset
10029 << ": 0x" << req
.regs
.front().blob_xoffset
10030 << " reading 0x" << req
.r_off
10031 << "~" << req
.r_len
<< std::dec
10035 auto r
= bptr
->get_blob().map(
10036 req
.r_off
, req
.r_len
,
10037 [&](uint64_t offset
, uint64_t length
) {
10038 int r
= bdev
->aio_read(offset
, length
, &req
.bl
, ioc
);
10044 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
)
10047 // propagate EIO to caller
10050 ceph_assert(r
== 0);
10052 ceph_assert(req
.bl
.length() == req
.r_len
);
10059 int BlueStore::_generate_read_result_bl(
10063 ready_regions_t
& ready_regions
,
10064 vector
<bufferlist
>& compressed_blob_bls
,
10065 blobs2read_t
& blobs2read
,
10070 // enumerate and decompress desired blobs
10071 auto p
= compressed_blob_bls
.begin();
10072 blobs2read_t::iterator b2r_it
= blobs2read
.begin();
10073 while (b2r_it
!= blobs2read
.end()) {
10074 const BlobRef
& bptr
= b2r_it
->first
;
10075 regions2read_t
& r2r
= b2r_it
->second
;
10076 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
10077 << " need 0x" << r2r
<< std::dec
<< dendl
;
10078 if (bptr
->get_blob().is_compressed()) {
10079 ceph_assert(p
!= compressed_blob_bls
.end());
10080 bufferlist
& compressed_bl
= *p
++;
10081 if (_verify_csum(o
, &bptr
->get_blob(), 0, compressed_bl
,
10082 r2r
.front().regs
.front().logical_offset
) < 0) {
10083 *csum_error
= true;
10087 auto r
= _decompress(compressed_bl
, &raw_bl
);
10091 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(), 0,
10094 for (auto& req
: r2r
) {
10095 for (auto& r
: req
.regs
) {
10096 ready_regions
[r
.logical_offset
].substr_of(
10097 raw_bl
, r
.blob_xoffset
, r
.length
);
10101 for (auto& req
: r2r
) {
10102 if (_verify_csum(o
, &bptr
->get_blob(), req
.r_off
, req
.bl
,
10103 req
.regs
.front().logical_offset
) < 0) {
10104 *csum_error
= true;
10108 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(),
10109 req
.r_off
, req
.bl
);
10112 // prune and keep result
10113 for (const auto& r
: req
.regs
) {
10114 ready_regions
[r
.logical_offset
].substr_of(req
.bl
, r
.front
, r
.length
);
10121 // generate a resulting buffer
10122 auto pr
= ready_regions
.begin();
10123 auto pr_end
= ready_regions
.end();
10125 while (pos
< length
) {
10126 if (pr
!= pr_end
&& pr
->first
== pos
+ offset
) {
10127 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
10128 << ": data from 0x" << pr
->first
<< "~" << pr
->second
.length()
10129 << std::dec
<< dendl
;
10130 pos
+= pr
->second
.length();
10131 bl
.claim_append(pr
->second
);
10134 uint64_t l
= length
- pos
;
10135 if (pr
!= pr_end
) {
10136 ceph_assert(pr
->first
> pos
+ offset
);
10137 l
= pr
->first
- (pos
+ offset
);
10139 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
10140 << ": zeros for 0x" << (pos
+ offset
) << "~" << l
10141 << std::dec
<< dendl
;
10146 ceph_assert(bl
.length() == length
);
10147 ceph_assert(pos
== length
);
10148 ceph_assert(pr
== pr_end
);
10152 int BlueStore::_do_read(
10159 uint64_t retry_count
)
10163 int read_cache_policy
= 0; // do not bypass clean or dirty cache
10165 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10166 << " size 0x" << o
->onode
.size
<< " (" << std::dec
10167 << o
->onode
.size
<< ")" << dendl
;
10170 if (offset
>= o
->onode
.size
) {
10174 // generally, don't buffer anything, unless the client explicitly requests
10176 bool buffered
= false;
10177 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
10178 dout(20) << __func__
<< " will do buffered read" << dendl
;
10180 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
10181 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
10182 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
10183 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
10187 if (offset
+ length
> o
->onode
.size
) {
10188 length
= o
->onode
.size
- offset
;
10191 auto start
= mono_clock::now();
10192 o
->extent_map
.fault_range(db
, offset
, length
);
10193 log_latency(__func__
,
10194 l_bluestore_read_onode_meta_lat
,
10195 mono_clock::now() - start
,
10196 cct
->_conf
->bluestore_log_op_age
);
10197 _dump_onode
<30>(cct
, *o
);
10199 // for deep-scrub, we only read dirty cache and bypass clean cache in
10200 // order to read underlying block device in case there are silent disk errors.
10201 if (op_flags
& CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE
) {
10202 dout(20) << __func__
<< " will bypass cache and do direct read" << dendl
;
10203 read_cache_policy
= BufferSpace::BYPASS_CLEAN_CACHE
;
10206 // build blob-wise list to of stuff read (that isn't cached)
10207 ready_regions_t ready_regions
;
10208 blobs2read_t blobs2read
;
10209 _read_cache(o
, offset
, length
, read_cache_policy
, ready_regions
, blobs2read
);
10212 // read raw blob data.
10213 start
= mono_clock::now(); // for the sake of simplicity
10214 // measure the whole block below.
10215 // The error isn't that much...
10216 vector
<bufferlist
> compressed_blob_bls
;
10217 IOContext
ioc(cct
, NULL
, true); // allow EIO
10218 r
= _prepare_read_ioc(blobs2read
, &compressed_blob_bls
, &ioc
);
10219 // we always issue aio for reading, so errors other than EIO are not allowed
10223 int64_t num_ios
= length
;
10224 if (ioc
.has_pending_aios()) {
10225 num_ios
= -ioc
.get_num_ios();
10226 bdev
->aio_submit(&ioc
);
10227 dout(20) << __func__
<< " waiting for aio" << dendl
;
10229 r
= ioc
.get_return_value();
10231 ceph_assert(r
== -EIO
); // no other errors allowed
10235 log_latency_fn(__func__
,
10236 l_bluestore_read_wait_aio_lat
,
10237 mono_clock::now() - start
,
10238 cct
->_conf
->bluestore_log_op_age
,
10239 [&](auto lat
) { return ", num_ios = " + stringify(num_ios
); }
10242 bool csum_error
= false;
10243 r
= _generate_read_result_bl(o
, offset
, length
, ready_regions
,
10244 compressed_blob_bls
, blobs2read
,
10245 buffered
, &csum_error
, bl
);
10247 // Handles spurious read errors caused by a kernel bug.
10248 // We sometimes get all-zero pages as a result of the read under
10249 // high memory pressure. Retrying the failing read succeeds in most
10251 // See also: http://tracker.ceph.com/issues/22464
10252 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
10255 return _do_read(c
, o
, offset
, length
, bl
, op_flags
, retry_count
+ 1);
10259 logger
->inc(l_bluestore_reads_with_retries
);
10260 dout(5) << __func__
<< " read at 0x" << std::hex
<< offset
<< "~" << length
10261 << " failed " << std::dec
<< retry_count
<< " times before succeeding" << dendl
;
10266 int BlueStore::_verify_csum(OnodeRef
& o
,
10267 const bluestore_blob_t
* blob
, uint64_t blob_xoffset
,
10268 const bufferlist
& bl
,
10269 uint64_t logical_offset
) const
10273 auto start
= mono_clock::now();
10274 int r
= blob
->verify_csum(blob_xoffset
, bl
, &bad
, &bad_csum
);
10275 if (cct
->_conf
->bluestore_debug_inject_csum_err_probability
> 0 &&
10276 (rand() % 10000) < cct
->_conf
->bluestore_debug_inject_csum_err_probability
* 10000.0) {
10277 derr
<< __func__
<< " injecting bluestore checksum verifcation error" << dendl
;
10278 bad
= blob_xoffset
;
10280 bad_csum
= 0xDEADBEEF;
10287 blob
->get_csum_chunk_size(),
10288 [&](uint64_t offset
, uint64_t length
) {
10289 pex
.emplace_back(bluestore_pextent_t(offset
, length
));
10292 derr
<< __func__
<< " bad "
10293 << Checksummer::get_csum_type_string(blob
->csum_type
)
10294 << "/0x" << std::hex
<< blob
->get_csum_chunk_size()
10295 << " checksum at blob offset 0x" << bad
10296 << ", got 0x" << bad_csum
<< ", expected 0x"
10297 << blob
->get_csum_item(bad
/ blob
->get_csum_chunk_size()) << std::dec
10298 << ", device location " << pex
10299 << ", logical extent 0x" << std::hex
10300 << (logical_offset
+ bad
- blob_xoffset
) << "~"
10301 << blob
->get_csum_chunk_size() << std::dec
10302 << ", object " << o
->oid
10305 derr
<< __func__
<< " failed with exit code: " << cpp_strerror(r
) << dendl
;
10308 log_latency(__func__
,
10309 l_bluestore_csum_lat
,
10310 mono_clock::now() - start
,
10311 cct
->_conf
->bluestore_log_op_age
);
10312 if (cct
->_conf
->bluestore_ignore_data_csum
) {
10318 int BlueStore::_decompress(bufferlist
& source
, bufferlist
* result
)
10321 auto start
= mono_clock::now();
10322 auto i
= source
.cbegin();
10323 bluestore_compression_header_t chdr
;
10325 int alg
= int(chdr
.type
);
10326 CompressorRef cp
= compressor
;
10327 if (!cp
|| (int)cp
->get_type() != alg
) {
10328 cp
= Compressor::create(cct
, alg
);
10332 // if compressor isn't available - error, because cannot return
10333 // decompressed data?
10335 const char* alg_name
= Compressor::get_comp_alg_name(alg
);
10336 derr
<< __func__
<< " can't load decompressor " << alg_name
<< dendl
;
10337 _set_compression_alert(false, alg_name
);
10340 r
= cp
->decompress(i
, chdr
.length
, *result
);
10342 derr
<< __func__
<< " decompression failed with exit code " << r
<< dendl
;
10346 log_latency(__func__
,
10347 l_bluestore_decompress_lat
,
10348 mono_clock::now() - start
,
10349 cct
->_conf
->bluestore_log_op_age
);
10353 // this stores fiemap into interval_set, other variations
10354 // use it internally
10355 int BlueStore::_fiemap(
10356 CollectionHandle
&c_
,
10357 const ghobject_t
& oid
,
10360 interval_set
<uint64_t>& destset
)
10362 Collection
*c
= static_cast<Collection
*>(c_
.get());
10366 std::shared_lock
l(c
->lock
);
10368 OnodeRef o
= c
->get_onode(oid
, false);
10369 if (!o
|| !o
->exists
) {
10372 _dump_onode
<30>(cct
, *o
);
10374 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10375 << " size 0x" << o
->onode
.size
<< std::dec
<< dendl
;
10377 boost::intrusive::set
<Extent
>::iterator ep
, eend
;
10378 if (offset
>= o
->onode
.size
)
10381 if (offset
+ length
> o
->onode
.size
) {
10382 length
= o
->onode
.size
- offset
;
10385 o
->extent_map
.fault_range(db
, offset
, length
);
10386 eend
= o
->extent_map
.extent_map
.end();
10387 ep
= o
->extent_map
.seek_lextent(offset
);
10388 while (length
> 0) {
10389 dout(20) << __func__
<< " offset " << offset
<< dendl
;
10390 if (ep
!= eend
&& ep
->logical_offset
+ ep
->length
<= offset
) {
10395 uint64_t x_len
= length
;
10396 if (ep
!= eend
&& ep
->logical_offset
<= offset
) {
10397 uint64_t x_off
= offset
- ep
->logical_offset
;
10398 x_len
= std::min(x_len
, ep
->length
- x_off
);
10399 dout(30) << __func__
<< " lextent 0x" << std::hex
<< offset
<< "~"
10400 << x_len
<< std::dec
<< " blob " << ep
->blob
<< dendl
;
10401 destset
.insert(offset
, x_len
);
10404 if (x_off
+ x_len
== ep
->length
)
10409 ep
->logical_offset
> offset
&&
10410 ep
->logical_offset
- offset
< x_len
) {
10411 x_len
= ep
->logical_offset
- offset
;
10419 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10420 << " size = 0x(" << destset
<< ")" << std::dec
<< dendl
;
10424 int BlueStore::fiemap(
10425 CollectionHandle
&c_
,
10426 const ghobject_t
& oid
,
10431 interval_set
<uint64_t> m
;
10432 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
10439 int BlueStore::fiemap(
10440 CollectionHandle
&c_
,
10441 const ghobject_t
& oid
,
10444 map
<uint64_t, uint64_t>& destmap
)
10446 interval_set
<uint64_t> m
;
10447 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
10449 destmap
= std::move(m
).detach();
10454 int BlueStore::readv(
10455 CollectionHandle
&c_
,
10456 const ghobject_t
& oid
,
10457 interval_set
<uint64_t>& m
,
10461 auto start
= mono_clock::now();
10462 Collection
*c
= static_cast<Collection
*>(c_
.get());
10463 const coll_t
&cid
= c
->get_cid();
10464 dout(15) << __func__
<< " " << cid
<< " " << oid
10473 std::shared_lock
l(c
->lock
);
10474 auto start1
= mono_clock::now();
10475 OnodeRef o
= c
->get_onode(oid
, false);
10476 log_latency("get_onode@read",
10477 l_bluestore_read_onode_meta_lat
,
10478 mono_clock::now() - start1
,
10479 cct
->_conf
->bluestore_log_op_age
);
10480 if (!o
|| !o
->exists
) {
10490 r
= _do_readv(c
, o
, m
, bl
, op_flags
);
10492 logger
->inc(l_bluestore_read_eio
);
10497 if (r
>= 0 && _debug_data_eio(oid
)) {
10499 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10500 } else if (oid
.hobj
.pool
> 0 && /* FIXME, see #23029 */
10501 cct
->_conf
->bluestore_debug_random_read_err
&&
10502 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
*
10504 dout(0) << __func__
<< ": inject random EIO" << dendl
;
10507 dout(10) << __func__
<< " " << cid
<< " " << oid
10508 << " fiemap " << m
<< std::dec
10509 << " = " << r
<< dendl
;
10510 log_latency(__func__
,
10511 l_bluestore_read_lat
,
10512 mono_clock::now() - start
,
10513 cct
->_conf
->bluestore_log_op_age
);
10517 int BlueStore::_do_readv(
10520 const interval_set
<uint64_t>& m
,
10523 uint64_t retry_count
)
10527 int read_cache_policy
= 0; // do not bypass clean or dirty cache
10529 dout(20) << __func__
<< " fiemap " << m
<< std::hex
10530 << " size 0x" << o
->onode
.size
<< " (" << std::dec
10531 << o
->onode
.size
<< ")" << dendl
;
10533 // generally, don't buffer anything, unless the client explicitly requests
10535 bool buffered
= false;
10536 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
10537 dout(20) << __func__
<< " will do buffered read" << dendl
;
10539 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
10540 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
10541 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
10542 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
10545 // this method must be idempotent since we may call it several times
10546 // before we finally read the expected result.
10549 // call fiemap first!
10550 ceph_assert(m
.range_start() <= o
->onode
.size
);
10551 ceph_assert(m
.range_end() <= o
->onode
.size
);
10552 auto start
= mono_clock::now();
10553 o
->extent_map
.fault_range(db
, m
.range_start(), m
.range_end() - m
.range_start());
10554 log_latency(__func__
,
10555 l_bluestore_read_onode_meta_lat
,
10556 mono_clock::now() - start
,
10557 cct
->_conf
->bluestore_log_op_age
);
10558 _dump_onode
<30>(cct
, *o
);
10560 IOContext
ioc(cct
, NULL
, true); // allow EIO
10561 vector
<std::tuple
<ready_regions_t
, vector
<bufferlist
>, blobs2read_t
>> raw_results
;
10562 raw_results
.reserve(m
.num_intervals());
10564 for (auto p
= m
.begin(); p
!= m
.end(); p
++, i
++) {
10565 raw_results
.push_back({});
10566 _read_cache(o
, p
.get_start(), p
.get_len(), read_cache_policy
,
10567 std::get
<0>(raw_results
[i
]), std::get
<2>(raw_results
[i
]));
10568 r
= _prepare_read_ioc(std::get
<2>(raw_results
[i
]), &std::get
<1>(raw_results
[i
]), &ioc
);
10569 // we always issue aio for reading, so errors other than EIO are not allowed
10574 auto num_ios
= m
.size();
10575 if (ioc
.has_pending_aios()) {
10576 num_ios
= ioc
.get_num_ios();
10577 bdev
->aio_submit(&ioc
);
10578 dout(20) << __func__
<< " waiting for aio" << dendl
;
10580 r
= ioc
.get_return_value();
10582 ceph_assert(r
== -EIO
); // no other errors allowed
10586 log_latency_fn(__func__
,
10587 l_bluestore_read_wait_aio_lat
,
10588 mono_clock::now() - start
,
10589 cct
->_conf
->bluestore_log_op_age
,
10590 [&](auto lat
) { return ", num_ios = " + stringify(num_ios
); }
10593 ceph_assert(raw_results
.size() == (size_t)m
.num_intervals());
10595 for (auto p
= m
.begin(); p
!= m
.end(); p
++, i
++) {
10596 bool csum_error
= false;
10598 r
= _generate_read_result_bl(o
, p
.get_start(), p
.get_len(),
10599 std::get
<0>(raw_results
[i
]),
10600 std::get
<1>(raw_results
[i
]),
10601 std::get
<2>(raw_results
[i
]),
10602 buffered
, &csum_error
, t
);
10604 // Handles spurious read errors caused by a kernel bug.
10605 // We sometimes get all-zero pages as a result of the read under
10606 // high memory pressure. Retrying the failing read succeeds in most
10608 // See also: http://tracker.ceph.com/issues/22464
10609 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
10612 return _do_readv(c
, o
, m
, bl
, op_flags
, retry_count
+ 1);
10614 bl
.claim_append(t
);
10617 logger
->inc(l_bluestore_reads_with_retries
);
10618 dout(5) << __func__
<< " read fiemap " << m
10619 << " failed " << retry_count
<< " times before succeeding"
10622 return bl
.length();
10625 int BlueStore::dump_onode(CollectionHandle
&c_
,
10626 const ghobject_t
& oid
,
10627 const string
& section_name
,
10630 Collection
*c
= static_cast<Collection
*>(c_
.get());
10631 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
10637 std::shared_lock
l(c
->lock
);
10639 OnodeRef o
= c
->get_onode(oid
, false);
10640 if (!o
|| !o
->exists
) {
10644 // FIXME minor: actually the next line isn't enough to
10645 // load shared blobs. Leaving as is for now..
10647 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
10649 _dump_onode
<0>(cct
, *o
);
10650 f
->open_object_section(section_name
.c_str());
10652 f
->close_section();
10656 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
10657 << " = " << r
<< dendl
;
10661 int BlueStore::getattr(
10662 CollectionHandle
&c_
,
10663 const ghobject_t
& oid
,
10667 Collection
*c
= static_cast<Collection
*>(c_
.get());
10668 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
<< dendl
;
10674 std::shared_lock
l(c
->lock
);
10675 mempool::bluestore_cache_meta::string
k(name
);
10677 OnodeRef o
= c
->get_onode(oid
, false);
10678 if (!o
|| !o
->exists
) {
10683 if (!o
->onode
.attrs
.count(k
)) {
10687 value
= o
->onode
.attrs
[k
];
10691 if (r
== 0 && _debug_mdata_eio(oid
)) {
10693 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10695 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
10696 << " = " << r
<< dendl
;
10700 int BlueStore::getattrs(
10701 CollectionHandle
&c_
,
10702 const ghobject_t
& oid
,
10703 map
<string
,bufferptr
>& aset
)
10705 Collection
*c
= static_cast<Collection
*>(c_
.get());
10706 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
10712 std::shared_lock
l(c
->lock
);
10714 OnodeRef o
= c
->get_onode(oid
, false);
10715 if (!o
|| !o
->exists
) {
10719 for (auto& i
: o
->onode
.attrs
) {
10720 aset
.emplace(i
.first
.c_str(), i
.second
);
10726 if (r
== 0 && _debug_mdata_eio(oid
)) {
10728 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10730 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
10731 << " = " << r
<< dendl
;
10735 int BlueStore::list_collections(vector
<coll_t
>& ls
)
10737 std::shared_lock
l(coll_lock
);
10738 ls
.reserve(coll_map
.size());
10739 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
= coll_map
.begin();
10740 p
!= coll_map
.end();
10742 ls
.push_back(p
->first
);
10746 bool BlueStore::collection_exists(const coll_t
& c
)
10748 std::shared_lock
l(coll_lock
);
10749 return coll_map
.count(c
);
10752 int BlueStore::collection_empty(CollectionHandle
& ch
, bool *empty
)
10754 dout(15) << __func__
<< " " << ch
->cid
<< dendl
;
10755 vector
<ghobject_t
> ls
;
10757 int r
= collection_list(ch
, ghobject_t(), ghobject_t::get_max(), 1,
10760 derr
<< __func__
<< " collection_list returned: " << cpp_strerror(r
)
10764 *empty
= ls
.empty();
10765 dout(10) << __func__
<< " " << ch
->cid
<< " = " << (int)(*empty
) << dendl
;
10769 int BlueStore::collection_bits(CollectionHandle
& ch
)
10771 dout(15) << __func__
<< " " << ch
->cid
<< dendl
;
10772 Collection
*c
= static_cast<Collection
*>(ch
.get());
10773 std::shared_lock
l(c
->lock
);
10774 dout(10) << __func__
<< " " << ch
->cid
<< " = " << c
->cnode
.bits
<< dendl
;
10775 return c
->cnode
.bits
;
10778 int BlueStore::collection_list(
10779 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10780 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10782 Collection
*c
= static_cast<Collection
*>(c_
.get());
10784 dout(15) << __func__
<< " " << c
->cid
10785 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
10788 std::shared_lock
l(c
->lock
);
10789 r
= _collection_list(c
, start
, end
, max
, false, ls
, pnext
);
10792 dout(10) << __func__
<< " " << c
->cid
10793 << " start " << start
<< " end " << end
<< " max " << max
10794 << " = " << r
<< ", ls.size() = " << ls
->size()
10795 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
10799 int BlueStore::collection_list_legacy(
10800 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10801 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10803 Collection
*c
= static_cast<Collection
*>(c_
.get());
10805 dout(15) << __func__
<< " " << c
->cid
10806 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
10809 std::shared_lock
l(c
->lock
);
10810 r
= _collection_list(c
, start
, end
, max
, true, ls
, pnext
);
10813 dout(10) << __func__
<< " " << c
->cid
10814 << " start " << start
<< " end " << end
<< " max " << max
10815 << " = " << r
<< ", ls.size() = " << ls
->size()
10816 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
10820 int BlueStore::_collection_list(
10821 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10822 bool legacy
, vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10828 auto start_time
= mono_clock::now();
10830 ghobject_t static_next
;
10831 std::unique_ptr
<CollectionListIterator
> it
;
10832 ghobject_t coll_range_temp_start
, coll_range_temp_end
;
10833 ghobject_t coll_range_start
, coll_range_end
;
10834 bool set_next
= false;
10839 pnext
= &static_next
;
10841 if (start
.is_max() || start
.hobj
.is_max()) {
10844 get_coll_range(c
->cid
, c
->cnode
.bits
, &coll_range_temp_start
,
10845 &coll_range_temp_end
, &coll_range_start
, &coll_range_end
);
10846 dout(20) << __func__
10847 << " range " << coll_range_temp_start
10848 << " to " << coll_range_temp_end
10849 << " and " << coll_range_start
10850 << " to " << coll_range_end
10851 << " start " << start
<< dendl
;
10853 it
= std::make_unique
<SimpleCollectionListIterator
>(
10854 cct
, db
->get_iterator(PREFIX_OBJ
));
10856 it
= std::make_unique
<SortedCollectionListIterator
>(
10857 db
->get_iterator(PREFIX_OBJ
));
10859 if (start
== ghobject_t() ||
10860 start
.hobj
== hobject_t() ||
10861 start
== c
->cid
.get_min_hobj()) {
10862 it
->upper_bound(coll_range_temp_start
);
10865 if (start
.hobj
.is_temp()) {
10867 ceph_assert(start
>= coll_range_temp_start
&& start
< coll_range_temp_end
);
10870 ceph_assert(start
>= coll_range_start
&& start
< coll_range_end
);
10872 dout(20) << __func__
<< " temp=" << (int)temp
<< dendl
;
10873 it
->lower_bound(start
);
10875 if (end
.hobj
.is_max()) {
10876 pend
= temp
? coll_range_temp_end
: coll_range_end
;
10878 if (end
.hobj
.is_temp()) {
10884 pend
= temp
? coll_range_temp_end
: end
;
10887 dout(20) << __func__
<< " pend " << pend
<< dendl
;
10889 if (!it
->valid() || it
->is_ge(pend
)) {
10891 dout(20) << __func__
<< " iterator not valid (end of db?)" << dendl
;
10893 dout(20) << __func__
<< " oid " << it
->oid() << " >= " << pend
<< dendl
;
10895 if (end
.hobj
.is_temp()) {
10896 if (it
->valid() && it
->is_lt(coll_range_temp_end
)) {
10897 *pnext
= it
->oid();
10902 dout(30) << __func__
<< " switch to non-temp namespace" << dendl
;
10904 it
->upper_bound(coll_range_start
);
10905 if (end
.hobj
.is_max())
10906 pend
= coll_range_end
;
10909 dout(30) << __func__
<< " pend " << pend
<< dendl
;
10912 if (it
->valid() && it
->is_lt(coll_range_end
)) {
10913 *pnext
= it
->oid();
10918 dout(20) << __func__
<< " oid " << it
->oid() << " end " << end
<< dendl
;
10919 if (ls
->size() >= (unsigned)max
) {
10920 dout(20) << __func__
<< " reached max " << max
<< dendl
;
10921 *pnext
= it
->oid();
10925 ls
->push_back(it
->oid());
10930 *pnext
= ghobject_t::get_max();
10934 l_bluestore_clist_lat
,
10935 mono_clock::now() - start_time
,
10936 cct
->_conf
->bluestore_log_collection_list_age
,
10937 [&] (const ceph::timespan
& lat
) {
10938 ostringstream ostr
;
10939 ostr
<< ", lat = " << timespan_str(lat
)
10940 << " cid =" << c
->cid
10941 << " start " << start
<< " end " << end
10949 int BlueStore::omap_get(
10950 CollectionHandle
&c_
, ///< [in] Collection containing oid
10951 const ghobject_t
&oid
, ///< [in] Object containing omap
10952 bufferlist
*header
, ///< [out] omap header
10953 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10956 Collection
*c
= static_cast<Collection
*>(c_
.get());
10957 return _omap_get(c
, oid
, header
, out
);
10960 int BlueStore::_omap_get(
10961 Collection
*c
, ///< [in] Collection containing oid
10962 const ghobject_t
&oid
, ///< [in] Object containing omap
10963 bufferlist
*header
, ///< [out] omap header
10964 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10967 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10970 std::shared_lock
l(c
->lock
);
10972 OnodeRef o
= c
->get_onode(oid
, false);
10973 if (!o
|| !o
->exists
) {
10977 r
= _onode_omap_get(o
, header
, out
);
10979 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10984 int BlueStore::_onode_omap_get(
10985 const OnodeRef
&o
, ///< [in] Object containing omap
10986 bufferlist
*header
, ///< [out] omap header
10987 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10991 if (!o
|| !o
->exists
) {
10995 if (!o
->onode
.has_omap())
10999 const string
& prefix
= o
->get_omap_prefix();
11000 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
11002 o
->get_omap_header(&head
);
11003 o
->get_omap_tail(&tail
);
11004 it
->lower_bound(head
);
11005 while (it
->valid()) {
11006 if (it
->key() == head
) {
11007 dout(30) << __func__
<< " got header" << dendl
;
11008 *header
= it
->value();
11009 } else if (it
->key() >= tail
) {
11010 dout(30) << __func__
<< " reached tail" << dendl
;
11014 o
->decode_omap_key(it
->key(), &user_key
);
11015 dout(20) << __func__
<< " got " << pretty_binary_string(it
->key())
11016 << " -> " << user_key
<< dendl
;
11017 (*out
)[user_key
] = it
->value();
11026 int BlueStore::omap_get_header(
11027 CollectionHandle
&c_
, ///< [in] Collection containing oid
11028 const ghobject_t
&oid
, ///< [in] Object containing omap
11029 bufferlist
*header
, ///< [out] omap header
11030 bool allow_eio
///< [in] don't assert on eio
11033 Collection
*c
= static_cast<Collection
*>(c_
.get());
11034 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
11037 std::shared_lock
l(c
->lock
);
11039 OnodeRef o
= c
->get_onode(oid
, false);
11040 if (!o
|| !o
->exists
) {
11044 if (!o
->onode
.has_omap())
11049 o
->get_omap_header(&head
);
11050 if (db
->get(o
->get_omap_prefix(), head
, header
) >= 0) {
11051 dout(30) << __func__
<< " got header" << dendl
;
11053 dout(30) << __func__
<< " no header" << dendl
;
11057 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
11062 int BlueStore::omap_get_keys(
11063 CollectionHandle
&c_
, ///< [in] Collection containing oid
11064 const ghobject_t
&oid
, ///< [in] Object containing omap
11065 set
<string
> *keys
///< [out] Keys defined on oid
11068 Collection
*c
= static_cast<Collection
*>(c_
.get());
11069 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
11072 auto start1
= mono_clock::now();
11073 std::shared_lock
l(c
->lock
);
11075 OnodeRef o
= c
->get_onode(oid
, false);
11076 if (!o
|| !o
->exists
) {
11080 if (!o
->onode
.has_omap())
11084 const string
& prefix
= o
->get_omap_prefix();
11085 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
11087 o
->get_omap_key(string(), &head
);
11088 o
->get_omap_tail(&tail
);
11089 it
->lower_bound(head
);
11090 while (it
->valid()) {
11091 if (it
->key() >= tail
) {
11092 dout(30) << __func__
<< " reached tail" << dendl
;
11096 o
->decode_omap_key(it
->key(), &user_key
);
11097 dout(20) << __func__
<< " got " << pretty_binary_string(it
->key())
11098 << " -> " << user_key
<< dendl
;
11099 keys
->insert(user_key
);
11104 c
->store
->log_latency(
11106 l_bluestore_omap_get_keys_lat
,
11107 mono_clock::now() - start1
,
11108 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
11110 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
11115 int BlueStore::omap_get_values(
11116 CollectionHandle
&c_
, ///< [in] Collection containing oid
11117 const ghobject_t
&oid
, ///< [in] Object containing omap
11118 const set
<string
> &keys
, ///< [in] Keys to get
11119 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
11122 Collection
*c
= static_cast<Collection
*>(c_
.get());
11123 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
11126 std::shared_lock
l(c
->lock
);
11127 auto start1
= mono_clock::now();
11130 OnodeRef o
= c
->get_onode(oid
, false);
11131 if (!o
|| !o
->exists
) {
11135 if (!o
->onode
.has_omap()) {
11140 const string
& prefix
= o
->get_omap_prefix();
11141 o
->get_omap_key(string(), &final_key
);
11142 size_t base_key_len
= final_key
.size();
11143 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
11144 final_key
.resize(base_key_len
); // keep prefix
11147 if (db
->get(prefix
, final_key
, &val
) >= 0) {
11148 dout(30) << __func__
<< " got " << pretty_binary_string(final_key
)
11149 << " -> " << *p
<< dendl
;
11150 out
->insert(make_pair(*p
, val
));
11155 c
->store
->log_latency(
11157 l_bluestore_omap_get_values_lat
,
11158 mono_clock::now() - start1
,
11159 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
11161 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
11166 #ifdef WITH_SEASTAR
11167 int BlueStore::omap_get_values(
11168 CollectionHandle
&c_
, ///< [in] Collection containing oid
11169 const ghobject_t
&oid
, ///< [in] Object containing omap
11170 const std::optional
<string
> &start_after
, ///< [in] Keys to get
11171 map
<string
, bufferlist
> *output
///< [out] Returned keys and values
11174 Collection
*c
= static_cast<Collection
*>(c_
.get());
11175 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
11178 std::shared_lock
l(c
->lock
);
11180 OnodeRef o
= c
->get_onode(oid
, false);
11181 if (!o
|| !o
->exists
) {
11185 if (!o
->onode
.has_omap()) {
11190 ObjectMap::ObjectMapIterator iter
= get_omap_iterator(c_
, oid
);
11195 iter
->upper_bound(*start_after
);
11196 for (; iter
->valid(); iter
->next()) {
11197 output
->insert(make_pair(iter
->key(), iter
->value()));
11202 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
11208 int BlueStore::omap_check_keys(
11209 CollectionHandle
&c_
, ///< [in] Collection containing oid
11210 const ghobject_t
&oid
, ///< [in] Object containing omap
11211 const set
<string
> &keys
, ///< [in] Keys to check
11212 set
<string
> *out
///< [out] Subset of keys defined on oid
11215 Collection
*c
= static_cast<Collection
*>(c_
.get());
11216 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
11219 std::shared_lock
l(c
->lock
);
11222 OnodeRef o
= c
->get_onode(oid
, false);
11223 if (!o
|| !o
->exists
) {
11227 if (!o
->onode
.has_omap()) {
11232 const string
& prefix
= o
->get_omap_prefix();
11233 o
->get_omap_key(string(), &final_key
);
11234 size_t base_key_len
= final_key
.size();
11235 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
11236 final_key
.resize(base_key_len
); // keep prefix
11239 if (db
->get(prefix
, final_key
, &val
) >= 0) {
11240 dout(30) << __func__
<< " have " << pretty_binary_string(final_key
)
11241 << " -> " << *p
<< dendl
;
11244 dout(30) << __func__
<< " miss " << pretty_binary_string(final_key
)
11245 << " -> " << *p
<< dendl
;
11250 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
11255 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
11256 CollectionHandle
&c_
, ///< [in] collection
11257 const ghobject_t
&oid
///< [in] object
11260 Collection
*c
= static_cast<Collection
*>(c_
.get());
11261 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
11263 return ObjectMap::ObjectMapIterator();
11265 std::shared_lock
l(c
->lock
);
11266 OnodeRef o
= c
->get_onode(oid
, false);
11267 if (!o
|| !o
->exists
) {
11268 dout(10) << __func__
<< " " << oid
<< "doesn't exist" <<dendl
;
11269 return ObjectMap::ObjectMapIterator();
11272 dout(10) << __func__
<< " has_omap = " << (int)o
->onode
.has_omap() <<dendl
;
11273 KeyValueDB::Iterator it
= db
->get_iterator(o
->get_omap_prefix());
11274 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c
, o
, it
));
11277 // -----------------
11280 uint64_t BlueStore::_get_ondisk_reserved() const {
11281 return round_up_to(
11282 std::max
<uint64_t>(SUPER_RESERVED
, min_alloc_size
), min_alloc_size
);
11285 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction
& t
)
11287 dout(10) << __func__
<< " ondisk_format " << ondisk_format
11288 << " min_compat_ondisk_format " << min_compat_ondisk_format
11290 ceph_assert(ondisk_format
== latest_ondisk_format
);
11293 encode(ondisk_format
, bl
);
11294 t
->set(PREFIX_SUPER
, "ondisk_format", bl
);
11298 encode(min_compat_ondisk_format
, bl
);
11299 t
->set(PREFIX_SUPER
, "min_compat_ondisk_format", bl
);
11303 int BlueStore::_open_super_meta()
11309 db
->get(PREFIX_SUPER
, "nid_max", &bl
);
11310 auto p
= bl
.cbegin();
11315 } catch (buffer::error
& e
) {
11316 derr
<< __func__
<< " unable to read nid_max" << dendl
;
11319 dout(10) << __func__
<< " old nid_max " << nid_max
<< dendl
;
11320 nid_last
= nid_max
.load();
11327 db
->get(PREFIX_SUPER
, "blobid_max", &bl
);
11328 auto p
= bl
.cbegin();
11333 } catch (buffer::error
& e
) {
11334 derr
<< __func__
<< " unable to read blobid_max" << dendl
;
11337 dout(10) << __func__
<< " old blobid_max " << blobid_max
<< dendl
;
11338 blobid_last
= blobid_max
.load();
11344 db
->get(PREFIX_SUPER
, "freelist_type", &bl
);
11346 freelist_type
= std::string(bl
.c_str(), bl
.length());
11347 dout(10) << __func__
<< " freelist_type " << freelist_type
<< dendl
;
11349 ceph_abort_msg("Not Support extent freelist manager");
11354 int32_t compat_ondisk_format
= 0;
11357 int r
= db
->get(PREFIX_SUPER
, "ondisk_format", &bl
);
11359 // base case: kraken bluestore is v1 and readable by v1
11360 dout(20) << __func__
<< " missing ondisk_format; assuming kraken"
11363 compat_ondisk_format
= 1;
11365 auto p
= bl
.cbegin();
11367 decode(ondisk_format
, p
);
11368 } catch (buffer::error
& e
) {
11369 derr
<< __func__
<< " unable to read ondisk_format" << dendl
;
11374 r
= db
->get(PREFIX_SUPER
, "min_compat_ondisk_format", &bl
);
11376 auto p
= bl
.cbegin();
11378 decode(compat_ondisk_format
, p
);
11379 } catch (buffer::error
& e
) {
11380 derr
<< __func__
<< " unable to read compat_ondisk_format" << dendl
;
11385 dout(10) << __func__
<< " ondisk_format " << ondisk_format
11386 << " compat_ondisk_format " << compat_ondisk_format
11390 if (latest_ondisk_format
< compat_ondisk_format
) {
11391 derr
<< __func__
<< " compat_ondisk_format is "
11392 << compat_ondisk_format
<< " but we only understand version "
11393 << latest_ondisk_format
<< dendl
;
11399 db
->get(PREFIX_SUPER
, "min_alloc_size", &bl
);
11400 auto p
= bl
.cbegin();
11404 min_alloc_size
= val
;
11405 min_alloc_size_order
= ctz(val
);
11406 ceph_assert(min_alloc_size
== 1u << min_alloc_size_order
);
11407 } catch (buffer::error
& e
) {
11408 derr
<< __func__
<< " unable to read min_alloc_size" << dendl
;
11411 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
11412 << std::dec
<< dendl
;
11415 _set_per_pool_omap();
11418 _set_alloc_sizes();
11419 _set_throttle_params();
11422 _set_compression();
11429 int BlueStore::_upgrade_super()
11431 dout(1) << __func__
<< " from " << ondisk_format
<< ", latest "
11432 << latest_ondisk_format
<< dendl
;
11433 if (ondisk_format
< latest_ondisk_format
) {
11434 ceph_assert(ondisk_format
> 0);
11435 ceph_assert(ondisk_format
< latest_ondisk_format
);
11437 KeyValueDB::Transaction t
= db
->get_transaction();
11438 if (ondisk_format
== 1) {
11440 // - super: added ondisk_format
11441 // - super: added min_readable_ondisk_format
11442 // - super: added min_compat_ondisk_format
11443 // - super: added min_alloc_size
11444 // - super: removed min_min_alloc_size
11447 db
->get(PREFIX_SUPER
, "min_min_alloc_size", &bl
);
11448 auto p
= bl
.cbegin();
11452 min_alloc_size
= val
;
11453 } catch (buffer::error
& e
) {
11454 derr
<< __func__
<< " failed to read min_min_alloc_size" << dendl
;
11457 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
11458 t
->rmkey(PREFIX_SUPER
, "min_min_alloc_size");
11462 if (ondisk_format
== 2) {
11464 // - onode has FLAG_PER_POOL_OMAP. Note that we do not know that *all*
11465 // ondes are using the per-pool prefix until a repair is run; at that
11466 // point the per_pool_omap=1 key will be set.
11467 // - super: added per_pool_omap key, which indicates that *all* objects
11468 // are using the new prefix and key format
11471 if (ondisk_format
== 3) {
11473 // - FreelistManager keeps meta within bdev label
11474 int r
= _write_out_fm_meta(0);
11475 ceph_assert(r
== 0);
11478 // This to be the last operation
11479 _prepare_ondisk_format_super(t
);
11480 int r
= db
->submit_transaction_sync(t
);
11481 ceph_assert(r
== 0);
11484 dout(1) << __func__
<< " done" << dendl
;
11488 void BlueStore::_assign_nid(TransContext
*txc
, OnodeRef o
)
11490 if (o
->onode
.nid
) {
11491 ceph_assert(o
->exists
);
11494 uint64_t nid
= ++nid_last
;
11495 dout(20) << __func__
<< " " << nid
<< dendl
;
11496 o
->onode
.nid
= nid
;
11497 txc
->last_nid
= nid
;
11501 uint64_t BlueStore::_assign_blobid(TransContext
*txc
)
11503 uint64_t bid
= ++blobid_last
;
11504 dout(20) << __func__
<< " " << bid
<< dendl
;
11505 txc
->last_blobid
= bid
;
11509 void BlueStore::get_db_statistics(Formatter
*f
)
11511 db
->get_statistics(f
);
11514 BlueStore::TransContext
*BlueStore::_txc_create(
11515 Collection
*c
, OpSequencer
*osr
,
11516 list
<Context
*> *on_commits
)
11518 TransContext
*txc
= new TransContext(cct
, c
, osr
, on_commits
);
11519 txc
->t
= db
->get_transaction();
11520 osr
->queue_new(txc
);
11521 dout(20) << __func__
<< " osr " << osr
<< " = " << txc
11522 << " seq " << txc
->seq
<< dendl
;
11526 void BlueStore::_txc_calc_cost(TransContext
*txc
)
11528 // one "io" for the kv commit
11529 auto ios
= 1 + txc
->ioc
.get_num_ios();
11530 auto cost
= throttle_cost_per_io
.load();
11531 txc
->cost
= ios
* cost
+ txc
->bytes
;
11533 dout(10) << __func__
<< " " << txc
<< " cost " << txc
->cost
<< " ("
11534 << ios
<< " ios * " << cost
<< " + " << txc
->bytes
11535 << " bytes)" << dendl
;
11538 void BlueStore::_txc_update_store_statfs(TransContext
*txc
)
11540 if (txc
->statfs_delta
.is_empty())
11543 logger
->inc(l_bluestore_allocated
, txc
->statfs_delta
.allocated());
11544 logger
->inc(l_bluestore_stored
, txc
->statfs_delta
.stored());
11545 logger
->inc(l_bluestore_compressed
, txc
->statfs_delta
.compressed());
11546 logger
->inc(l_bluestore_compressed_allocated
, txc
->statfs_delta
.compressed_allocated());
11547 logger
->inc(l_bluestore_compressed_original
, txc
->statfs_delta
.compressed_original());
11550 txc
->statfs_delta
.encode(bl
);
11551 if (per_pool_stat_collection
) {
11553 get_pool_stat_key(txc
->osd_pool_id
, &key
);
11554 txc
->t
->merge(PREFIX_STAT
, key
, bl
);
11556 std::lock_guard
l(vstatfs_lock
);
11557 auto& stats
= osd_pools
[txc
->osd_pool_id
];
11558 stats
+= txc
->statfs_delta
;
11560 vstatfs
+= txc
->statfs_delta
; //non-persistent in this mode
11563 txc
->t
->merge(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, bl
);
11565 std::lock_guard
l(vstatfs_lock
);
11566 vstatfs
+= txc
->statfs_delta
;
11568 txc
->statfs_delta
.reset();
11571 void BlueStore::_txc_state_proc(TransContext
*txc
)
11574 dout(10) << __func__
<< " txc " << txc
11575 << " " << txc
->get_state_name() << dendl
;
11576 switch (txc
->state
) {
11577 case TransContext::STATE_PREPARE
:
11578 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_prepare_lat
);
11579 if (txc
->ioc
.has_pending_aios()) {
11580 txc
->state
= TransContext::STATE_AIO_WAIT
;
11581 txc
->had_ios
= true;
11582 _txc_aio_submit(txc
);
11587 case TransContext::STATE_AIO_WAIT
:
11589 mono_clock::duration lat
= throttle
.log_state_latency(
11590 *txc
, logger
, l_bluestore_state_aio_wait_lat
);
11591 if (ceph::to_seconds
<double>(lat
) >= cct
->_conf
->bluestore_log_op_age
) {
11592 dout(0) << __func__
<< " slow aio_wait, txc = " << txc
11593 << ", latency = " << lat
11598 _txc_finish_io(txc
); // may trigger blocked txc's too
11601 case TransContext::STATE_IO_DONE
:
11602 ceph_assert(ceph_mutex_is_locked(txc
->osr
->qlock
)); // see _txc_finish_io
11603 if (txc
->had_ios
) {
11604 ++txc
->osr
->txc_with_unstable_io
;
11606 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_io_done_lat
);
11607 txc
->state
= TransContext::STATE_KV_QUEUED
;
11608 if (cct
->_conf
->bluestore_sync_submit_transaction
) {
11609 if (txc
->last_nid
>= nid_max
||
11610 txc
->last_blobid
>= blobid_max
) {
11611 dout(20) << __func__
11612 << " last_{nid,blobid} exceeds max, submit via kv thread"
11614 } else if (txc
->osr
->kv_committing_serially
) {
11615 dout(20) << __func__
<< " prior txc submitted via kv thread, us too"
11617 // note: this is starvation-prone. once we have a txc in a busy
11618 // sequencer that is committing serially it is possible to keep
11619 // submitting new transactions fast enough that we get stuck doing
11620 // so. the alternative is to block here... fixme?
11621 } else if (txc
->osr
->txc_with_unstable_io
) {
11622 dout(20) << __func__
<< " prior txc(s) with unstable ios "
11623 << txc
->osr
->txc_with_unstable_io
.load() << dendl
;
11624 } else if (cct
->_conf
->bluestore_debug_randomize_serial_transaction
&&
11625 rand() % cct
->_conf
->bluestore_debug_randomize_serial_transaction
11627 dout(20) << __func__
<< " DEBUG randomly forcing submit via kv thread"
11630 _txc_apply_kv(txc
, true);
11634 std::lock_guard
l(kv_lock
);
11635 kv_queue
.push_back(txc
);
11636 if (!kv_sync_in_progress
) {
11637 kv_sync_in_progress
= true;
11638 kv_cond
.notify_one();
11640 if (txc
->state
!= TransContext::STATE_KV_SUBMITTED
) {
11641 kv_queue_unsubmitted
.push_back(txc
);
11642 ++txc
->osr
->kv_committing_serially
;
11646 kv_throttle_costs
+= txc
->cost
;
11649 case TransContext::STATE_KV_SUBMITTED
:
11650 _txc_committed_kv(txc
);
11653 case TransContext::STATE_KV_DONE
:
11654 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_done_lat
);
11655 if (txc
->deferred_txn
) {
11656 txc
->state
= TransContext::STATE_DEFERRED_QUEUED
;
11657 _deferred_queue(txc
);
11660 txc
->state
= TransContext::STATE_FINISHING
;
11663 case TransContext::STATE_DEFERRED_CLEANUP
:
11664 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_deferred_cleanup_lat
);
11665 txc
->state
= TransContext::STATE_FINISHING
;
11668 case TransContext::STATE_FINISHING
:
11669 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_finishing_lat
);
11674 derr
<< __func__
<< " unexpected txc " << txc
11675 << " state " << txc
->get_state_name() << dendl
;
11676 ceph_abort_msg("unexpected txc state");
11682 void BlueStore::_txc_finish_io(TransContext
*txc
)
11684 dout(20) << __func__
<< " " << txc
<< dendl
;
11687 * we need to preserve the order of kv transactions,
11688 * even though aio will complete in any order.
11691 OpSequencer
*osr
= txc
->osr
.get();
11692 std::lock_guard
l(osr
->qlock
);
11693 txc
->state
= TransContext::STATE_IO_DONE
;
11694 txc
->ioc
.release_running_aios();
11695 OpSequencer::q_list_t::iterator p
= osr
->q
.iterator_to(*txc
);
11696 while (p
!= osr
->q
.begin()) {
11698 if (p
->state
< TransContext::STATE_IO_DONE
) {
11699 dout(20) << __func__
<< " " << txc
<< " blocked by " << &*p
<< " "
11700 << p
->get_state_name() << dendl
;
11703 if (p
->state
> TransContext::STATE_IO_DONE
) {
11709 _txc_state_proc(&*p
++);
11710 } while (p
!= osr
->q
.end() &&
11711 p
->state
== TransContext::STATE_IO_DONE
);
11713 if (osr
->kv_submitted_waiters
) {
11714 osr
->qcond
.notify_all();
11718 void BlueStore::_txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
)
11720 dout(20) << __func__
<< " txc " << txc
11721 << " onodes " << txc
->onodes
11722 << " shared_blobs " << txc
->shared_blobs
11726 for (auto o
: txc
->onodes
) {
11727 _record_onode(o
, t
);
11728 o
->flushing_count
++;
11731 // objects we modified but didn't affect the onode
11732 auto p
= txc
->modified_objects
.begin();
11733 while (p
!= txc
->modified_objects
.end()) {
11734 if (txc
->onodes
.count(*p
) == 0) {
11735 (*p
)->flushing_count
++;
11738 // remove dups with onodes list to avoid problems in _txc_finish
11739 p
= txc
->modified_objects
.erase(p
);
11743 // finalize shared_blobs
11744 for (auto sb
: txc
->shared_blobs
) {
11746 auto sbid
= sb
->get_sbid();
11747 get_shared_blob_key(sbid
, &key
);
11748 if (sb
->persistent
->empty()) {
11749 dout(20) << __func__
<< " shared_blob 0x"
11750 << std::hex
<< sbid
<< std::dec
11751 << " is empty" << dendl
;
11752 t
->rmkey(PREFIX_SHARED_BLOB
, key
);
11755 encode(*(sb
->persistent
), bl
);
11756 dout(20) << __func__
<< " shared_blob 0x"
11757 << std::hex
<< sbid
<< std::dec
11758 << " is " << bl
.length() << " " << *sb
<< dendl
;
11759 t
->set(PREFIX_SHARED_BLOB
, key
, bl
);
11764 void BlueStore::BSPerfTracker::update_from_perfcounters(
11765 PerfCounters
&logger
)
11767 os_commit_latency_ns
.consume_next(
11768 logger
.get_tavg_ns(
11769 l_bluestore_commit_lat
));
11770 os_apply_latency_ns
.consume_next(
11771 logger
.get_tavg_ns(
11772 l_bluestore_commit_lat
));
11775 void BlueStore::_txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
)
11777 dout(20) << __func__
<< " txc " << txc
<< std::hex
11778 << " allocated 0x" << txc
->allocated
11779 << " released 0x" << txc
->released
11780 << std::dec
<< dendl
;
11782 // We have to handle the case where we allocate *and* deallocate the
11783 // same region in this transaction. The freelist doesn't like that.
11784 // (Actually, the only thing that cares is the BitmapFreelistManager
11785 // debug check. But that's important.)
11786 interval_set
<uint64_t> tmp_allocated
, tmp_released
;
11787 interval_set
<uint64_t> *pallocated
= &txc
->allocated
;
11788 interval_set
<uint64_t> *preleased
= &txc
->released
;
11789 if (!txc
->allocated
.empty() && !txc
->released
.empty()) {
11790 interval_set
<uint64_t> overlap
;
11791 overlap
.intersection_of(txc
->allocated
, txc
->released
);
11792 if (!overlap
.empty()) {
11793 tmp_allocated
= txc
->allocated
;
11794 tmp_allocated
.subtract(overlap
);
11795 tmp_released
= txc
->released
;
11796 tmp_released
.subtract(overlap
);
11797 dout(20) << __func__
<< " overlap 0x" << std::hex
<< overlap
11798 << ", new allocated 0x" << tmp_allocated
11799 << " released 0x" << tmp_released
<< std::dec
11801 pallocated
= &tmp_allocated
;
11802 preleased
= &tmp_released
;
11806 // update freelist with non-overlap sets
11807 for (interval_set
<uint64_t>::iterator p
= pallocated
->begin();
11808 p
!= pallocated
->end();
11810 fm
->allocate(p
.get_start(), p
.get_len(), t
);
11812 for (interval_set
<uint64_t>::iterator p
= preleased
->begin();
11813 p
!= preleased
->end();
11815 dout(20) << __func__
<< " release 0x" << std::hex
<< p
.get_start()
11816 << "~" << p
.get_len() << std::dec
<< dendl
;
11817 fm
->release(p
.get_start(), p
.get_len(), t
);
11820 _txc_update_store_statfs(txc
);
11823 void BlueStore::_txc_apply_kv(TransContext
*txc
, bool sync_submit_transaction
)
11825 ceph_assert(txc
->state
== TransContext::STATE_KV_QUEUED
);
11827 #if defined(WITH_LTTNG)
11828 auto start
= mono_clock::now();
11831 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction(txc
->t
);
11832 ceph_assert(r
== 0);
11833 txc
->state
= TransContext::STATE_KV_SUBMITTED
;
11834 if (txc
->osr
->kv_submitted_waiters
) {
11835 std::lock_guard
l(txc
->osr
->qlock
);
11836 txc
->osr
->qcond
.notify_all();
11839 #if defined(WITH_LTTNG)
11840 if (txc
->tracing
) {
11843 transaction_kv_submit_latency
,
11844 txc
->osr
->get_sequencer_id(),
11846 sync_submit_transaction
,
11847 ceph::to_seconds
<double>(mono_clock::now() - start
));
11852 for (auto ls
: { &txc
->onodes
, &txc
->modified_objects
}) {
11853 for (auto& o
: *ls
) {
11854 dout(20) << __func__
<< " onode " << o
<< " had " << o
->flushing_count
11856 if (--o
->flushing_count
== 0 && o
->waiting_count
.load()) {
11857 std::lock_guard
l(o
->flush_lock
);
11858 o
->flush_cond
.notify_all();
11864 void BlueStore::_txc_committed_kv(TransContext
*txc
)
11866 dout(20) << __func__
<< " txc " << txc
<< dendl
;
11867 throttle
.complete_kv(*txc
);
11869 std::lock_guard
l(txc
->osr
->qlock
);
11870 txc
->state
= TransContext::STATE_KV_DONE
;
11871 if (txc
->ch
->commit_queue
) {
11872 txc
->ch
->commit_queue
->queue(txc
->oncommits
);
11874 finisher
.queue(txc
->oncommits
);
11877 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_committing_lat
);
11880 l_bluestore_commit_lat
,
11881 mono_clock::now() - txc
->start
,
11882 cct
->_conf
->bluestore_log_op_age
,
11884 return ", txc = " + stringify(txc
);
11889 void BlueStore::_txc_finish(TransContext
*txc
)
11891 dout(20) << __func__
<< " " << txc
<< " onodes " << txc
->onodes
<< dendl
;
11892 ceph_assert(txc
->state
== TransContext::STATE_FINISHING
);
11894 for (auto& sb
: txc
->shared_blobs_written
) {
11895 sb
->finish_write(txc
->seq
);
11897 txc
->shared_blobs_written
.clear();
11899 while (!txc
->removed_collections
.empty()) {
11900 _queue_reap_collection(txc
->removed_collections
.front());
11901 txc
->removed_collections
.pop_front();
11904 OpSequencerRef osr
= txc
->osr
;
11905 bool empty
= false;
11906 bool submit_deferred
= false;
11907 OpSequencer::q_list_t releasing_txc
;
11909 std::lock_guard
l(osr
->qlock
);
11910 txc
->state
= TransContext::STATE_DONE
;
11911 bool notify
= false;
11912 while (!osr
->q
.empty()) {
11913 TransContext
*txc
= &osr
->q
.front();
11914 dout(20) << __func__
<< " txc " << txc
<< " " << txc
->get_state_name()
11916 if (txc
->state
!= TransContext::STATE_DONE
) {
11917 if (txc
->state
== TransContext::STATE_PREPARE
&&
11918 deferred_aggressive
) {
11919 // for _osr_drain_preceding()
11922 if (txc
->state
== TransContext::STATE_DEFERRED_QUEUED
&&
11923 osr
->q
.size() > g_conf()->bluestore_max_deferred_txc
) {
11924 submit_deferred
= true;
11929 osr
->q
.pop_front();
11930 releasing_txc
.push_back(*txc
);
11933 if (osr
->q
.empty()) {
11934 dout(20) << __func__
<< " osr " << osr
<< " q now empty" << dendl
;
11938 // only drain()/drain_preceding() need wakeup,
11939 // other cases use kv_submitted_waiters
11940 if (notify
|| empty
) {
11941 osr
->qcond
.notify_all();
11945 while (!releasing_txc
.empty()) {
11946 // release to allocator only after all preceding txc's have also
11947 // finished any deferred writes that potentially land in these
11949 auto txc
= &releasing_txc
.front();
11950 _txc_release_alloc(txc
);
11951 releasing_txc
.pop_front();
11952 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_done_lat
);
11953 throttle
.complete(*txc
);
11957 if (submit_deferred
) {
11958 // we're pinning memory; flush! we could be more fine-grained here but
11959 // i'm not sure it's worth the bother.
11960 deferred_try_submit();
11963 if (empty
&& osr
->zombie
) {
11964 std::lock_guard
l(zombie_osr_lock
);
11965 if (zombie_osr_set
.erase(osr
->cid
)) {
11966 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
11968 dout(10) << __func__
<< " empty zombie osr " << osr
<< " already reaped"
11974 void BlueStore::_txc_release_alloc(TransContext
*txc
)
11976 // it's expected we're called with lazy_release_lock already taken!
11977 if (likely(!cct
->_conf
->bluestore_debug_no_reuse_blocks
)) {
11979 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
11980 r
= bdev
->queue_discard(txc
->released
);
11982 dout(10) << __func__
<< "(queued) " << txc
<< " " << std::hex
11983 << txc
->released
<< std::dec
<< dendl
;
11986 } else if (cct
->_conf
->bdev_enable_discard
) {
11987 for (auto p
= txc
->released
.begin(); p
!= txc
->released
.end(); ++p
) {
11988 bdev
->discard(p
.get_start(), p
.get_len());
11991 dout(10) << __func__
<< "(sync) " << txc
<< " " << std::hex
11992 << txc
->released
<< std::dec
<< dendl
;
11993 alloc
->release(txc
->released
);
11997 txc
->allocated
.clear();
11998 txc
->released
.clear();
12001 void BlueStore::_osr_attach(Collection
*c
)
12003 // note: caller has RWLock on coll_map
12004 auto q
= coll_map
.find(c
->cid
);
12005 if (q
!= coll_map
.end()) {
12006 c
->osr
= q
->second
->osr
;
12007 ldout(cct
, 10) << __func__
<< " " << c
->cid
12008 << " reusing osr " << c
->osr
<< " from existing coll "
12009 << q
->second
<< dendl
;
12011 std::lock_guard
l(zombie_osr_lock
);
12012 auto p
= zombie_osr_set
.find(c
->cid
);
12013 if (p
== zombie_osr_set
.end()) {
12014 c
->osr
= ceph::make_ref
<OpSequencer
>(this, next_sequencer_id
++, c
->cid
);
12015 ldout(cct
, 10) << __func__
<< " " << c
->cid
12016 << " fresh osr " << c
->osr
<< dendl
;
12018 c
->osr
= p
->second
;
12019 zombie_osr_set
.erase(p
);
12020 ldout(cct
, 10) << __func__
<< " " << c
->cid
12021 << " resurrecting zombie osr " << c
->osr
<< dendl
;
12022 c
->osr
->zombie
= false;
12027 void BlueStore::_osr_register_zombie(OpSequencer
*osr
)
12029 std::lock_guard
l(zombie_osr_lock
);
12030 dout(10) << __func__
<< " " << osr
<< " " << osr
->cid
<< dendl
;
12031 osr
->zombie
= true;
12032 auto i
= zombie_osr_set
.emplace(osr
->cid
, osr
);
12033 // this is either a new insertion or the same osr is already there
12034 ceph_assert(i
.second
|| i
.first
->second
== osr
);
12037 void BlueStore::_osr_drain_preceding(TransContext
*txc
)
12039 OpSequencer
*osr
= txc
->osr
.get();
12040 dout(10) << __func__
<< " " << txc
<< " osr " << osr
<< dendl
;
12041 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
12043 // submit anything pending
12044 deferred_lock
.lock();
12045 if (osr
->deferred_pending
&& !osr
->deferred_running
) {
12046 _deferred_submit_unlock(osr
);
12048 deferred_lock
.unlock();
12052 // wake up any previously finished deferred events
12053 std::lock_guard
l(kv_lock
);
12054 if (!kv_sync_in_progress
) {
12055 kv_sync_in_progress
= true;
12056 kv_cond
.notify_one();
12059 osr
->drain_preceding(txc
);
12060 --deferred_aggressive
;
12061 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
12064 void BlueStore::_osr_drain(OpSequencer
*osr
)
12066 dout(10) << __func__
<< " " << osr
<< dendl
;
12067 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
12069 // submit anything pending
12070 deferred_lock
.lock();
12071 if (osr
->deferred_pending
&& !osr
->deferred_running
) {
12072 _deferred_submit_unlock(osr
);
12074 deferred_lock
.unlock();
12078 // wake up any previously finished deferred events
12079 std::lock_guard
l(kv_lock
);
12080 if (!kv_sync_in_progress
) {
12081 kv_sync_in_progress
= true;
12082 kv_cond
.notify_one();
12086 --deferred_aggressive
;
12087 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
12090 void BlueStore::_osr_drain_all()
12092 dout(10) << __func__
<< dendl
;
12094 set
<OpSequencerRef
> s
;
12095 vector
<OpSequencerRef
> zombies
;
12097 std::shared_lock
l(coll_lock
);
12098 for (auto& i
: coll_map
) {
12099 s
.insert(i
.second
->osr
);
12103 std::lock_guard
l(zombie_osr_lock
);
12104 for (auto& i
: zombie_osr_set
) {
12105 s
.insert(i
.second
);
12106 zombies
.push_back(i
.second
);
12109 dout(20) << __func__
<< " osr_set " << s
<< dendl
;
12111 ++deferred_aggressive
;
12113 // submit anything pending
12114 deferred_try_submit();
12117 // wake up any previously finished deferred events
12118 std::lock_guard
l(kv_lock
);
12119 kv_cond
.notify_one();
12122 std::lock_guard
l(kv_finalize_lock
);
12123 kv_finalize_cond
.notify_one();
12125 for (auto osr
: s
) {
12126 dout(20) << __func__
<< " drain " << osr
<< dendl
;
12129 --deferred_aggressive
;
12132 std::lock_guard
l(zombie_osr_lock
);
12133 for (auto& osr
: zombies
) {
12134 if (zombie_osr_set
.erase(osr
->cid
)) {
12135 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
12136 ceph_assert(osr
->q
.empty());
12137 } else if (osr
->zombie
) {
12138 dout(10) << __func__
<< " empty zombie osr " << osr
12139 << " already reaped" << dendl
;
12140 ceph_assert(osr
->q
.empty());
12142 dout(10) << __func__
<< " empty zombie osr " << osr
12143 << " resurrected" << dendl
;
12148 dout(10) << __func__
<< " done" << dendl
;
12152 void BlueStore::_kv_start()
12154 dout(10) << __func__
<< dendl
;
12157 kv_sync_thread
.create("bstore_kv_sync");
12158 kv_finalize_thread
.create("bstore_kv_final");
12161 void BlueStore::_kv_stop()
12163 dout(10) << __func__
<< dendl
;
12165 std::unique_lock l
{kv_lock
};
12166 while (!kv_sync_started
) {
12170 kv_cond
.notify_all();
12173 std::unique_lock l
{kv_finalize_lock
};
12174 while (!kv_finalize_started
) {
12175 kv_finalize_cond
.wait(l
);
12177 kv_finalize_stop
= true;
12178 kv_finalize_cond
.notify_all();
12180 kv_sync_thread
.join();
12181 kv_finalize_thread
.join();
12182 ceph_assert(removed_collections
.empty());
12184 std::lock_guard
l(kv_lock
);
12188 std::lock_guard
l(kv_finalize_lock
);
12189 kv_finalize_stop
= false;
12191 dout(10) << __func__
<< " stopping finishers" << dendl
;
12192 finisher
.wait_for_empty();
12194 dout(10) << __func__
<< " stopped" << dendl
;
12197 void BlueStore::_kv_sync_thread()
12199 dout(10) << __func__
<< " start" << dendl
;
12200 deque
<DeferredBatch
*> deferred_stable_queue
; ///< deferred ios done + stable
12201 std::unique_lock l
{kv_lock
};
12202 ceph_assert(!kv_sync_started
);
12203 kv_sync_started
= true;
12204 kv_cond
.notify_all();
12206 auto t0
= mono_clock::now();
12207 timespan twait
= ceph::make_timespan(0);
12208 size_t kv_submitted
= 0;
12211 auto period
= cct
->_conf
->bluestore_kv_sync_util_logging_s
;
12212 auto observation_period
=
12213 ceph::make_timespan(period
);
12214 auto elapsed
= mono_clock::now() - t0
;
12215 if (period
&& elapsed
>= observation_period
) {
12216 dout(5) << __func__
<< " utilization: idle "
12217 << twait
<< " of " << elapsed
12218 << ", submitted: " << kv_submitted
12220 t0
= mono_clock::now();
12221 twait
= ceph::make_timespan(0);
12224 ceph_assert(kv_committing
.empty());
12225 if (kv_queue
.empty() &&
12226 ((deferred_done_queue
.empty() && deferred_stable_queue
.empty()) ||
12227 !deferred_aggressive
)) {
12230 dout(20) << __func__
<< " sleep" << dendl
;
12231 auto t
= mono_clock::now();
12232 kv_sync_in_progress
= false;
12234 twait
+= mono_clock::now() - t
;
12236 dout(20) << __func__
<< " wake" << dendl
;
12238 deque
<TransContext
*> kv_submitting
;
12239 deque
<DeferredBatch
*> deferred_done
, deferred_stable
;
12240 uint64_t aios
= 0, costs
= 0;
12242 dout(20) << __func__
<< " committing " << kv_queue
.size()
12243 << " submitting " << kv_queue_unsubmitted
.size()
12244 << " deferred done " << deferred_done_queue
.size()
12245 << " stable " << deferred_stable_queue
.size()
12247 kv_committing
.swap(kv_queue
);
12248 kv_submitting
.swap(kv_queue_unsubmitted
);
12249 deferred_done
.swap(deferred_done_queue
);
12250 deferred_stable
.swap(deferred_stable_queue
);
12252 costs
= kv_throttle_costs
;
12254 kv_throttle_costs
= 0;
12257 dout(30) << __func__
<< " committing " << kv_committing
<< dendl
;
12258 dout(30) << __func__
<< " submitting " << kv_submitting
<< dendl
;
12259 dout(30) << __func__
<< " deferred_done " << deferred_done
<< dendl
;
12260 dout(30) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
12262 auto start
= mono_clock::now();
12264 bool force_flush
= false;
12265 // if bluefs is sharing the same device as data (only), then we
12266 // can rely on the bluefs commit to flush the device and make
12267 // deferred aios stable. that means that if we do have done deferred
12268 // txcs AND we are not on a single device, we need to force a flush.
12269 if (bluefs
&& bluefs_layout
.single_shared_device()) {
12271 force_flush
= true;
12272 } else if (kv_committing
.empty() && deferred_stable
.empty()) {
12273 force_flush
= true; // there's nothing else to commit!
12274 } else if (deferred_aggressive
) {
12275 force_flush
= true;
12278 if (aios
|| !deferred_done
.empty()) {
12279 force_flush
= true;
12281 dout(20) << __func__
<< " skipping flush (no aios, no deferred_done)" << dendl
;
12286 dout(20) << __func__
<< " num_aios=" << aios
12287 << " force_flush=" << (int)force_flush
12288 << ", flushing, deferred done->stable" << dendl
;
12289 // flush/barrier on block device
12292 // if we flush then deferred done are now deferred stable
12293 deferred_stable
.insert(deferred_stable
.end(), deferred_done
.begin(),
12294 deferred_done
.end());
12295 deferred_done
.clear();
12297 auto after_flush
= mono_clock::now();
12299 // we will use one final transaction to force a sync
12300 KeyValueDB::Transaction synct
= db
->get_transaction();
12302 // increase {nid,blobid}_max? note that this covers both the
12303 // case where we are approaching the max and the case we passed
12304 // it. in either case, we increase the max in the earlier txn
12306 uint64_t new_nid_max
= 0, new_blobid_max
= 0;
12307 if (nid_last
+ cct
->_conf
->bluestore_nid_prealloc
/2 > nid_max
) {
12308 KeyValueDB::Transaction t
=
12309 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
12310 new_nid_max
= nid_last
+ cct
->_conf
->bluestore_nid_prealloc
;
12312 encode(new_nid_max
, bl
);
12313 t
->set(PREFIX_SUPER
, "nid_max", bl
);
12314 dout(10) << __func__
<< " new_nid_max " << new_nid_max
<< dendl
;
12316 if (blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
/2 > blobid_max
) {
12317 KeyValueDB::Transaction t
=
12318 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
12319 new_blobid_max
= blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
;
12321 encode(new_blobid_max
, bl
);
12322 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
12323 dout(10) << __func__
<< " new_blobid_max " << new_blobid_max
<< dendl
;
12326 for (auto txc
: kv_committing
) {
12327 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_queued_lat
);
12328 if (txc
->state
== TransContext::STATE_KV_QUEUED
) {
12330 _txc_apply_kv(txc
, false);
12331 --txc
->osr
->kv_committing_serially
;
12333 ceph_assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
12335 if (txc
->had_ios
) {
12336 --txc
->osr
->txc_with_unstable_io
;
12340 // release throttle *before* we commit. this allows new ops
12341 // to be prepared and enter pipeline while we are waiting on
12342 // the kv commit sync/flush. then hopefully on the next
12343 // iteration there will already be ops awake. otherwise, we
12344 // end up going to sleep, and then wake up when the very first
12345 // transaction is ready for commit.
12346 throttle
.release_kv_throttle(costs
);
12349 after_flush
- bluefs_last_balance
>
12350 ceph::make_timespan(cct
->_conf
->bluestore_bluefs_balance_interval
)) {
12351 bluefs_last_balance
= after_flush
;
12352 int r
= _balance_bluefs_freespace();
12353 ceph_assert(r
>= 0);
12356 // cleanup sync deferred keys
12357 for (auto b
: deferred_stable
) {
12358 for (auto& txc
: b
->txcs
) {
12359 bluestore_deferred_transaction_t
& wt
= *txc
.deferred_txn
;
12360 ceph_assert(wt
.released
.empty()); // only kraken did this
12362 get_deferred_key(wt
.seq
, &key
);
12363 synct
->rm_single_key(PREFIX_DEFERRED
, key
);
12367 #if defined(WITH_LTTNG)
12368 auto sync_start
= mono_clock::now();
12370 // submit synct synchronously (block and wait for it to commit)
12371 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction_sync(synct
);
12372 ceph_assert(r
== 0);
12374 int committing_size
= kv_committing
.size();
12375 int deferred_size
= deferred_stable
.size();
12377 #if defined(WITH_LTTNG)
12378 double sync_latency
= ceph::to_seconds
<double>(mono_clock::now() - sync_start
);
12379 for (auto txc
: kv_committing
) {
12380 if (txc
->tracing
) {
12383 transaction_kv_sync_latency
,
12384 txc
->osr
->get_sequencer_id(),
12386 kv_committing
.size(),
12387 deferred_done
.size(),
12388 deferred_stable
.size(),
12395 std::unique_lock m
{kv_finalize_lock
};
12396 if (kv_committing_to_finalize
.empty()) {
12397 kv_committing_to_finalize
.swap(kv_committing
);
12399 kv_committing_to_finalize
.insert(
12400 kv_committing_to_finalize
.end(),
12401 kv_committing
.begin(),
12402 kv_committing
.end());
12403 kv_committing
.clear();
12405 if (deferred_stable_to_finalize
.empty()) {
12406 deferred_stable_to_finalize
.swap(deferred_stable
);
12408 deferred_stable_to_finalize
.insert(
12409 deferred_stable_to_finalize
.end(),
12410 deferred_stable
.begin(),
12411 deferred_stable
.end());
12412 deferred_stable
.clear();
12414 if (!kv_finalize_in_progress
) {
12415 kv_finalize_in_progress
= true;
12416 kv_finalize_cond
.notify_one();
12421 nid_max
= new_nid_max
;
12422 dout(10) << __func__
<< " nid_max now " << nid_max
<< dendl
;
12424 if (new_blobid_max
) {
12425 blobid_max
= new_blobid_max
;
12426 dout(10) << __func__
<< " blobid_max now " << blobid_max
<< dendl
;
12430 auto finish
= mono_clock::now();
12431 ceph::timespan dur_flush
= after_flush
- start
;
12432 ceph::timespan dur_kv
= finish
- after_flush
;
12433 ceph::timespan dur
= finish
- start
;
12434 dout(20) << __func__
<< " committed " << committing_size
12435 << " cleaned " << deferred_size
12437 << " (" << dur_flush
<< " flush + " << dur_kv
<< " kv commit)"
12439 log_latency("kv_flush",
12440 l_bluestore_kv_flush_lat
,
12442 cct
->_conf
->bluestore_log_op_age
);
12443 log_latency("kv_commit",
12444 l_bluestore_kv_commit_lat
,
12446 cct
->_conf
->bluestore_log_op_age
);
12447 log_latency("kv_sync",
12448 l_bluestore_kv_sync_lat
,
12450 cct
->_conf
->bluestore_log_op_age
);
12454 if (!bluefs_extents_reclaiming
.empty()) {
12455 dout(0) << __func__
<< " releasing old bluefs 0x" << std::hex
12456 << bluefs_extents_reclaiming
<< std::dec
<< dendl
;
12458 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
12459 r
= bdev
->queue_discard(bluefs_extents_reclaiming
);
12463 } else if (cct
->_conf
->bdev_enable_discard
) {
12464 for (auto p
= bluefs_extents_reclaiming
.begin(); p
!= bluefs_extents_reclaiming
.end(); ++p
) {
12465 bdev
->discard(p
.get_start(), p
.get_len());
12469 alloc
->release(bluefs_extents_reclaiming
);
12471 bluefs_extents_reclaiming
.clear();
12476 // previously deferred "done" are now "stable" by virtue of this
12478 deferred_stable_queue
.swap(deferred_done
);
12481 dout(10) << __func__
<< " finish" << dendl
;
12482 kv_sync_started
= false;
12485 void BlueStore::_kv_finalize_thread()
12487 deque
<TransContext
*> kv_committed
;
12488 deque
<DeferredBatch
*> deferred_stable
;
12489 dout(10) << __func__
<< " start" << dendl
;
12490 std::unique_lock
l(kv_finalize_lock
);
12491 ceph_assert(!kv_finalize_started
);
12492 kv_finalize_started
= true;
12493 kv_finalize_cond
.notify_all();
12495 ceph_assert(kv_committed
.empty());
12496 ceph_assert(deferred_stable
.empty());
12497 if (kv_committing_to_finalize
.empty() &&
12498 deferred_stable_to_finalize
.empty()) {
12499 if (kv_finalize_stop
)
12501 dout(20) << __func__
<< " sleep" << dendl
;
12502 kv_finalize_in_progress
= false;
12503 kv_finalize_cond
.wait(l
);
12504 dout(20) << __func__
<< " wake" << dendl
;
12506 kv_committed
.swap(kv_committing_to_finalize
);
12507 deferred_stable
.swap(deferred_stable_to_finalize
);
12509 dout(20) << __func__
<< " kv_committed " << kv_committed
<< dendl
;
12510 dout(20) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
12512 auto start
= mono_clock::now();
12514 while (!kv_committed
.empty()) {
12515 TransContext
*txc
= kv_committed
.front();
12516 ceph_assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
12517 _txc_state_proc(txc
);
12518 kv_committed
.pop_front();
12521 for (auto b
: deferred_stable
) {
12522 auto p
= b
->txcs
.begin();
12523 while (p
!= b
->txcs
.end()) {
12524 TransContext
*txc
= &*p
;
12525 p
= b
->txcs
.erase(p
); // unlink here because
12526 _txc_state_proc(txc
); // this may destroy txc
12530 deferred_stable
.clear();
12532 if (!deferred_aggressive
) {
12533 if (deferred_queue_size
>= deferred_batch_ops
.load() ||
12534 throttle
.should_submit_deferred()) {
12535 deferred_try_submit();
12539 // this is as good a place as any ...
12540 _reap_collections();
12542 logger
->set(l_bluestore_fragmentation
,
12543 (uint64_t)(alloc
->get_fragmentation() * 1000));
12545 log_latency("kv_final",
12546 l_bluestore_kv_final_lat
,
12547 mono_clock::now() - start
,
12548 cct
->_conf
->bluestore_log_op_age
);
12553 dout(10) << __func__
<< " finish" << dendl
;
12554 kv_finalize_started
= false;
12557 bluestore_deferred_op_t
*BlueStore::_get_deferred_op(
12560 if (!txc
->deferred_txn
) {
12561 txc
->deferred_txn
= new bluestore_deferred_transaction_t
;
12563 txc
->deferred_txn
->ops
.push_back(bluestore_deferred_op_t());
12564 return &txc
->deferred_txn
->ops
.back();
12567 void BlueStore::_deferred_queue(TransContext
*txc
)
12569 dout(20) << __func__
<< " txc " << txc
<< " osr " << txc
->osr
<< dendl
;
12570 deferred_lock
.lock();
12571 if (!txc
->osr
->deferred_pending
&&
12572 !txc
->osr
->deferred_running
) {
12573 deferred_queue
.push_back(*txc
->osr
);
12575 if (!txc
->osr
->deferred_pending
) {
12576 txc
->osr
->deferred_pending
= new DeferredBatch(cct
, txc
->osr
.get());
12578 ++deferred_queue_size
;
12579 txc
->osr
->deferred_pending
->txcs
.push_back(*txc
);
12580 bluestore_deferred_transaction_t
& wt
= *txc
->deferred_txn
;
12581 for (auto opi
= wt
.ops
.begin(); opi
!= wt
.ops
.end(); ++opi
) {
12582 const auto& op
= *opi
;
12583 ceph_assert(op
.op
== bluestore_deferred_op_t::OP_WRITE
);
12584 bufferlist::const_iterator p
= op
.data
.begin();
12585 for (auto e
: op
.extents
) {
12586 txc
->osr
->deferred_pending
->prepare_write(
12587 cct
, wt
.seq
, e
.offset
, e
.length
, p
);
12590 if (deferred_aggressive
&&
12591 !txc
->osr
->deferred_running
) {
12592 _deferred_submit_unlock(txc
->osr
.get());
12594 deferred_lock
.unlock();
12598 void BlueStore::deferred_try_submit()
12600 dout(20) << __func__
<< " " << deferred_queue
.size() << " osrs, "
12601 << deferred_queue_size
<< " txcs" << dendl
;
12602 std::lock_guard
l(deferred_lock
);
12603 vector
<OpSequencerRef
> osrs
;
12604 osrs
.reserve(deferred_queue
.size());
12605 for (auto& osr
: deferred_queue
) {
12606 osrs
.push_back(&osr
);
12608 for (auto& osr
: osrs
) {
12609 if (osr
->deferred_pending
) {
12610 if (!osr
->deferred_running
) {
12611 _deferred_submit_unlock(osr
.get());
12612 deferred_lock
.lock();
12614 dout(20) << __func__
<< " osr " << osr
<< " already has running"
12618 dout(20) << __func__
<< " osr " << osr
<< " has no pending" << dendl
;
12622 deferred_last_submitted
= ceph_clock_now();
12625 void BlueStore::_deferred_submit_unlock(OpSequencer
*osr
)
12627 dout(10) << __func__
<< " osr " << osr
12628 << " " << osr
->deferred_pending
->iomap
.size() << " ios pending "
12630 ceph_assert(osr
->deferred_pending
);
12631 ceph_assert(!osr
->deferred_running
);
12633 auto b
= osr
->deferred_pending
;
12634 deferred_queue_size
-= b
->seq_bytes
.size();
12635 ceph_assert(deferred_queue_size
>= 0);
12637 osr
->deferred_running
= osr
->deferred_pending
;
12638 osr
->deferred_pending
= nullptr;
12640 deferred_lock
.unlock();
12642 for (auto& txc
: b
->txcs
) {
12643 throttle
.log_state_latency(txc
, logger
, l_bluestore_state_deferred_queued_lat
);
12645 uint64_t start
= 0, pos
= 0;
12647 auto i
= b
->iomap
.begin();
12649 if (i
== b
->iomap
.end() || i
->first
!= pos
) {
12651 dout(20) << __func__
<< " write 0x" << std::hex
12652 << start
<< "~" << bl
.length()
12653 << " crc " << bl
.crc32c(-1) << std::dec
<< dendl
;
12654 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
12655 logger
->inc(l_bluestore_deferred_write_ops
);
12656 logger
->inc(l_bluestore_deferred_write_bytes
, bl
.length());
12657 int r
= bdev
->aio_write(start
, bl
, &b
->ioc
, false);
12658 ceph_assert(r
== 0);
12661 if (i
== b
->iomap
.end()) {
12668 dout(20) << __func__
<< " seq " << i
->second
.seq
<< " 0x"
12669 << std::hex
<< pos
<< "~" << i
->second
.bl
.length() << std::dec
12671 if (!bl
.length()) {
12674 pos
+= i
->second
.bl
.length();
12675 bl
.claim_append(i
->second
.bl
);
12679 bdev
->aio_submit(&b
->ioc
);
12682 struct C_DeferredTrySubmit
: public Context
{
12684 C_DeferredTrySubmit(BlueStore
*s
) : store(s
) {}
12685 void finish(int r
) {
12686 store
->deferred_try_submit();
12690 void BlueStore::_deferred_aio_finish(OpSequencer
*osr
)
12692 dout(10) << __func__
<< " osr " << osr
<< dendl
;
12693 ceph_assert(osr
->deferred_running
);
12694 DeferredBatch
*b
= osr
->deferred_running
;
12697 deferred_lock
.lock();
12698 ceph_assert(osr
->deferred_running
== b
);
12699 osr
->deferred_running
= nullptr;
12700 if (!osr
->deferred_pending
) {
12701 dout(20) << __func__
<< " dequeueing" << dendl
;
12702 auto q
= deferred_queue
.iterator_to(*osr
);
12703 deferred_queue
.erase(q
);
12704 deferred_lock
.unlock();
12706 deferred_lock
.unlock();
12707 if (deferred_aggressive
) {
12708 dout(20) << __func__
<< " queuing async deferred_try_submit" << dendl
;
12709 finisher
.queue(new C_DeferredTrySubmit(this));
12711 dout(20) << __func__
<< " leaving queued, more pending" << dendl
;
12717 uint64_t costs
= 0;
12719 for (auto& i
: b
->txcs
) {
12720 TransContext
*txc
= &i
;
12721 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_deferred_aio_wait_lat
);
12722 txc
->state
= TransContext::STATE_DEFERRED_CLEANUP
;
12723 costs
+= txc
->cost
;
12726 throttle
.release_deferred_throttle(costs
);
12730 std::lock_guard
l(kv_lock
);
12731 deferred_done_queue
.emplace_back(b
);
12733 // in the normal case, do not bother waking up the kv thread; it will
12734 // catch us on the next commit anyway.
12735 if (deferred_aggressive
&& !kv_sync_in_progress
) {
12736 kv_sync_in_progress
= true;
12737 kv_cond
.notify_one();
12742 int BlueStore::_deferred_replay()
12744 dout(10) << __func__
<< " start" << dendl
;
12747 CollectionRef ch
= _get_collection(coll_t::meta());
12748 bool fake_ch
= false;
12750 // hmm, replaying initial mkfs?
12751 ch
= static_cast<Collection
*>(create_new_collection(coll_t::meta()).get());
12754 OpSequencer
*osr
= static_cast<OpSequencer
*>(ch
->osr
.get());
12755 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_DEFERRED
);
12756 for (it
->lower_bound(string()); it
->valid(); it
->next(), ++count
) {
12757 dout(20) << __func__
<< " replay " << pretty_binary_string(it
->key())
12759 bluestore_deferred_transaction_t
*deferred_txn
=
12760 new bluestore_deferred_transaction_t
;
12761 bufferlist bl
= it
->value();
12762 auto p
= bl
.cbegin();
12764 decode(*deferred_txn
, p
);
12765 } catch (buffer::error
& e
) {
12766 derr
<< __func__
<< " failed to decode deferred txn "
12767 << pretty_binary_string(it
->key()) << dendl
;
12768 delete deferred_txn
;
12772 TransContext
*txc
= _txc_create(ch
.get(), osr
, nullptr);
12773 txc
->deferred_txn
= deferred_txn
;
12774 txc
->state
= TransContext::STATE_KV_DONE
;
12775 _txc_state_proc(txc
);
12778 dout(20) << __func__
<< " draining osr" << dendl
;
12779 _osr_register_zombie(osr
);
12782 new_coll_map
.clear();
12784 dout(10) << __func__
<< " completed " << count
<< " events" << dendl
;
12788 // ---------------------------
12791 int BlueStore::queue_transactions(
12792 CollectionHandle
& ch
,
12793 vector
<Transaction
>& tls
,
12795 ThreadPool::TPHandle
*handle
)
12798 list
<Context
*> on_applied
, on_commit
, on_applied_sync
;
12799 ObjectStore::Transaction::collect_contexts(
12800 tls
, &on_applied
, &on_commit
, &on_applied_sync
);
12802 auto start
= mono_clock::now();
12804 Collection
*c
= static_cast<Collection
*>(ch
.get());
12805 OpSequencer
*osr
= c
->osr
.get();
12806 dout(10) << __func__
<< " ch " << c
<< " " << c
->cid
<< dendl
;
12809 TransContext
*txc
= _txc_create(static_cast<Collection
*>(ch
.get()), osr
,
12812 for (vector
<Transaction
>::iterator p
= tls
.begin(); p
!= tls
.end(); ++p
) {
12813 txc
->bytes
+= (*p
).get_num_bytes();
12814 _txc_add_transaction(txc
, &(*p
));
12816 _txc_calc_cost(txc
);
12818 _txc_write_nodes(txc
, txc
->t
);
12820 // journal deferred items
12821 if (txc
->deferred_txn
) {
12822 txc
->deferred_txn
->seq
= ++deferred_seq
;
12824 encode(*txc
->deferred_txn
, bl
);
12826 get_deferred_key(txc
->deferred_txn
->seq
, &key
);
12827 txc
->t
->set(PREFIX_DEFERRED
, key
, bl
);
12830 _txc_finalize_kv(txc
, txc
->t
);
12832 handle
->suspend_tp_timeout();
12834 auto tstart
= mono_clock::now();
12836 if (!throttle
.try_start_transaction(
12840 // ensure we do not block here because of deferred writes
12841 dout(10) << __func__
<< " failed get throttle_deferred_bytes, aggressive"
12843 ++deferred_aggressive
;
12844 deferred_try_submit();
12846 // wake up any previously finished deferred events
12847 std::lock_guard
l(kv_lock
);
12848 if (!kv_sync_in_progress
) {
12849 kv_sync_in_progress
= true;
12850 kv_cond
.notify_one();
12853 throttle
.finish_start_transaction(*db
, *txc
, tstart
);
12854 --deferred_aggressive
;
12856 auto tend
= mono_clock::now();
12859 handle
->reset_tp_timeout();
12861 logger
->inc(l_bluestore_txc
);
12864 _txc_state_proc(txc
);
12866 // we're immediately readable (unlike FileStore)
12867 for (auto c
: on_applied_sync
) {
12870 if (!on_applied
.empty()) {
12871 if (c
->commit_queue
) {
12872 c
->commit_queue
->queue(on_applied
);
12874 finisher
.queue(on_applied
);
12878 log_latency("submit_transact",
12879 l_bluestore_submit_lat
,
12880 mono_clock::now() - start
,
12881 cct
->_conf
->bluestore_log_op_age
);
12882 log_latency("throttle_transact",
12883 l_bluestore_throttle_lat
,
12885 cct
->_conf
->bluestore_log_op_age
);
12889 void BlueStore::_txc_aio_submit(TransContext
*txc
)
12891 dout(10) << __func__
<< " txc " << txc
<< dendl
;
12892 bdev
->aio_submit(&txc
->ioc
);
12895 void BlueStore::_txc_add_transaction(TransContext
*txc
, Transaction
*t
)
12897 Transaction::iterator i
= t
->begin();
12899 _dump_transaction
<30>(cct
, t
);
12901 vector
<CollectionRef
> cvec(i
.colls
.size());
12903 for (vector
<coll_t
>::iterator p
= i
.colls
.begin(); p
!= i
.colls
.end();
12905 cvec
[j
] = _get_collection(*p
);
12908 vector
<OnodeRef
> ovec(i
.objects
.size());
12910 for (int pos
= 0; i
.have_op(); ++pos
) {
12911 Transaction::Op
*op
= i
.decode_op();
12915 if (op
->op
== Transaction::OP_NOP
)
12919 // collection operations
12920 CollectionRef
&c
= cvec
[op
->cid
];
12922 // initialize osd_pool_id and do a smoke test that all collections belong
12923 // to the same pool
12925 if (!!c
? c
->cid
.is_pg(&pgid
) : false) {
12926 ceph_assert(txc
->osd_pool_id
== META_POOL_ID
||
12927 txc
->osd_pool_id
== pgid
.pool());
12928 txc
->osd_pool_id
= pgid
.pool();
12932 case Transaction::OP_RMCOLL
:
12934 const coll_t
&cid
= i
.get_cid(op
->cid
);
12935 r
= _remove_collection(txc
, cid
, &c
);
12941 case Transaction::OP_MKCOLL
:
12944 const coll_t
&cid
= i
.get_cid(op
->cid
);
12945 r
= _create_collection(txc
, cid
, op
->split_bits
, &c
);
12951 case Transaction::OP_SPLIT_COLLECTION
:
12952 ceph_abort_msg("deprecated");
12955 case Transaction::OP_SPLIT_COLLECTION2
:
12957 uint32_t bits
= op
->split_bits
;
12958 uint32_t rem
= op
->split_rem
;
12959 r
= _split_collection(txc
, c
, cvec
[op
->dest_cid
], bits
, rem
);
12965 case Transaction::OP_MERGE_COLLECTION
:
12967 uint32_t bits
= op
->split_bits
;
12968 r
= _merge_collection(txc
, &c
, cvec
[op
->dest_cid
], bits
);
12974 case Transaction::OP_COLL_HINT
:
12976 uint32_t type
= op
->hint_type
;
12979 auto hiter
= hint
.cbegin();
12980 if (type
== Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
) {
12983 decode(pg_num
, hiter
);
12984 decode(num_objs
, hiter
);
12985 dout(10) << __func__
<< " collection hint objects is a no-op, "
12986 << " pg_num " << pg_num
<< " num_objects " << num_objs
12990 dout(10) << __func__
<< " unknown collection hint " << type
<< dendl
;
12996 case Transaction::OP_COLL_SETATTR
:
13000 case Transaction::OP_COLL_RMATTR
:
13004 case Transaction::OP_COLL_RENAME
:
13005 ceph_abort_msg("not implemented");
13009 derr
<< __func__
<< " error " << cpp_strerror(r
)
13010 << " not handled on operation " << op
->op
13011 << " (op " << pos
<< ", counting from 0)" << dendl
;
13012 _dump_transaction
<0>(cct
, t
);
13013 ceph_abort_msg("unexpected error");
13016 // these operations implicity create the object
13017 bool create
= false;
13018 if (op
->op
== Transaction::OP_TOUCH
||
13019 op
->op
== Transaction::OP_CREATE
||
13020 op
->op
== Transaction::OP_WRITE
||
13021 op
->op
== Transaction::OP_ZERO
) {
13025 // object operations
13026 std::unique_lock
l(c
->lock
);
13027 OnodeRef
&o
= ovec
[op
->oid
];
13029 ghobject_t oid
= i
.get_oid(op
->oid
);
13030 o
= c
->get_onode(oid
, create
, op
->op
== Transaction::OP_CREATE
);
13032 if (!create
&& (!o
|| !o
->exists
)) {
13033 dout(10) << __func__
<< " op " << op
->op
<< " got ENOENT on "
13034 << i
.get_oid(op
->oid
) << dendl
;
13040 case Transaction::OP_CREATE
:
13041 case Transaction::OP_TOUCH
:
13042 r
= _touch(txc
, c
, o
);
13045 case Transaction::OP_WRITE
:
13047 uint64_t off
= op
->off
;
13048 uint64_t len
= op
->len
;
13049 uint32_t fadvise_flags
= i
.get_fadvise_flags();
13052 r
= _write(txc
, c
, o
, off
, len
, bl
, fadvise_flags
);
13056 case Transaction::OP_ZERO
:
13058 uint64_t off
= op
->off
;
13059 uint64_t len
= op
->len
;
13060 r
= _zero(txc
, c
, o
, off
, len
);
13064 case Transaction::OP_TRIMCACHE
:
13066 // deprecated, no-op
13070 case Transaction::OP_TRUNCATE
:
13072 uint64_t off
= op
->off
;
13073 r
= _truncate(txc
, c
, o
, off
);
13077 case Transaction::OP_REMOVE
:
13079 r
= _remove(txc
, c
, o
);
13083 case Transaction::OP_SETATTR
:
13085 string name
= i
.decode_string();
13088 r
= _setattr(txc
, c
, o
, name
, bp
);
13092 case Transaction::OP_SETATTRS
:
13094 map
<string
, bufferptr
> aset
;
13095 i
.decode_attrset(aset
);
13096 r
= _setattrs(txc
, c
, o
, aset
);
13100 case Transaction::OP_RMATTR
:
13102 string name
= i
.decode_string();
13103 r
= _rmattr(txc
, c
, o
, name
);
13107 case Transaction::OP_RMATTRS
:
13109 r
= _rmattrs(txc
, c
, o
);
13113 case Transaction::OP_CLONE
:
13115 OnodeRef
& no
= ovec
[op
->dest_oid
];
13117 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
13118 no
= c
->get_onode(noid
, true);
13120 r
= _clone(txc
, c
, o
, no
);
13124 case Transaction::OP_CLONERANGE
:
13125 ceph_abort_msg("deprecated");
13128 case Transaction::OP_CLONERANGE2
:
13130 OnodeRef
& no
= ovec
[op
->dest_oid
];
13132 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
13133 no
= c
->get_onode(noid
, true);
13135 uint64_t srcoff
= op
->off
;
13136 uint64_t len
= op
->len
;
13137 uint64_t dstoff
= op
->dest_off
;
13138 r
= _clone_range(txc
, c
, o
, no
, srcoff
, len
, dstoff
);
13142 case Transaction::OP_COLL_ADD
:
13143 ceph_abort_msg("not implemented");
13146 case Transaction::OP_COLL_REMOVE
:
13147 ceph_abort_msg("not implemented");
13150 case Transaction::OP_COLL_MOVE
:
13151 ceph_abort_msg("deprecated");
13154 case Transaction::OP_COLL_MOVE_RENAME
:
13155 case Transaction::OP_TRY_RENAME
:
13157 ceph_assert(op
->cid
== op
->dest_cid
);
13158 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
13159 OnodeRef
& no
= ovec
[op
->dest_oid
];
13161 no
= c
->get_onode(noid
, false);
13163 r
= _rename(txc
, c
, o
, no
, noid
);
13167 case Transaction::OP_OMAP_CLEAR
:
13169 r
= _omap_clear(txc
, c
, o
);
13172 case Transaction::OP_OMAP_SETKEYS
:
13174 bufferlist aset_bl
;
13175 i
.decode_attrset_bl(&aset_bl
);
13176 r
= _omap_setkeys(txc
, c
, o
, aset_bl
);
13179 case Transaction::OP_OMAP_RMKEYS
:
13181 bufferlist keys_bl
;
13182 i
.decode_keyset_bl(&keys_bl
);
13183 r
= _omap_rmkeys(txc
, c
, o
, keys_bl
);
13186 case Transaction::OP_OMAP_RMKEYRANGE
:
13188 string first
, last
;
13189 first
= i
.decode_string();
13190 last
= i
.decode_string();
13191 r
= _omap_rmkey_range(txc
, c
, o
, first
, last
);
13194 case Transaction::OP_OMAP_SETHEADER
:
13198 r
= _omap_setheader(txc
, c
, o
, bl
);
13202 case Transaction::OP_SETALLOCHINT
:
13204 r
= _set_alloc_hint(txc
, c
, o
,
13205 op
->expected_object_size
,
13206 op
->expected_write_size
,
13207 op
->alloc_hint_flags
);
13212 derr
<< __func__
<< " bad op " << op
->op
<< dendl
;
13220 if (r
== -ENOENT
&& !(op
->op
== Transaction::OP_CLONERANGE
||
13221 op
->op
== Transaction::OP_CLONE
||
13222 op
->op
== Transaction::OP_CLONERANGE2
||
13223 op
->op
== Transaction::OP_COLL_ADD
||
13224 op
->op
== Transaction::OP_SETATTR
||
13225 op
->op
== Transaction::OP_SETATTRS
||
13226 op
->op
== Transaction::OP_RMATTR
||
13227 op
->op
== Transaction::OP_OMAP_SETKEYS
||
13228 op
->op
== Transaction::OP_OMAP_RMKEYS
||
13229 op
->op
== Transaction::OP_OMAP_RMKEYRANGE
||
13230 op
->op
== Transaction::OP_OMAP_SETHEADER
))
13231 // -ENOENT is usually okay
13237 const char *msg
= "unexpected error code";
13239 if (r
== -ENOENT
&& (op
->op
== Transaction::OP_CLONERANGE
||
13240 op
->op
== Transaction::OP_CLONE
||
13241 op
->op
== Transaction::OP_CLONERANGE2
))
13242 msg
= "ENOENT on clone suggests osd bug";
13245 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
13246 // by partially applying transactions.
13247 msg
= "ENOSPC from bluestore, misconfigured cluster";
13249 if (r
== -ENOTEMPTY
) {
13250 msg
= "ENOTEMPTY suggests garbage data in osd data dir";
13253 derr
<< __func__
<< " error " << cpp_strerror(r
)
13254 << " not handled on operation " << op
->op
13255 << " (op " << pos
<< ", counting from 0)"
13257 derr
<< msg
<< dendl
;
13258 _dump_transaction
<0>(cct
, t
);
13259 ceph_abort_msg("unexpected error");
13267 // -----------------
13268 // write operations
13270 int BlueStore::_touch(TransContext
*txc
,
13274 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
13276 _assign_nid(txc
, o
);
13277 txc
->write_onode(o
);
13278 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
13282 void BlueStore::_pad_zeros(
13283 bufferlist
*bl
, uint64_t *offset
,
13284 uint64_t chunk_size
)
13286 auto length
= bl
->length();
13287 dout(30) << __func__
<< " 0x" << std::hex
<< *offset
<< "~" << length
13288 << " chunk_size 0x" << chunk_size
<< std::dec
<< dendl
;
13289 dout(40) << "before:\n";
13290 bl
->hexdump(*_dout
);
13293 size_t front_pad
= *offset
% chunk_size
;
13294 size_t back_pad
= 0;
13295 size_t pad_count
= 0;
13297 size_t front_copy
= std::min
<uint64_t>(chunk_size
- front_pad
, length
);
13298 bufferptr z
= buffer::create_small_page_aligned(chunk_size
);
13299 z
.zero(0, front_pad
, false);
13300 pad_count
+= front_pad
;
13301 bl
->begin().copy(front_copy
, z
.c_str() + front_pad
);
13302 if (front_copy
+ front_pad
< chunk_size
) {
13303 back_pad
= chunk_size
- (length
+ front_pad
);
13304 z
.zero(front_pad
+ length
, back_pad
, false);
13305 pad_count
+= back_pad
;
13309 t
.substr_of(old
, front_copy
, length
- front_copy
);
13311 bl
->claim_append(t
);
13312 *offset
-= front_pad
;
13313 length
+= pad_count
;
13317 uint64_t end
= *offset
+ length
;
13318 unsigned back_copy
= end
% chunk_size
;
13320 ceph_assert(back_pad
== 0);
13321 back_pad
= chunk_size
- back_copy
;
13322 ceph_assert(back_copy
<= length
);
13323 bufferptr
tail(chunk_size
);
13324 bl
->begin(length
- back_copy
).copy(back_copy
, tail
.c_str());
13325 tail
.zero(back_copy
, back_pad
, false);
13328 bl
->substr_of(old
, 0, length
- back_copy
);
13330 length
+= back_pad
;
13331 pad_count
+= back_pad
;
13333 dout(20) << __func__
<< " pad 0x" << std::hex
<< front_pad
<< " + 0x"
13334 << back_pad
<< " on front/back, now 0x" << *offset
<< "~"
13335 << length
<< std::dec
<< dendl
;
13336 dout(40) << "after:\n";
13337 bl
->hexdump(*_dout
);
13340 logger
->inc(l_bluestore_write_pad_bytes
, pad_count
);
13341 ceph_assert(bl
->length() == length
);
13344 void BlueStore::_do_write_small(
13348 uint64_t offset
, uint64_t length
,
13349 bufferlist::iterator
& blp
,
13350 WriteContext
*wctx
)
13352 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
13353 << std::dec
<< dendl
;
13354 ceph_assert(length
< min_alloc_size
);
13355 uint64_t end_offs
= offset
+ length
;
13357 logger
->inc(l_bluestore_write_small
);
13358 logger
->inc(l_bluestore_write_small_bytes
, length
);
13361 blp
.copy(length
, bl
);
13363 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13364 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
13365 uint32_t alloc_len
= min_alloc_size
;
13366 auto offset0
= p2align
<uint64_t>(offset
, alloc_len
);
13370 // search suitable extent in both forward and reverse direction in
13371 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13372 // then check if blob can be reused via can_reuse_blob func or apply
13373 // direct/deferred write (the latter for extents including or higher
13374 // than 'offset' only).
13375 o
->extent_map
.fault_range(db
, min_off
, offset
+ max_bsize
- min_off
);
13377 // Look for an existing mutable blob we can use.
13378 auto begin
= o
->extent_map
.extent_map
.begin();
13379 auto end
= o
->extent_map
.extent_map
.end();
13380 auto ep
= o
->extent_map
.seek_lextent(offset
);
13383 if (ep
->blob_end() <= offset
) {
13388 if (prev_ep
!= begin
) {
13391 prev_ep
= end
; // to avoid this extent check as it's a duplicate
13394 boost::container::flat_set
<const bluestore_blob_t
*> inspected_blobs
;
13395 // We don't want to have more blobs than min alloc units fit
13396 // into 2 max blobs
13397 size_t blob_threshold
= max_blob_size
/ min_alloc_size
* 2 + 1;
13398 bool above_blob_threshold
= false;
13400 inspected_blobs
.reserve(blob_threshold
);
13402 uint64_t max_off
= 0;
13403 auto start_ep
= ep
;
13404 auto end_ep
= ep
; // exclusively
13406 any_change
= false;
13408 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
13409 BlobRef b
= ep
->blob
;
13410 if (!above_blob_threshold
) {
13411 inspected_blobs
.insert(&b
->get_blob());
13412 above_blob_threshold
= inspected_blobs
.size() >= blob_threshold
;
13414 max_off
= ep
->logical_end();
13415 auto bstart
= ep
->blob_start();
13417 dout(20) << __func__
<< " considering " << *b
13418 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
13419 if (bstart
>= end_offs
) {
13420 dout(20) << __func__
<< " ignoring distant " << *b
<< dendl
;
13421 } else if (!b
->get_blob().is_mutable()) {
13422 dout(20) << __func__
<< " ignoring immutable " << *b
<< dendl
;
13423 } else if (ep
->logical_offset
% min_alloc_size
!=
13424 ep
->blob_offset
% min_alloc_size
) {
13425 dout(20) << __func__
<< " ignoring offset-skewed " << *b
<< dendl
;
13427 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
13428 // can we pad our head/tail out with zeros?
13429 uint64_t head_pad
, tail_pad
;
13430 head_pad
= p2phase(offset
, chunk_size
);
13431 tail_pad
= p2nphase(end_offs
, chunk_size
);
13432 if (head_pad
|| tail_pad
) {
13433 o
->extent_map
.fault_range(db
, offset
- head_pad
,
13434 end_offs
- offset
+ head_pad
+ tail_pad
);
13437 o
->extent_map
.has_any_lextents(offset
- head_pad
, chunk_size
)) {
13440 if (tail_pad
&& o
->extent_map
.has_any_lextents(end_offs
, tail_pad
)) {
13444 uint64_t b_off
= offset
- head_pad
- bstart
;
13445 uint64_t b_len
= length
+ head_pad
+ tail_pad
;
13447 // direct write into unused blocks of an existing mutable blob?
13448 if ((b_off
% chunk_size
== 0 && b_len
% chunk_size
== 0) &&
13449 b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
13450 b
->get_blob().is_unused(b_off
, b_len
) &&
13451 b
->get_blob().is_allocated(b_off
, b_len
)) {
13452 _apply_padding(head_pad
, tail_pad
, bl
);
13454 dout(20) << __func__
<< " write to unused 0x" << std::hex
13455 << b_off
<< "~" << b_len
13456 << " pad 0x" << head_pad
<< " + 0x" << tail_pad
13457 << std::dec
<< " of mutable " << *b
<< dendl
;
13458 _buffer_cache_write(txc
, b
, b_off
, bl
,
13459 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13461 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13462 if (b_len
<= prefer_deferred_size
) {
13463 dout(20) << __func__
<< " deferring small 0x" << std::hex
13464 << b_len
<< std::dec
<< " unused write via deferred" << dendl
;
13465 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
13466 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13469 [&](uint64_t offset
, uint64_t length
) {
13470 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13475 b
->get_blob().map_bl(
13477 [&](uint64_t offset
, bufferlist
& t
) {
13478 bdev
->aio_write(offset
, t
,
13479 &txc
->ioc
, wctx
->buffered
);
13483 b
->dirty_blob().calc_csum(b_off
, bl
);
13484 dout(20) << __func__
<< " lex old " << *ep
<< dendl
;
13485 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, b_off
+ head_pad
, length
,
13487 &wctx
->old_extents
);
13488 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13489 txc
->statfs_delta
.stored() += le
->length
;
13490 dout(20) << __func__
<< " lex " << *le
<< dendl
;
13491 logger
->inc(l_bluestore_write_small_unused
);
13494 // read some data to fill out the chunk?
13495 uint64_t head_read
= p2phase(b_off
, chunk_size
);
13496 uint64_t tail_read
= p2nphase(b_off
+ b_len
, chunk_size
);
13497 if ((head_read
|| tail_read
) &&
13498 (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
+ tail_read
) &&
13499 head_read
+ tail_read
< min_alloc_size
) {
13500 b_off
-= head_read
;
13501 b_len
+= head_read
+ tail_read
;
13504 head_read
= tail_read
= 0;
13507 // chunk-aligned deferred overwrite?
13508 if (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
13509 b_off
% chunk_size
== 0 &&
13510 b_len
% chunk_size
== 0 &&
13511 b
->get_blob().is_allocated(b_off
, b_len
)) {
13513 _apply_padding(head_pad
, tail_pad
, bl
);
13515 dout(20) << __func__
<< " reading head 0x" << std::hex
<< head_read
13516 << " and tail 0x" << tail_read
<< std::dec
<< dendl
;
13518 bufferlist head_bl
;
13519 int r
= _do_read(c
.get(), o
, offset
- head_pad
- head_read
, head_read
,
13521 ceph_assert(r
>= 0 && r
<= (int)head_read
);
13522 size_t zlen
= head_read
- r
;
13524 head_bl
.append_zero(zlen
);
13525 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13527 head_bl
.claim_append(bl
);
13529 logger
->inc(l_bluestore_write_penalty_read_ops
);
13532 bufferlist tail_bl
;
13533 int r
= _do_read(c
.get(), o
, offset
+ length
+ tail_pad
, tail_read
,
13535 ceph_assert(r
>= 0 && r
<= (int)tail_read
);
13536 size_t zlen
= tail_read
- r
;
13538 tail_bl
.append_zero(zlen
);
13539 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13541 bl
.claim_append(tail_bl
);
13542 logger
->inc(l_bluestore_write_penalty_read_ops
);
13544 logger
->inc(l_bluestore_write_small_pre_read
);
13546 _buffer_cache_write(txc
, b
, b_off
, bl
,
13547 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13549 if (b
->get_blob().csum_type
) {
13550 b
->dirty_blob().calc_csum(b_off
, bl
);
13553 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13554 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
13555 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13556 int r
= b
->get_blob().map(
13558 [&](uint64_t offset
, uint64_t length
) {
13559 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13562 ceph_assert(r
== 0);
13563 op
->data
.claim(bl
);
13564 dout(20) << __func__
<< " deferred write 0x" << std::hex
<< b_off
<< "~"
13565 << b_len
<< std::dec
<< " of mutable " << *b
13566 << " at " << op
->extents
<< dendl
;
13569 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, offset
- bstart
, length
,
13570 b
, &wctx
->old_extents
);
13571 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13572 txc
->statfs_delta
.stored() += le
->length
;
13573 dout(20) << __func__
<< " lex " << *le
<< dendl
;
13574 logger
->inc(l_bluestore_write_small_deferred
);
13577 // try to reuse blob if we can
13578 if (b
->can_reuse_blob(min_alloc_size
,
13582 ceph_assert(alloc_len
== min_alloc_size
); // expecting data always
13583 // fit into reused blob
13584 // Need to check for pending writes desiring to
13585 // reuse the same pextent. The rationale is that during GC two chunks
13586 // from garbage blobs(compressed?) can share logical space within the same
13587 // AU. That's in turn might be caused by unaligned len in clone_range2.
13588 // Hence the second write will fail in an attempt to reuse blob at
13589 // do_alloc_write().
13590 if (!wctx
->has_conflict(b
,
13592 offset0
+ alloc_len
,
13595 // we can't reuse pad_head/pad_tail since they might be truncated
13596 // due to existent extents
13597 uint64_t b_off
= offset
- bstart
;
13598 uint64_t b_off0
= b_off
;
13599 _pad_zeros(&bl
, &b_off0
, chunk_size
);
13601 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13602 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
13603 << " (0x" << b_off
<< "~" << length
<< ")"
13604 << std::dec
<< dendl
;
13606 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13607 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13609 logger
->inc(l_bluestore_write_small_unused
);
13617 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13619 // check extent for reuse in reverse order
13620 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
13621 BlobRef b
= prev_ep
->blob
;
13622 if (!above_blob_threshold
) {
13623 inspected_blobs
.insert(&b
->get_blob());
13624 above_blob_threshold
= inspected_blobs
.size() >= blob_threshold
;
13626 start_ep
= prev_ep
;
13627 auto bstart
= prev_ep
->blob_start();
13628 dout(20) << __func__
<< " considering " << *b
13629 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
13630 if (b
->can_reuse_blob(min_alloc_size
,
13634 ceph_assert(alloc_len
== min_alloc_size
); // expecting data always
13635 // fit into reused blob
13636 // Need to check for pending writes desiring to
13637 // reuse the same pextent. The rationale is that during GC two chunks
13638 // from garbage blobs(compressed?) can share logical space within the same
13639 // AU. That's in turn might be caused by unaligned len in clone_range2.
13640 // Hence the second write will fail in an attempt to reuse blob at
13641 // do_alloc_write().
13642 if (!wctx
->has_conflict(b
,
13644 offset0
+ alloc_len
,
13647 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
13648 uint64_t b_off
= offset
- bstart
;
13649 uint64_t b_off0
= b_off
;
13650 _pad_zeros(&bl
, &b_off0
, chunk_size
);
13652 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13653 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
13654 << " (0x" << b_off
<< "~" << length
<< ")"
13655 << std::dec
<< dendl
;
13657 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13658 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13660 logger
->inc(l_bluestore_write_small_unused
);
13664 if (prev_ep
!= begin
) {
13668 prev_ep
= end
; // to avoid useless first extent re-check
13670 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13671 } while (any_change
);
13673 if (above_blob_threshold
) {
13674 dout(10) << __func__
<< " request GC, blobs >= " << inspected_blobs
.size()
13675 << " " << std::hex
<< min_off
<< "~" << max_off
<< std::dec
13677 ceph_assert(start_ep
!= end_ep
);
13678 for (auto ep
= start_ep
; ep
!= end_ep
; ++ep
) {
13679 dout(20) << __func__
<< " inserting for GC "
13680 << std::hex
<< ep
->logical_offset
<< "~" << ep
->length
13681 << std::dec
<< dendl
;
13683 wctx
->extents_to_gc
.union_insert(ep
->logical_offset
, ep
->length
);
13685 // insert newly written extent to GC
13686 wctx
->extents_to_gc
.union_insert(offset
, length
);
13687 dout(20) << __func__
<< " inserting (last) for GC "
13688 << std::hex
<< offset
<< "~" << length
13689 << std::dec
<< dendl
;
13692 BlobRef b
= c
->new_blob();
13693 uint64_t b_off
= p2phase
<uint64_t>(offset
, alloc_len
);
13694 uint64_t b_off0
= b_off
;
13695 _pad_zeros(&bl
, &b_off0
, block_size
);
13696 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13697 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13698 min_alloc_size
!= block_size
, // use 'unused' bitmap when alloc granularity
13699 // doesn't match disk one only
13705 void BlueStore::_do_write_big(
13709 uint64_t offset
, uint64_t length
,
13710 bufferlist::iterator
& blp
,
13711 WriteContext
*wctx
)
13713 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
13714 << " target_blob_size 0x" << wctx
->target_blob_size
<< std::dec
13715 << " compress " << (int)wctx
->compress
13717 logger
->inc(l_bluestore_write_big
);
13718 logger
->inc(l_bluestore_write_big_bytes
, length
);
13719 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13720 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13721 while (length
> 0) {
13722 bool new_blob
= false;
13723 uint32_t l
= std::min(max_bsize
, length
);
13725 uint32_t b_off
= 0;
13727 //attempting to reuse existing blob
13728 if (!wctx
->compress
) {
13729 // look for an existing mutable blob we can reuse
13730 auto begin
= o
->extent_map
.extent_map
.begin();
13731 auto end
= o
->extent_map
.extent_map
.end();
13732 auto ep
= o
->extent_map
.seek_lextent(offset
);
13734 if (prev_ep
!= begin
) {
13737 prev_ep
= end
; // to avoid this extent check as it's a duplicate
13739 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
13740 // search suitable extent in both forward and reverse direction in
13741 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13742 // then check if blob can be reused via can_reuse_blob func.
13745 any_change
= false;
13746 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
13747 if (offset
>= ep
->blob_start() &&
13748 ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
13749 offset
- ep
->blob_start(),
13752 b_off
= offset
- ep
->blob_start();
13753 prev_ep
= end
; // to avoid check below
13754 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13755 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
13762 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
13763 if (prev_ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
13764 offset
- prev_ep
->blob_start(),
13767 b_off
= offset
- prev_ep
->blob_start();
13768 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13769 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
13770 } else if (prev_ep
!= begin
) {
13774 prev_ep
= end
; // to avoid useless first extent re-check
13777 } while (b
== nullptr && any_change
);
13779 if (b
== nullptr) {
13787 wctx
->write(offset
, b
, l
, b_off
, t
, b_off
, l
, false, new_blob
);
13790 logger
->inc(l_bluestore_write_big_blobs
);
13794 int BlueStore::_do_alloc_write(
13796 CollectionRef coll
,
13798 WriteContext
*wctx
)
13800 dout(20) << __func__
<< " txc " << txc
13801 << " " << wctx
->writes
.size() << " blobs"
13803 if (wctx
->writes
.empty()) {
13809 if (wctx
->compress
) {
13811 "compression_algorithm",
13815 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_ALGORITHM
, &val
)) {
13816 CompressorRef cp
= compressor
;
13817 if (!cp
|| cp
->get_type_name() != val
) {
13818 cp
= Compressor::create(cct
, val
);
13820 if (_set_compression_alert(false, val
.c_str())) {
13821 derr
<< __func__
<< " unable to initialize " << val
.c_str()
13822 << " compressor" << dendl
;
13826 return boost::optional
<CompressorRef
>(cp
);
13828 return boost::optional
<CompressorRef
>();
13832 crr
= select_option(
13833 "compression_required_ratio",
13834 cct
->_conf
->bluestore_compression_required_ratio
,
13837 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO
, &val
)) {
13838 return boost::optional
<double>(val
);
13840 return boost::optional
<double>();
13846 int64_t csum
= csum_type
.load();
13847 csum
= select_option(
13852 if (coll
->pool_opts
.get(pool_opts_t::CSUM_TYPE
, &val
)) {
13853 return boost::optional
<int64_t>(val
);
13855 return boost::optional
<int64_t>();
13859 // compress (as needed) and calc needed space
13861 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13862 for (auto& wi
: wctx
->writes
) {
13863 if (c
&& wi
.blob_length
> min_alloc_size
) {
13864 auto start
= mono_clock::now();
13867 ceph_assert(wi
.b_off
== 0);
13868 ceph_assert(wi
.blob_length
== wi
.bl
.length());
13870 // FIXME: memory alignment here is bad
13872 int r
= c
->compress(wi
.bl
, t
);
13873 uint64_t want_len_raw
= wi
.blob_length
* crr
;
13874 uint64_t want_len
= p2roundup(want_len_raw
, min_alloc_size
);
13875 bool rejected
= false;
13876 uint64_t compressed_len
= t
.length();
13877 // do an approximate (fast) estimation for resulting blob size
13878 // that doesn't take header overhead into account
13879 uint64_t result_len
= p2roundup(compressed_len
, min_alloc_size
);
13880 if (r
== 0 && result_len
<= want_len
&& result_len
< wi
.blob_length
) {
13881 bluestore_compression_header_t chdr
;
13882 chdr
.type
= c
->get_type();
13883 chdr
.length
= t
.length();
13884 encode(chdr
, wi
.compressed_bl
);
13885 wi
.compressed_bl
.claim_append(t
);
13887 compressed_len
= wi
.compressed_bl
.length();
13888 result_len
= p2roundup(compressed_len
, min_alloc_size
);
13889 if (result_len
<= want_len
&& result_len
< wi
.blob_length
) {
13890 // Cool. We compressed at least as much as we were hoping to.
13891 // pad out to min_alloc_size
13892 wi
.compressed_bl
.append_zero(result_len
- compressed_len
);
13893 wi
.compressed_len
= compressed_len
;
13894 wi
.compressed
= true;
13895 logger
->inc(l_bluestore_write_pad_bytes
, result_len
- compressed_len
);
13896 dout(20) << __func__
<< std::hex
<< " compressed 0x" << wi
.blob_length
13897 << " -> 0x" << compressed_len
<< " => 0x" << result_len
13898 << " with " << c
->get_type()
13899 << std::dec
<< dendl
;
13900 txc
->statfs_delta
.compressed() += compressed_len
;
13901 txc
->statfs_delta
.compressed_original() += wi
.blob_length
;
13902 txc
->statfs_delta
.compressed_allocated() += result_len
;
13903 logger
->inc(l_bluestore_compress_success_count
);
13904 need
+= result_len
;
13908 } else if (r
!= 0) {
13909 dout(5) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
13910 << " bytes compressed using " << c
->get_type_name()
13912 << " failed with errcode = " << r
13913 << ", leaving uncompressed"
13915 logger
->inc(l_bluestore_compress_rejected_count
);
13916 need
+= wi
.blob_length
;
13922 dout(20) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
13923 << " compressed to 0x" << compressed_len
<< " -> 0x" << result_len
13924 << " with " << c
->get_type()
13925 << ", which is more than required 0x" << want_len_raw
13926 << " -> 0x" << want_len
13927 << ", leaving uncompressed"
13928 << std::dec
<< dendl
;
13929 logger
->inc(l_bluestore_compress_rejected_count
);
13930 need
+= wi
.blob_length
;
13932 log_latency("compress@_do_alloc_write",
13933 l_bluestore_compress_lat
,
13934 mono_clock::now() - start
,
13935 cct
->_conf
->bluestore_log_op_age
);
13937 need
+= wi
.blob_length
;
13940 PExtentVector prealloc
;
13941 prealloc
.reserve(2 * wctx
->writes
.size());;
13942 int64_t prealloc_left
= 0;
13943 prealloc_left
= alloc
->allocate(
13944 need
, min_alloc_size
, need
,
13946 if (prealloc_left
< 0 || prealloc_left
< (int64_t)need
) {
13947 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< need
13948 << " allocated 0x " << (prealloc_left
< 0 ? 0 : prealloc_left
)
13949 << " min_alloc_size 0x" << min_alloc_size
13950 << " available 0x " << alloc
->get_free()
13951 << std::dec
<< dendl
;
13952 if (prealloc
.size()) {
13953 alloc
->release(prealloc
);
13957 _collect_allocation_stats(need
, min_alloc_size
, prealloc
.size());
13959 dout(20) << __func__
<< " prealloc " << prealloc
<< dendl
;
13960 auto prealloc_pos
= prealloc
.begin();
13962 for (auto& wi
: wctx
->writes
) {
13964 bluestore_blob_t
& dblob
= b
->dirty_blob();
13965 uint64_t b_off
= wi
.b_off
;
13966 bufferlist
*l
= &wi
.bl
;
13967 uint64_t final_length
= wi
.blob_length
;
13968 uint64_t csum_length
= wi
.blob_length
;
13969 if (wi
.compressed
) {
13970 final_length
= wi
.compressed_bl
.length();
13971 csum_length
= final_length
;
13972 unsigned csum_order
= ctz(csum_length
);
13973 l
= &wi
.compressed_bl
;
13974 dblob
.set_compressed(wi
.blob_length
, wi
.compressed_len
);
13975 if (csum
!= Checksummer::CSUM_NONE
) {
13976 dout(20) << __func__
<< " initialize csum setting for compressed blob " << *b
13977 << " csum_type " << Checksummer::get_csum_type_string(csum
)
13978 << " csum_order " << csum_order
13979 << " csum_length 0x" << std::hex
<< csum_length
13980 << " blob_length 0x" << wi
.blob_length
13981 << " compressed_length 0x" << wi
.compressed_len
<< std::dec
13983 dblob
.init_csum(csum
, csum_order
, csum_length
);
13985 } else if (wi
.new_blob
) {
13986 unsigned csum_order
;
13987 // initialize newly created blob only
13988 ceph_assert(dblob
.is_mutable());
13989 if (l
->length() != wi
.blob_length
) {
13990 // hrm, maybe we could do better here, but let's not bother.
13991 dout(20) << __func__
<< " forcing csum_order to block_size_order "
13992 << block_size_order
<< dendl
;
13993 csum_order
= block_size_order
;
13995 csum_order
= std::min(wctx
->csum_order
, ctz(l
->length()));
13997 // try to align blob with max_blob_size to improve
13998 // its reuse ratio, e.g. in case of reverse write
13999 uint32_t suggested_boff
=
14000 (wi
.logical_offset
- (wi
.b_off0
- wi
.b_off
)) % max_bsize
;
14001 if ((suggested_boff
% (1 << csum_order
)) == 0 &&
14002 suggested_boff
+ final_length
<= max_bsize
&&
14003 suggested_boff
> b_off
) {
14004 dout(20) << __func__
<< " forcing blob_offset to 0x"
14005 << std::hex
<< suggested_boff
<< std::dec
<< dendl
;
14006 ceph_assert(suggested_boff
>= b_off
);
14007 csum_length
+= suggested_boff
- b_off
;
14008 b_off
= suggested_boff
;
14010 if (csum
!= Checksummer::CSUM_NONE
) {
14011 dout(20) << __func__
<< " initialize csum setting for new blob " << *b
14012 << " csum_type " << Checksummer::get_csum_type_string(csum
)
14013 << " csum_order " << csum_order
14014 << " csum_length 0x" << std::hex
<< csum_length
<< std::dec
14016 dblob
.init_csum(csum
, csum_order
, csum_length
);
14020 PExtentVector extents
;
14021 int64_t left
= final_length
;
14023 ceph_assert(prealloc_left
> 0);
14024 if (prealloc_pos
->length
<= left
) {
14025 prealloc_left
-= prealloc_pos
->length
;
14026 left
-= prealloc_pos
->length
;
14027 txc
->statfs_delta
.allocated() += prealloc_pos
->length
;
14028 extents
.push_back(*prealloc_pos
);
14031 extents
.emplace_back(prealloc_pos
->offset
, left
);
14032 prealloc_pos
->offset
+= left
;
14033 prealloc_pos
->length
-= left
;
14034 prealloc_left
-= left
;
14035 txc
->statfs_delta
.allocated() += left
;
14040 for (auto& p
: extents
) {
14041 txc
->allocated
.insert(p
.offset
, p
.length
);
14043 dblob
.allocated(p2align(b_off
, min_alloc_size
), final_length
, extents
);
14045 dout(20) << __func__
<< " blob " << *b
<< dendl
;
14046 if (dblob
.has_csum()) {
14047 dblob
.calc_csum(b_off
, *l
);
14050 if (wi
.mark_unused
) {
14051 ceph_assert(!dblob
.is_compressed());
14052 auto b_end
= b_off
+ wi
.bl
.length();
14054 dblob
.add_unused(0, b_off
);
14056 uint64_t llen
= dblob
.get_logical_length();
14057 if (b_end
< llen
) {
14058 dblob
.add_unused(b_end
, llen
- b_end
);
14062 Extent
*le
= o
->extent_map
.set_lextent(coll
, wi
.logical_offset
,
14063 b_off
+ (wi
.b_off0
- wi
.b_off
),
14067 wi
.b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
14068 txc
->statfs_delta
.stored() += le
->length
;
14069 dout(20) << __func__
<< " lex " << *le
<< dendl
;
14070 _buffer_cache_write(txc
, wi
.b
, b_off
, wi
.bl
,
14071 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
14074 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
14075 if (l
->length() <= prefer_deferred_size
.load()) {
14076 dout(20) << __func__
<< " deferring small 0x" << std::hex
14077 << l
->length() << std::dec
<< " write via deferred" << dendl
;
14078 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
14079 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
14080 int r
= b
->get_blob().map(
14081 b_off
, l
->length(),
14082 [&](uint64_t offset
, uint64_t length
) {
14083 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
14086 ceph_assert(r
== 0);
14088 logger
->inc(l_bluestore_write_small_deferred
);
14090 b
->get_blob().map_bl(
14092 [&](uint64_t offset
, bufferlist
& t
) {
14093 bdev
->aio_write(offset
, t
, &txc
->ioc
, false);
14095 logger
->inc(l_bluestore_write_small_new
);
14099 ceph_assert(prealloc_pos
== prealloc
.end());
14100 ceph_assert(prealloc_left
== 0);
14104 void BlueStore::_wctx_finish(
14108 WriteContext
*wctx
,
14109 set
<SharedBlob
*> *maybe_unshared_blobs
)
14111 auto oep
= wctx
->old_extents
.begin();
14112 while (oep
!= wctx
->old_extents
.end()) {
14114 oep
= wctx
->old_extents
.erase(oep
);
14115 dout(20) << __func__
<< " lex_old " << lo
.e
<< dendl
;
14116 BlobRef b
= lo
.e
.blob
;
14117 const bluestore_blob_t
& blob
= b
->get_blob();
14118 if (blob
.is_compressed()) {
14119 if (lo
.blob_empty
) {
14120 txc
->statfs_delta
.compressed() -= blob
.get_compressed_payload_length();
14122 txc
->statfs_delta
.compressed_original() -= lo
.e
.length
;
14125 txc
->statfs_delta
.stored() -= lo
.e
.length
;
14127 dout(20) << __func__
<< " blob release " << r
<< dendl
;
14128 if (blob
.is_shared()) {
14129 PExtentVector final
;
14130 c
->load_shared_blob(b
->shared_blob
);
14131 bool unshare
= false;
14132 bool* unshare_ptr
=
14133 !maybe_unshared_blobs
|| b
->is_referenced() ? nullptr : &unshare
;
14135 b
->shared_blob
->put_ref(
14136 e
.offset
, e
.length
, &final
,
14140 ceph_assert(maybe_unshared_blobs
);
14141 maybe_unshared_blobs
->insert(b
->shared_blob
.get());
14143 dout(20) << __func__
<< " shared_blob release " << final
14144 << " from " << *b
->shared_blob
<< dendl
;
14145 txc
->write_shared_blob(b
->shared_blob
);
14150 // we can't invalidate our logical extents as we drop them because
14151 // other lextents (either in our onode or others) may still
14152 // reference them. but we can throw out anything that is no
14153 // longer allocated. Note that this will leave behind edge bits
14154 // that are no longer referenced but not deallocated (until they
14155 // age out of the cache naturally).
14156 b
->discard_unallocated(c
.get());
14158 dout(20) << __func__
<< " release " << e
<< dendl
;
14159 txc
->released
.insert(e
.offset
, e
.length
);
14160 txc
->statfs_delta
.allocated() -= e
.length
;
14161 if (blob
.is_compressed()) {
14162 txc
->statfs_delta
.compressed_allocated() -= e
.length
;
14166 if (b
->is_spanning() && !b
->is_referenced() && lo
.blob_empty
) {
14167 dout(20) << __func__
<< " spanning_blob_map removing empty " << *b
14169 o
->extent_map
.spanning_blob_map
.erase(b
->id
);
14175 void BlueStore::_do_write_data(
14182 WriteContext
*wctx
)
14184 uint64_t end
= offset
+ length
;
14185 bufferlist::iterator p
= bl
.begin();
14187 if (offset
/ min_alloc_size
== (end
- 1) / min_alloc_size
&&
14188 (length
!= min_alloc_size
)) {
14189 // we fall within the same block
14190 _do_write_small(txc
, c
, o
, offset
, length
, p
, wctx
);
14192 uint64_t head_offset
, head_length
;
14193 uint64_t middle_offset
, middle_length
;
14194 uint64_t tail_offset
, tail_length
;
14196 head_offset
= offset
;
14197 head_length
= p2nphase(offset
, min_alloc_size
);
14199 tail_offset
= p2align(end
, min_alloc_size
);
14200 tail_length
= p2phase(end
, min_alloc_size
);
14202 middle_offset
= head_offset
+ head_length
;
14203 middle_length
= length
- head_length
- tail_length
;
14206 _do_write_small(txc
, c
, o
, head_offset
, head_length
, p
, wctx
);
14209 if (middle_length
) {
14210 _do_write_big(txc
, c
, o
, middle_offset
, middle_length
, p
, wctx
);
14214 _do_write_small(txc
, c
, o
, tail_offset
, tail_length
, p
, wctx
);
14219 void BlueStore::_choose_write_options(
14222 uint32_t fadvise_flags
,
14223 WriteContext
*wctx
)
14225 if (fadvise_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
14226 dout(20) << __func__
<< " will do buffered write" << dendl
;
14227 wctx
->buffered
= true;
14228 } else if (cct
->_conf
->bluestore_default_buffered_write
&&
14229 (fadvise_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
14230 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
14231 dout(20) << __func__
<< " defaulting to buffered write" << dendl
;
14232 wctx
->buffered
= true;
14235 // apply basic csum block size
14236 wctx
->csum_order
= block_size_order
;
14238 // compression parameters
14239 unsigned alloc_hints
= o
->onode
.alloc_hint_flags
;
14240 auto cm
= select_option(
14241 "compression_mode",
14245 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MODE
, &val
)) {
14246 return boost::optional
<Compressor::CompressionMode
>(
14247 Compressor::get_comp_mode_type(val
));
14249 return boost::optional
<Compressor::CompressionMode
>();
14253 wctx
->compress
= (cm
!= Compressor::COMP_NONE
) &&
14254 ((cm
== Compressor::COMP_FORCE
) ||
14255 (cm
== Compressor::COMP_AGGRESSIVE
&&
14256 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE
) == 0) ||
14257 (cm
== Compressor::COMP_PASSIVE
&&
14258 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE
)));
14260 if ((alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ
) &&
14261 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ
) == 0 &&
14262 (alloc_hints
& (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE
|
14263 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY
)) &&
14264 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE
) == 0) {
14266 dout(20) << __func__
<< " will prefer large blob and csum sizes" << dendl
;
14268 if (o
->onode
.expected_write_size
) {
14269 wctx
->csum_order
= std::max(min_alloc_size_order
,
14270 (uint8_t)ctz(o
->onode
.expected_write_size
));
14272 wctx
->csum_order
= min_alloc_size_order
;
14275 if (wctx
->compress
) {
14276 wctx
->target_blob_size
= select_option(
14277 "compression_max_blob_size",
14278 comp_max_blob_size
.load(),
14281 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, &val
)) {
14282 return boost::optional
<uint64_t>((uint64_t)val
);
14284 return boost::optional
<uint64_t>();
14289 if (wctx
->compress
) {
14290 wctx
->target_blob_size
= select_option(
14291 "compression_min_blob_size",
14292 comp_min_blob_size
.load(),
14295 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, &val
)) {
14296 return boost::optional
<uint64_t>((uint64_t)val
);
14298 return boost::optional
<uint64_t>();
14304 uint64_t max_bsize
= max_blob_size
.load();
14305 if (wctx
->target_blob_size
== 0 || wctx
->target_blob_size
> max_bsize
) {
14306 wctx
->target_blob_size
= max_bsize
;
14309 // set the min blob size floor at 2x the min_alloc_size, or else we
14310 // won't be able to allocate a smaller extent for the compressed
14312 if (wctx
->compress
&&
14313 wctx
->target_blob_size
< min_alloc_size
* 2) {
14314 wctx
->target_blob_size
= min_alloc_size
* 2;
14317 dout(20) << __func__
<< " prefer csum_order " << wctx
->csum_order
14318 << " target_blob_size 0x" << std::hex
<< wctx
->target_blob_size
14319 << " compress=" << (int)wctx
->compress
14320 << " buffered=" << (int)wctx
->buffered
14321 << std::dec
<< dendl
;
14324 int BlueStore::_do_gc(
14328 const WriteContext
& wctx
,
14329 uint64_t *dirty_start
,
14330 uint64_t *dirty_end
)
14333 bool dirty_range_updated
= false;
14334 WriteContext wctx_gc
;
14335 wctx_gc
.fork(wctx
); // make a clone for garbage collection
14337 auto & extents_to_collect
= wctx
.extents_to_gc
;
14338 for (auto it
= extents_to_collect
.begin();
14339 it
!= extents_to_collect
.end();
14342 auto offset
= (*it
).first
;
14343 auto length
= (*it
).second
;
14344 dout(20) << __func__
<< " processing " << std::hex
14345 << offset
<< "~" << length
<< std::dec
14347 int r
= _do_read(c
.get(), o
, offset
, length
, bl
, 0);
14348 ceph_assert(r
== (int)length
);
14350 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx_gc
);
14351 logger
->inc(l_bluestore_gc_merged
, length
);
14353 if (*dirty_start
> offset
) {
14354 *dirty_start
= offset
;
14355 dirty_range_updated
= true;
14358 if (*dirty_end
< offset
+ length
) {
14359 *dirty_end
= offset
+ length
;
14360 dirty_range_updated
= true;
14363 if (dirty_range_updated
) {
14364 o
->extent_map
.fault_range(db
, *dirty_start
, *dirty_end
);
14367 dout(30) << __func__
<< " alloc write" << dendl
;
14368 int r
= _do_alloc_write(txc
, c
, o
, &wctx_gc
);
14370 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
14375 _wctx_finish(txc
, c
, o
, &wctx_gc
);
14379 int BlueStore::_do_write(
14386 uint32_t fadvise_flags
)
14390 dout(20) << __func__
14392 << " 0x" << std::hex
<< offset
<< "~" << length
14393 << " - have 0x" << o
->onode
.size
14394 << " (" << std::dec
<< o
->onode
.size
<< ")"
14396 << " fadvise_flags 0x" << std::hex
<< fadvise_flags
<< std::dec
14398 _dump_onode
<30>(cct
, *o
);
14404 uint64_t end
= offset
+ length
;
14406 GarbageCollector
gc(c
->store
->cct
);
14407 int64_t benefit
= 0;
14408 auto dirty_start
= offset
;
14409 auto dirty_end
= end
;
14412 _choose_write_options(c
, o
, fadvise_flags
, &wctx
);
14413 o
->extent_map
.fault_range(db
, offset
, length
);
14414 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx
);
14415 r
= _do_alloc_write(txc
, c
, o
, &wctx
);
14417 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
14422 if (wctx
.extents_to_gc
.empty() ||
14423 wctx
.extents_to_gc
.range_start() > offset
||
14424 wctx
.extents_to_gc
.range_end() < offset
+ length
) {
14425 benefit
= gc
.estimate(offset
,
14432 // NB: _wctx_finish() will empty old_extents
14433 // so we must do gc estimation before that
14434 _wctx_finish(txc
, c
, o
, &wctx
);
14435 if (end
> o
->onode
.size
) {
14436 dout(20) << __func__
<< " extending size to 0x" << std::hex
<< end
14437 << std::dec
<< dendl
;
14438 o
->onode
.size
= end
;
14441 if (benefit
>= g_conf()->bluestore_gc_enable_total_threshold
) {
14442 wctx
.extents_to_gc
.union_of(gc
.get_extents_to_collect());
14443 dout(20) << __func__
14444 << " perform garbage collection for compressed extents, "
14445 << "expected benefit = " << benefit
<< " AUs" << dendl
;
14447 if (!wctx
.extents_to_gc
.empty()) {
14448 dout(20) << __func__
<< " perform garbage collection" << dendl
;
14450 r
= _do_gc(txc
, c
, o
,
14452 &dirty_start
, &dirty_end
);
14454 derr
<< __func__
<< " _do_gc failed with " << cpp_strerror(r
)
14458 dout(20)<<__func__
<<" gc range is " << std::hex
<< dirty_start
14459 << "~" << dirty_end
- dirty_start
<< std::dec
<< dendl
;
14461 o
->extent_map
.compress_extent_map(dirty_start
, dirty_end
- dirty_start
);
14462 o
->extent_map
.dirty_range(dirty_start
, dirty_end
- dirty_start
);
14470 int BlueStore::_write(TransContext
*txc
,
14473 uint64_t offset
, size_t length
,
14475 uint32_t fadvise_flags
)
14477 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14478 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14481 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
14484 _assign_nid(txc
, o
);
14485 r
= _do_write(txc
, c
, o
, offset
, length
, bl
, fadvise_flags
);
14486 txc
->write_onode(o
);
14488 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14489 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14490 << " = " << r
<< dendl
;
14494 int BlueStore::_zero(TransContext
*txc
,
14497 uint64_t offset
, size_t length
)
14499 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14500 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14503 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
14506 _assign_nid(txc
, o
);
14507 r
= _do_zero(txc
, c
, o
, offset
, length
);
14509 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14510 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14511 << " = " << r
<< dendl
;
14515 int BlueStore::_do_zero(TransContext
*txc
,
14518 uint64_t offset
, size_t length
)
14520 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14521 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14525 _dump_onode
<30>(cct
, *o
);
14528 o
->extent_map
.fault_range(db
, offset
, length
);
14529 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
14530 o
->extent_map
.dirty_range(offset
, length
);
14531 _wctx_finish(txc
, c
, o
, &wctx
);
14533 if (length
> 0 && offset
+ length
> o
->onode
.size
) {
14534 o
->onode
.size
= offset
+ length
;
14535 dout(20) << __func__
<< " extending size to " << offset
+ length
14538 txc
->write_onode(o
);
14540 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14541 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14542 << " = " << r
<< dendl
;
14546 void BlueStore::_do_truncate(
14547 TransContext
*txc
, CollectionRef
& c
, OnodeRef o
, uint64_t offset
,
14548 set
<SharedBlob
*> *maybe_unshared_blobs
)
14550 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14551 << " 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
14553 _dump_onode
<30>(cct
, *o
);
14555 if (offset
== o
->onode
.size
)
14558 if (offset
< o
->onode
.size
) {
14560 uint64_t length
= o
->onode
.size
- offset
;
14561 o
->extent_map
.fault_range(db
, offset
, length
);
14562 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
14563 o
->extent_map
.dirty_range(offset
, length
);
14564 _wctx_finish(txc
, c
, o
, &wctx
, maybe_unshared_blobs
);
14566 // if we have shards past EOF, ask for a reshard
14567 if (!o
->onode
.extent_map_shards
.empty() &&
14568 o
->onode
.extent_map_shards
.back().offset
>= offset
) {
14569 dout(10) << __func__
<< " request reshard past EOF" << dendl
;
14571 o
->extent_map
.request_reshard(offset
- 1, offset
+ length
);
14573 o
->extent_map
.request_reshard(0, length
);
14578 o
->onode
.size
= offset
;
14580 txc
->write_onode(o
);
14583 int BlueStore::_truncate(TransContext
*txc
,
14588 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14589 << " 0x" << std::hex
<< offset
<< std::dec
14592 if (offset
>= OBJECT_MAX_SIZE
) {
14595 _do_truncate(txc
, c
, o
, offset
);
14597 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14598 << " 0x" << std::hex
<< offset
<< std::dec
14599 << " = " << r
<< dendl
;
14603 int BlueStore::_do_remove(
14608 set
<SharedBlob
*> maybe_unshared_blobs
;
14609 bool is_gen
= !o
->oid
.is_no_gen();
14610 _do_truncate(txc
, c
, o
, 0, is_gen
? &maybe_unshared_blobs
: nullptr);
14611 if (o
->onode
.has_omap()) {
14613 _do_omap_clear(txc
, o
);
14617 for (auto &s
: o
->extent_map
.shards
) {
14618 dout(20) << __func__
<< " removing shard 0x" << std::hex
14619 << s
.shard_info
->offset
<< std::dec
<< dendl
;
14620 generate_extent_shard_key_and_apply(o
->key
, s
.shard_info
->offset
, &key
,
14621 [&](const string
& final_key
) {
14622 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
14626 txc
->t
->rmkey(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size());
14627 txc
->note_removed_object(o
);
14628 o
->extent_map
.clear();
14629 o
->onode
= bluestore_onode_t();
14630 _debug_obj_on_delete(o
->oid
);
14632 if (!is_gen
|| maybe_unshared_blobs
.empty()) {
14636 // see if we can unshare blobs still referenced by the head
14637 dout(10) << __func__
<< " gen and maybe_unshared_blobs "
14638 << maybe_unshared_blobs
<< dendl
;
14639 ghobject_t nogen
= o
->oid
;
14640 nogen
.generation
= ghobject_t::NO_GEN
;
14641 OnodeRef h
= c
->onode_map
.lookup(nogen
);
14643 if (!h
|| !h
->exists
) {
14647 dout(20) << __func__
<< " checking for unshareable blobs on " << h
14648 << " " << h
->oid
<< dendl
;
14649 map
<SharedBlob
*,bluestore_extent_ref_map_t
> expect
;
14650 for (auto& e
: h
->extent_map
.extent_map
) {
14651 const bluestore_blob_t
& b
= e
.blob
->get_blob();
14652 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
14653 if (b
.is_shared() &&
14655 maybe_unshared_blobs
.count(sb
)) {
14656 if (b
.is_compressed()) {
14657 expect
[sb
].get(0, b
.get_ondisk_length());
14659 b
.map(e
.blob_offset
, e
.length
, [&](uint64_t off
, uint64_t len
) {
14660 expect
[sb
].get(off
, len
);
14667 vector
<SharedBlob
*> unshared_blobs
;
14668 unshared_blobs
.reserve(maybe_unshared_blobs
.size());
14669 for (auto& p
: expect
) {
14670 dout(20) << " ? " << *p
.first
<< " vs " << p
.second
<< dendl
;
14671 if (p
.first
->persistent
->ref_map
== p
.second
) {
14672 SharedBlob
*sb
= p
.first
;
14673 dout(20) << __func__
<< " unsharing " << *sb
<< dendl
;
14674 unshared_blobs
.push_back(sb
);
14675 txc
->unshare_blob(sb
);
14676 uint64_t sbid
= c
->make_blob_unshared(sb
);
14678 get_shared_blob_key(sbid
, &key
);
14679 txc
->t
->rmkey(PREFIX_SHARED_BLOB
, key
);
14683 if (unshared_blobs
.empty()) {
14687 for (auto& e
: h
->extent_map
.extent_map
) {
14688 const bluestore_blob_t
& b
= e
.blob
->get_blob();
14689 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
14690 if (b
.is_shared() &&
14691 std::find(unshared_blobs
.begin(), unshared_blobs
.end(),
14692 sb
) != unshared_blobs
.end()) {
14693 dout(20) << __func__
<< " unsharing " << e
<< dendl
;
14694 bluestore_blob_t
& blob
= e
.blob
->dirty_blob();
14695 blob
.clear_flag(bluestore_blob_t::FLAG_SHARED
);
14696 h
->extent_map
.dirty_range(e
.logical_offset
, 1);
14699 txc
->write_onode(h
);
14704 int BlueStore::_remove(TransContext
*txc
,
14708 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14709 << " onode " << o
.get()
14710 << " txc "<< txc
<< dendl
;
14712 auto start_time
= mono_clock::now();
14713 int r
= _do_remove(txc
, c
, o
);
14716 l_bluestore_remove_lat
,
14717 mono_clock::now() - start_time
,
14718 cct
->_conf
->bluestore_log_op_age
,
14719 [&](const ceph::timespan
& lat
) {
14720 ostringstream ostr
;
14721 ostr
<< ", lat = " << timespan_str(lat
)
14722 << " cid =" << c
->cid
14723 << " oid =" << o
->oid
;
14728 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14732 int BlueStore::_setattr(TransContext
*txc
,
14735 const string
& name
,
14738 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14739 << " " << name
<< " (" << val
.length() << " bytes)"
14742 if (val
.is_partial()) {
14743 auto& b
= o
->onode
.attrs
[name
.c_str()] = bufferptr(val
.c_str(),
14745 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14747 auto& b
= o
->onode
.attrs
[name
.c_str()] = val
;
14748 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14750 txc
->write_onode(o
);
14751 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14752 << " " << name
<< " (" << val
.length() << " bytes)"
14753 << " = " << r
<< dendl
;
14757 int BlueStore::_setattrs(TransContext
*txc
,
14760 const map
<string
,bufferptr
>& aset
)
14762 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14763 << " " << aset
.size() << " keys"
14766 for (map
<string
,bufferptr
>::const_iterator p
= aset
.begin();
14767 p
!= aset
.end(); ++p
) {
14768 if (p
->second
.is_partial()) {
14769 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] =
14770 bufferptr(p
->second
.c_str(), p
->second
.length());
14771 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14773 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] = p
->second
;
14774 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_meta
);
14777 txc
->write_onode(o
);
14778 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14779 << " " << aset
.size() << " keys"
14780 << " = " << r
<< dendl
;
14785 int BlueStore::_rmattr(TransContext
*txc
,
14788 const string
& name
)
14790 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14791 << " " << name
<< dendl
;
14793 auto it
= o
->onode
.attrs
.find(name
.c_str());
14794 if (it
== o
->onode
.attrs
.end())
14797 o
->onode
.attrs
.erase(it
);
14798 txc
->write_onode(o
);
14801 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14802 << " " << name
<< " = " << r
<< dendl
;
14806 int BlueStore::_rmattrs(TransContext
*txc
,
14810 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14813 if (o
->onode
.attrs
.empty())
14816 o
->onode
.attrs
.clear();
14817 txc
->write_onode(o
);
14820 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14824 void BlueStore::_do_omap_clear(TransContext
*txc
, OnodeRef
& o
)
14826 const string
& omap_prefix
= o
->get_omap_prefix();
14827 string prefix
, tail
;
14828 o
->get_omap_header(&prefix
);
14829 o
->get_omap_tail(&tail
);
14830 txc
->t
->rm_range_keys(omap_prefix
, prefix
, tail
);
14831 txc
->t
->rmkey(omap_prefix
, tail
);
14832 dout(20) << __func__
<< " remove range start: "
14833 << pretty_binary_string(prefix
) << " end: "
14834 << pretty_binary_string(tail
) << dendl
;
14837 int BlueStore::_omap_clear(TransContext
*txc
,
14841 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14843 if (o
->onode
.has_omap()) {
14845 _do_omap_clear(txc
, o
);
14846 o
->onode
.clear_omap_flag();
14847 txc
->write_onode(o
);
14849 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14853 int BlueStore::_omap_setkeys(TransContext
*txc
,
14858 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14860 auto p
= bl
.cbegin();
14862 if (!o
->onode
.has_omap()) {
14863 if (o
->oid
.is_pgmeta()) {
14864 o
->onode
.set_omap_flags_pgmeta();
14866 o
->onode
.set_omap_flags();
14868 txc
->write_onode(o
);
14870 const string
& prefix
= o
->get_omap_prefix();
14873 o
->get_omap_tail(&key_tail
);
14874 txc
->t
->set(prefix
, key_tail
, tail
);
14876 txc
->note_modified_object(o
);
14878 const string
& prefix
= o
->get_omap_prefix();
14880 o
->get_omap_key(string(), &final_key
);
14881 size_t base_key_len
= final_key
.size();
14888 final_key
.resize(base_key_len
); // keep prefix
14890 dout(20) << __func__
<< " " << pretty_binary_string(final_key
)
14891 << " <- " << key
<< dendl
;
14892 txc
->t
->set(prefix
, final_key
, value
);
14895 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14899 int BlueStore::_omap_setheader(TransContext
*txc
,
14904 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14907 if (!o
->onode
.has_omap()) {
14908 if (o
->oid
.is_pgmeta()) {
14909 o
->onode
.set_omap_flags_pgmeta();
14911 o
->onode
.set_omap_flags();
14913 txc
->write_onode(o
);
14915 const string
& prefix
= o
->get_omap_prefix();
14918 o
->get_omap_tail(&key_tail
);
14919 txc
->t
->set(prefix
, key_tail
, tail
);
14921 txc
->note_modified_object(o
);
14923 const string
& prefix
= o
->get_omap_prefix();
14924 o
->get_omap_header(&key
);
14925 txc
->t
->set(prefix
, key
, bl
);
14927 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14931 int BlueStore::_omap_rmkeys(TransContext
*txc
,
14936 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14938 auto p
= bl
.cbegin();
14942 if (!o
->onode
.has_omap()) {
14946 const string
& prefix
= o
->get_omap_prefix();
14947 o
->get_omap_key(string(), &final_key
);
14948 size_t base_key_len
= final_key
.size();
14953 final_key
.resize(base_key_len
); // keep prefix
14955 dout(20) << __func__
<< " rm " << pretty_binary_string(final_key
)
14956 << " <- " << key
<< dendl
;
14957 txc
->t
->rmkey(prefix
, final_key
);
14960 txc
->note_modified_object(o
);
14963 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14967 int BlueStore::_omap_rmkey_range(TransContext
*txc
,
14970 const string
& first
, const string
& last
)
14972 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14973 string key_first
, key_last
;
14975 if (!o
->onode
.has_omap()) {
14979 const string
& prefix
= o
->get_omap_prefix();
14981 o
->get_omap_key(first
, &key_first
);
14982 o
->get_omap_key(last
, &key_last
);
14983 txc
->t
->rm_range_keys(prefix
, key_first
, key_last
);
14984 dout(20) << __func__
<< " remove range start: "
14985 << pretty_binary_string(key_first
) << " end: "
14986 << pretty_binary_string(key_last
) << dendl
;
14988 txc
->note_modified_object(o
);
14991 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14995 int BlueStore::_set_alloc_hint(
14999 uint64_t expected_object_size
,
15000 uint64_t expected_write_size
,
15003 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
15004 << " object_size " << expected_object_size
15005 << " write_size " << expected_write_size
15006 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
15009 o
->onode
.expected_object_size
= expected_object_size
;
15010 o
->onode
.expected_write_size
= expected_write_size
;
15011 o
->onode
.alloc_hint_flags
= flags
;
15012 txc
->write_onode(o
);
15013 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
15014 << " object_size " << expected_object_size
15015 << " write_size " << expected_write_size
15016 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
15017 << " = " << r
<< dendl
;
15021 int BlueStore::_clone(TransContext
*txc
,
15026 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15027 << newo
->oid
<< dendl
;
15029 if (oldo
->oid
.hobj
.get_hash() != newo
->oid
.hobj
.get_hash()) {
15030 derr
<< __func__
<< " mismatched hash on " << oldo
->oid
15031 << " and " << newo
->oid
<< dendl
;
15035 _assign_nid(txc
, newo
);
15039 _do_truncate(txc
, c
, newo
, 0);
15040 if (cct
->_conf
->bluestore_clone_cow
) {
15041 _do_clone_range(txc
, c
, oldo
, newo
, 0, oldo
->onode
.size
, 0);
15044 r
= _do_read(c
.get(), oldo
, 0, oldo
->onode
.size
, bl
, 0);
15047 r
= _do_write(txc
, c
, newo
, 0, oldo
->onode
.size
, bl
, 0);
15053 newo
->onode
.attrs
= oldo
->onode
.attrs
;
15056 if (newo
->onode
.has_omap()) {
15057 dout(20) << __func__
<< " clearing old omap data" << dendl
;
15059 _do_omap_clear(txc
, newo
);
15060 newo
->onode
.clear_omap_flag();
15062 if (oldo
->onode
.has_omap()) {
15063 dout(20) << __func__
<< " copying omap data" << dendl
;
15064 if (newo
->oid
.is_pgmeta()) {
15065 newo
->onode
.set_omap_flags_pgmeta();
15067 newo
->onode
.set_omap_flags();
15069 const string
& prefix
= newo
->get_omap_prefix();
15070 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
15072 oldo
->get_omap_header(&head
);
15073 oldo
->get_omap_tail(&tail
);
15074 it
->lower_bound(head
);
15075 while (it
->valid()) {
15076 if (it
->key() >= tail
) {
15077 dout(30) << __func__
<< " reached tail" << dendl
;
15080 dout(30) << __func__
<< " got header/data "
15081 << pretty_binary_string(it
->key()) << dendl
;
15083 newo
->rewrite_omap_key(it
->key(), &key
);
15084 txc
->t
->set(prefix
, key
, it
->value());
15089 bufferlist new_tail_value
;
15090 newo
->get_omap_tail(&new_tail
);
15091 txc
->t
->set(prefix
, new_tail
, new_tail_value
);
15094 txc
->write_onode(newo
);
15098 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15099 << newo
->oid
<< " = " << r
<< dendl
;
15103 int BlueStore::_do_clone_range(
15112 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15114 << " 0x" << std::hex
<< srcoff
<< "~" << length
<< " -> "
15115 << " 0x" << dstoff
<< "~" << length
<< std::dec
<< dendl
;
15116 oldo
->extent_map
.fault_range(db
, srcoff
, length
);
15117 newo
->extent_map
.fault_range(db
, dstoff
, length
);
15118 _dump_onode
<30>(cct
, *oldo
);
15119 _dump_onode
<30>(cct
, *newo
);
15121 oldo
->extent_map
.dup(this, txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
15122 _dump_onode
<30>(cct
, *oldo
);
15123 _dump_onode
<30>(cct
, *newo
);
15127 int BlueStore::_clone_range(TransContext
*txc
,
15131 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
)
15133 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15134 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
15135 << " to offset 0x" << dstoff
<< std::dec
<< dendl
;
15138 if (srcoff
+ length
>= OBJECT_MAX_SIZE
||
15139 dstoff
+ length
>= OBJECT_MAX_SIZE
) {
15143 if (srcoff
+ length
> oldo
->onode
.size
) {
15148 _assign_nid(txc
, newo
);
15151 if (cct
->_conf
->bluestore_clone_cow
) {
15152 _do_zero(txc
, c
, newo
, dstoff
, length
);
15153 _do_clone_range(txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
15156 r
= _do_read(c
.get(), oldo
, srcoff
, length
, bl
, 0);
15159 r
= _do_write(txc
, c
, newo
, dstoff
, bl
.length(), bl
, 0);
15165 txc
->write_onode(newo
);
15169 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15170 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
15171 << " to offset 0x" << dstoff
<< std::dec
15172 << " = " << r
<< dendl
;
15176 int BlueStore::_rename(TransContext
*txc
,
15180 const ghobject_t
& new_oid
)
15182 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
15183 << new_oid
<< dendl
;
15185 ghobject_t old_oid
= oldo
->oid
;
15186 mempool::bluestore_cache_meta::string new_okey
;
15189 if (newo
->exists
) {
15193 ceph_assert(txc
->onodes
.count(newo
) == 0);
15196 txc
->t
->rmkey(PREFIX_OBJ
, oldo
->key
.c_str(), oldo
->key
.size());
15200 oldo
->extent_map
.fault_range(db
, 0, oldo
->onode
.size
);
15201 get_object_key(cct
, new_oid
, &new_okey
);
15203 for (auto &s
: oldo
->extent_map
.shards
) {
15204 generate_extent_shard_key_and_apply(oldo
->key
, s
.shard_info
->offset
, &key
,
15205 [&](const string
& final_key
) {
15206 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
15214 txc
->write_onode(newo
);
15216 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
15217 // Onode in the old slot
15218 c
->onode_map
.rename(oldo
, old_oid
, new_oid
, new_okey
);
15221 // hold a ref to new Onode in old name position, to ensure we don't drop
15222 // it from the cache before this txc commits (or else someone may come along
15223 // and read newo's metadata via the old name).
15224 txc
->note_modified_object(oldo
);
15227 dout(10) << __func__
<< " " << c
->cid
<< " " << old_oid
<< " -> "
15228 << new_oid
<< " = " << r
<< dendl
;
15234 int BlueStore::_create_collection(
15240 dout(15) << __func__
<< " " << cid
<< " bits " << bits
<< dendl
;
15245 std::unique_lock
l(coll_lock
);
15250 auto p
= new_coll_map
.find(cid
);
15251 ceph_assert(p
!= new_coll_map
.end());
15253 (*c
)->cnode
.bits
= bits
;
15254 coll_map
[cid
] = *c
;
15255 new_coll_map
.erase(p
);
15257 encode((*c
)->cnode
, bl
);
15258 txc
->t
->set(PREFIX_COLL
, stringify(cid
), bl
);
15262 dout(10) << __func__
<< " " << cid
<< " bits " << bits
<< " = " << r
<< dendl
;
15266 int BlueStore::_remove_collection(TransContext
*txc
, const coll_t
&cid
,
15269 dout(15) << __func__
<< " " << cid
<< dendl
;
15272 (*c
)->flush_all_but_last();
15274 std::unique_lock
l(coll_lock
);
15279 size_t nonexistent_count
= 0;
15280 ceph_assert((*c
)->exists
);
15281 if ((*c
)->onode_map
.map_any([&](Onode
* o
) {
15283 dout(1) << __func__
<< " " << o
->oid
<< " " << o
15284 << " exists in onode_map" << dendl
;
15287 ++nonexistent_count
;
15294 vector
<ghobject_t
> ls
;
15296 // Enumerate onodes in db, up to nonexistent_count + 1
15297 // then check if all of them are marked as non-existent.
15298 // Bypass the check if (next != ghobject_t::get_max())
15299 r
= _collection_list(c
->get(), ghobject_t(), ghobject_t::get_max(),
15300 nonexistent_count
+ 1, false, &ls
, &next
);
15302 // If true mean collecton has more objects than nonexistent_count,
15303 // so bypass check.
15304 bool exists
= (!next
.is_max());
15305 for (auto it
= ls
.begin(); !exists
&& it
< ls
.end(); ++it
) {
15306 dout(10) << __func__
<< " oid " << *it
<< dendl
;
15307 auto onode
= (*c
)->onode_map
.lookup(*it
);
15308 exists
= !onode
|| onode
->exists
;
15310 dout(1) << __func__
<< " " << *it
15311 << " exists in db, "
15312 << (!onode
? "not present in ram" : "present in ram")
15317 _do_remove_collection(txc
, c
);
15320 dout(10) << __func__
<< " " << cid
15321 << " is non-empty" << dendl
;
15328 dout(10) << __func__
<< " " << cid
<< " = " << r
<< dendl
;
15332 void BlueStore::_do_remove_collection(TransContext
*txc
,
15335 coll_map
.erase((*c
)->cid
);
15336 txc
->removed_collections
.push_back(*c
);
15337 (*c
)->exists
= false;
15338 _osr_register_zombie((*c
)->osr
.get());
15339 txc
->t
->rmkey(PREFIX_COLL
, stringify((*c
)->cid
));
15343 int BlueStore::_split_collection(TransContext
*txc
,
15346 unsigned bits
, int rem
)
15348 dout(15) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
15349 << " bits " << bits
<< dendl
;
15350 std::unique_lock
l(c
->lock
);
15351 std::unique_lock
l2(d
->lock
);
15354 // flush all previous deferred writes on this sequencer. this is a bit
15355 // heavyweight, but we need to make sure all deferred writes complete
15356 // before we split as the new collection's sequencer may need to order
15357 // this after those writes, and we don't bother with the complexity of
15358 // moving those TransContexts over to the new osr.
15359 _osr_drain_preceding(txc
);
15361 // move any cached items (onodes and referenced shared blobs) that will
15362 // belong to the child collection post-split. leave everything else behind.
15363 // this may include things that don't strictly belong to the now-smaller
15364 // parent split, but the OSD will always send us a split for every new
15367 spg_t pgid
, dest_pgid
;
15368 bool is_pg
= c
->cid
.is_pg(&pgid
);
15369 ceph_assert(is_pg
);
15370 is_pg
= d
->cid
.is_pg(&dest_pgid
);
15371 ceph_assert(is_pg
);
15373 // the destination should initially be empty.
15374 ceph_assert(d
->onode_map
.empty());
15375 ceph_assert(d
->shared_blob_set
.empty());
15376 ceph_assert(d
->cnode
.bits
== bits
);
15378 c
->split_cache(d
.get());
15380 // adjust bits. note that this will be redundant for all but the first
15381 // split call for this parent (first child).
15382 c
->cnode
.bits
= bits
;
15383 ceph_assert(d
->cnode
.bits
== bits
);
15387 encode(c
->cnode
, bl
);
15388 txc
->t
->set(PREFIX_COLL
, stringify(c
->cid
), bl
);
15390 dout(10) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
15391 << " bits " << bits
<< " = " << r
<< dendl
;
15395 int BlueStore::_merge_collection(
15401 dout(15) << __func__
<< " " << (*c
)->cid
<< " to " << d
->cid
15402 << " bits " << bits
<< dendl
;
15403 std::unique_lock
l((*c
)->lock
);
15404 std::unique_lock
l2(d
->lock
);
15407 coll_t cid
= (*c
)->cid
;
15409 // flush all previous deferred writes on the source collection to ensure
15410 // that all deferred writes complete before we merge as the target collection's
15411 // sequencer may need to order new ops after those writes.
15413 _osr_drain((*c
)->osr
.get());
15415 // move any cached items (onodes and referenced shared blobs) that will
15416 // belong to the child collection post-split. leave everything else behind.
15417 // this may include things that don't strictly belong to the now-smaller
15418 // parent split, but the OSD will always send us a split for every new
15421 spg_t pgid
, dest_pgid
;
15422 bool is_pg
= cid
.is_pg(&pgid
);
15423 ceph_assert(is_pg
);
15424 is_pg
= d
->cid
.is_pg(&dest_pgid
);
15425 ceph_assert(is_pg
);
15427 // adjust bits. note that this will be redundant for all but the first
15428 // merge call for the parent/target.
15429 d
->cnode
.bits
= bits
;
15431 // behavior depends on target (d) bits, so this after that is updated.
15432 (*c
)->split_cache(d
.get());
15434 // remove source collection
15436 std::unique_lock
l3(coll_lock
);
15437 _do_remove_collection(txc
, c
);
15443 encode(d
->cnode
, bl
);
15444 txc
->t
->set(PREFIX_COLL
, stringify(d
->cid
), bl
);
15446 dout(10) << __func__
<< " " << cid
<< " to " << d
->cid
<< " "
15447 << " bits " << bits
<< " = " << r
<< dendl
;
15451 void BlueStore::log_latency(
15454 const ceph::timespan
& l
,
15455 double lat_threshold
,
15456 const char* info
) const
15458 logger
->tinc(idx
, l
);
15459 if (lat_threshold
> 0.0 &&
15460 l
>= make_timespan(lat_threshold
)) {
15461 dout(0) << __func__
<< " slow operation observed for " << name
15462 << ", latency = " << l
15468 void BlueStore::log_latency_fn(
15471 const ceph::timespan
& l
,
15472 double lat_threshold
,
15473 std::function
<string (const ceph::timespan
& lat
)> fn
) const
15475 logger
->tinc(idx
, l
);
15476 if (lat_threshold
> 0.0 &&
15477 l
>= make_timespan(lat_threshold
)) {
15478 dout(0) << __func__
<< " slow operation observed for " << name
15479 << ", latency = " << l
15485 #if defined(WITH_LTTNG)
15486 void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15489 mono_clock::time_point start_throttle_acquire
)
15491 pending_kv_ios
+= txc
.ios
;
15492 if (txc
.deferred_txn
) {
15493 pending_deferred_ios
+= txc
.ios
;
15496 uint64_t started
= 0;
15497 uint64_t completed
= 0;
15498 if (should_trace(&started
, &completed
)) {
15499 txc
.tracing
= true;
15500 uint64_t rocksdb_base_level
,
15501 rocksdb_estimate_pending_compaction_bytes
,
15502 rocksdb_cur_size_all_mem_tables
,
15503 rocksdb_compaction_pending
,
15504 rocksdb_mem_table_flush_pending
,
15505 rocksdb_num_running_compactions
,
15506 rocksdb_num_running_flushes
,
15507 rocksdb_actual_delayed_write_rate
;
15509 "rocksdb.base-level",
15510 &rocksdb_base_level
);
15512 "rocksdb.estimate-pending-compaction-bytes",
15513 &rocksdb_estimate_pending_compaction_bytes
);
15515 "rocksdb.cur-size-all-mem-tables",
15516 &rocksdb_cur_size_all_mem_tables
);
15518 "rocksdb.compaction-pending",
15519 &rocksdb_compaction_pending
);
15521 "rocksdb.mem-table-flush-pending",
15522 &rocksdb_mem_table_flush_pending
);
15524 "rocksdb.num-running-compactions",
15525 &rocksdb_num_running_compactions
);
15527 "rocksdb.num-running-flushes",
15528 &rocksdb_num_running_flushes
);
15530 "rocksdb.actual-delayed-write-rate",
15531 &rocksdb_actual_delayed_write_rate
);
15536 transaction_initial_state
,
15537 txc
.osr
->get_sequencer_id(),
15539 throttle_bytes
.get_current(),
15540 throttle_deferred_bytes
.get_current(),
15542 pending_deferred_ios
,
15545 ceph::to_seconds
<double>(mono_clock::now() - start_throttle_acquire
));
15549 transaction_initial_state_rocksdb
,
15550 txc
.osr
->get_sequencer_id(),
15552 rocksdb_base_level
,
15553 rocksdb_estimate_pending_compaction_bytes
,
15554 rocksdb_cur_size_all_mem_tables
,
15555 rocksdb_compaction_pending
,
15556 rocksdb_mem_table_flush_pending
,
15557 rocksdb_num_running_compactions
,
15558 rocksdb_num_running_flushes
,
15559 rocksdb_actual_delayed_write_rate
);
15564 mono_clock::duration
BlueStore::BlueStoreThrottle::log_state_latency(
15565 TransContext
&txc
, PerfCounters
*logger
, int state
)
15567 mono_clock::time_point now
= mono_clock::now();
15568 mono_clock::duration lat
= now
- txc
.last_stamp
;
15569 logger
->tinc(state
, lat
);
15570 #if defined(WITH_LTTNG)
15572 state
>= l_bluestore_state_prepare_lat
&&
15573 state
<= l_bluestore_state_done_lat
) {
15574 OID_ELAPSED("", lat
.to_nsec() / 1000.0, txc
.get_state_latency_name(state
));
15577 transaction_state_duration
,
15578 txc
.osr
->get_sequencer_id(),
15581 ceph::to_seconds
<double>(lat
));
15584 txc
.last_stamp
= now
;
15588 bool BlueStore::BlueStoreThrottle::try_start_transaction(
15591 mono_clock::time_point start_throttle_acquire
)
15593 throttle_bytes
.get(txc
.cost
);
15595 if (!txc
.deferred_txn
|| throttle_deferred_bytes
.get_or_fail(txc
.cost
)) {
15596 emit_initial_tracepoint(db
, txc
, start_throttle_acquire
);
15603 void BlueStore::BlueStoreThrottle::finish_start_transaction(
15606 mono_clock::time_point start_throttle_acquire
)
15608 ceph_assert(txc
.deferred_txn
);
15609 throttle_deferred_bytes
.get(txc
.cost
);
15610 emit_initial_tracepoint(db
, txc
, start_throttle_acquire
);
15613 #if defined(WITH_LTTNG)
15614 void BlueStore::BlueStoreThrottle::complete_kv(TransContext
&txc
)
15616 pending_kv_ios
-= 1;
15617 ios_completed_since_last_traced
++;
15621 transaction_commit_latency
,
15622 txc
.osr
->get_sequencer_id(),
15624 ceph::to_seconds
<double>(mono_clock::now() - txc
.start
));
15629 #if defined(WITH_LTTNG)
15630 void BlueStore::BlueStoreThrottle::complete(TransContext
&txc
)
15632 if (txc
.deferred_txn
) {
15633 pending_deferred_ios
-= 1;
15636 mono_clock::time_point now
= mono_clock::now();
15637 mono_clock::duration lat
= now
- txc
.start
;
15640 transaction_total_duration
,
15641 txc
.osr
->get_sequencer_id(),
15643 ceph::to_seconds
<double>(lat
));
15648 // DB key value Histogram
15649 #define KEY_SLAB 32
15650 #define VALUE_SLAB 64
15652 const string prefix_onode
= "o";
15653 const string prefix_onode_shard
= "x";
15654 const string prefix_other
= "Z";
15656 int BlueStore::DBHistogram::get_key_slab(size_t sz
)
15658 return (sz
/KEY_SLAB
);
15661 string
BlueStore::DBHistogram::get_key_slab_to_range(int slab
)
15663 int lower_bound
= slab
* KEY_SLAB
;
15664 int upper_bound
= (slab
+ 1) * KEY_SLAB
;
15665 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
15669 int BlueStore::DBHistogram::get_value_slab(size_t sz
)
15671 return (sz
/VALUE_SLAB
);
15674 string
BlueStore::DBHistogram::get_value_slab_to_range(int slab
)
15676 int lower_bound
= slab
* VALUE_SLAB
;
15677 int upper_bound
= (slab
+ 1) * VALUE_SLAB
;
15678 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
15682 void BlueStore::DBHistogram::update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
15683 const string
&prefix
, size_t key_size
, size_t value_size
)
15685 uint32_t key_slab
= get_key_slab(key_size
);
15686 uint32_t value_slab
= get_value_slab(value_size
);
15687 key_hist
[prefix
][key_slab
].count
++;
15688 key_hist
[prefix
][key_slab
].max_len
=
15689 std::max
<size_t>(key_size
, key_hist
[prefix
][key_slab
].max_len
);
15690 key_hist
[prefix
][key_slab
].val_map
[value_slab
].count
++;
15691 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
=
15692 std::max
<size_t>(value_size
,
15693 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
);
15696 void BlueStore::DBHistogram::dump(Formatter
*f
)
15698 f
->open_object_section("rocksdb_value_distribution");
15699 for (auto i
: value_hist
) {
15700 f
->dump_unsigned(get_value_slab_to_range(i
.first
).data(), i
.second
);
15702 f
->close_section();
15704 f
->open_object_section("rocksdb_key_value_histogram");
15705 for (auto i
: key_hist
) {
15706 f
->dump_string("prefix", i
.first
);
15707 f
->open_object_section("key_hist");
15708 for ( auto k
: i
.second
) {
15709 f
->dump_unsigned(get_key_slab_to_range(k
.first
).data(), k
.second
.count
);
15710 f
->dump_unsigned("max_len", k
.second
.max_len
);
15711 f
->open_object_section("value_hist");
15712 for ( auto j
: k
.second
.val_map
) {
15713 f
->dump_unsigned(get_value_slab_to_range(j
.first
).data(), j
.second
.count
);
15714 f
->dump_unsigned("max_len", j
.second
.max_len
);
15716 f
->close_section();
15718 f
->close_section();
15720 f
->close_section();
15723 //Itrerates through the db and collects the stats
15724 void BlueStore::generate_db_histogram(Formatter
*f
)
15727 uint64_t num_onodes
= 0;
15728 uint64_t num_shards
= 0;
15729 uint64_t num_super
= 0;
15730 uint64_t num_coll
= 0;
15731 uint64_t num_omap
= 0;
15732 uint64_t num_pgmeta_omap
= 0;
15733 uint64_t num_deferred
= 0;
15734 uint64_t num_alloc
= 0;
15735 uint64_t num_stat
= 0;
15736 uint64_t num_others
= 0;
15737 uint64_t num_shared_shards
= 0;
15738 size_t max_key_size
=0, max_value_size
= 0;
15739 uint64_t total_key_size
= 0, total_value_size
= 0;
15740 size_t key_size
= 0, value_size
= 0;
15743 auto start
= coarse_mono_clock::now();
15745 KeyValueDB::WholeSpaceIterator iter
= db
->get_wholespace_iterator();
15746 iter
->seek_to_first();
15747 while (iter
->valid()) {
15748 dout(30) << __func__
<< " Key: " << iter
->key() << dendl
;
15749 key_size
= iter
->key_size();
15750 value_size
= iter
->value_size();
15751 hist
.value_hist
[hist
.get_value_slab(value_size
)]++;
15752 max_key_size
= std::max(max_key_size
, key_size
);
15753 max_value_size
= std::max(max_value_size
, value_size
);
15754 total_key_size
+= key_size
;
15755 total_value_size
+= value_size
;
15757 pair
<string
,string
> key(iter
->raw_key());
15759 if (key
.first
== PREFIX_SUPER
) {
15760 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SUPER
, key_size
, value_size
);
15762 } else if (key
.first
== PREFIX_STAT
) {
15763 hist
.update_hist_entry(hist
.key_hist
, PREFIX_STAT
, key_size
, value_size
);
15765 } else if (key
.first
== PREFIX_COLL
) {
15766 hist
.update_hist_entry(hist
.key_hist
, PREFIX_COLL
, key_size
, value_size
);
15768 } else if (key
.first
== PREFIX_OBJ
) {
15769 if (key
.second
.back() == ONODE_KEY_SUFFIX
) {
15770 hist
.update_hist_entry(hist
.key_hist
, prefix_onode
, key_size
, value_size
);
15773 hist
.update_hist_entry(hist
.key_hist
, prefix_onode_shard
, key_size
, value_size
);
15776 } else if (key
.first
== PREFIX_OMAP
) {
15777 hist
.update_hist_entry(hist
.key_hist
, PREFIX_OMAP
, key_size
, value_size
);
15779 } else if (key
.first
== PREFIX_PGMETA_OMAP
) {
15780 hist
.update_hist_entry(hist
.key_hist
, PREFIX_PGMETA_OMAP
, key_size
, value_size
);
15782 } else if (key
.first
== PREFIX_DEFERRED
) {
15783 hist
.update_hist_entry(hist
.key_hist
, PREFIX_DEFERRED
, key_size
, value_size
);
15785 } else if (key
.first
== PREFIX_ALLOC
|| key
.first
== PREFIX_ALLOC_BITMAP
) {
15786 hist
.update_hist_entry(hist
.key_hist
, PREFIX_ALLOC
, key_size
, value_size
);
15788 } else if (key
.first
== PREFIX_SHARED_BLOB
) {
15789 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SHARED_BLOB
, key_size
, value_size
);
15790 num_shared_shards
++;
15792 hist
.update_hist_entry(hist
.key_hist
, prefix_other
, key_size
, value_size
);
15798 ceph::timespan duration
= coarse_mono_clock::now() - start
;
15799 f
->open_object_section("rocksdb_key_value_stats");
15800 f
->dump_unsigned("num_onodes", num_onodes
);
15801 f
->dump_unsigned("num_shards", num_shards
);
15802 f
->dump_unsigned("num_super", num_super
);
15803 f
->dump_unsigned("num_coll", num_coll
);
15804 f
->dump_unsigned("num_omap", num_omap
);
15805 f
->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap
);
15806 f
->dump_unsigned("num_deferred", num_deferred
);
15807 f
->dump_unsigned("num_alloc", num_alloc
);
15808 f
->dump_unsigned("num_stat", num_stat
);
15809 f
->dump_unsigned("num_shared_shards", num_shared_shards
);
15810 f
->dump_unsigned("num_others", num_others
);
15811 f
->dump_unsigned("max_key_size", max_key_size
);
15812 f
->dump_unsigned("max_value_size", max_value_size
);
15813 f
->dump_unsigned("total_key_size", total_key_size
);
15814 f
->dump_unsigned("total_value_size", total_value_size
);
15815 f
->close_section();
15819 dout(20) << __func__
<< " finished in " << duration
<< " seconds" << dendl
;
15823 void BlueStore::_shutdown_cache()
15825 dout(10) << __func__
<< dendl
;
15826 for (auto i
: buffer_cache_shards
) {
15828 ceph_assert(i
->empty());
15830 for (auto& p
: coll_map
) {
15831 p
.second
->onode_map
.clear();
15832 if (!p
.second
->shared_blob_set
.empty()) {
15833 derr
<< __func__
<< " stray shared blobs on " << p
.first
<< dendl
;
15834 p
.second
->shared_blob_set
.dump
<0>(cct
);
15836 ceph_assert(p
.second
->onode_map
.empty());
15837 ceph_assert(p
.second
->shared_blob_set
.empty());
15840 for (auto i
: onode_cache_shards
) {
15841 ceph_assert(i
->empty());
15845 // For external caller.
15846 // We use a best-effort policy instead, e.g.,
15847 // we don't care if there are still some pinned onodes/data in the cache
15848 // after this command is completed.
15849 int BlueStore::flush_cache(ostream
*os
)
15851 dout(10) << __func__
<< dendl
;
15852 for (auto i
: onode_cache_shards
) {
15855 for (auto i
: buffer_cache_shards
) {
15862 void BlueStore::_apply_padding(uint64_t head_pad
,
15864 bufferlist
& padded
)
15867 padded
.prepend_zero(head_pad
);
15870 padded
.append_zero(tail_pad
);
15872 if (head_pad
|| tail_pad
) {
15873 dout(20) << __func__
<< " can pad head 0x" << std::hex
<< head_pad
15874 << " tail 0x" << tail_pad
<< std::dec
<< dendl
;
15875 logger
->inc(l_bluestore_write_pad_bytes
, head_pad
+ tail_pad
);
15879 void BlueStore::_record_onode(OnodeRef
&o
, KeyValueDB::Transaction
&txn
)
15881 // finalize extent_map shards
15882 o
->extent_map
.update(txn
, false);
15883 if (o
->extent_map
.needs_reshard()) {
15884 o
->extent_map
.reshard(db
, txn
);
15885 o
->extent_map
.update(txn
, true);
15886 if (o
->extent_map
.needs_reshard()) {
15887 dout(20) << __func__
<< " warning: still wants reshard, check options?"
15889 o
->extent_map
.clear_needs_reshard();
15891 logger
->inc(l_bluestore_onode_reshard
);
15896 denc(o
->onode
, bound
);
15897 o
->extent_map
.bound_encode_spanning_blobs(bound
);
15898 if (o
->onode
.extent_map_shards
.empty()) {
15899 denc(o
->extent_map
.inline_bl
, bound
);
15904 unsigned onode_part
, blob_part
, extent_part
;
15906 auto p
= bl
.get_contiguous_appender(bound
, true);
15908 onode_part
= p
.get_logical_offset();
15909 o
->extent_map
.encode_spanning_blobs(p
);
15910 blob_part
= p
.get_logical_offset() - onode_part
;
15911 if (o
->onode
.extent_map_shards
.empty()) {
15912 denc(o
->extent_map
.inline_bl
, p
);
15914 extent_part
= p
.get_logical_offset() - onode_part
- blob_part
;
15917 dout(20) << __func__
<< " onode " << o
->oid
<< " is " << bl
.length()
15918 << " (" << onode_part
<< " bytes onode + "
15919 << blob_part
<< " bytes spanning blobs + "
15920 << extent_part
<< " bytes inline extents)"
15924 txn
->set(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size(), bl
);
15927 void BlueStore::_log_alerts(osd_alert_list_t
& alerts
)
15929 std::lock_guard
l(qlock
);
15931 if (!disk_size_mismatch_alert
.empty()) {
15933 "BLUESTORE_DISK_SIZE_MISMATCH",
15934 disk_size_mismatch_alert
);
15936 if (!legacy_statfs_alert
.empty()) {
15938 "BLUESTORE_LEGACY_STATFS",
15939 legacy_statfs_alert
);
15941 if (!spillover_alert
.empty() &&
15942 cct
->_conf
->bluestore_warn_on_bluefs_spillover
) {
15944 "BLUEFS_SPILLOVER",
15947 if (!no_per_pool_omap_alert
.empty()) {
15949 "BLUESTORE_NO_PER_POOL_OMAP",
15950 no_per_pool_omap_alert
);
15952 string
s0(failed_cmode
);
15954 if (!failed_compressors
.empty()) {
15958 s0
+= "unable to load:";
15960 for (auto& s
: failed_compressors
) {
15969 "BLUESTORE_NO_COMPRESSION",
15974 void BlueStore::_collect_allocation_stats(uint64_t need
, uint32_t alloc_size
,
15977 alloc_stats_count
++;
15978 alloc_stats_fragments
+= extents
;
15979 alloc_stats_size
+= need
;
15982 void BlueStore::_record_allocation_stats()
15984 // don't care about data consistency,
15985 // fields can be partially modified while making the tuple
15986 auto t0
= std::make_tuple(
15987 alloc_stats_count
.exchange(0),
15988 alloc_stats_fragments
.exchange(0),
15989 alloc_stats_size
.exchange(0));
15991 dout(0) << " allocation stats probe "
15992 << probe_count
<< ":"
15993 << " cnt: " << std::get
<0>(t0
)
15994 << " frags: " << std::get
<1>(t0
)
15995 << " size: " << std::get
<2>(t0
)
16000 // Keep the history for probes from the power-of-two sequence:
16001 // -1, -2, -4, -8, -16
16004 for (auto& t
: alloc_stats_history
) {
16005 dout(0) << " probe -"
16006 << base
+ (probe_count
% base
) << ": "
16008 << ", " << std::get
<1>(t
)
16009 << ", " << std::get
<2>(t
)
16013 dout(0) << "------------" << dendl
;
16015 auto prev
= probe_count
++;
16016 auto mask
= (1 << alloc_stats_history
.size()) - 1;
16017 probe_count
&= mask
;
16019 for (size_t i
= cbits(prev
^ probe_count
) - 1; i
> 0 ; --i
) {
16020 alloc_stats_history
[i
] = alloc_stats_history
[i
- 1];
16022 alloc_stats_history
[0].swap(t0
);
16025 // ===========================================
16026 // BlueStoreRepairer
16028 size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
16029 const interval_set
<uint64_t>& extents
)
16031 ceph_assert(granularity
); // initialized
16032 // can't call for the second time
16033 ceph_assert(!was_filtered_out
);
16034 ceph_assert(collections_bfs
.size() == objects_bfs
.size());
16036 uint64_t prev_pos
= 0;
16037 uint64_t npos
= collections_bfs
.size();
16039 bloom_vector collections_reduced
;
16040 bloom_vector objects_reduced
;
16042 for (auto e
: extents
) {
16043 if (e
.second
== 0) {
16046 uint64_t pos
= max(e
.first
/ granularity
, prev_pos
);
16047 uint64_t end_pos
= 1 + (e
.first
+ e
.second
- 1) / granularity
;
16048 while (pos
!= npos
&& pos
< end_pos
) {
16049 ceph_assert( collections_bfs
[pos
].element_count() ==
16050 objects_bfs
[pos
].element_count());
16051 if (collections_bfs
[pos
].element_count()) {
16052 collections_reduced
.push_back(std::move(collections_bfs
[pos
]));
16053 objects_reduced
.push_back(std::move(objects_bfs
[pos
]));
16057 prev_pos
= end_pos
;
16059 collections_reduced
.swap(collections_bfs
);
16060 objects_reduced
.swap(objects_bfs
);
16061 was_filtered_out
= true;
16062 return collections_bfs
.size();
16065 bool BlueStoreRepairer::remove_key(KeyValueDB
*db
,
16066 const string
& prefix
,
16069 if (!remove_key_txn
) {
16070 remove_key_txn
= db
->get_transaction();
16073 remove_key_txn
->rmkey(prefix
, key
);
16078 void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB
*db
)
16080 fix_per_pool_omap_txn
= db
->get_transaction();
16084 fix_per_pool_omap_txn
->set(PREFIX_SUPER
, "per_pool_omap", bl
);
16087 bool BlueStoreRepairer::fix_shared_blob(
16090 const bufferlist
* bl
)
16092 KeyValueDB::Transaction txn
;
16093 if (fix_misreferences_txn
) { // reuse this txn
16094 txn
= fix_misreferences_txn
;
16096 if (!fix_shared_blob_txn
) {
16097 fix_shared_blob_txn
= db
->get_transaction();
16099 txn
= fix_shared_blob_txn
;
16102 get_shared_blob_key(sbid
, &key
);
16106 txn
->set(PREFIX_SHARED_BLOB
, key
, *bl
);
16108 txn
->rmkey(PREFIX_SHARED_BLOB
, key
);
16113 bool BlueStoreRepairer::fix_statfs(KeyValueDB
*db
,
16115 const store_statfs_t
& new_statfs
)
16117 if (!fix_statfs_txn
) {
16118 fix_statfs_txn
= db
->get_transaction();
16120 BlueStore::volatile_statfs vstatfs
;
16121 vstatfs
= new_statfs
;
16123 vstatfs
.encode(bl
);
16125 fix_statfs_txn
->set(PREFIX_STAT
, key
, bl
);
16129 bool BlueStoreRepairer::fix_leaked(KeyValueDB
*db
,
16130 FreelistManager
* fm
,
16131 uint64_t offset
, uint64_t len
)
16133 if (!fix_fm_leaked_txn
) {
16134 fix_fm_leaked_txn
= db
->get_transaction();
16137 fm
->release(offset
, len
, fix_fm_leaked_txn
);
16140 bool BlueStoreRepairer::fix_false_free(KeyValueDB
*db
,
16141 FreelistManager
* fm
,
16142 uint64_t offset
, uint64_t len
)
16144 if (!fix_fm_false_free_txn
) {
16145 fix_fm_false_free_txn
= db
->get_transaction();
16148 fm
->allocate(offset
, len
, fix_fm_false_free_txn
);
16152 bool BlueStoreRepairer::fix_bluefs_extents(std::atomic
<uint64_t>& out_of_sync_flag
)
16154 // this is just a stub to count num of repairs properly,
16155 // actual repair happens in BlueStore::_close_db_and_around()
16156 // while doing _sync_bluefs_and_fm
16157 ++out_of_sync_flag
;
16162 KeyValueDB::Transaction
BlueStoreRepairer::fix_spanning_blobs(KeyValueDB
* db
)
16164 if (!fix_onode_txn
) {
16165 fix_onode_txn
= db
->get_transaction();
16168 return fix_onode_txn
;
16171 bool BlueStoreRepairer::preprocess_misreference(KeyValueDB
*db
)
16173 if (misreferenced_extents
.size()) {
16174 size_t n
= space_usage_tracker
.filter_out(misreferenced_extents
);
16175 ceph_assert(n
> 0);
16176 if (!fix_misreferences_txn
) {
16177 fix_misreferences_txn
= db
->get_transaction();
16184 unsigned BlueStoreRepairer::apply(KeyValueDB
* db
)
16186 if (fix_per_pool_omap_txn
) {
16187 db
->submit_transaction_sync(fix_per_pool_omap_txn
);
16188 fix_per_pool_omap_txn
= nullptr;
16190 if (fix_fm_leaked_txn
) {
16191 db
->submit_transaction_sync(fix_fm_leaked_txn
);
16192 fix_fm_leaked_txn
= nullptr;
16194 if (fix_fm_false_free_txn
) {
16195 db
->submit_transaction_sync(fix_fm_false_free_txn
);
16196 fix_fm_false_free_txn
= nullptr;
16198 if (remove_key_txn
) {
16199 db
->submit_transaction_sync(remove_key_txn
);
16200 remove_key_txn
= nullptr;
16202 if (fix_misreferences_txn
) {
16203 db
->submit_transaction_sync(fix_misreferences_txn
);
16204 fix_misreferences_txn
= nullptr;
16206 if (fix_onode_txn
) {
16207 db
->submit_transaction_sync(fix_onode_txn
);
16208 fix_onode_txn
= nullptr;
16210 if (fix_shared_blob_txn
) {
16211 db
->submit_transaction_sync(fix_shared_blob_txn
);
16212 fix_shared_blob_txn
= nullptr;
16215 if (fix_statfs_txn
) {
16216 db
->submit_transaction_sync(fix_statfs_txn
);
16217 fix_statfs_txn
= nullptr;
16219 unsigned repaired
= to_repair_cnt
;
16224 // =======================================================
16225 // RocksDBBlueFSVolumeSelector
16227 uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h
) {
16228 ceph_assert(h
!= nullptr);
16229 uint64_t hint
= reinterpret_cast<uint64_t>(h
);
16233 res
= BlueFS::BDEV_SLOW
;
16234 if (db_avail4slow
> 0) {
16235 // considering statically available db space vs.
16236 // - observed maximums on DB dev for DB/WAL/UNSORTED data
16237 // - observed maximum spillovers
16238 uint64_t max_db_use
= 0; // max db usage we potentially observed
16239 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_LOG
- LEVEL_FIRST
);
16240 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_WAL
- LEVEL_FIRST
);
16241 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_DB
- LEVEL_FIRST
);
16242 // this could go to db hence using it in the estimation
16243 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_SLOW
, LEVEL_DB
- LEVEL_FIRST
);
16245 auto db_total
= l_totals
[LEVEL_DB
- LEVEL_FIRST
];
16246 uint64_t avail
= min(
16248 max_db_use
< db_total
? db_total
- max_db_use
: 0);
16250 // considering current DB dev usage for SLOW data
16251 if (avail
> per_level_per_dev_usage
.at(BlueFS::BDEV_DB
, LEVEL_SLOW
- LEVEL_FIRST
)) {
16252 res
= BlueFS::BDEV_DB
;
16258 res
= BlueFS::BDEV_WAL
;
16262 res
= BlueFS::BDEV_DB
;
16268 void RocksDBBlueFSVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const
16270 res
.emplace_back(base
, l_totals
[LEVEL_DB
- LEVEL_FIRST
]);
16271 res
.emplace_back(base
+ ".slow", l_totals
[LEVEL_SLOW
- LEVEL_FIRST
]);
16274 void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string
& dirname
) const {
16275 uint8_t res
= LEVEL_DB
;
16276 if (dirname
.length() > 5) {
16277 // the "db.slow" and "db.wal" directory names are hard-coded at
16278 // match up with bluestore. the slow device is always the second
16279 // one (when a dedicated block.db device is present and used at
16280 // bdev 0). the wal device is always last.
16281 if (boost::algorithm::ends_with(dirname
, ".slow")) {
16284 else if (boost::algorithm::ends_with(dirname
, ".wal")) {
16288 return reinterpret_cast<void*>(res
);
16291 void RocksDBBlueFSVolumeSelector::dump(ostream
& sout
) {
16292 auto max_x
= per_level_per_dev_usage
.get_max_x();
16293 auto max_y
= per_level_per_dev_usage
.get_max_y();
16294 sout
<< "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals
[LEVEL_WAL
- LEVEL_FIRST
]
16295 << ", db_total:" << l_totals
[LEVEL_DB
- LEVEL_FIRST
]
16296 << ", slow_total:" << l_totals
[LEVEL_SLOW
- LEVEL_FIRST
]
16297 << ", db_avail:" << db_avail4slow
<< std::endl
16298 << "Usage matrix:" << std::endl
;
16299 constexpr std::array
<const char*, 8> names
{ {
16309 const size_t width
= 12;
16310 for (size_t i
= 0; i
< names
.size(); ++i
) {
16311 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16316 for (size_t l
= 0; l
< max_y
; l
++) {
16317 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16319 switch (l
+ LEVEL_FIRST
) {
16321 sout
<< "LOG"; break;
16323 sout
<< "WAL"; break;
16325 sout
<< "DB"; break;
16327 sout
<< "SLOW"; break;
16329 sout
<< "TOTALS"; break;
16331 for (size_t d
= 0; d
< max_x
; d
++) {
16332 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16334 sout
<< stringify(byte_u_t(per_level_per_dev_usage
.at(d
, l
)));
16336 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16338 sout
<< stringify(per_level_files
[l
]) << std::endl
;
16340 ceph_assert(max_x
== per_level_per_dev_max
.get_max_x());
16341 ceph_assert(max_y
== per_level_per_dev_max
.get_max_y());
16342 sout
<< "MAXIMUMS:" << std::endl
;
16343 for (size_t l
= 0; l
< max_y
; l
++) {
16344 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16346 switch (l
+ LEVEL_FIRST
) {
16348 sout
<< "LOG"; break;
16350 sout
<< "WAL"; break;
16352 sout
<< "DB"; break;
16354 sout
<< "SLOW"; break;
16356 sout
<< "TOTALS"; break;
16358 for (size_t d
= 0; d
< max_x
- 1; d
++) {
16359 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16361 sout
<< stringify(byte_u_t(per_level_per_dev_max
.at(d
, l
)));
16363 sout
.setf(std::ios::left
, std::ios::adjustfield
);
16365 sout
<< stringify(byte_u_t(per_level_per_dev_max
.at(max_x
- 1, l
)));
16366 if (l
< max_y
- 1) {
16372 // =======================================================