1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include <sys/types.h>
21 #include <boost/container/flat_set.hpp>
22 #include "boost/algorithm/string.hpp"
24 #include "include/cpp-btree/btree_set.h"
26 #include "bluestore_common.h"
27 #include "BlueStore.h"
29 #include "include/compat.h"
30 #include "include/intarith.h"
31 #include "include/stringify.h"
32 #include "include/str_map.h"
33 #include "include/util.h"
34 #include "common/errno.h"
35 #include "common/safe_io.h"
36 #include "common/PriorityCache.h"
37 #include "common/RWLock.h"
38 #include "Allocator.h"
39 #include "FreelistManager.h"
41 #include "BlueRocksEnv.h"
42 #include "auth/Crypto.h"
43 #include "common/EventTrace.h"
44 #include "perfglue/heap_profiler.h"
45 #include "common/blkdev.h"
46 #include "common/numa.h"
48 #if defined(WITH_LTTNG)
49 #define TRACEPOINT_DEFINE
50 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
51 #include "tracing/bluestore.h"
52 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
53 #undef TRACEPOINT_DEFINE
55 #define tracepoint(...)
58 #define dout_context cct
59 #define dout_subsys ceph_subsys_bluestore
61 using bid_t
= decltype(BlueStore::Blob::id
);
63 // bluestore_cache_onode
64 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode
, bluestore_onode
,
65 bluestore_cache_onode
);
67 // bluestore_cache_other
68 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer
, bluestore_buffer
,
69 bluestore_cache_other
);
70 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent
, bluestore_extent
,
71 bluestore_cache_other
);
72 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob
, bluestore_blob
,
73 bluestore_cache_other
);
74 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob
, bluestore_shared_blob
,
75 bluestore_cache_other
);
78 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext
, bluestore_transcontext
,
83 const string PREFIX_SUPER
= "S"; // field -> value
84 const string PREFIX_STAT
= "T"; // field -> value(int64 array)
85 const string PREFIX_COLL
= "C"; // collection name -> cnode_t
86 const string PREFIX_OBJ
= "O"; // object name -> onode_t
87 const string PREFIX_OMAP
= "M"; // u64 + keyname -> value
88 const string PREFIX_PGMETA_OMAP
= "P"; // u64 + keyname -> value(for meta coll)
89 const string PREFIX_PERPOOL_OMAP
= "m"; // s64 + u64 + keyname -> value
90 const string PREFIX_DEFERRED
= "L"; // id -> deferred_transaction_t
91 const string PREFIX_ALLOC
= "B"; // u64 offset -> u64 length (freelist)
92 const string PREFIX_ALLOC_BITMAP
= "b";// (see BitmapFreelistManager)
93 const string PREFIX_SHARED_BLOB
= "X"; // u64 offset -> shared_blob_t
95 const string BLUESTORE_GLOBAL_STATFS_KEY
= "bluestore_statfs";
97 // write a label in the first block. always use this size. note that
98 // bluefs makes a matching assumption about the location of its
99 // superblock (always the second block of the device).
100 #define BDEV_LABEL_BLOCK_SIZE 4096
102 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
103 #define SUPER_RESERVED 8192
105 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
109 * extent map blob encoding
111 * we use the low bits of the blobid field to indicate some common scenarios
112 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
114 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
115 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
116 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
117 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
118 #define BLOBID_SHIFT_BITS 4
121 * object name key structure
123 * encoded u8: shard + 2^7 (so that it sorts properly)
124 * encoded u64: poolid + 2^63 (so that it sorts properly)
125 * encoded u32: hash (bit reversed)
127 * escaped string: namespace
129 * escaped string: key or object name
130 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
131 * we are done. otherwise, we are followed by the object name.
132 * escaped string: object name (unless '=' above)
135 * encoded u64: generation
138 #define ONODE_KEY_SUFFIX 'o'
147 #define EXTENT_SHARD_KEY_SUFFIX 'x'
150 * string encoding in the key
152 * The key string needs to lexicographically sort the same way that
153 * ghobject_t does. We do this by escaping anything <= to '#' with #
154 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
157 * We use ! as a terminator for strings; this works because it is < #
158 * and will get escaped if it is present in the string.
162 static void append_escaped(const string
&in
, S
*out
)
164 char hexbyte
[in
.length() * 3 + 1];
165 char* ptr
= &hexbyte
[0];
166 for (string::const_iterator i
= in
.begin(); i
!= in
.end(); ++i
) {
169 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
170 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
171 } else if (*i
>= '~') {
173 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
174 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
180 out
->append(hexbyte
, ptr
- &hexbyte
[0]);
183 inline unsigned h2i(char c
)
185 if ((c
>= '0') && (c
<= '9')) {
187 } else if ((c
>= 'a') && (c
<= 'f')) {
189 } else if ((c
>= 'A') && (c
<= 'F')) {
192 return 256; // make it always larger than 255
196 static int decode_escaped(const char *p
, string
*out
)
199 char* ptr
= &buff
[0];
200 char* max
= &buff
[252];
201 const char *orig_p
= p
;
202 while (*p
&& *p
!= '!') {
203 if (*p
== '#' || *p
== '~') {
206 hex
= h2i(*p
++) << 4;
219 out
->append(buff
, ptr
-buff
);
224 out
->append(buff
, ptr
-buff
);
229 // some things we encode in binary (as le32 or le64); print the
230 // resulting key strings nicely
232 static string
pretty_binary_string(const S
& in
)
236 out
.reserve(in
.length() * 3);
237 enum { NONE
, HEX
, STRING
} mode
= NONE
;
238 unsigned from
= 0, i
;
239 for (i
=0; i
< in
.length(); ++i
) {
240 if ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
241 (mode
== HEX
&& in
.length() - i
>= 4 &&
242 ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
243 (in
[i
+1] < 32 || (unsigned char)in
[i
+1] > 126) ||
244 (in
[i
+2] < 32 || (unsigned char)in
[i
+2] > 126) ||
245 (in
[i
+3] < 32 || (unsigned char)in
[i
+3] > 126)))) {
246 if (mode
== STRING
) {
247 out
.append(in
.c_str() + from
, i
- from
);
254 if (in
.length() - i
>= 4) {
255 // print a whole u32 at once
256 snprintf(buf
, sizeof(buf
), "%08x",
257 (uint32_t)(((unsigned char)in
[i
] << 24) |
258 ((unsigned char)in
[i
+1] << 16) |
259 ((unsigned char)in
[i
+2] << 8) |
260 ((unsigned char)in
[i
+3] << 0)));
263 snprintf(buf
, sizeof(buf
), "%02x", (int)(unsigned char)in
[i
]);
267 if (mode
!= STRING
) {
274 if (mode
== STRING
) {
275 out
.append(in
.c_str() + from
, i
- from
);
282 static void _key_encode_shard(shard_id_t shard
, T
*key
)
284 key
->push_back((char)((uint8_t)shard
.id
+ (uint8_t)0x80));
287 static const char *_key_decode_shard(const char *key
, shard_id_t
*pshard
)
289 pshard
->id
= (uint8_t)*key
- (uint8_t)0x80;
293 static void get_coll_key_range(const coll_t
& cid
, int bits
,
294 string
*temp_start
, string
*temp_end
,
295 string
*start
, string
*end
)
303 if (cid
.is_pg(&pgid
)) {
304 _key_encode_shard(pgid
.shard
, start
);
305 *temp_start
= *start
;
307 _key_encode_u64(pgid
.pool() + 0x8000000000000000ull
, start
);
308 _key_encode_u64((-2ll - pgid
.pool()) + 0x8000000000000000ull
, temp_start
);
311 *temp_end
= *temp_start
;
313 uint32_t reverse_hash
= hobject_t::_reverse_bits(pgid
.ps());
314 _key_encode_u32(reverse_hash
, start
);
315 _key_encode_u32(reverse_hash
, temp_start
);
317 uint64_t end_hash
= reverse_hash
+ (1ull << (32 - bits
));
318 if (end_hash
> 0xffffffffull
)
319 end_hash
= 0xffffffffull
;
321 _key_encode_u32(end_hash
, end
);
322 _key_encode_u32(end_hash
, temp_end
);
324 _key_encode_shard(shard_id_t::NO_SHARD
, start
);
325 _key_encode_u64(-1ull + 0x8000000000000000ull
, start
);
327 _key_encode_u32(0, start
);
328 _key_encode_u32(0xffffffff, end
);
330 // no separate temp section
336 static void get_shared_blob_key(uint64_t sbid
, string
*key
)
339 _key_encode_u64(sbid
, key
);
342 static int get_key_shared_blob(const string
& key
, uint64_t *sbid
)
344 const char *p
= key
.c_str();
345 if (key
.length() < sizeof(uint64_t))
347 _key_decode_u64(p
, sbid
);
352 static int get_key_object(const S
& key
, ghobject_t
*oid
)
355 const char *p
= key
.c_str();
357 if (key
.length() < 1 + 8 + 4)
359 p
= _key_decode_shard(p
, &oid
->shard_id
);
362 p
= _key_decode_u64(p
, &pool
);
363 oid
->hobj
.pool
= pool
- 0x8000000000000000ull
;
366 p
= _key_decode_u32(p
, &hash
);
368 oid
->hobj
.set_bitwise_key_u32(hash
);
370 r
= decode_escaped(p
, &oid
->hobj
.nspace
);
376 r
= decode_escaped(p
, &k
);
383 oid
->hobj
.oid
.name
= k
;
384 } else if (*p
== '<' || *p
== '>') {
387 r
= decode_escaped(p
, &oid
->hobj
.oid
.name
);
391 oid
->hobj
.set_key(k
);
397 p
= _key_decode_u64(p
, &oid
->hobj
.snap
.val
);
398 p
= _key_decode_u64(p
, &oid
->generation
);
400 if (*p
!= ONODE_KEY_SUFFIX
) {
405 // if we get something other than a null terminator here,
406 // something goes wrong.
414 static void get_object_key(CephContext
*cct
, const ghobject_t
& oid
, S
*key
)
418 size_t max_len
= 1 + 8 + 4 +
419 (oid
.hobj
.nspace
.length() * 3 + 1) +
420 (oid
.hobj
.get_key().length() * 3 + 1) +
421 1 + // for '<', '=', or '>'
422 (oid
.hobj
.oid
.name
.length() * 3 + 1) +
424 key
->reserve(max_len
);
426 _key_encode_shard(oid
.shard_id
, key
);
427 _key_encode_u64(oid
.hobj
.pool
+ 0x8000000000000000ull
, key
);
428 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), key
);
430 append_escaped(oid
.hobj
.nspace
, key
);
432 if (oid
.hobj
.get_key().length()) {
433 // is a key... could be < = or >.
434 append_escaped(oid
.hobj
.get_key(), key
);
435 // (ASCII chars < = and > sort in that order, yay)
436 int r
= oid
.hobj
.get_key().compare(oid
.hobj
.oid
.name
);
438 key
->append(r
> 0 ? ">" : "<");
439 append_escaped(oid
.hobj
.oid
.name
, key
);
446 append_escaped(oid
.hobj
.oid
.name
, key
);
450 _key_encode_u64(oid
.hobj
.snap
, key
);
451 _key_encode_u64(oid
.generation
, key
);
453 key
->push_back(ONODE_KEY_SUFFIX
);
458 int r
= get_key_object(*key
, &t
);
460 derr
<< " r " << r
<< dendl
;
461 derr
<< "key " << pretty_binary_string(*key
) << dendl
;
462 derr
<< "oid " << oid
<< dendl
;
463 derr
<< " t " << t
<< dendl
;
464 ceph_assert(r
== 0 && t
== oid
);
470 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
471 // char lets us quickly test whether it is a shard key without decoding any
472 // of the prefix bytes.
474 static void get_extent_shard_key(const S
& onode_key
, uint32_t offset
,
478 key
->reserve(onode_key
.length() + 4 + 1);
479 key
->append(onode_key
.c_str(), onode_key
.size());
480 _key_encode_u32(offset
, key
);
481 key
->push_back(EXTENT_SHARD_KEY_SUFFIX
);
484 static void rewrite_extent_shard_key(uint32_t offset
, string
*key
)
486 ceph_assert(key
->size() > sizeof(uint32_t) + 1);
487 ceph_assert(*key
->rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
488 _key_encode_u32(offset
, key
->size() - sizeof(uint32_t) - 1, key
);
492 static void generate_extent_shard_key_and_apply(
496 std::function
<void(const string
& final_key
)> apply
)
498 if (key
->empty()) { // make full key
499 ceph_assert(!onode_key
.empty());
500 get_extent_shard_key(onode_key
, offset
, key
);
502 rewrite_extent_shard_key(offset
, key
);
507 int get_key_extent_shard(const string
& key
, string
*onode_key
, uint32_t *offset
)
509 ceph_assert(key
.size() > sizeof(uint32_t) + 1);
510 ceph_assert(*key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
511 int okey_len
= key
.size() - sizeof(uint32_t) - 1;
512 *onode_key
= key
.substr(0, okey_len
);
513 const char *p
= key
.data() + okey_len
;
514 _key_decode_u32(p
, offset
);
518 static bool is_extent_shard_key(const string
& key
)
520 return *key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
;
523 static void get_deferred_key(uint64_t seq
, string
*out
)
525 _key_encode_u64(seq
, out
);
528 static void get_pool_stat_key(int64_t pool_id
, string
*key
)
531 _key_encode_u64(pool_id
, key
);
534 static int get_key_pool_stat(const string
& key
, uint64_t* pool_id
)
536 const char *p
= key
.c_str();
537 if (key
.length() < sizeof(uint64_t))
539 _key_decode_u64(p
, pool_id
);
543 template <int LogLevelV
>
544 void _dump_extent_map(CephContext
*cct
, const BlueStore::ExtentMap
&em
)
547 for (auto& s
: em
.shards
) {
548 dout(LogLevelV
) << __func__
<< " shard " << *s
.shard_info
549 << (s
.loaded
? " (loaded)" : "")
550 << (s
.dirty
? " (dirty)" : "")
553 for (auto& e
: em
.extent_map
) {
554 dout(LogLevelV
) << __func__
<< " " << e
<< dendl
;
555 ceph_assert(e
.logical_offset
>= pos
);
556 pos
= e
.logical_offset
+ e
.length
;
557 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
558 if (blob
.has_csum()) {
560 unsigned n
= blob
.get_csum_count();
561 for (unsigned i
= 0; i
< n
; ++i
)
562 v
.push_back(blob
.get_csum_item(i
));
563 dout(LogLevelV
) << __func__
<< " csum: " << std::hex
<< v
<< std::dec
566 std::lock_guard
l(e
.blob
->shared_blob
->get_cache()->lock
);
567 for (auto& i
: e
.blob
->shared_blob
->bc
.buffer_map
) {
568 dout(LogLevelV
) << __func__
<< " 0x" << std::hex
<< i
.first
569 << "~" << i
.second
->length
<< std::dec
570 << " " << *i
.second
<< dendl
;
575 template <int LogLevelV
>
576 void _dump_onode(CephContext
*cct
, const BlueStore::Onode
& o
)
578 if (!cct
->_conf
->subsys
.should_gather
<ceph_subsys_bluestore
, LogLevelV
>())
580 dout(LogLevelV
) << __func__
<< " " << &o
<< " " << o
.oid
581 << " nid " << o
.onode
.nid
582 << " size 0x" << std::hex
<< o
.onode
.size
583 << " (" << std::dec
<< o
.onode
.size
<< ")"
584 << " expected_object_size " << o
.onode
.expected_object_size
585 << " expected_write_size " << o
.onode
.expected_write_size
586 << " in " << o
.onode
.extent_map_shards
.size() << " shards"
587 << ", " << o
.extent_map
.spanning_blob_map
.size()
590 for (auto p
= o
.onode
.attrs
.begin();
591 p
!= o
.onode
.attrs
.end();
593 dout(LogLevelV
) << __func__
<< " attr " << p
->first
594 << " len " << p
->second
.length() << dendl
;
596 _dump_extent_map
<LogLevelV
>(cct
, o
.extent_map
);
599 template <int LogLevelV
>
600 void _dump_transaction(CephContext
*cct
, ObjectStore::Transaction
*t
)
602 dout(LogLevelV
) << __func__
<< " transaction dump:\n";
603 JSONFormatter
f(true);
604 f
.open_object_section("transaction");
613 struct Int64ArrayMergeOperator
: public KeyValueDB::MergeOperator
{
614 void merge_nonexistent(
615 const char *rdata
, size_t rlen
, std::string
*new_value
) override
{
616 *new_value
= std::string(rdata
, rlen
);
619 const char *ldata
, size_t llen
,
620 const char *rdata
, size_t rlen
,
621 std::string
*new_value
) override
{
622 ceph_assert(llen
== rlen
);
623 ceph_assert((rlen
% 8) == 0);
624 new_value
->resize(rlen
);
625 const ceph_le64
* lv
= (const ceph_le64
*)ldata
;
626 const ceph_le64
* rv
= (const ceph_le64
*)rdata
;
627 ceph_le64
* nv
= &(ceph_le64
&)new_value
->at(0);
628 for (size_t i
= 0; i
< rlen
>> 3; ++i
) {
629 nv
[i
] = lv
[i
] + rv
[i
];
632 // We use each operator name and each prefix to construct the
633 // overall RocksDB operator name for consistency check at open time.
634 const char *name() const override
{
635 return "int64_array";
642 ostream
& operator<<(ostream
& out
, const BlueStore::Buffer
& b
)
644 out
<< "buffer(" << &b
<< " space " << b
.space
<< " 0x" << std::hex
645 << b
.offset
<< "~" << b
.length
<< std::dec
646 << " " << BlueStore::Buffer::get_state_name(b
.state
);
648 out
<< " " << BlueStore::Buffer::get_flag_name(b
.flags
);
654 void BlueStore::GarbageCollector::process_protrusive_extents(
655 const BlueStore::ExtentMap
& extent_map
,
656 uint64_t start_offset
,
658 uint64_t start_touch_offset
,
659 uint64_t end_touch_offset
,
660 uint64_t min_alloc_size
)
662 ceph_assert(start_offset
<= start_touch_offset
&& end_offset
>= end_touch_offset
);
664 uint64_t lookup_start_offset
= p2align(start_offset
, min_alloc_size
);
665 uint64_t lookup_end_offset
= round_up_to(end_offset
, min_alloc_size
);
667 dout(30) << __func__
<< " (hex): [" << std::hex
668 << lookup_start_offset
<< ", " << lookup_end_offset
669 << ")" << std::dec
<< dendl
;
671 for (auto it
= extent_map
.seek_lextent(lookup_start_offset
);
672 it
!= extent_map
.extent_map
.end() &&
673 it
->logical_offset
< lookup_end_offset
;
675 uint64_t alloc_unit_start
= it
->logical_offset
/ min_alloc_size
;
676 uint64_t alloc_unit_end
= (it
->logical_end() - 1) / min_alloc_size
;
678 dout(30) << __func__
<< " " << *it
679 << "alloc_units: " << alloc_unit_start
<< ".." << alloc_unit_end
682 Blob
* b
= it
->blob
.get();
684 if (it
->logical_offset
>=start_touch_offset
&&
685 it
->logical_end() <= end_touch_offset
) {
686 // Process extents within the range affected by
687 // the current write request.
688 // Need to take into account if existing extents
689 // can be merged with them (uncompressed case)
690 if (!b
->get_blob().is_compressed()) {
691 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
692 --blob_info_counted
->expected_allocations
; // don't need to allocate
693 // new AU for compressed
694 // data since another
695 // collocated uncompressed
696 // blob already exists
697 dout(30) << __func__
<< " --expected:"
698 << alloc_unit_start
<< dendl
;
700 used_alloc_unit
= alloc_unit_end
;
701 blob_info_counted
= nullptr;
703 } else if (b
->get_blob().is_compressed()) {
705 // additionally we take compressed blobs that were not impacted
706 // by the write into account too
708 affected_blobs
.emplace(
709 b
, BlobInfo(b
->get_referenced_bytes())).first
->second
;
712 (used_alloc_unit
&& used_alloc_unit
== alloc_unit_start
) ? 0 : 1;
713 bi
.expected_allocations
+= alloc_unit_end
- alloc_unit_start
+ adjust
;
714 dout(30) << __func__
<< " expected_allocations="
715 << bi
.expected_allocations
<< " end_au:"
716 << alloc_unit_end
<< dendl
;
718 blob_info_counted
= &bi
;
719 used_alloc_unit
= alloc_unit_end
;
721 ceph_assert(it
->length
<= bi
.referenced_bytes
);
722 bi
.referenced_bytes
-= it
->length
;
723 dout(30) << __func__
<< " affected_blob:" << *b
724 << " unref 0x" << std::hex
<< it
->length
725 << " referenced = 0x" << bi
.referenced_bytes
726 << std::dec
<< dendl
;
727 // NOTE: we can't move specific blob to resulting GC list here
728 // when reference counter == 0 since subsequent extents might
729 // decrement its expected_allocation.
730 // Hence need to enumerate all the extents first.
731 if (!bi
.collect_candidate
) {
732 bi
.first_lextent
= it
;
733 bi
.collect_candidate
= true;
735 bi
.last_lextent
= it
;
737 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
738 // don't need to allocate new AU for compressed data since another
739 // collocated uncompressed blob already exists
740 --blob_info_counted
->expected_allocations
;
741 dout(30) << __func__
<< " --expected_allocations:"
742 << alloc_unit_start
<< dendl
;
744 used_alloc_unit
= alloc_unit_end
;
745 blob_info_counted
= nullptr;
749 for (auto b_it
= affected_blobs
.begin();
750 b_it
!= affected_blobs
.end();
752 Blob
* b
= b_it
->first
;
753 BlobInfo
& bi
= b_it
->second
;
754 if (bi
.referenced_bytes
== 0) {
755 uint64_t len_on_disk
= b_it
->first
->get_blob().get_ondisk_length();
756 int64_t blob_expected_for_release
=
757 round_up_to(len_on_disk
, min_alloc_size
) / min_alloc_size
;
759 dout(30) << __func__
<< " " << *(b_it
->first
)
760 << " expected4release=" << blob_expected_for_release
761 << " expected_allocations=" << bi
.expected_allocations
763 int64_t benefit
= blob_expected_for_release
- bi
.expected_allocations
;
764 if (benefit
>= g_conf()->bluestore_gc_enable_blob_threshold
) {
765 if (bi
.collect_candidate
) {
766 auto it
= bi
.first_lextent
;
769 if (it
->blob
.get() == b
) {
770 extents_to_collect
.insert(it
->logical_offset
, it
->length
);
772 bExit
= it
== bi
.last_lextent
;
776 expected_for_release
+= blob_expected_for_release
;
777 expected_allocations
+= bi
.expected_allocations
;
783 int64_t BlueStore::GarbageCollector::estimate(
784 uint64_t start_offset
,
786 const BlueStore::ExtentMap
& extent_map
,
787 const BlueStore::old_extent_map_t
& old_extents
,
788 uint64_t min_alloc_size
)
791 affected_blobs
.clear();
792 extents_to_collect
.clear();
793 used_alloc_unit
= boost::optional
<uint64_t >();
794 blob_info_counted
= nullptr;
796 uint64_t gc_start_offset
= start_offset
;
797 uint64_t gc_end_offset
= start_offset
+ length
;
799 uint64_t end_offset
= start_offset
+ length
;
801 for (auto it
= old_extents
.begin(); it
!= old_extents
.end(); ++it
) {
802 Blob
* b
= it
->e
.blob
.get();
803 if (b
->get_blob().is_compressed()) {
805 // update gc_start_offset/gc_end_offset if needed
806 gc_start_offset
= min(gc_start_offset
, (uint64_t)it
->e
.blob_start());
807 gc_end_offset
= std::max(gc_end_offset
, (uint64_t)it
->e
.blob_end());
809 auto o
= it
->e
.logical_offset
;
810 auto l
= it
->e
.length
;
812 uint64_t ref_bytes
= b
->get_referenced_bytes();
813 // micro optimization to bypass blobs that have no more references
814 if (ref_bytes
!= 0) {
815 dout(30) << __func__
<< " affected_blob:" << *b
816 << " unref 0x" << std::hex
<< o
<< "~" << l
817 << std::dec
<< dendl
;
818 affected_blobs
.emplace(b
, BlobInfo(ref_bytes
));
822 dout(30) << __func__
<< " gc range(hex): [" << std::hex
823 << gc_start_offset
<< ", " << gc_end_offset
824 << ")" << std::dec
<< dendl
;
826 // enumerate preceeding extents to check if they reference affected blobs
827 if (gc_start_offset
< start_offset
|| gc_end_offset
> end_offset
) {
828 process_protrusive_extents(extent_map
,
835 return expected_for_release
- expected_allocations
;
838 // LruOnodeCacheShard
839 struct LruOnodeCacheShard
: public BlueStore::OnodeCacheShard
{
840 typedef boost::intrusive::list
<
842 boost::intrusive::member_hook
<
844 boost::intrusive::list_member_hook
<>,
845 &BlueStore::Onode::lru_item
> > list_t
;
849 explicit LruOnodeCacheShard(CephContext
*cct
) : BlueStore::OnodeCacheShard(cct
) {}
851 void _add(BlueStore::Onode
* o
, int level
) override
853 if (o
->put_cache()) {
854 (level
> 0) ? lru
.push_front(*o
) : lru
.push_back(*o
);
858 ++num
; // we count both pinned and unpinned entries
859 dout(20) << __func__
<< " " << this << " " << o
->oid
<< " added, num=" << num
<< dendl
;
861 void _rm(BlueStore::Onode
* o
) override
863 if (o
->pop_cache()) {
864 lru
.erase(lru
.iterator_to(*o
));
866 ceph_assert(num_pinned
);
871 dout(20) << __func__
<< " " << this << " " << " " << o
->oid
<< " removed, num=" << num
<< dendl
;
873 void _pin(BlueStore::Onode
* o
) override
875 lru
.erase(lru
.iterator_to(*o
));
877 dout(20) << __func__
<< this << " " << " " << " " << o
->oid
<< " pinned" << dendl
;
879 void _unpin(BlueStore::Onode
* o
) override
882 ceph_assert(num_pinned
);
884 dout(20) << __func__
<< this << " " << " " << " " << o
->oid
<< " unpinned" << dendl
;
887 void _trim_to(uint64_t new_size
) override
889 if (new_size
>= lru
.size()) {
890 return; // don't even try
892 uint64_t n
= lru
.size() - new_size
;
894 ceph_assert(p
!= lru
.begin());
896 ceph_assert(num
>= n
);
899 BlueStore::Onode
*o
= &*p
;
900 dout(20) << __func__
<< " rm " << o
->oid
<< " "
901 << o
->nref
<< " " << o
->cached
<< " " << o
->pinned
<< dendl
;
902 if (p
!= lru
.begin()) {
908 auto pinned
= !o
->pop_cache();
909 ceph_assert(!pinned
);
910 o
->c
->onode_map
._remove(o
->oid
);
913 void move_pinned(OnodeCacheShard
*to
, BlueStore::Onode
*o
) override
918 ceph_assert(o
->cached
);
919 ceph_assert(o
->pinned
);
921 ceph_assert(num_pinned
);
927 void add_stats(uint64_t *onodes
, uint64_t *pinned_onodes
) override
930 *pinned_onodes
+= num_pinned
;
935 BlueStore::OnodeCacheShard
*BlueStore::OnodeCacheShard::create(
938 PerfCounters
*logger
)
940 BlueStore::OnodeCacheShard
*c
= nullptr;
941 // Currently we only implement an LRU cache for onodes
942 c
= new LruOnodeCacheShard(cct
);
947 // LruBufferCacheShard
948 struct LruBufferCacheShard
: public BlueStore::BufferCacheShard
{
949 typedef boost::intrusive::list
<
951 boost::intrusive::member_hook
<
953 boost::intrusive::list_member_hook
<>,
954 &BlueStore::Buffer::lru_item
> > list_t
;
957 explicit LruBufferCacheShard(CephContext
*cct
) : BlueStore::BufferCacheShard(cct
) {}
959 void _add(BlueStore::Buffer
*b
, int level
, BlueStore::Buffer
*near
) override
{
961 auto q
= lru
.iterator_to(*near
);
963 } else if (level
> 0) {
968 buffer_bytes
+= b
->length
;
971 void _rm(BlueStore::Buffer
*b
) override
{
972 ceph_assert(buffer_bytes
>= b
->length
);
973 buffer_bytes
-= b
->length
;
974 auto q
= lru
.iterator_to(*b
);
978 void _move(BlueStore::BufferCacheShard
*src
, BlueStore::Buffer
*b
) override
{
982 void _adjust_size(BlueStore::Buffer
*b
, int64_t delta
) override
{
983 ceph_assert((int64_t)buffer_bytes
+ delta
>= 0);
984 buffer_bytes
+= delta
;
986 void _touch(BlueStore::Buffer
*b
) override
{
987 auto p
= lru
.iterator_to(*b
);
991 _audit("_touch_buffer end");
994 void _trim_to(uint64_t max
) override
996 while (buffer_bytes
> max
) {
997 auto i
= lru
.rbegin();
998 if (i
== lru
.rend()) {
999 // stop if lru is now empty
1003 BlueStore::Buffer
*b
= &*i
;
1004 ceph_assert(b
->is_clean());
1005 dout(20) << __func__
<< " rm " << *b
<< dendl
;
1006 b
->space
->_rm_buffer(this, b
);
1011 void add_stats(uint64_t *extents
,
1014 uint64_t *bytes
) override
{
1015 *extents
+= num_extents
;
1016 *blobs
+= num_blobs
;
1018 *bytes
+= buffer_bytes
;
1021 void _audit(const char *s
) override
1023 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1025 for (auto i
= lru
.begin(); i
!= lru
.end(); ++i
) {
1028 if (s
!= buffer_bytes
) {
1029 derr
<< __func__
<< " buffer_size " << buffer_bytes
<< " actual " << s
1031 for (auto i
= lru
.begin(); i
!= lru
.end(); ++i
) {
1032 derr
<< __func__
<< " " << *i
<< dendl
;
1034 ceph_assert(s
== buffer_bytes
);
1036 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1042 // TwoQBufferCacheShard
1044 struct TwoQBufferCacheShard
: public BlueStore::BufferCacheShard
{
1045 typedef boost::intrusive::list
<
1047 boost::intrusive::member_hook
<
1049 boost::intrusive::list_member_hook
<>,
1050 &BlueStore::Buffer::lru_item
> > list_t
;
1051 list_t hot
; ///< "Am" hot buffers
1052 list_t warm_in
; ///< "A1in" newly warm buffers
1053 list_t warm_out
; ///< "A1out" empty buffers we've evicted
1054 uint64_t buffer_bytes
= 0; ///< bytes
1058 BUFFER_WARM_IN
, ///< in warm_in
1059 BUFFER_WARM_OUT
, ///< in warm_out
1060 BUFFER_HOT
, ///< in hot
1064 uint64_t list_bytes
[BUFFER_TYPE_MAX
] = {0}; ///< bytes per type
1067 explicit TwoQBufferCacheShard(CephContext
*cct
) : BufferCacheShard(cct
) {}
1069 void _add(BlueStore::Buffer
*b
, int level
, BlueStore::Buffer
*near
) override
1071 dout(20) << __func__
<< " level " << level
<< " near " << near
1073 << " which has cache_private " << b
->cache_private
<< dendl
;
1075 b
->cache_private
= near
->cache_private
;
1076 switch (b
->cache_private
) {
1077 case BUFFER_WARM_IN
:
1078 warm_in
.insert(warm_in
.iterator_to(*near
), *b
);
1080 case BUFFER_WARM_OUT
:
1081 ceph_assert(b
->is_empty());
1082 warm_out
.insert(warm_out
.iterator_to(*near
), *b
);
1085 hot
.insert(hot
.iterator_to(*near
), *b
);
1088 ceph_abort_msg("bad cache_private");
1090 } else if (b
->cache_private
== BUFFER_NEW
) {
1091 b
->cache_private
= BUFFER_WARM_IN
;
1093 warm_in
.push_front(*b
);
1095 // take caller hint to start at the back of the warm queue
1096 warm_in
.push_back(*b
);
1099 // we got a hint from discard
1100 switch (b
->cache_private
) {
1101 case BUFFER_WARM_IN
:
1102 // stay in warm_in. move to front, even though 2Q doesn't actually
1104 dout(20) << __func__
<< " move to front of warm " << *b
<< dendl
;
1105 warm_in
.push_front(*b
);
1107 case BUFFER_WARM_OUT
:
1108 b
->cache_private
= BUFFER_HOT
;
1109 // move to hot. fall-thru
1111 dout(20) << __func__
<< " move to front of hot " << *b
<< dendl
;
1115 ceph_abort_msg("bad cache_private");
1118 if (!b
->is_empty()) {
1119 buffer_bytes
+= b
->length
;
1120 list_bytes
[b
->cache_private
] += b
->length
;
1122 num
= hot
.size() + warm_in
.size();
1125 void _rm(BlueStore::Buffer
*b
) override
1127 dout(20) << __func__
<< " " << *b
<< dendl
;
1128 if (!b
->is_empty()) {
1129 ceph_assert(buffer_bytes
>= b
->length
);
1130 buffer_bytes
-= b
->length
;
1131 ceph_assert(list_bytes
[b
->cache_private
] >= b
->length
);
1132 list_bytes
[b
->cache_private
] -= b
->length
;
1134 switch (b
->cache_private
) {
1135 case BUFFER_WARM_IN
:
1136 warm_in
.erase(warm_in
.iterator_to(*b
));
1138 case BUFFER_WARM_OUT
:
1139 warm_out
.erase(warm_out
.iterator_to(*b
));
1142 hot
.erase(hot
.iterator_to(*b
));
1145 ceph_abort_msg("bad cache_private");
1147 num
= hot
.size() + warm_in
.size();
1150 void _move(BlueStore::BufferCacheShard
*srcc
, BlueStore::Buffer
*b
) override
1152 TwoQBufferCacheShard
*src
= static_cast<TwoQBufferCacheShard
*>(srcc
);
1155 // preserve which list we're on (even if we can't preserve the order!)
1156 switch (b
->cache_private
) {
1157 case BUFFER_WARM_IN
:
1158 ceph_assert(!b
->is_empty());
1159 warm_in
.push_back(*b
);
1161 case BUFFER_WARM_OUT
:
1162 ceph_assert(b
->is_empty());
1163 warm_out
.push_back(*b
);
1166 ceph_assert(!b
->is_empty());
1170 ceph_abort_msg("bad cache_private");
1172 if (!b
->is_empty()) {
1173 buffer_bytes
+= b
->length
;
1174 list_bytes
[b
->cache_private
] += b
->length
;
1176 num
= hot
.size() + warm_in
.size();
1179 void _adjust_size(BlueStore::Buffer
*b
, int64_t delta
) override
1181 dout(20) << __func__
<< " delta " << delta
<< " on " << *b
<< dendl
;
1182 if (!b
->is_empty()) {
1183 ceph_assert((int64_t)buffer_bytes
+ delta
>= 0);
1184 buffer_bytes
+= delta
;
1185 ceph_assert((int64_t)list_bytes
[b
->cache_private
] + delta
>= 0);
1186 list_bytes
[b
->cache_private
] += delta
;
1190 void _touch(BlueStore::Buffer
*b
) override
{
1191 switch (b
->cache_private
) {
1192 case BUFFER_WARM_IN
:
1193 // do nothing (somewhat counter-intuitively!)
1195 case BUFFER_WARM_OUT
:
1196 // move from warm_out to hot LRU
1197 ceph_abort_msg("this happens via discard hint");
1200 // move to front of hot LRU
1201 hot
.erase(hot
.iterator_to(*b
));
1205 num
= hot
.size() + warm_in
.size();
1206 _audit("_touch_buffer end");
1209 void _trim_to(uint64_t max
) override
1211 if (buffer_bytes
> max
) {
1212 uint64_t kin
= max
* cct
->_conf
->bluestore_2q_cache_kin_ratio
;
1213 uint64_t khot
= max
- kin
;
1215 // pre-calculate kout based on average buffer size too,
1216 // which is typical(the warm_in and hot lists may change later)
1218 uint64_t buffer_num
= hot
.size() + warm_in
.size();
1220 uint64_t avg_size
= buffer_bytes
/ buffer_num
;
1221 ceph_assert(avg_size
);
1222 uint64_t calculated_num
= max
/ avg_size
;
1223 kout
= calculated_num
* cct
->_conf
->bluestore_2q_cache_kout_ratio
;
1226 if (list_bytes
[BUFFER_HOT
] < khot
) {
1227 // hot is small, give slack to warm_in
1228 kin
+= khot
- list_bytes
[BUFFER_HOT
];
1229 } else if (list_bytes
[BUFFER_WARM_IN
] < kin
) {
1230 // warm_in is small, give slack to hot
1231 khot
+= kin
- list_bytes
[BUFFER_WARM_IN
];
1234 // adjust warm_in list
1235 int64_t to_evict_bytes
= list_bytes
[BUFFER_WARM_IN
] - kin
;
1236 uint64_t evicted
= 0;
1238 while (to_evict_bytes
> 0) {
1239 auto p
= warm_in
.rbegin();
1240 if (p
== warm_in
.rend()) {
1241 // stop if warm_in list is now empty
1245 BlueStore::Buffer
*b
= &*p
;
1246 ceph_assert(b
->is_clean());
1247 dout(20) << __func__
<< " buffer_warm_in -> out " << *b
<< dendl
;
1248 ceph_assert(buffer_bytes
>= b
->length
);
1249 buffer_bytes
-= b
->length
;
1250 ceph_assert(list_bytes
[BUFFER_WARM_IN
] >= b
->length
);
1251 list_bytes
[BUFFER_WARM_IN
] -= b
->length
;
1252 to_evict_bytes
-= b
->length
;
1253 evicted
+= b
->length
;
1254 b
->state
= BlueStore::Buffer::STATE_EMPTY
;
1256 warm_in
.erase(warm_in
.iterator_to(*b
));
1257 warm_out
.push_front(*b
);
1258 b
->cache_private
= BUFFER_WARM_OUT
;
1262 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1263 << " from warm_in list, done evicting warm_in buffers"
1268 to_evict_bytes
= list_bytes
[BUFFER_HOT
] - khot
;
1271 while (to_evict_bytes
> 0) {
1272 auto p
= hot
.rbegin();
1273 if (p
== hot
.rend()) {
1274 // stop if hot list is now empty
1278 BlueStore::Buffer
*b
= &*p
;
1279 dout(20) << __func__
<< " buffer_hot rm " << *b
<< dendl
;
1280 ceph_assert(b
->is_clean());
1281 // adjust evict size before buffer goes invalid
1282 to_evict_bytes
-= b
->length
;
1283 evicted
+= b
->length
;
1284 b
->space
->_rm_buffer(this, b
);
1288 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1289 << " from hot list, done evicting hot buffers"
1293 // adjust warm out list too, if necessary
1294 int64_t n
= warm_out
.size() - kout
;
1296 BlueStore::Buffer
*b
= &*warm_out
.rbegin();
1297 ceph_assert(b
->is_empty());
1298 dout(20) << __func__
<< " buffer_warm_out rm " << *b
<< dendl
;
1299 b
->space
->_rm_buffer(this, b
);
1302 num
= hot
.size() + warm_in
.size();
1305 void add_stats(uint64_t *extents
,
1308 uint64_t *bytes
) override
{
1309 *extents
+= num_extents
;
1310 *blobs
+= num_blobs
;
1312 *bytes
+= buffer_bytes
;
1316 void _audit(const char *s
) override
1318 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1320 for (auto i
= hot
.begin(); i
!= hot
.end(); ++i
) {
1324 uint64_t hot_bytes
= s
;
1325 if (hot_bytes
!= list_bytes
[BUFFER_HOT
]) {
1326 derr
<< __func__
<< " hot_list_bytes "
1327 << list_bytes
[BUFFER_HOT
]
1328 << " != actual " << hot_bytes
1330 ceph_assert(hot_bytes
== list_bytes
[BUFFER_HOT
]);
1333 for (auto i
= warm_in
.begin(); i
!= warm_in
.end(); ++i
) {
1337 uint64_t warm_in_bytes
= s
- hot_bytes
;
1338 if (warm_in_bytes
!= list_bytes
[BUFFER_WARM_IN
]) {
1339 derr
<< __func__
<< " warm_in_list_bytes "
1340 << list_bytes
[BUFFER_WARM_IN
]
1341 << " != actual " << warm_in_bytes
1343 ceph_assert(warm_in_bytes
== list_bytes
[BUFFER_WARM_IN
]);
1346 if (s
!= buffer_bytes
) {
1347 derr
<< __func__
<< " buffer_bytes " << buffer_bytes
<< " actual " << s
1349 ceph_assert(s
== buffer_bytes
);
1352 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1360 BlueStore::BufferCacheShard
*BlueStore::BufferCacheShard::create(
1363 PerfCounters
*logger
)
1365 BufferCacheShard
*c
= nullptr;
1367 c
= new LruBufferCacheShard(cct
);
1368 else if (type
== "2q")
1369 c
= new TwoQBufferCacheShard(cct
);
1371 ceph_abort_msg("unrecognized cache type");
1379 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1381 void BlueStore::BufferSpace::_clear(BufferCacheShard
* cache
)
1383 // note: we already hold cache->lock
1384 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1385 while (!buffer_map
.empty()) {
1386 _rm_buffer(cache
, buffer_map
.begin());
1390 int BlueStore::BufferSpace::_discard(BufferCacheShard
* cache
, uint32_t offset
, uint32_t length
)
1392 // note: we already hold cache->lock
1393 ldout(cache
->cct
, 20) << __func__
<< std::hex
<< " 0x" << offset
<< "~" << length
1394 << std::dec
<< dendl
;
1395 int cache_private
= 0;
1396 cache
->_audit("discard start");
1397 auto i
= _data_lower_bound(offset
);
1398 uint32_t end
= offset
+ length
;
1399 while (i
!= buffer_map
.end()) {
1400 Buffer
*b
= i
->second
.get();
1401 if (b
->offset
>= end
) {
1404 if (b
->cache_private
> cache_private
) {
1405 cache_private
= b
->cache_private
;
1407 if (b
->offset
< offset
) {
1408 int64_t front
= offset
- b
->offset
;
1409 if (b
->end() > end
) {
1410 // drop middle (split)
1411 uint32_t tail
= b
->end() - end
;
1412 if (b
->data
.length()) {
1414 bl
.substr_of(b
->data
, b
->length
- tail
, tail
);
1415 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1416 nb
->maybe_rebuild();
1417 _add_buffer(cache
, nb
, 0, b
);
1419 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, tail
),
1422 if (!b
->is_writing()) {
1423 cache
->_adjust_size(b
, front
- (int64_t)b
->length
);
1427 cache
->_audit("discard end 1");
1431 if (!b
->is_writing()) {
1432 cache
->_adjust_size(b
, front
- (int64_t)b
->length
);
1440 if (b
->end() <= end
) {
1441 // drop entire buffer
1442 _rm_buffer(cache
, i
++);
1446 uint32_t keep
= b
->end() - end
;
1447 if (b
->data
.length()) {
1449 bl
.substr_of(b
->data
, b
->length
- keep
, keep
);
1450 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1451 nb
->maybe_rebuild();
1452 _add_buffer(cache
, nb
, 0, b
);
1454 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, keep
), 0, b
);
1456 _rm_buffer(cache
, i
);
1457 cache
->_audit("discard end 2");
1460 return cache_private
;
1463 void BlueStore::BufferSpace::read(
1464 BufferCacheShard
* cache
,
1467 BlueStore::ready_regions_t
& res
,
1468 interval_set
<uint32_t>& res_intervals
,
1472 res_intervals
.clear();
1473 uint32_t want_bytes
= length
;
1474 uint32_t end
= offset
+ length
;
1477 std::lock_guard
l(cache
->lock
);
1478 for (auto i
= _data_lower_bound(offset
);
1479 i
!= buffer_map
.end() && offset
< end
&& i
->first
< end
;
1481 Buffer
*b
= i
->second
.get();
1482 ceph_assert(b
->end() > offset
);
1485 if (flags
& BYPASS_CLEAN_CACHE
)
1486 val
= b
->is_writing();
1488 val
= b
->is_writing() || b
->is_clean();
1490 if (b
->offset
< offset
) {
1491 uint32_t skip
= offset
- b
->offset
;
1492 uint32_t l
= min(length
, b
->length
- skip
);
1493 res
[offset
].substr_of(b
->data
, skip
, l
);
1494 res_intervals
.insert(offset
, l
);
1497 if (!b
->is_writing()) {
1502 if (b
->offset
> offset
) {
1503 uint32_t gap
= b
->offset
- offset
;
1504 if (length
<= gap
) {
1510 if (!b
->is_writing()) {
1513 if (b
->length
> length
) {
1514 res
[offset
].substr_of(b
->data
, 0, length
);
1515 res_intervals
.insert(offset
, length
);
1518 res
[offset
].append(b
->data
);
1519 res_intervals
.insert(offset
, b
->length
);
1520 if (b
->length
== length
)
1522 offset
+= b
->length
;
1523 length
-= b
->length
;
1529 uint64_t hit_bytes
= res_intervals
.size();
1530 ceph_assert(hit_bytes
<= want_bytes
);
1531 uint64_t miss_bytes
= want_bytes
- hit_bytes
;
1532 cache
->logger
->inc(l_bluestore_buffer_hit_bytes
, hit_bytes
);
1533 cache
->logger
->inc(l_bluestore_buffer_miss_bytes
, miss_bytes
);
1536 void BlueStore::BufferSpace::_finish_write(BufferCacheShard
* cache
, uint64_t seq
)
1538 auto i
= writing
.begin();
1539 while (i
!= writing
.end()) {
1549 ceph_assert(b
->is_writing());
1551 if (b
->flags
& Buffer::FLAG_NOCACHE
) {
1553 ldout(cache
->cct
, 20) << __func__
<< " discard " << *b
<< dendl
;
1554 buffer_map
.erase(b
->offset
);
1556 b
->state
= Buffer::STATE_CLEAN
;
1559 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
1560 cache
->_add(b
, 1, nullptr);
1561 ldout(cache
->cct
, 20) << __func__
<< " added " << *b
<< dendl
;
1565 cache
->_audit("finish_write end");
1568 void BlueStore::BufferSpace::split(BufferCacheShard
* cache
, size_t pos
, BlueStore::BufferSpace
&r
)
1570 std::lock_guard
lk(cache
->lock
);
1571 if (buffer_map
.empty())
1574 auto p
= --buffer_map
.end();
1576 if (p
->second
->end() <= pos
)
1579 if (p
->second
->offset
< pos
) {
1580 ldout(cache
->cct
, 30) << __func__
<< " cut " << *p
->second
<< dendl
;
1581 size_t left
= pos
- p
->second
->offset
;
1582 size_t right
= p
->second
->length
- left
;
1583 if (p
->second
->data
.length()) {
1585 bl
.substr_of(p
->second
->data
, left
, right
);
1586 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, bl
),
1587 0, p
->second
.get());
1589 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, right
),
1590 0, p
->second
.get());
1592 cache
->_adjust_size(p
->second
.get(), -right
);
1593 p
->second
->truncate(left
);
1597 ceph_assert(p
->second
->end() > pos
);
1598 ldout(cache
->cct
, 30) << __func__
<< " move " << *p
->second
<< dendl
;
1599 if (p
->second
->data
.length()) {
1600 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1601 p
->second
->offset
- pos
, p
->second
->data
),
1602 0, p
->second
.get());
1604 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1605 p
->second
->offset
- pos
, p
->second
->length
),
1606 0, p
->second
.get());
1608 if (p
== buffer_map
.begin()) {
1609 _rm_buffer(cache
, p
);
1612 _rm_buffer(cache
, p
--);
1615 ceph_assert(writing
.empty());
1622 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1624 BlueStore::OnodeRef
BlueStore::OnodeSpace::add(const ghobject_t
& oid
,
1627 std::lock_guard
l(cache
->lock
);
1628 auto p
= onode_map
.find(oid
);
1629 if (p
!= onode_map
.end()) {
1630 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
1631 << " raced, returning existing " << p
->second
1635 ldout(cache
->cct
, 20) << __func__
<< " " << oid
<< " " << o
<< dendl
;
1637 cache
->_add(o
.get(), 1);
1642 void BlueStore::OnodeSpace::_remove(const ghobject_t
& oid
)
1644 ldout(cache
->cct
, 20) << __func__
<< " " << oid
<< " " << dendl
;
1645 onode_map
.erase(oid
);
1648 BlueStore::OnodeRef
BlueStore::OnodeSpace::lookup(const ghobject_t
& oid
)
1650 ldout(cache
->cct
, 30) << __func__
<< dendl
;
1655 std::lock_guard
l(cache
->lock
);
1656 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator p
= onode_map
.find(oid
);
1657 if (p
== onode_map
.end()) {
1658 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " miss" << dendl
;
1660 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " hit " << p
->second
1661 << " " << p
->second
->nref
1662 << " " << p
->second
->cached
1663 << " " << p
->second
->pinned
1665 // This will pin onode and implicitly touch the cache when Onode
1666 // eventually will become unpinned
1668 ceph_assert(!o
->cached
|| o
->pinned
);
1675 cache
->logger
->inc(l_bluestore_onode_hits
);
1677 cache
->logger
->inc(l_bluestore_onode_misses
);
1682 void BlueStore::OnodeSpace::clear()
1684 std::lock_guard
l(cache
->lock
);
1685 ldout(cache
->cct
, 10) << __func__
<< " " << onode_map
.size()<< dendl
;
1686 for (auto &p
: onode_map
) {
1687 cache
->_rm(p
.second
.get());
1692 bool BlueStore::OnodeSpace::empty()
1694 std::lock_guard
l(cache
->lock
);
1695 return onode_map
.empty();
1698 void BlueStore::OnodeSpace::rename(
1700 const ghobject_t
& old_oid
,
1701 const ghobject_t
& new_oid
,
1702 const mempool::bluestore_cache_other::string
& new_okey
)
1704 std::lock_guard
l(cache
->lock
);
1705 ldout(cache
->cct
, 30) << __func__
<< " " << old_oid
<< " -> " << new_oid
1707 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator po
, pn
;
1708 po
= onode_map
.find(old_oid
);
1709 pn
= onode_map
.find(new_oid
);
1710 ceph_assert(po
!= pn
);
1712 ceph_assert(po
!= onode_map
.end());
1713 if (pn
!= onode_map
.end()) {
1714 ldout(cache
->cct
, 30) << __func__
<< " removing target " << pn
->second
1716 cache
->_rm(pn
->second
.get());
1717 onode_map
.erase(pn
);
1719 OnodeRef o
= po
->second
;
1721 // install a non-existent onode at old location
1722 oldo
.reset(new Onode(o
->c
, old_oid
, o
->key
));
1724 cache
->_add(oldo
.get(), 1);
1725 // add at new position and fix oid, key.
1726 // This will pin 'o' and implicitly touch cache
1727 // when it will eventually become unpinned
1728 onode_map
.insert(make_pair(new_oid
, o
));
1729 ceph_assert(o
->pinned
);
1736 bool BlueStore::OnodeSpace::map_any(std::function
<bool(OnodeRef
)> f
)
1738 std::lock_guard
l(cache
->lock
);
1739 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1740 for (auto& i
: onode_map
) {
1748 template <int LogLevelV
= 30>
1749 void BlueStore::OnodeSpace::dump(CephContext
*cct
)
1751 for (auto& i
: onode_map
) {
1752 ldout(cct
, LogLevelV
) << i
.first
<< " : " << i
.second
1753 << " " << i
.second
->nref
1754 << " " << i
.second
->cached
1755 << " " << i
.second
->pinned
1763 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1765 #define dout_context coll->store->cct
1767 void BlueStore::SharedBlob::dump(Formatter
* f
) const
1769 f
->dump_bool("loaded", loaded
);
1771 persistent
->dump(f
);
1773 f
->dump_unsigned("sbid_unloaded", sbid_unloaded
);
1777 ostream
& operator<<(ostream
& out
, const BlueStore::SharedBlob
& sb
)
1779 out
<< "SharedBlob(" << &sb
;
1782 out
<< " loaded " << *sb
.persistent
;
1784 out
<< " sbid 0x" << std::hex
<< sb
.sbid_unloaded
<< std::dec
;
1789 BlueStore::SharedBlob::SharedBlob(uint64_t i
, Collection
*_coll
)
1790 : coll(_coll
), sbid_unloaded(i
)
1792 ceph_assert(sbid_unloaded
> 0);
1794 get_cache()->add_blob();
1798 BlueStore::SharedBlob::~SharedBlob()
1800 if (loaded
&& persistent
) {
1805 void BlueStore::SharedBlob::put()
1808 dout(20) << __func__
<< " " << this
1809 << " removing self from set " << get_parent()
1812 auto coll_snap
= coll
;
1814 std::lock_guard
l(coll_snap
->cache
->lock
);
1815 if (coll_snap
!= coll
) {
1818 if (!coll_snap
->shared_blob_set
.remove(this, true)) {
1822 bc
._clear(coll_snap
->cache
);
1823 coll_snap
->cache
->rm_blob();
1829 void BlueStore::SharedBlob::get_ref(uint64_t offset
, uint32_t length
)
1831 ceph_assert(persistent
);
1832 persistent
->ref_map
.get(offset
, length
);
1835 void BlueStore::SharedBlob::put_ref(uint64_t offset
, uint32_t length
,
1839 ceph_assert(persistent
);
1840 persistent
->ref_map
.put(offset
, length
, r
,
1841 unshare
&& !*unshare
? unshare
: nullptr);
1844 void BlueStore::SharedBlob::finish_write(uint64_t seq
)
1847 BufferCacheShard
*cache
= coll
->cache
;
1848 std::lock_guard
l(cache
->lock
);
1849 if (coll
->cache
!= cache
) {
1850 dout(20) << __func__
1851 << " raced with sb cache update, was " << cache
1852 << ", now " << coll
->cache
<< ", retrying"
1856 bc
._finish_write(cache
, seq
);
1864 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1866 template <int LogLevelV
= 30>
1867 void BlueStore::SharedBlobSet::dump(CephContext
*cct
)
1869 std::lock_guard
l(lock
);
1870 for (auto& i
: sb_map
) {
1871 ldout(cct
, LogLevelV
) << i
.first
<< " : " << *i
.second
<< dendl
;
1878 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1880 void BlueStore::Blob::dump(Formatter
* f
) const
1882 if (is_spanning()) {
1883 f
->dump_unsigned("spanning_id ", id
);
1887 f
->dump_object("shared", *shared_blob
);
1891 ostream
& operator<<(ostream
& out
, const BlueStore::Blob
& b
)
1893 out
<< "Blob(" << &b
;
1894 if (b
.is_spanning()) {
1895 out
<< " spanning " << b
.id
;
1897 out
<< " " << b
.get_blob() << " " << b
.get_blob_use_tracker();
1898 if (b
.shared_blob
) {
1899 out
<< " " << *b
.shared_blob
;
1901 out
<< " (shared_blob=NULL)";
1907 void BlueStore::Blob::discard_unallocated(Collection
*coll
)
1909 if (get_blob().is_shared()) {
1912 if (get_blob().is_compressed()) {
1913 bool discard
= false;
1914 bool all_invalid
= true;
1915 for (auto e
: get_blob().get_extents()) {
1916 if (!e
.is_valid()) {
1919 all_invalid
= false;
1922 ceph_assert(discard
== all_invalid
); // in case of compressed blob all
1923 // or none pextents are invalid.
1925 shared_blob
->bc
.discard(shared_blob
->get_cache(), 0,
1926 get_blob().get_logical_length());
1930 for (auto e
: get_blob().get_extents()) {
1931 if (!e
.is_valid()) {
1932 dout(20) << __func__
<< " 0x" << std::hex
<< pos
1934 << std::dec
<< dendl
;
1935 shared_blob
->bc
.discard(shared_blob
->get_cache(), pos
, e
.length
);
1939 if (get_blob().can_prune_tail()) {
1940 dirty_blob().prune_tail();
1941 used_in_blob
.prune_tail(get_blob().get_ondisk_length());
1942 dout(20) << __func__
<< " pruned tail, now " << get_blob() << dendl
;
1947 void BlueStore::Blob::get_ref(
1952 // Caller has to initialize Blob's logical length prior to increment
1953 // references. Otherwise one is neither unable to determine required
1954 // amount of counters in case of per-au tracking nor obtain min_release_size
1955 // for single counter mode.
1956 ceph_assert(get_blob().get_logical_length() != 0);
1957 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
1958 << std::dec
<< " " << *this << dendl
;
1960 if (used_in_blob
.is_empty()) {
1961 uint32_t min_release_size
=
1962 get_blob().get_release_size(coll
->store
->min_alloc_size
);
1963 uint64_t l
= get_blob().get_logical_length();
1964 dout(20) << __func__
<< " init 0x" << std::hex
<< l
<< ", "
1965 << min_release_size
<< std::dec
<< dendl
;
1966 used_in_blob
.init(l
, min_release_size
);
1973 bool BlueStore::Blob::put_ref(
1979 PExtentVector logical
;
1981 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
1982 << std::dec
<< " " << *this << dendl
;
1984 bool empty
= used_in_blob
.put(
1989 // nothing to release
1990 if (!empty
&& logical
.empty()) {
1994 bluestore_blob_t
& b
= dirty_blob();
1995 return b
.release_extents(empty
, logical
, r
);
1998 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size
,
1999 uint32_t target_blob_size
,
2001 uint32_t *length0
) {
2002 ceph_assert(min_alloc_size
);
2003 ceph_assert(target_blob_size
);
2004 if (!get_blob().is_mutable()) {
2008 uint32_t length
= *length0
;
2009 uint32_t end
= b_offset
+ length
;
2011 // Currently for the sake of simplicity we omit blob reuse if data is
2012 // unaligned with csum chunk. Later we can perform padding if needed.
2013 if (get_blob().has_csum() &&
2014 ((b_offset
% get_blob().get_csum_chunk_size()) != 0 ||
2015 (end
% get_blob().get_csum_chunk_size()) != 0)) {
2019 auto blen
= get_blob().get_logical_length();
2020 uint32_t new_blen
= blen
;
2022 // make sure target_blob_size isn't less than current blob len
2023 target_blob_size
= std::max(blen
, target_blob_size
);
2025 if (b_offset
>= blen
) {
2026 // new data totally stands out of the existing blob
2029 // new data overlaps with the existing blob
2030 new_blen
= std::max(blen
, end
);
2032 uint32_t overlap
= 0;
2033 if (new_blen
> blen
) {
2034 overlap
= blen
- b_offset
;
2039 if (!get_blob().is_unallocated(b_offset
, overlap
)) {
2040 // abort if any piece of the overlap has already been allocated
2045 if (new_blen
> blen
) {
2046 int64_t overflow
= int64_t(new_blen
) - target_blob_size
;
2047 // Unable to decrease the provided length to fit into max_blob_size
2048 if (overflow
>= length
) {
2052 // FIXME: in some cases we could reduce unused resolution
2053 if (get_blob().has_unused()) {
2058 new_blen
-= overflow
;
2063 if (new_blen
> blen
) {
2064 dirty_blob().add_tail(new_blen
);
2065 used_in_blob
.add_tail(new_blen
,
2066 get_blob().get_release_size(min_alloc_size
));
2072 void BlueStore::Blob::split(Collection
*coll
, uint32_t blob_offset
, Blob
*r
)
2074 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2075 << " start " << *this << dendl
;
2076 ceph_assert(blob
.can_split());
2077 ceph_assert(used_in_blob
.can_split());
2078 bluestore_blob_t
&lb
= dirty_blob();
2079 bluestore_blob_t
&rb
= r
->dirty_blob();
2083 &(r
->used_in_blob
));
2085 lb
.split(blob_offset
, rb
);
2086 shared_blob
->bc
.split(shared_blob
->get_cache(), blob_offset
, r
->shared_blob
->bc
);
2088 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2089 << " finish " << *this << dendl
;
2090 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
2091 << " and " << *r
<< dendl
;
2094 #ifndef CACHE_BLOB_BL
2095 void BlueStore::Blob::decode(
2097 bufferptr::const_iterator
& p
,
2100 bool include_ref_map
)
2102 denc(blob
, p
, struct_v
);
2103 if (blob
.is_shared()) {
2106 if (include_ref_map
) {
2108 used_in_blob
.decode(p
);
2110 used_in_blob
.clear();
2111 bluestore_extent_ref_map_t legacy_ref_map
;
2112 legacy_ref_map
.decode(p
);
2113 for (auto r
: legacy_ref_map
.ref_map
) {
2117 r
.second
.refs
* r
.second
.length
);
2126 void BlueStore::Extent::dump(Formatter
* f
) const
2128 f
->dump_unsigned("logical_offset", logical_offset
);
2129 f
->dump_unsigned("length", length
);
2130 f
->dump_unsigned("blob_offset", blob_offset
);
2131 f
->dump_object("blob", *blob
);
2134 ostream
& operator<<(ostream
& out
, const BlueStore::Extent
& e
)
2136 return out
<< std::hex
<< "0x" << e
.logical_offset
<< "~" << e
.length
2137 << ": 0x" << e
.blob_offset
<< "~" << e
.length
<< std::dec
2142 BlueStore::OldExtent
* BlueStore::OldExtent::create(CollectionRef c
,
2147 OldExtent
* oe
= new OldExtent(lo
, o
, l
, b
);
2148 b
->put_ref(c
.get(), o
, l
, &(oe
->r
));
2149 oe
->blob_empty
= b
->get_referenced_bytes() == 0;
2156 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
2158 #define dout_context onode->c->store->cct
2160 BlueStore::ExtentMap::ExtentMap(Onode
*o
)
2163 o
->c
->store
->cct
->_conf
->bluestore_extent_map_inline_shard_prealloc_size
) {
2166 void BlueStore::ExtentMap::dump(Formatter
* f
) const
2168 f
->open_array_section("extents");
2170 for (auto& e
: extent_map
) {
2171 f
->dump_object("extent", e
);
2176 void BlueStore::ExtentMap::dup(BlueStore
* b
, TransContext
* txc
,
2177 CollectionRef
& c
, OnodeRef
& oldo
, OnodeRef
& newo
, uint64_t& srcoff
,
2178 uint64_t& length
, uint64_t& dstoff
) {
2180 auto cct
= onode
->c
->store
->cct
;
2182 cct
->_conf
->bluestore_debug_inject_bug21040
;
2183 vector
<BlobRef
> id_to_blob(oldo
->extent_map
.extent_map
.size());
2184 for (auto& e
: oldo
->extent_map
.extent_map
) {
2185 e
.blob
->last_encoded_id
= -1;
2189 uint64_t end
= srcoff
+ length
;
2190 uint32_t dirty_range_begin
= 0;
2191 uint32_t dirty_range_end
= 0;
2192 bool src_dirty
= false;
2193 for (auto ep
= oldo
->extent_map
.seek_lextent(srcoff
);
2194 ep
!= oldo
->extent_map
.extent_map
.end();
2197 if (e
.logical_offset
>= end
) {
2200 dout(20) << __func__
<< " src " << e
<< dendl
;
2202 bool blob_duped
= true;
2203 if (e
.blob
->last_encoded_id
>= 0) {
2204 cb
= id_to_blob
[e
.blob
->last_encoded_id
];
2208 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
2209 // make sure it is shared
2210 if (!blob
.is_shared()) {
2211 c
->make_blob_shared(b
->_assign_blobid(txc
), e
.blob
);
2212 if (!inject_21040
&& !src_dirty
) {
2214 dirty_range_begin
= e
.logical_offset
;
2215 } else if (inject_21040
&&
2216 dirty_range_begin
== 0 && dirty_range_end
== 0) {
2217 dirty_range_begin
= e
.logical_offset
;
2219 ceph_assert(e
.logical_end() > 0);
2220 // -1 to exclude next potential shard
2221 dirty_range_end
= e
.logical_end() - 1;
2223 c
->load_shared_blob(e
.blob
->shared_blob
);
2226 e
.blob
->last_encoded_id
= n
;
2229 // bump the extent refs on the copied blob's extents
2230 for (auto p
: blob
.get_extents()) {
2232 e
.blob
->shared_blob
->get_ref(p
.offset
, p
.length
);
2235 txc
->write_shared_blob(e
.blob
->shared_blob
);
2236 dout(20) << __func__
<< " new " << *cb
<< dendl
;
2239 int skip_front
, skip_back
;
2240 if (e
.logical_offset
< srcoff
) {
2241 skip_front
= srcoff
- e
.logical_offset
;
2245 if (e
.logical_end() > end
) {
2246 skip_back
= e
.logical_end() - end
;
2251 Extent
* ne
= new Extent(e
.logical_offset
+ skip_front
+ dstoff
- srcoff
,
2252 e
.blob_offset
+ skip_front
, e
.length
- skip_front
- skip_back
, cb
);
2253 newo
->extent_map
.extent_map
.insert(*ne
);
2254 ne
->blob
->get_ref(c
.get(), ne
->blob_offset
, ne
->length
);
2255 // fixme: we may leave parts of new blob unreferenced that could
2256 // be freed (relative to the shared_blob).
2257 txc
->statfs_delta
.stored() += ne
->length
;
2258 if (e
.blob
->get_blob().is_compressed()) {
2259 txc
->statfs_delta
.compressed_original() += ne
->length
;
2261 txc
->statfs_delta
.compressed() +=
2262 cb
->get_blob().get_compressed_payload_length();
2265 dout(20) << __func__
<< " dst " << *ne
<< dendl
;
2268 if ((!inject_21040
&& src_dirty
) ||
2269 (inject_21040
&& dirty_range_end
> dirty_range_begin
)) {
2270 oldo
->extent_map
.dirty_range(dirty_range_begin
,
2271 dirty_range_end
- dirty_range_begin
);
2272 txc
->write_onode(oldo
);
2274 txc
->write_onode(newo
);
2276 if (dstoff
+ length
> newo
->onode
.size
) {
2277 newo
->onode
.size
= dstoff
+ length
;
2279 newo
->extent_map
.dirty_range(dstoff
, length
);
2281 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t
,
2284 auto cct
= onode
->c
->store
->cct
; //used by dout
2285 dout(20) << __func__
<< " " << onode
->oid
<< (force
? " force" : "") << dendl
;
2286 if (onode
->onode
.extent_map_shards
.empty()) {
2287 if (inline_bl
.length() == 0) {
2289 // we need to encode inline_bl to measure encoded length
2290 bool never_happen
= encode_some(0, OBJECT_MAX_SIZE
, inline_bl
, &n
);
2291 inline_bl
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
2292 ceph_assert(!never_happen
);
2293 size_t len
= inline_bl
.length();
2294 dout(20) << __func__
<< " inline shard " << len
<< " bytes from " << n
2295 << " extents" << dendl
;
2296 if (!force
&& len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2297 request_reshard(0, OBJECT_MAX_SIZE
);
2301 // will persist in the onode key.
2303 // pending shard update
2304 struct dirty_shard_t
{
2307 dirty_shard_t(Shard
*s
) : shard(s
) {}
2309 vector
<dirty_shard_t
> encoded_shards
;
2310 // allocate slots for all shards in a single call instead of
2311 // doing multiple allocations - one per each dirty shard
2312 encoded_shards
.reserve(shards
.size());
2314 auto p
= shards
.begin();
2316 while (p
!= shards
.end()) {
2317 ceph_assert(p
->shard_info
->offset
>= prev_p
->shard_info
->offset
);
2322 if (n
== shards
.end()) {
2323 endoff
= OBJECT_MAX_SIZE
;
2325 endoff
= n
->shard_info
->offset
;
2327 encoded_shards
.emplace_back(dirty_shard_t(&(*p
)));
2328 bufferlist
& bl
= encoded_shards
.back().bl
;
2329 if (encode_some(p
->shard_info
->offset
, endoff
- p
->shard_info
->offset
,
2332 derr
<< __func__
<< " encode_some needs reshard" << dendl
;
2333 ceph_assert(!force
);
2336 size_t len
= bl
.length();
2338 dout(20) << __func__
<< " shard 0x" << std::hex
2339 << p
->shard_info
->offset
<< std::dec
<< " is " << len
2340 << " bytes (was " << p
->shard_info
->bytes
<< ") from "
2341 << p
->extents
<< " extents" << dendl
;
2344 if (len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2345 // we are big; reshard ourselves
2346 request_reshard(p
->shard_info
->offset
, endoff
);
2348 // avoid resharding the trailing shard, even if it is small
2349 else if (n
!= shards
.end() &&
2350 len
< g_conf()->bluestore_extent_map_shard_min_size
) {
2351 ceph_assert(endoff
!= OBJECT_MAX_SIZE
);
2352 if (p
== shards
.begin()) {
2353 // we are the first shard, combine with next shard
2354 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2356 // combine either with the previous shard or the next,
2357 // whichever is smaller
2358 if (prev_p
->shard_info
->bytes
> n
->shard_info
->bytes
) {
2359 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2361 request_reshard(prev_p
->shard_info
->offset
, endoff
);
2370 if (needs_reshard()) {
2374 // schedule DB update for dirty shards
2376 for (auto& it
: encoded_shards
) {
2377 it
.shard
->dirty
= false;
2378 it
.shard
->shard_info
->bytes
= it
.bl
.length();
2379 generate_extent_shard_key_and_apply(
2381 it
.shard
->shard_info
->offset
,
2383 [&](const string
& final_key
) {
2384 t
->set(PREFIX_OBJ
, final_key
, it
.bl
);
2391 bid_t
BlueStore::ExtentMap::allocate_spanning_blob_id()
2393 if (spanning_blob_map
.empty())
2395 bid_t bid
= spanning_blob_map
.rbegin()->first
+ 1;
2396 // bid is valid and available.
2399 // Find next unused bid;
2400 bid
= rand() % (numeric_limits
<bid_t
>::max() + 1);
2401 const auto begin_bid
= bid
;
2403 if (!spanning_blob_map
.count(bid
))
2407 if (bid
< 0) bid
= 0;
2409 } while (bid
!= begin_bid
);
2410 auto cct
= onode
->c
->store
->cct
; // used by dout
2411 _dump_onode
<0>(cct
, *onode
);
2412 ceph_abort_msg("no available blob id");
2415 void BlueStore::ExtentMap::reshard(
2417 KeyValueDB::Transaction t
)
2419 auto cct
= onode
->c
->store
->cct
; // used by dout
2421 dout(10) << __func__
<< " 0x[" << std::hex
<< needs_reshard_begin
<< ","
2422 << needs_reshard_end
<< ")" << std::dec
2423 << " of " << onode
->onode
.extent_map_shards
.size()
2424 << " shards on " << onode
->oid
<< dendl
;
2425 for (auto& p
: spanning_blob_map
) {
2426 dout(20) << __func__
<< " spanning blob " << p
.first
<< " " << *p
.second
2429 // determine shard index range
2430 unsigned si_begin
= 0, si_end
= 0;
2431 if (!shards
.empty()) {
2432 while (si_begin
+ 1 < shards
.size() &&
2433 shards
[si_begin
+ 1].shard_info
->offset
<= needs_reshard_begin
) {
2436 needs_reshard_begin
= shards
[si_begin
].shard_info
->offset
;
2437 for (si_end
= si_begin
; si_end
< shards
.size(); ++si_end
) {
2438 if (shards
[si_end
].shard_info
->offset
>= needs_reshard_end
) {
2439 needs_reshard_end
= shards
[si_end
].shard_info
->offset
;
2443 if (si_end
== shards
.size()) {
2444 needs_reshard_end
= OBJECT_MAX_SIZE
;
2446 dout(20) << __func__
<< " shards [" << si_begin
<< "," << si_end
<< ")"
2447 << " over 0x[" << std::hex
<< needs_reshard_begin
<< ","
2448 << needs_reshard_end
<< ")" << std::dec
<< dendl
;
2451 fault_range(db
, needs_reshard_begin
, (needs_reshard_end
- needs_reshard_begin
));
2453 // we may need to fault in a larger interval later must have all
2454 // referring extents for spanning blobs loaded in order to have
2455 // accurate use_tracker values.
2456 uint32_t spanning_scan_begin
= needs_reshard_begin
;
2457 uint32_t spanning_scan_end
= needs_reshard_end
;
2461 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2462 generate_extent_shard_key_and_apply(
2463 onode
->key
, shards
[i
].shard_info
->offset
, &key
,
2464 [&](const string
& final_key
) {
2465 t
->rmkey(PREFIX_OBJ
, final_key
);
2470 // calculate average extent size
2472 unsigned extents
= 0;
2473 if (onode
->onode
.extent_map_shards
.empty()) {
2474 bytes
= inline_bl
.length();
2475 extents
= extent_map
.size();
2477 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2478 bytes
+= shards
[i
].shard_info
->bytes
;
2479 extents
+= shards
[i
].extents
;
2482 unsigned target
= cct
->_conf
->bluestore_extent_map_shard_target_size
;
2483 unsigned slop
= target
*
2484 cct
->_conf
->bluestore_extent_map_shard_target_size_slop
;
2485 unsigned extent_avg
= bytes
/ std::max(1u, extents
);
2486 dout(20) << __func__
<< " extent_avg " << extent_avg
<< ", target " << target
2487 << ", slop " << slop
<< dendl
;
2490 unsigned estimate
= 0;
2491 unsigned offset
= needs_reshard_begin
;
2492 vector
<bluestore_onode_t::shard_info
> new_shard_info
;
2493 unsigned max_blob_end
= 0;
2494 Extent
dummy(needs_reshard_begin
);
2495 for (auto e
= extent_map
.lower_bound(dummy
);
2496 e
!= extent_map
.end();
2498 if (e
->logical_offset
>= needs_reshard_end
) {
2501 dout(30) << " extent " << *e
<< dendl
;
2503 // disfavor shard boundaries that span a blob
2504 bool would_span
= (e
->logical_offset
< max_blob_end
) || e
->blob_offset
;
2506 estimate
+ extent_avg
> target
+ (would_span
? slop
: 0)) {
2508 if (offset
== needs_reshard_begin
) {
2509 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2510 new_shard_info
.back().offset
= offset
;
2511 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2512 << std::dec
<< dendl
;
2514 offset
= e
->logical_offset
;
2515 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2516 new_shard_info
.back().offset
= offset
;
2517 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2518 << std::dec
<< dendl
;
2521 estimate
+= extent_avg
;
2522 unsigned bs
= e
->blob_start();
2523 if (bs
< spanning_scan_begin
) {
2524 spanning_scan_begin
= bs
;
2526 uint32_t be
= e
->blob_end();
2527 if (be
> max_blob_end
) {
2530 if (be
> spanning_scan_end
) {
2531 spanning_scan_end
= be
;
2534 if (new_shard_info
.empty() && (si_begin
> 0 ||
2535 si_end
< shards
.size())) {
2536 // we resharded a partial range; we must produce at least one output
2538 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2539 new_shard_info
.back().offset
= needs_reshard_begin
;
2540 dout(20) << __func__
<< " new shard 0x" << std::hex
<< needs_reshard_begin
2541 << std::dec
<< " (singleton degenerate case)" << dendl
;
2544 auto& sv
= onode
->onode
.extent_map_shards
;
2545 dout(20) << __func__
<< " new " << new_shard_info
<< dendl
;
2546 dout(20) << __func__
<< " old " << sv
<< dendl
;
2548 // no old shards to keep
2549 sv
.swap(new_shard_info
);
2550 init_shards(true, true);
2552 // splice in new shards
2553 sv
.erase(sv
.begin() + si_begin
, sv
.begin() + si_end
);
2554 shards
.erase(shards
.begin() + si_begin
, shards
.begin() + si_end
);
2556 sv
.begin() + si_begin
,
2557 new_shard_info
.begin(),
2558 new_shard_info
.end());
2559 shards
.insert(shards
.begin() + si_begin
, new_shard_info
.size(), Shard());
2560 si_end
= si_begin
+ new_shard_info
.size();
2562 ceph_assert(sv
.size() == shards
.size());
2564 // note that we need to update every shard_info of shards here,
2565 // as sv might have been totally re-allocated above
2566 for (unsigned i
= 0; i
< shards
.size(); i
++) {
2567 shards
[i
].shard_info
= &sv
[i
];
2570 // mark newly added shards as dirty
2571 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2572 shards
[i
].loaded
= true;
2573 shards
[i
].dirty
= true;
2576 dout(20) << __func__
<< " fin " << sv
<< dendl
;
2580 // no more shards; unspan all previously spanning blobs
2581 auto p
= spanning_blob_map
.begin();
2582 while (p
!= spanning_blob_map
.end()) {
2584 dout(30) << __func__
<< " un-spanning " << *p
->second
<< dendl
;
2585 p
= spanning_blob_map
.erase(p
);
2588 // identify new spanning blobs
2589 dout(20) << __func__
<< " checking spanning blobs 0x[" << std::hex
2590 << spanning_scan_begin
<< "," << spanning_scan_end
<< ")" << dendl
;
2591 if (spanning_scan_begin
< needs_reshard_begin
) {
2592 fault_range(db
, spanning_scan_begin
,
2593 needs_reshard_begin
- spanning_scan_begin
);
2595 if (spanning_scan_end
> needs_reshard_end
) {
2596 fault_range(db
, needs_reshard_end
,
2597 spanning_scan_end
- needs_reshard_end
);
2599 auto sp
= sv
.begin() + si_begin
;
2600 auto esp
= sv
.end();
2601 unsigned shard_start
= sp
->offset
;
2605 shard_end
= OBJECT_MAX_SIZE
;
2607 shard_end
= sp
->offset
;
2609 Extent
dummy(needs_reshard_begin
);
2611 bool was_too_many_blobs_check
= false;
2612 auto too_many_blobs_threshold
=
2613 g_conf()->bluestore_debug_too_many_blobs_threshold
;
2614 auto& dumped_onodes
= onode
->c
->onode_map
.cache
->dumped_onodes
;
2615 decltype(onode
->c
->onode_map
.cache
->dumped_onodes
)::value_type
* oid_slot
= nullptr;
2616 decltype(onode
->c
->onode_map
.cache
->dumped_onodes
)::value_type
* oldest_slot
= nullptr;
2618 for (auto e
= extent_map
.lower_bound(dummy
); e
!= extent_map
.end(); ++e
) {
2619 if (e
->logical_offset
>= needs_reshard_end
) {
2622 dout(30) << " extent " << *e
<< dendl
;
2623 while (e
->logical_offset
>= shard_end
) {
2624 shard_start
= shard_end
;
2625 ceph_assert(sp
!= esp
);
2628 shard_end
= OBJECT_MAX_SIZE
;
2630 shard_end
= sp
->offset
;
2632 dout(30) << __func__
<< " shard 0x" << std::hex
<< shard_start
2633 << " to 0x" << shard_end
<< std::dec
<< dendl
;
2636 if (e
->blob_escapes_range(shard_start
, shard_end
- shard_start
)) {
2637 if (!e
->blob
->is_spanning()) {
2638 // We have two options: (1) split the blob into pieces at the
2639 // shard boundaries (and adjust extents accordingly), or (2)
2640 // mark it spanning. We prefer to cut the blob if we can. Note that
2641 // we may have to split it multiple times--potentially at every
2643 bool must_span
= false;
2644 BlobRef b
= e
->blob
;
2645 if (b
->can_split()) {
2646 uint32_t bstart
= e
->blob_start();
2647 uint32_t bend
= e
->blob_end();
2648 for (const auto& sh
: shards
) {
2649 if (bstart
< sh
.shard_info
->offset
&&
2650 bend
> sh
.shard_info
->offset
) {
2651 uint32_t blob_offset
= sh
.shard_info
->offset
- bstart
;
2652 if (b
->can_split_at(blob_offset
)) {
2653 dout(20) << __func__
<< " splitting blob, bstart 0x"
2654 << std::hex
<< bstart
<< " blob_offset 0x"
2655 << blob_offset
<< std::dec
<< " " << *b
<< dendl
;
2656 b
= split_blob(b
, blob_offset
, sh
.shard_info
->offset
);
2657 // switch b to the new right-hand side, in case it
2658 // *also* has to get split.
2659 bstart
+= blob_offset
;
2660 onode
->c
->store
->logger
->inc(l_bluestore_blob_split
);
2671 auto bid
= allocate_spanning_blob_id();
2673 spanning_blob_map
[b
->id
] = b
;
2674 dout(20) << __func__
<< " adding spanning " << *b
<< dendl
;
2675 if (!was_too_many_blobs_check
&&
2676 too_many_blobs_threshold
&&
2677 spanning_blob_map
.size() >= size_t(too_many_blobs_threshold
)) {
2679 was_too_many_blobs_check
= true;
2680 for (size_t i
= 0; i
< dumped_onodes
.size(); ++i
) {
2681 if (dumped_onodes
[i
].first
== onode
->oid
) {
2682 oid_slot
= &dumped_onodes
[i
];
2685 if (!oldest_slot
|| (oldest_slot
&&
2686 dumped_onodes
[i
].second
< oldest_slot
->second
)) {
2687 oldest_slot
= &dumped_onodes
[i
];
2694 if (e
->blob
->is_spanning()) {
2695 spanning_blob_map
.erase(e
->blob
->id
);
2697 dout(30) << __func__
<< " un-spanning " << *e
->blob
<< dendl
;
2701 bool do_dump
= (!oid_slot
&& was_too_many_blobs_check
) ||
2703 (mono_clock::now() - oid_slot
->second
>= make_timespan(5 * 60)));
2706 << " spanning blob count exceeds threshold, "
2707 << spanning_blob_map
.size() << " spanning blobs"
2709 _dump_onode
<0>(cct
, *onode
);
2711 oid_slot
->second
= mono_clock::now();
2713 ceph_assert(oldest_slot
);
2714 oldest_slot
->first
= onode
->oid
;
2715 oldest_slot
->second
= mono_clock::now();
2720 clear_needs_reshard();
2723 bool BlueStore::ExtentMap::encode_some(
2729 Extent
dummy(offset
);
2730 auto start
= extent_map
.lower_bound(dummy
);
2731 uint32_t end
= offset
+ length
;
2733 __u8 struct_v
= 2; // Version 2 differs from v1 in blob's ref_map
2734 // serialization only. Hence there is no specific
2735 // handling at ExtentMap level.
2739 bool must_reshard
= false;
2740 for (auto p
= start
;
2741 p
!= extent_map
.end() && p
->logical_offset
< end
;
2743 ceph_assert(p
->logical_offset
>= offset
);
2744 p
->blob
->last_encoded_id
= -1;
2745 if (!p
->blob
->is_spanning() && p
->blob_escapes_range(offset
, length
)) {
2746 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2747 << std::dec
<< " hit new spanning blob " << *p
<< dendl
;
2748 request_reshard(p
->blob_start(), p
->blob_end());
2749 must_reshard
= true;
2751 if (!must_reshard
) {
2752 denc_varint(0, bound
); // blobid
2753 denc_varint(0, bound
); // logical_offset
2754 denc_varint(0, bound
); // len
2755 denc_varint(0, bound
); // blob_offset
2757 p
->blob
->bound_encode(
2760 p
->blob
->shared_blob
->get_sbid(),
2768 denc(struct_v
, bound
);
2769 denc_varint(0, bound
); // number of extents
2772 auto app
= bl
.get_contiguous_appender(bound
);
2773 denc(struct_v
, app
);
2774 denc_varint(n
, app
);
2781 uint64_t prev_len
= 0;
2782 for (auto p
= start
;
2783 p
!= extent_map
.end() && p
->logical_offset
< end
;
2786 bool include_blob
= false;
2787 if (p
->blob
->is_spanning()) {
2788 blobid
= p
->blob
->id
<< BLOBID_SHIFT_BITS
;
2789 blobid
|= BLOBID_FLAG_SPANNING
;
2790 } else if (p
->blob
->last_encoded_id
< 0) {
2791 p
->blob
->last_encoded_id
= n
+ 1; // so it is always non-zero
2792 include_blob
= true;
2793 blobid
= 0; // the decoder will infer the id from n
2795 blobid
= p
->blob
->last_encoded_id
<< BLOBID_SHIFT_BITS
;
2797 if (p
->logical_offset
== pos
) {
2798 blobid
|= BLOBID_FLAG_CONTIGUOUS
;
2800 if (p
->blob_offset
== 0) {
2801 blobid
|= BLOBID_FLAG_ZEROOFFSET
;
2803 if (p
->length
== prev_len
) {
2804 blobid
|= BLOBID_FLAG_SAMELENGTH
;
2806 prev_len
= p
->length
;
2808 denc_varint(blobid
, app
);
2809 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
2810 denc_varint_lowz(p
->logical_offset
- pos
, app
);
2812 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
2813 denc_varint_lowz(p
->blob_offset
, app
);
2815 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
2816 denc_varint_lowz(p
->length
, app
);
2818 pos
= p
->logical_end();
2820 p
->blob
->encode(app
, struct_v
, p
->blob
->shared_blob
->get_sbid(), false);
2824 /*derr << __func__ << bl << dendl;
2825 derr << __func__ << ":";
2832 unsigned BlueStore::ExtentMap::decode_some(bufferlist
& bl
)
2835 derr << __func__ << ":";
2840 ceph_assert(bl
.get_num_buffers() <= 1);
2841 auto p
= bl
.front().begin_deep();
2844 // Version 2 differs from v1 in blob's ref_map
2845 // serialization only. Hence there is no specific
2846 // handling at ExtentMap level below.
2847 ceph_assert(struct_v
== 1 || struct_v
== 2);
2850 denc_varint(num
, p
);
2851 vector
<BlobRef
> blobs(num
);
2853 uint64_t prev_len
= 0;
2857 Extent
*le
= new Extent();
2859 denc_varint(blobid
, p
);
2860 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
2862 denc_varint_lowz(gap
, p
);
2865 le
->logical_offset
= pos
;
2866 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
2867 denc_varint_lowz(le
->blob_offset
, p
);
2869 le
->blob_offset
= 0;
2871 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
2872 denc_varint_lowz(prev_len
, p
);
2874 le
->length
= prev_len
;
2876 if (blobid
& BLOBID_FLAG_SPANNING
) {
2877 dout(30) << __func__
<< " getting spanning blob "
2878 << (blobid
>> BLOBID_SHIFT_BITS
) << dendl
;
2879 le
->assign_blob(get_spanning_blob(blobid
>> BLOBID_SHIFT_BITS
));
2881 blobid
>>= BLOBID_SHIFT_BITS
;
2883 le
->assign_blob(blobs
[blobid
- 1]);
2884 ceph_assert(le
->blob
);
2886 Blob
*b
= new Blob();
2888 b
->decode(onode
->c
, p
, struct_v
, &sbid
, false);
2890 onode
->c
->open_shared_blob(sbid
, b
);
2893 // we build ref_map dynamically for non-spanning blobs
2901 extent_map
.insert(*le
);
2904 ceph_assert(n
== num
);
2908 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p
)
2910 // Version 2 differs from v1 in blob's ref_map
2911 // serialization only. Hence there is no specific
2912 // handling at ExtentMap level.
2916 denc_varint((uint32_t)0, p
);
2917 size_t key_size
= 0;
2918 denc_varint((uint32_t)0, key_size
);
2919 p
+= spanning_blob_map
.size() * key_size
;
2920 for (const auto& i
: spanning_blob_map
) {
2921 i
.second
->bound_encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
2925 void BlueStore::ExtentMap::encode_spanning_blobs(
2926 bufferlist::contiguous_appender
& p
)
2928 // Version 2 differs from v1 in blob's ref_map
2929 // serialization only. Hence there is no specific
2930 // handling at ExtentMap level.
2934 denc_varint(spanning_blob_map
.size(), p
);
2935 for (auto& i
: spanning_blob_map
) {
2936 denc_varint(i
.second
->id
, p
);
2937 i
.second
->encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
2941 void BlueStore::ExtentMap::decode_spanning_blobs(
2942 bufferptr::const_iterator
& p
)
2946 // Version 2 differs from v1 in blob's ref_map
2947 // serialization only. Hence there is no specific
2948 // handling at ExtentMap level.
2949 ceph_assert(struct_v
== 1 || struct_v
== 2);
2954 BlobRef
b(new Blob());
2955 denc_varint(b
->id
, p
);
2956 spanning_blob_map
[b
->id
] = b
;
2958 b
->decode(onode
->c
, p
, struct_v
, &sbid
, true);
2959 onode
->c
->open_shared_blob(sbid
, b
);
2963 void BlueStore::ExtentMap::init_shards(bool loaded
, bool dirty
)
2965 shards
.resize(onode
->onode
.extent_map_shards
.size());
2967 for (auto &s
: onode
->onode
.extent_map_shards
) {
2968 shards
[i
].shard_info
= &s
;
2969 shards
[i
].loaded
= loaded
;
2970 shards
[i
].dirty
= dirty
;
2975 void BlueStore::ExtentMap::fault_range(
2980 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2981 << std::dec
<< dendl
;
2982 auto start
= seek_shard(offset
);
2983 auto last
= seek_shard(offset
+ length
);
2988 ceph_assert(last
>= start
);
2990 while (start
<= last
) {
2991 ceph_assert((size_t)start
< shards
.size());
2992 auto p
= &shards
[start
];
2994 dout(30) << __func__
<< " opening shard 0x" << std::hex
2995 << p
->shard_info
->offset
<< std::dec
<< dendl
;
2997 generate_extent_shard_key_and_apply(
2998 onode
->key
, p
->shard_info
->offset
, &key
,
2999 [&](const string
& final_key
) {
3000 int r
= db
->get(PREFIX_OBJ
, final_key
, &v
);
3002 derr
<< __func__
<< " missing shard 0x" << std::hex
3003 << p
->shard_info
->offset
<< std::dec
<< " for " << onode
->oid
3005 ceph_assert(r
>= 0);
3009 p
->extents
= decode_some(v
);
3011 dout(20) << __func__
<< " open shard 0x" << std::hex
3012 << p
->shard_info
->offset
3013 << " for range 0x" << offset
<< "~" << length
<< std::dec
3014 << " (" << v
.length() << " bytes)" << dendl
;
3015 ceph_assert(p
->dirty
== false);
3016 ceph_assert(v
.length() == p
->shard_info
->bytes
);
3017 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_misses
);
3019 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_hits
);
3025 void BlueStore::ExtentMap::dirty_range(
3029 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3030 << std::dec
<< dendl
;
3031 if (shards
.empty()) {
3032 dout(20) << __func__
<< " mark inline shard dirty" << dendl
;
3036 auto start
= seek_shard(offset
);
3040 auto last
= seek_shard(offset
+ length
- 1);
3044 ceph_assert(last
>= start
);
3045 while (start
<= last
) {
3046 ceph_assert((size_t)start
< shards
.size());
3047 auto p
= &shards
[start
];
3049 derr
<< __func__
<< "on write 0x" << std::hex
<< offset
3050 << "~" << length
<< " shard 0x" << p
->shard_info
->offset
3051 << std::dec
<< " is not loaded, can't mark dirty" << dendl
;
3052 ceph_abort_msg("can't mark unloaded shard dirty");
3055 dout(20) << __func__
<< " mark shard 0x" << std::hex
3056 << p
->shard_info
->offset
<< std::dec
<< " dirty" << dendl
;
3063 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::find(
3066 Extent
dummy(offset
);
3067 return extent_map
.find(dummy
);
3070 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::seek_lextent(
3073 Extent
dummy(offset
);
3074 auto fp
= extent_map
.lower_bound(dummy
);
3075 if (fp
!= extent_map
.begin()) {
3077 if (fp
->logical_end() <= offset
) {
3084 BlueStore::extent_map_t::const_iterator
BlueStore::ExtentMap::seek_lextent(
3085 uint64_t offset
) const
3087 Extent
dummy(offset
);
3088 auto fp
= extent_map
.lower_bound(dummy
);
3089 if (fp
!= extent_map
.begin()) {
3091 if (fp
->logical_end() <= offset
) {
3098 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset
, uint64_t length
)
3100 auto fp
= seek_lextent(offset
);
3101 if (fp
== extent_map
.end() || fp
->logical_offset
>= offset
+ length
) {
3107 int BlueStore::ExtentMap::compress_extent_map(
3111 if (extent_map
.empty())
3114 auto p
= seek_lextent(offset
);
3115 if (p
!= extent_map
.begin()) {
3116 --p
; // start to the left of offset
3118 // the caller should have just written to this region
3119 ceph_assert(p
!= extent_map
.end());
3121 // identify the *next* shard
3122 auto pshard
= shards
.begin();
3123 while (pshard
!= shards
.end() &&
3124 p
->logical_offset
>= pshard
->shard_info
->offset
) {
3128 if (pshard
!= shards
.end()) {
3129 shard_end
= pshard
->shard_info
->offset
;
3131 shard_end
= OBJECT_MAX_SIZE
;
3135 for (++n
; n
!= extent_map
.end(); p
= n
++) {
3136 if (n
->logical_offset
> offset
+ length
) {
3137 break; // stop after end
3139 while (n
!= extent_map
.end() &&
3140 p
->logical_end() == n
->logical_offset
&&
3141 p
->blob
== n
->blob
&&
3142 p
->blob_offset
+ p
->length
== n
->blob_offset
&&
3143 n
->logical_offset
< shard_end
) {
3144 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3145 << " next shard 0x" << shard_end
<< std::dec
3146 << " merging " << *p
<< " and " << *n
<< dendl
;
3147 p
->length
+= n
->length
;
3151 if (n
== extent_map
.end()) {
3154 if (n
->logical_offset
>= shard_end
) {
3155 ceph_assert(pshard
!= shards
.end());
3157 if (pshard
!= shards
.end()) {
3158 shard_end
= pshard
->shard_info
->offset
;
3160 shard_end
= OBJECT_MAX_SIZE
;
3165 onode
->c
->store
->logger
->inc(l_bluestore_extent_compress
, removed
);
3170 void BlueStore::ExtentMap::punch_hole(
3174 old_extent_map_t
*old_extents
)
3176 auto p
= seek_lextent(offset
);
3177 uint64_t end
= offset
+ length
;
3178 while (p
!= extent_map
.end()) {
3179 if (p
->logical_offset
>= end
) {
3182 if (p
->logical_offset
< offset
) {
3183 if (p
->logical_end() > end
) {
3184 // split and deref middle
3185 uint64_t front
= offset
- p
->logical_offset
;
3186 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ front
,
3188 old_extents
->push_back(*oe
);
3190 p
->blob_offset
+ front
+ length
,
3191 p
->length
- front
- length
,
3197 ceph_assert(p
->logical_end() > offset
); // else seek_lextent bug
3198 uint64_t keep
= offset
- p
->logical_offset
;
3199 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ keep
,
3200 p
->length
- keep
, p
->blob
);
3201 old_extents
->push_back(*oe
);
3207 if (p
->logical_offset
+ p
->length
<= end
) {
3208 // deref whole lextent
3209 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
3210 p
->length
, p
->blob
);
3211 old_extents
->push_back(*oe
);
3216 uint64_t keep
= p
->logical_end() - end
;
3217 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
3218 p
->length
- keep
, p
->blob
);
3219 old_extents
->push_back(*oe
);
3221 add(end
, p
->blob_offset
+ p
->length
- keep
, keep
, p
->blob
);
3227 BlueStore::Extent
*BlueStore::ExtentMap::set_lextent(
3229 uint64_t logical_offset
,
3230 uint64_t blob_offset
, uint64_t length
, BlobRef b
,
3231 old_extent_map_t
*old_extents
)
3233 // We need to have completely initialized Blob to increment its ref counters.
3234 ceph_assert(b
->get_blob().get_logical_length() != 0);
3236 // Do get_ref prior to punch_hole to prevent from putting reused blob into
3237 // old_extents list if we overwre the blob totally
3238 // This might happen during WAL overwrite.
3239 b
->get_ref(onode
->c
, blob_offset
, length
);
3242 punch_hole(c
, logical_offset
, length
, old_extents
);
3245 Extent
*le
= new Extent(logical_offset
, blob_offset
, length
, b
);
3246 extent_map
.insert(*le
);
3247 if (spans_shard(logical_offset
, length
)) {
3248 request_reshard(logical_offset
, logical_offset
+ length
);
3253 BlueStore::BlobRef
BlueStore::ExtentMap::split_blob(
3255 uint32_t blob_offset
,
3258 uint32_t end_pos
= pos
+ lb
->get_blob().get_logical_length() - blob_offset
;
3259 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< " end 0x" << end_pos
3260 << " blob_offset 0x" << blob_offset
<< std::dec
<< " " << *lb
3262 BlobRef rb
= onode
->c
->new_blob();
3263 lb
->split(onode
->c
, blob_offset
, rb
.get());
3265 for (auto ep
= seek_lextent(pos
);
3266 ep
!= extent_map
.end() && ep
->logical_offset
< end_pos
;
3268 if (ep
->blob
!= lb
) {
3271 if (ep
->logical_offset
< pos
) {
3273 size_t left
= pos
- ep
->logical_offset
;
3274 Extent
*ne
= new Extent(pos
, 0, ep
->length
- left
, rb
);
3275 extent_map
.insert(*ne
);
3277 dout(30) << __func__
<< " split " << *ep
<< dendl
;
3278 dout(30) << __func__
<< " to " << *ne
<< dendl
;
3281 ceph_assert(ep
->blob_offset
>= blob_offset
);
3284 ep
->blob_offset
-= blob_offset
;
3285 dout(30) << __func__
<< " adjusted " << *ep
<< dendl
;
3294 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
3297 // A tricky thing about Onode's ref counter is that we do an additional
3298 // increment when newly pinned instance is detected. And -1 on unpin.
3299 // This prevents from a conflict with a delete call (when nref == 0).
3300 // The latter might happen while the thread is in unpin() function
3301 // (and e.g. waiting for lock acquisition) since nref is already
3302 // decremented. And another 'putting' thread on the instance will release it.
3304 void BlueStore::Onode::get() {
3306 c
->get_onode_cache()->pin(this, [&]() {
3307 bool was_pinned
= pinned
;
3309 // additional increment for newly pinned instance
3310 bool r
= !was_pinned
&& pinned
;
3318 void BlueStore::Onode::put() {
3320 c
->get_onode_cache()->unpin(this, [&]() {
3321 bool was_pinned
= pinned
;
3322 pinned
= pinned
&& nref
> 2; // intentionally use > not >= as we have
3323 // +1 due to pinned state
3324 bool r
= was_pinned
&& !pinned
;
3325 // additional decrement for newly unpinned instance
3337 BlueStore::Onode
* BlueStore::Onode::decode(
3339 const ghobject_t
& oid
,
3341 const bufferlist
& v
)
3343 Onode
* on
= new Onode(c
.get(), oid
, key
);
3345 auto p
= v
.front().begin_deep();
3346 on
->onode
.decode(p
);
3347 for (auto& i
: on
->onode
.attrs
) {
3348 i
.second
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
3351 // initialize extent_map
3352 on
->extent_map
.decode_spanning_blobs(p
);
3353 if (on
->onode
.extent_map_shards
.empty()) {
3354 denc(on
->extent_map
.inline_bl
, p
);
3355 on
->extent_map
.decode_some(on
->extent_map
.inline_bl
);
3356 on
->extent_map
.inline_bl
.reassign_to_mempool(
3357 mempool::mempool_bluestore_cache_other
);
3360 on
->extent_map
.init_shards(false, false);
3365 void BlueStore::Onode::flush()
3367 if (flushing_count
.load()) {
3368 ldout(c
->store
->cct
, 20) << __func__
<< " cnt:" << flushing_count
<< dendl
;
3370 std::unique_lock
l(flush_lock
);
3371 while (flushing_count
.load()) {
3376 ldout(c
->store
->cct
, 20) << __func__
<< " done" << dendl
;
3379 void BlueStore::Onode::dump(Formatter
* f
) const
3386 const string
& BlueStore::Onode::get_omap_prefix()
3388 if (onode
.is_pgmeta_omap()) {
3389 return PREFIX_PGMETA_OMAP
;
3391 if (onode
.is_perpool_omap()) {
3392 return PREFIX_PERPOOL_OMAP
;
3399 void BlueStore::Onode::get_omap_header(string
*out
)
3401 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3402 _key_encode_u64(c
->pool(), out
);
3404 _key_encode_u64(onode
.nid
, out
);
3405 out
->push_back('-');
3408 void BlueStore::Onode::get_omap_key(const string
& key
, string
*out
)
3410 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3411 _key_encode_u64(c
->pool(), out
);
3413 _key_encode_u64(onode
.nid
, out
);
3414 out
->push_back('.');
3418 void BlueStore::Onode::rewrite_omap_key(const string
& old
, string
*out
)
3420 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3421 _key_encode_u64(c
->pool(), out
);
3423 _key_encode_u64(onode
.nid
, out
);
3424 out
->append(old
.c_str() + out
->length(), old
.size() - out
->length());
3427 void BlueStore::Onode::get_omap_tail(string
*out
)
3429 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3430 _key_encode_u64(c
->pool(), out
);
3432 _key_encode_u64(onode
.nid
, out
);
3433 out
->push_back('~');
3436 void BlueStore::Onode::decode_omap_key(const string
& key
, string
*user_key
)
3438 if (onode
.is_perpool_omap() && !onode
.is_pgmeta_omap()) {
3439 *user_key
= key
.substr(sizeof(uint64_t)*2 + 1);
3441 *user_key
= key
.substr(sizeof(uint64_t) + 1);
3446 // =======================================================
3449 /// Checks for writes to the same pextent within a blob
3450 bool BlueStore::WriteContext::has_conflict(
3454 uint64_t min_alloc_size
)
3456 ceph_assert((loffs
% min_alloc_size
) == 0);
3457 ceph_assert((loffs_end
% min_alloc_size
) == 0);
3458 for (auto w
: writes
) {
3460 auto loffs2
= p2align(w
.logical_offset
, min_alloc_size
);
3461 auto loffs2_end
= p2roundup(w
.logical_offset
+ w
.length0
, min_alloc_size
);
3462 if ((loffs
<= loffs2
&& loffs_end
> loffs2
) ||
3463 (loffs
>= loffs2
&& loffs
< loffs2_end
)) {
3471 // =======================================================
3475 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3477 #define dout_context cct
3479 void BlueStore::DeferredBatch::prepare_write(
3481 uint64_t seq
, uint64_t offset
, uint64_t length
,
3482 bufferlist::const_iterator
& blp
)
3484 _discard(cct
, offset
, length
);
3485 auto i
= iomap
.insert(make_pair(offset
, deferred_io()));
3486 ceph_assert(i
.second
); // this should be a new insertion
3487 i
.first
->second
.seq
= seq
;
3488 blp
.copy(length
, i
.first
->second
.bl
);
3489 i
.first
->second
.bl
.reassign_to_mempool(
3490 mempool::mempool_bluestore_writing_deferred
);
3491 dout(20) << __func__
<< " seq " << seq
3492 << " 0x" << std::hex
<< offset
<< "~" << length
3493 << " crc " << i
.first
->second
.bl
.crc32c(-1)
3494 << std::dec
<< dendl
;
3495 seq_bytes
[seq
] += length
;
3496 #ifdef DEBUG_DEFERRED
3501 void BlueStore::DeferredBatch::_discard(
3502 CephContext
*cct
, uint64_t offset
, uint64_t length
)
3504 generic_dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3505 << std::dec
<< dendl
;
3506 auto p
= iomap
.lower_bound(offset
);
3507 if (p
!= iomap
.begin()) {
3509 auto end
= p
->first
+ p
->second
.bl
.length();
3512 head
.substr_of(p
->second
.bl
, 0, offset
- p
->first
);
3513 dout(20) << __func__
<< " keep head " << p
->second
.seq
3514 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3515 << " -> 0x" << head
.length() << std::dec
<< dendl
;
3516 auto i
= seq_bytes
.find(p
->second
.seq
);
3517 ceph_assert(i
!= seq_bytes
.end());
3518 if (end
> offset
+ length
) {
3520 tail
.substr_of(p
->second
.bl
, offset
+ length
- p
->first
,
3521 end
- (offset
+ length
));
3522 dout(20) << __func__
<< " keep tail " << p
->second
.seq
3523 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3524 << " -> 0x" << tail
.length() << std::dec
<< dendl
;
3525 auto &n
= iomap
[offset
+ length
];
3527 n
.seq
= p
->second
.seq
;
3528 i
->second
-= length
;
3530 i
->second
-= end
- offset
;
3532 ceph_assert(i
->second
>= 0);
3533 p
->second
.bl
.swap(head
);
3537 while (p
!= iomap
.end()) {
3538 if (p
->first
>= offset
+ length
) {
3541 auto i
= seq_bytes
.find(p
->second
.seq
);
3542 ceph_assert(i
!= seq_bytes
.end());
3543 auto end
= p
->first
+ p
->second
.bl
.length();
3544 if (end
> offset
+ length
) {
3545 unsigned drop_front
= offset
+ length
- p
->first
;
3546 unsigned keep_tail
= end
- (offset
+ length
);
3547 dout(20) << __func__
<< " truncate front " << p
->second
.seq
3548 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3549 << " drop_front 0x" << drop_front
<< " keep_tail 0x" << keep_tail
3550 << " to 0x" << (offset
+ length
) << "~" << keep_tail
3551 << std::dec
<< dendl
;
3552 auto &s
= iomap
[offset
+ length
];
3553 s
.seq
= p
->second
.seq
;
3554 s
.bl
.substr_of(p
->second
.bl
, drop_front
, keep_tail
);
3555 i
->second
-= drop_front
;
3557 dout(20) << __func__
<< " drop " << p
->second
.seq
3558 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3559 << std::dec
<< dendl
;
3560 i
->second
-= p
->second
.bl
.length();
3562 ceph_assert(i
->second
>= 0);
3567 void BlueStore::DeferredBatch::_audit(CephContext
*cct
)
3569 map
<uint64_t,int> sb
;
3570 for (auto p
: seq_bytes
) {
3571 sb
[p
.first
] = 0; // make sure we have the same set of keys
3574 for (auto& p
: iomap
) {
3575 ceph_assert(p
.first
>= pos
);
3576 sb
[p
.second
.seq
] += p
.second
.bl
.length();
3577 pos
= p
.first
+ p
.second
.bl
.length();
3579 ceph_assert(sb
== seq_bytes
);
3586 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3588 BlueStore::Collection::Collection(BlueStore
*store_
, OnodeCacheShard
*oc
, BufferCacheShard
*bc
, coll_t cid
)
3589 : CollectionImpl(store_
->cct
, cid
),
3594 commit_queue(nullptr)
3598 bool BlueStore::Collection::flush_commit(Context
*c
)
3600 return osr
->flush_commit(c
);
3603 void BlueStore::Collection::flush()
3608 void BlueStore::Collection::flush_all_but_last()
3610 osr
->flush_all_but_last();
3613 void BlueStore::Collection::open_shared_blob(uint64_t sbid
, BlobRef b
)
3615 ceph_assert(!b
->shared_blob
);
3616 const bluestore_blob_t
& blob
= b
->get_blob();
3617 if (!blob
.is_shared()) {
3618 b
->shared_blob
= new SharedBlob(this);
3622 b
->shared_blob
= shared_blob_set
.lookup(sbid
);
3623 if (b
->shared_blob
) {
3624 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3625 << std::dec
<< " had " << *b
->shared_blob
<< dendl
;
3627 b
->shared_blob
= new SharedBlob(sbid
, this);
3628 shared_blob_set
.add(this, b
->shared_blob
.get());
3629 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3630 << std::dec
<< " opened " << *b
->shared_blob
3635 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb
)
3637 if (!sb
->is_loaded()) {
3641 auto sbid
= sb
->get_sbid();
3642 get_shared_blob_key(sbid
, &key
);
3643 int r
= store
->db
->get(PREFIX_SHARED_BLOB
, key
, &v
);
3645 lderr(store
->cct
) << __func__
<< " sbid 0x" << std::hex
<< sbid
3646 << std::dec
<< " not found at key "
3647 << pretty_binary_string(key
) << dendl
;
3648 ceph_abort_msg("uh oh, missing shared_blob");
3652 sb
->persistent
= new bluestore_shared_blob_t(sbid
);
3653 auto p
= v
.cbegin();
3654 decode(*(sb
->persistent
), p
);
3655 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3656 << std::dec
<< " loaded shared_blob " << *sb
<< dendl
;
3660 void BlueStore::Collection::make_blob_shared(uint64_t sbid
, BlobRef b
)
3662 ldout(store
->cct
, 10) << __func__
<< " " << *b
<< dendl
;
3663 ceph_assert(!b
->shared_blob
->is_loaded());
3666 bluestore_blob_t
& blob
= b
->dirty_blob();
3667 blob
.set_flag(bluestore_blob_t::FLAG_SHARED
);
3669 // update shared blob
3670 b
->shared_blob
->loaded
= true;
3671 b
->shared_blob
->persistent
= new bluestore_shared_blob_t(sbid
);
3672 shared_blob_set
.add(this, b
->shared_blob
.get());
3673 for (auto p
: blob
.get_extents()) {
3675 b
->shared_blob
->get_ref(
3680 ldout(store
->cct
, 20) << __func__
<< " now " << *b
<< dendl
;
3683 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob
*sb
)
3685 ldout(store
->cct
, 10) << __func__
<< " " << *sb
<< dendl
;
3686 ceph_assert(sb
->is_loaded());
3688 uint64_t sbid
= sb
->get_sbid();
3689 shared_blob_set
.remove(sb
);
3691 delete sb
->persistent
;
3692 sb
->sbid_unloaded
= 0;
3693 ldout(store
->cct
, 20) << __func__
<< " now " << *sb
<< dendl
;
3697 BlueStore::OnodeRef
BlueStore::Collection::get_onode(
3698 const ghobject_t
& oid
,
3702 ceph_assert(create
? ceph_mutex_is_wlocked(lock
) : ceph_mutex_is_locked(lock
));
3705 if (cid
.is_pg(&pgid
)) {
3706 if (!oid
.match(cnode
.bits
, pgid
.ps())) {
3707 lderr(store
->cct
) << __func__
<< " oid " << oid
<< " not part of "
3708 << pgid
<< " bits " << cnode
.bits
<< dendl
;
3713 OnodeRef o
= onode_map
.lookup(oid
);
3718 get_object_key(store
->cct
, oid
, &key
);
3720 ldout(store
->cct
, 20) << __func__
<< " oid " << oid
<< " key "
3721 << pretty_binary_string(key
) << dendl
;
3727 r
= store
->db
->get(PREFIX_OBJ
, key
.c_str(), key
.size(), &v
);
3728 ldout(store
->cct
, 20) << " r " << r
<< " v.len " << v
.length() << dendl
;
3730 if (v
.length() == 0) {
3731 ceph_assert(r
== -ENOENT
);
3732 if (!store
->cct
->_conf
->bluestore_debug_misc
&&
3736 // new object, new onode
3737 on
= new Onode(this, oid
, key
);
3740 ceph_assert(r
>= 0);
3741 on
= Onode::decode(this, oid
, key
, v
);
3744 return onode_map
.add(oid
, o
);
3747 void BlueStore::Collection::split_cache(
3750 ldout(store
->cct
, 10) << __func__
<< " to " << dest
<< dendl
;
3752 // lock (one or both) cache shards
3753 std::lock(cache
->lock
, dest
->cache
->lock
);
3754 std::lock_guard
l(cache
->lock
, std::adopt_lock
);
3755 std::lock_guard
l2(dest
->cache
->lock
, std::adopt_lock
);
3757 int destbits
= dest
->cnode
.bits
;
3759 bool is_pg
= dest
->cid
.is_pg(&destpg
);
3762 auto p
= onode_map
.onode_map
.begin();
3763 while (p
!= onode_map
.onode_map
.end()) {
3764 OnodeRef o
= p
->second
;
3765 if (!p
->second
->oid
.match(destbits
, destpg
.pgid
.ps())) {
3766 // onode does not belong to this child
3767 ldout(store
->cct
, 20) << __func__
<< " not moving " << o
<< " " << o
->oid
3771 ldout(store
->cct
, 20) << __func__
<< " moving " << o
<< " " << o
->oid
3774 // ensuring that nref is always >= 2 and hence onode is pinned and
3775 // physically out of cache during the transition
3777 ceph_assert(o
->pinned
);
3779 p
= onode_map
.onode_map
.erase(p
);
3780 dest
->onode_map
.onode_map
[o
->oid
] = o
;
3781 if (get_onode_cache() != dest
->get_onode_cache()) {
3782 get_onode_cache()->move_pinned(dest
->get_onode_cache(), o
.get());
3786 // move over shared blobs and buffers. cover shared blobs from
3787 // both extent map and spanning blob map (the full extent map
3788 // may not be faulted in)
3789 vector
<SharedBlob
*> sbvec
;
3790 for (auto& e
: o
->extent_map
.extent_map
) {
3791 sbvec
.push_back(e
.blob
->shared_blob
.get());
3793 for (auto& b
: o
->extent_map
.spanning_blob_map
) {
3794 sbvec
.push_back(b
.second
->shared_blob
.get());
3796 for (auto sb
: sbvec
) {
3797 if (sb
->coll
== dest
) {
3798 ldout(store
->cct
, 20) << __func__
<< " already moved " << *sb
3802 ldout(store
->cct
, 20) << __func__
<< " moving " << *sb
<< dendl
;
3803 if (sb
->get_sbid()) {
3804 ldout(store
->cct
, 20) << __func__
3805 << " moving registration " << *sb
<< dendl
;
3806 shared_blob_set
.remove(sb
);
3807 dest
->shared_blob_set
.add(dest
, sb
);
3810 if (dest
->cache
!= cache
) {
3811 for (auto& i
: sb
->bc
.buffer_map
) {
3812 if (!i
.second
->is_writing()) {
3813 ldout(store
->cct
, 20) << __func__
<< " moving " << *i
.second
3815 dest
->cache
->_move(cache
, i
.second
.get());
3822 dest
->cache
->_trim();
3825 // =======================================================
3830 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
3832 #define dout_context store->cct
3834 void *BlueStore::MempoolThread::entry()
3836 std::unique_lock l
{lock
};
3838 uint32_t prev_config_change
= store
->config_changed
.load();
3839 uint64_t base
= store
->osd_memory_base
;
3840 double fragmentation
= store
->osd_memory_expected_fragmentation
;
3841 uint64_t target
= store
->osd_memory_target
;
3842 uint64_t min
= store
->osd_memory_cache_min
;
3845 // When setting the maximum amount of memory to use for cache, first
3846 // assume some base amount of memory for the OSD and then fudge in
3847 // some overhead for fragmentation that scales with cache usage.
3848 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
3849 if (ltarget
> base
+ min
) {
3850 max
= ltarget
- base
;
3853 binned_kv_cache
= store
->db
->get_priority_cache();
3854 if (store
->cache_autotune
&& binned_kv_cache
!= nullptr) {
3855 pcm
= std::make_shared
<PriorityCache::Manager
>(
3856 store
->cct
, min
, max
, target
, true);
3857 pcm
->insert("kv", binned_kv_cache
, true);
3858 pcm
->insert("meta", meta_cache
, true);
3859 pcm
->insert("data", data_cache
, true);
3862 utime_t next_balance
= ceph_clock_now();
3863 utime_t next_resize
= ceph_clock_now();
3864 utime_t next_deferred_force_submit
= ceph_clock_now();
3865 utime_t alloc_stats_dump_clock
= ceph_clock_now();
3867 bool interval_stats_trim
= false;
3869 // Update pcm cache settings if related configuration was changed
3870 uint32_t cur_config_change
= store
->config_changed
.load();
3871 if (cur_config_change
!= prev_config_change
) {
3872 _update_cache_settings();
3873 prev_config_change
= cur_config_change
;
3876 // Before we trim, check and see if it's time to rebalance/resize.
3877 double autotune_interval
= store
->cache_autotune_interval
;
3878 double resize_interval
= store
->osd_memory_cache_resize_interval
;
3879 double max_defer_interval
= store
->max_defer_interval
;
3881 double alloc_stats_dump_interval
=
3882 store
->cct
->_conf
->bluestore_alloc_stats_dump_interval
;
3884 if (alloc_stats_dump_interval
> 0 &&
3885 alloc_stats_dump_clock
+ alloc_stats_dump_interval
< ceph_clock_now()) {
3886 store
->_record_allocation_stats();
3887 alloc_stats_dump_clock
= ceph_clock_now();
3889 if (autotune_interval
> 0 && next_balance
< ceph_clock_now()) {
3890 _adjust_cache_settings();
3892 // Log events at 5 instead of 20 when balance happens.
3893 interval_stats_trim
= true;
3895 if (pcm
!= nullptr) {
3899 next_balance
= ceph_clock_now();
3900 next_balance
+= autotune_interval
;
3902 if (resize_interval
> 0 && next_resize
< ceph_clock_now()) {
3903 if (ceph_using_tcmalloc() && pcm
!= nullptr) {
3906 next_resize
= ceph_clock_now();
3907 next_resize
+= resize_interval
;
3910 if (max_defer_interval
> 0 &&
3911 next_deferred_force_submit
< ceph_clock_now()) {
3912 if (store
->get_deferred_last_submitted() + max_defer_interval
<
3914 store
->deferred_try_submit();
3916 next_deferred_force_submit
= ceph_clock_now();
3917 next_deferred_force_submit
+= max_defer_interval
/3;
3920 // Now Resize the shards
3921 _resize_shards(interval_stats_trim
);
3922 interval_stats_trim
= false;
3924 store
->_update_cache_logger();
3925 auto wait
= ceph::make_timespan(
3926 store
->cct
->_conf
->bluestore_cache_trim_interval
);
3927 cond
.wait_for(l
, wait
);
3930 store
->_record_allocation_stats();
3935 void BlueStore::MempoolThread::_adjust_cache_settings()
3937 if (binned_kv_cache
!= nullptr) {
3938 binned_kv_cache
->set_cache_ratio(store
->cache_kv_ratio
);
3940 meta_cache
->set_cache_ratio(store
->cache_meta_ratio
);
3941 data_cache
->set_cache_ratio(store
->cache_data_ratio
);
3944 void BlueStore::MempoolThread::_resize_shards(bool interval_stats
)
3946 size_t onode_shards
= store
->onode_cache_shards
.size();
3947 size_t buffer_shards
= store
->buffer_cache_shards
.size();
3948 int64_t kv_used
= store
->db
->get_cache_usage();
3949 int64_t meta_used
= meta_cache
->_get_used_bytes();
3950 int64_t data_used
= data_cache
->_get_used_bytes();
3952 uint64_t cache_size
= store
->cache_size
;
3954 static_cast<int64_t>(store
->cache_kv_ratio
* cache_size
);
3955 int64_t meta_alloc
=
3956 static_cast<int64_t>(store
->cache_meta_ratio
* cache_size
);
3957 int64_t data_alloc
=
3958 static_cast<int64_t>(store
->cache_data_ratio
* cache_size
);
3960 if (pcm
!= nullptr && binned_kv_cache
!= nullptr) {
3961 cache_size
= pcm
->get_tuned_mem();
3962 kv_alloc
= binned_kv_cache
->get_committed_size();
3963 meta_alloc
= meta_cache
->get_committed_size();
3964 data_alloc
= data_cache
->get_committed_size();
3967 if (interval_stats
) {
3968 dout(5) << __func__
<< " cache_size: " << cache_size
3969 << " kv_alloc: " << kv_alloc
3970 << " kv_used: " << kv_used
3971 << " meta_alloc: " << meta_alloc
3972 << " meta_used: " << meta_used
3973 << " data_alloc: " << data_alloc
3974 << " data_used: " << data_used
<< dendl
;
3976 dout(20) << __func__
<< " cache_size: " << cache_size
3977 << " kv_alloc: " << kv_alloc
3978 << " kv_used: " << kv_used
3979 << " meta_alloc: " << meta_alloc
3980 << " meta_used: " << meta_used
3981 << " data_alloc: " << data_alloc
3982 << " data_used: " << data_used
<< dendl
;
3985 uint64_t max_shard_onodes
= static_cast<uint64_t>(
3986 (meta_alloc
/ (double) onode_shards
) / meta_cache
->get_bytes_per_onode());
3987 uint64_t max_shard_buffer
= static_cast<uint64_t>(data_alloc
/ buffer_shards
);
3989 dout(30) << __func__
<< " max_shard_onodes: " << max_shard_onodes
3990 << " max_shard_buffer: " << max_shard_buffer
<< dendl
;
3992 for (auto i
: store
->onode_cache_shards
) {
3993 i
->set_max(max_shard_onodes
);
3995 for (auto i
: store
->buffer_cache_shards
) {
3996 i
->set_max(max_shard_buffer
);
4000 void BlueStore::MempoolThread::_update_cache_settings()
4002 // Nothing to do if pcm is not used.
4003 if (pcm
== nullptr) {
4007 uint64_t target
= store
->osd_memory_target
;
4008 uint64_t base
= store
->osd_memory_base
;
4009 uint64_t min
= store
->osd_memory_cache_min
;
4011 double fragmentation
= store
->osd_memory_expected_fragmentation
;
4013 uint64_t ltarget
= (1.0 - fragmentation
) * target
;
4014 if (ltarget
> base
+ min
) {
4015 max
= ltarget
- base
;
4018 // set pcm cache levels
4019 pcm
->set_target_memory(target
);
4020 pcm
->set_min_memory(min
);
4021 pcm
->set_max_memory(max
);
4023 dout(5) << __func__
<< " updated pcm target: " << target
4024 << " pcm min: " << min
4025 << " pcm max: " << max
4029 // =======================================================
4034 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
4036 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
4037 CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
)
4038 : c(c
), o(o
), it(it
)
4040 std::shared_lock
l(c
->lock
);
4041 if (o
->onode
.has_omap()) {
4042 o
->get_omap_key(string(), &head
);
4043 o
->get_omap_tail(&tail
);
4044 it
->lower_bound(head
);
4048 string
BlueStore::OmapIteratorImpl::_stringify() const
4051 s
<< " omap_iterator(cid = " << c
->cid
4052 <<", oid = " << o
->oid
<< ")";
4056 int BlueStore::OmapIteratorImpl::seek_to_first()
4058 std::shared_lock
l(c
->lock
);
4059 auto start1
= mono_clock::now();
4060 if (o
->onode
.has_omap()) {
4061 it
->lower_bound(head
);
4063 it
= KeyValueDB::Iterator();
4065 c
->store
->log_latency(
4067 l_bluestore_omap_seek_to_first_lat
,
4068 mono_clock::now() - start1
,
4069 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
4074 int BlueStore::OmapIteratorImpl::upper_bound(const string
& after
)
4076 std::shared_lock
l(c
->lock
);
4077 auto start1
= mono_clock::now();
4078 if (o
->onode
.has_omap()) {
4080 o
->get_omap_key(after
, &key
);
4081 ldout(c
->store
->cct
,20) << __func__
<< " after " << after
<< " key "
4082 << pretty_binary_string(key
) << dendl
;
4083 it
->upper_bound(key
);
4085 it
= KeyValueDB::Iterator();
4087 c
->store
->log_latency_fn(
4089 l_bluestore_omap_upper_bound_lat
,
4090 mono_clock::now() - start1
,
4091 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
,
4092 [&] (const ceph::timespan
& lat
) {
4093 return ", after = " + after
+
4100 int BlueStore::OmapIteratorImpl::lower_bound(const string
& to
)
4102 std::shared_lock
l(c
->lock
);
4103 auto start1
= mono_clock::now();
4104 if (o
->onode
.has_omap()) {
4106 o
->get_omap_key(to
, &key
);
4107 ldout(c
->store
->cct
,20) << __func__
<< " to " << to
<< " key "
4108 << pretty_binary_string(key
) << dendl
;
4109 it
->lower_bound(key
);
4111 it
= KeyValueDB::Iterator();
4113 c
->store
->log_latency_fn(
4115 l_bluestore_omap_lower_bound_lat
,
4116 mono_clock::now() - start1
,
4117 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
,
4118 [&] (const ceph::timespan
& lat
) {
4119 return ", to = " + to
+
4126 bool BlueStore::OmapIteratorImpl::valid()
4128 std::shared_lock
l(c
->lock
);
4129 bool r
= o
->onode
.has_omap() && it
&& it
->valid() &&
4130 it
->raw_key().second
< tail
;
4131 if (it
&& it
->valid()) {
4132 ldout(c
->store
->cct
,20) << __func__
<< " is at "
4133 << pretty_binary_string(it
->raw_key().second
)
4139 int BlueStore::OmapIteratorImpl::next()
4142 std::shared_lock
l(c
->lock
);
4143 auto start1
= mono_clock::now();
4144 if (o
->onode
.has_omap()) {
4148 c
->store
->log_latency(
4150 l_bluestore_omap_next_lat
,
4151 mono_clock::now() - start1
,
4152 c
->store
->cct
->_conf
->bluestore_log_omap_iterator_age
);
4157 string
BlueStore::OmapIteratorImpl::key()
4159 std::shared_lock
l(c
->lock
);
4160 ceph_assert(it
->valid());
4161 string db_key
= it
->raw_key().second
;
4163 o
->decode_omap_key(db_key
, &user_key
);
4168 bufferlist
BlueStore::OmapIteratorImpl::value()
4170 std::shared_lock
l(c
->lock
);
4171 ceph_assert(it
->valid());
4176 // =====================================
4179 #define dout_prefix *_dout << "bluestore(" << path << ") "
4181 #define dout_context cct
4184 static void aio_cb(void *priv
, void *priv2
)
4186 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
4187 BlueStore::AioContext
*c
= static_cast<BlueStore::AioContext
*>(priv2
);
4188 c
->aio_finish(store
);
4191 static void discard_cb(void *priv
, void *priv2
)
4193 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
4194 interval_set
<uint64_t> *tmp
= static_cast<interval_set
<uint64_t>*>(priv2
);
4195 store
->handle_discard(*tmp
);
4198 void BlueStore::handle_discard(interval_set
<uint64_t>& to_release
)
4200 dout(10) << __func__
<< dendl
;
4202 alloc
->release(to_release
);
4205 BlueStore::BlueStore(CephContext
*cct
, const string
& path
)
4206 : BlueStore(cct
, path
, 0) {}
4208 BlueStore::BlueStore(CephContext
*cct
,
4210 uint64_t _min_alloc_size
)
4211 : ObjectStore(cct
, path
),
4213 finisher(cct
, "commit_finisher", "cfin"),
4214 kv_sync_thread(this),
4215 kv_finalize_thread(this),
4216 min_alloc_size(_min_alloc_size
),
4217 min_alloc_size_order(ctz(_min_alloc_size
)),
4218 mempool_thread(this)
4221 cct
->_conf
.add_observer(this);
4222 set_cache_shards(1);
4225 BlueStore::~BlueStore()
4227 cct
->_conf
.remove_observer(this);
4229 ceph_assert(!mounted
);
4230 ceph_assert(db
== NULL
);
4231 ceph_assert(bluefs
== NULL
);
4232 ceph_assert(fsid_fd
< 0);
4233 ceph_assert(path_fd
< 0);
4234 for (auto i
: onode_cache_shards
) {
4237 for (auto i
: buffer_cache_shards
) {
4240 onode_cache_shards
.clear();
4241 buffer_cache_shards
.clear();
4244 const char **BlueStore::get_tracked_conf_keys() const
4246 static const char* KEYS
[] = {
4247 "bluestore_csum_type",
4248 "bluestore_compression_mode",
4249 "bluestore_compression_algorithm",
4250 "bluestore_compression_min_blob_size",
4251 "bluestore_compression_min_blob_size_ssd",
4252 "bluestore_compression_min_blob_size_hdd",
4253 "bluestore_compression_max_blob_size",
4254 "bluestore_compression_max_blob_size_ssd",
4255 "bluestore_compression_max_blob_size_hdd",
4256 "bluestore_compression_required_ratio",
4257 "bluestore_max_alloc_size",
4258 "bluestore_prefer_deferred_size",
4259 "bluestore_prefer_deferred_size_hdd",
4260 "bluestore_prefer_deferred_size_ssd",
4261 "bluestore_deferred_batch_ops",
4262 "bluestore_deferred_batch_ops_hdd",
4263 "bluestore_deferred_batch_ops_ssd",
4264 "bluestore_throttle_bytes",
4265 "bluestore_throttle_deferred_bytes",
4266 "bluestore_throttle_cost_per_io_hdd",
4267 "bluestore_throttle_cost_per_io_ssd",
4268 "bluestore_throttle_cost_per_io",
4269 "bluestore_max_blob_size",
4270 "bluestore_max_blob_size_ssd",
4271 "bluestore_max_blob_size_hdd",
4272 "osd_memory_target",
4273 "osd_memory_target_cgroup_limit_ratio",
4275 "osd_memory_cache_min",
4276 "osd_memory_expected_fragmentation",
4277 "bluestore_cache_autotune",
4278 "bluestore_cache_autotune_interval",
4279 "bluestore_warn_on_legacy_statfs",
4280 "bluestore_warn_on_no_per_pool_omap",
4281 "bluestore_max_defer_interval",
4287 void BlueStore::handle_conf_change(const ConfigProxy
& conf
,
4288 const std::set
<std::string
> &changed
)
4290 if (changed
.count("bluestore_warn_on_legacy_statfs")) {
4291 _check_legacy_statfs_alert();
4293 if (changed
.count("bluestore_warn_on_no_per_pool_omap")) {
4294 _check_no_per_pool_omap_alert();
4297 if (changed
.count("bluestore_csum_type")) {
4300 if (changed
.count("bluestore_compression_mode") ||
4301 changed
.count("bluestore_compression_algorithm") ||
4302 changed
.count("bluestore_compression_min_blob_size") ||
4303 changed
.count("bluestore_compression_max_blob_size")) {
4308 if (changed
.count("bluestore_max_blob_size") ||
4309 changed
.count("bluestore_max_blob_size_ssd") ||
4310 changed
.count("bluestore_max_blob_size_hdd")) {
4312 // only after startup
4316 if (changed
.count("bluestore_prefer_deferred_size") ||
4317 changed
.count("bluestore_prefer_deferred_size_hdd") ||
4318 changed
.count("bluestore_prefer_deferred_size_ssd") ||
4319 changed
.count("bluestore_max_alloc_size") ||
4320 changed
.count("bluestore_deferred_batch_ops") ||
4321 changed
.count("bluestore_deferred_batch_ops_hdd") ||
4322 changed
.count("bluestore_deferred_batch_ops_ssd")) {
4324 // only after startup
4328 if (changed
.count("bluestore_throttle_cost_per_io") ||
4329 changed
.count("bluestore_throttle_cost_per_io_hdd") ||
4330 changed
.count("bluestore_throttle_cost_per_io_ssd")) {
4332 _set_throttle_params();
4335 if (changed
.count("bluestore_throttle_bytes") ||
4336 changed
.count("bluestore_throttle_deferred_bytes") ||
4337 changed
.count("bluestore_throttle_trace_rate")) {
4338 throttle
.reset_throttle(conf
);
4340 if (changed
.count("bluestore_max_defer_interval")) {
4342 _set_max_defer_interval();
4345 if (changed
.count("osd_memory_target") ||
4346 changed
.count("osd_memory_base") ||
4347 changed
.count("osd_memory_cache_min") ||
4348 changed
.count("osd_memory_expected_fragmentation")) {
4349 _update_osd_memory_options();
4353 void BlueStore::_set_compression()
4355 auto m
= Compressor::get_comp_mode_type(cct
->_conf
->bluestore_compression_mode
);
4357 _clear_compression_alert();
4360 derr
<< __func__
<< " unrecognized value '"
4361 << cct
->_conf
->bluestore_compression_mode
4362 << "' for bluestore_compression_mode, reverting to 'none'"
4364 comp_mode
= Compressor::COMP_NONE
;
4365 string
s("unknown mode: ");
4366 s
+= cct
->_conf
->bluestore_compression_mode
;
4367 _set_compression_alert(true, s
.c_str());
4370 compressor
= nullptr;
4372 if (cct
->_conf
->bluestore_compression_min_blob_size
) {
4373 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size
;
4376 if (_use_rotational_settings()) {
4377 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_hdd
;
4379 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_ssd
;
4383 if (cct
->_conf
->bluestore_compression_max_blob_size
) {
4384 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size
;
4387 if (_use_rotational_settings()) {
4388 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_hdd
;
4390 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_ssd
;
4394 auto& alg_name
= cct
->_conf
->bluestore_compression_algorithm
;
4395 if (!alg_name
.empty()) {
4396 compressor
= Compressor::create(cct
, alg_name
);
4398 derr
<< __func__
<< " unable to initialize " << alg_name
.c_str() << " compressor"
4400 _set_compression_alert(false, alg_name
.c_str());
4404 dout(10) << __func__
<< " mode " << Compressor::get_comp_mode_name(comp_mode
)
4405 << " alg " << (compressor
? compressor
->get_type_name() : "(none)")
4406 << " min_blob " << comp_min_blob_size
4407 << " max_blob " << comp_max_blob_size
4411 void BlueStore::_set_csum()
4413 csum_type
= Checksummer::CSUM_NONE
;
4414 int t
= Checksummer::get_csum_string_type(cct
->_conf
->bluestore_csum_type
);
4415 if (t
> Checksummer::CSUM_NONE
)
4418 dout(10) << __func__
<< " csum_type "
4419 << Checksummer::get_csum_type_string(csum_type
)
4423 void BlueStore::_set_throttle_params()
4425 if (cct
->_conf
->bluestore_throttle_cost_per_io
) {
4426 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io
;
4429 if (_use_rotational_settings()) {
4430 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_hdd
;
4432 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_ssd
;
4436 dout(10) << __func__
<< " throttle_cost_per_io " << throttle_cost_per_io
4439 void BlueStore::_set_blob_size()
4441 if (cct
->_conf
->bluestore_max_blob_size
) {
4442 max_blob_size
= cct
->_conf
->bluestore_max_blob_size
;
4445 if (_use_rotational_settings()) {
4446 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_hdd
;
4448 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_ssd
;
4451 dout(10) << __func__
<< " max_blob_size 0x" << std::hex
<< max_blob_size
4452 << std::dec
<< dendl
;
4455 void BlueStore::_update_osd_memory_options()
4457 osd_memory_target
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_target");
4458 osd_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
4459 osd_memory_expected_fragmentation
= cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
4460 osd_memory_cache_min
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_cache_min");
4462 dout(10) << __func__
4463 << " osd_memory_target " << osd_memory_target
4464 << " osd_memory_base " << osd_memory_base
4465 << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
4466 << " osd_memory_cache_min " << osd_memory_cache_min
4470 int BlueStore::_set_cache_sizes()
4473 cache_autotune
= cct
->_conf
.get_val
<bool>("bluestore_cache_autotune");
4474 cache_autotune_interval
=
4475 cct
->_conf
.get_val
<double>("bluestore_cache_autotune_interval");
4476 osd_memory_target
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_target");
4477 osd_memory_base
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_base");
4478 osd_memory_expected_fragmentation
=
4479 cct
->_conf
.get_val
<double>("osd_memory_expected_fragmentation");
4480 osd_memory_cache_min
= cct
->_conf
.get_val
<Option::size_t>("osd_memory_cache_min");
4481 osd_memory_cache_resize_interval
=
4482 cct
->_conf
.get_val
<double>("osd_memory_cache_resize_interval");
4484 if (cct
->_conf
->bluestore_cache_size
) {
4485 cache_size
= cct
->_conf
->bluestore_cache_size
;
4487 // choose global cache size based on backend type
4488 if (_use_rotational_settings()) {
4489 cache_size
= cct
->_conf
->bluestore_cache_size_hdd
;
4491 cache_size
= cct
->_conf
->bluestore_cache_size_ssd
;
4495 cache_meta_ratio
= cct
->_conf
->bluestore_cache_meta_ratio
;
4496 if (cache_meta_ratio
< 0 || cache_meta_ratio
> 1.0) {
4497 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
4498 << ") must be in range [0,1.0]" << dendl
;
4502 cache_kv_ratio
= cct
->_conf
->bluestore_cache_kv_ratio
;
4503 if (cache_kv_ratio
< 0 || cache_kv_ratio
> 1.0) {
4504 derr
<< __func__
<< " bluestore_cache_kv_ratio (" << cache_kv_ratio
4505 << ") must be in range [0,1.0]" << dendl
;
4509 if (cache_meta_ratio
+ cache_kv_ratio
> 1.0) {
4510 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
4511 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4512 << ") = " << cache_meta_ratio
+ cache_kv_ratio
<< "; must be <= 1.0"
4518 (double)1.0 - (double)cache_meta_ratio
- (double)cache_kv_ratio
;
4519 if (cache_data_ratio
< 0) {
4520 // deal with floating point imprecision
4521 cache_data_ratio
= 0;
4524 dout(1) << __func__
<< " cache_size " << cache_size
4525 << " meta " << cache_meta_ratio
4526 << " kv " << cache_kv_ratio
4527 << " data " << cache_data_ratio
4532 int BlueStore::write_meta(const std::string
& key
, const std::string
& value
)
4534 bluestore_bdev_label_t label
;
4535 string p
= path
+ "/block";
4536 int r
= _read_bdev_label(cct
, p
, &label
);
4538 return ObjectStore::write_meta(key
, value
);
4540 label
.meta
[key
] = value
;
4541 r
= _write_bdev_label(cct
, p
, label
);
4542 ceph_assert(r
== 0);
4543 return ObjectStore::write_meta(key
, value
);
4546 int BlueStore::read_meta(const std::string
& key
, std::string
*value
)
4548 bluestore_bdev_label_t label
;
4549 string p
= path
+ "/block";
4550 int r
= _read_bdev_label(cct
, p
, &label
);
4552 return ObjectStore::read_meta(key
, value
);
4554 auto i
= label
.meta
.find(key
);
4555 if (i
== label
.meta
.end()) {
4556 return ObjectStore::read_meta(key
, value
);
4562 void BlueStore::_init_logger()
4564 PerfCountersBuilder
b(cct
, "bluestore",
4565 l_bluestore_first
, l_bluestore_last
);
4566 b
.add_time_avg(l_bluestore_kv_flush_lat
, "kv_flush_lat",
4567 "Average kv_thread flush latency",
4568 "fl_l", PerfCountersBuilder::PRIO_INTERESTING
);
4569 b
.add_time_avg(l_bluestore_kv_commit_lat
, "kv_commit_lat",
4570 "Average kv_thread commit latency");
4571 b
.add_time_avg(l_bluestore_kv_sync_lat
, "kv_sync_lat",
4572 "Average kv_sync thread latency",
4573 "ks_l", PerfCountersBuilder::PRIO_INTERESTING
);
4574 b
.add_time_avg(l_bluestore_kv_final_lat
, "kv_final_lat",
4575 "Average kv_finalize thread latency",
4576 "kf_l", PerfCountersBuilder::PRIO_INTERESTING
);
4577 b
.add_time_avg(l_bluestore_state_prepare_lat
, "state_prepare_lat",
4578 "Average prepare state latency");
4579 b
.add_time_avg(l_bluestore_state_aio_wait_lat
, "state_aio_wait_lat",
4580 "Average aio_wait state latency",
4581 "io_l", PerfCountersBuilder::PRIO_INTERESTING
);
4582 b
.add_time_avg(l_bluestore_state_io_done_lat
, "state_io_done_lat",
4583 "Average io_done state latency");
4584 b
.add_time_avg(l_bluestore_state_kv_queued_lat
, "state_kv_queued_lat",
4585 "Average kv_queued state latency");
4586 b
.add_time_avg(l_bluestore_state_kv_committing_lat
, "state_kv_commiting_lat",
4587 "Average kv_commiting state latency");
4588 b
.add_time_avg(l_bluestore_state_kv_done_lat
, "state_kv_done_lat",
4589 "Average kv_done state latency");
4590 b
.add_time_avg(l_bluestore_state_deferred_queued_lat
, "state_deferred_queued_lat",
4591 "Average deferred_queued state latency");
4592 b
.add_time_avg(l_bluestore_state_deferred_aio_wait_lat
, "state_deferred_aio_wait_lat",
4593 "Average aio_wait state latency");
4594 b
.add_time_avg(l_bluestore_state_deferred_cleanup_lat
, "state_deferred_cleanup_lat",
4595 "Average cleanup state latency");
4596 b
.add_time_avg(l_bluestore_state_finishing_lat
, "state_finishing_lat",
4597 "Average finishing state latency");
4598 b
.add_time_avg(l_bluestore_state_done_lat
, "state_done_lat",
4599 "Average done state latency");
4600 b
.add_time_avg(l_bluestore_throttle_lat
, "throttle_lat",
4601 "Average submit throttle latency",
4602 "th_l", PerfCountersBuilder::PRIO_CRITICAL
);
4603 b
.add_time_avg(l_bluestore_submit_lat
, "submit_lat",
4604 "Average submit latency",
4605 "s_l", PerfCountersBuilder::PRIO_CRITICAL
);
4606 b
.add_time_avg(l_bluestore_commit_lat
, "commit_lat",
4607 "Average commit latency",
4608 "c_l", PerfCountersBuilder::PRIO_CRITICAL
);
4609 b
.add_time_avg(l_bluestore_read_lat
, "read_lat",
4610 "Average read latency",
4611 "r_l", PerfCountersBuilder::PRIO_CRITICAL
);
4612 b
.add_time_avg(l_bluestore_read_onode_meta_lat
, "read_onode_meta_lat",
4613 "Average read onode metadata latency");
4614 b
.add_time_avg(l_bluestore_read_wait_aio_lat
, "read_wait_aio_lat",
4615 "Average read latency");
4616 b
.add_time_avg(l_bluestore_compress_lat
, "compress_lat",
4617 "Average compress latency");
4618 b
.add_time_avg(l_bluestore_decompress_lat
, "decompress_lat",
4619 "Average decompress latency");
4620 b
.add_time_avg(l_bluestore_csum_lat
, "csum_lat",
4621 "Average checksum latency");
4622 b
.add_u64_counter(l_bluestore_compress_success_count
, "compress_success_count",
4623 "Sum for beneficial compress ops");
4624 b
.add_u64_counter(l_bluestore_compress_rejected_count
, "compress_rejected_count",
4625 "Sum for compress ops rejected due to low net gain of space");
4626 b
.add_u64_counter(l_bluestore_write_pad_bytes
, "write_pad_bytes",
4627 "Sum for write-op padded bytes", NULL
, 0, unit_t(UNIT_BYTES
));
4628 b
.add_u64_counter(l_bluestore_deferred_write_ops
, "deferred_write_ops",
4629 "Sum for deferred write op");
4630 b
.add_u64_counter(l_bluestore_deferred_write_bytes
, "deferred_write_bytes",
4631 "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES
));
4632 b
.add_u64_counter(l_bluestore_write_penalty_read_ops
, "write_penalty_read_ops",
4633 "Sum for write penalty read ops");
4634 b
.add_u64(l_bluestore_allocated
, "bluestore_allocated",
4635 "Sum for allocated bytes");
4636 b
.add_u64(l_bluestore_stored
, "bluestore_stored",
4637 "Sum for stored bytes");
4638 b
.add_u64(l_bluestore_compressed
, "bluestore_compressed",
4639 "Sum for stored compressed bytes",
4640 "c", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4641 b
.add_u64(l_bluestore_compressed_allocated
, "bluestore_compressed_allocated",
4642 "Sum for bytes allocated for compressed data",
4643 "c_a", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4644 b
.add_u64(l_bluestore_compressed_original
, "bluestore_compressed_original",
4645 "Sum for original bytes that were compressed",
4646 "c_o", PerfCountersBuilder::PRIO_USEFUL
, unit_t(UNIT_BYTES
));
4647 b
.add_u64(l_bluestore_onodes
, "bluestore_onodes",
4648 "Number of onodes in cache");
4649 b
.add_u64(l_bluestore_pinned_onodes
, "bluestore_pinned_onodes",
4650 "Number of pinned onodes in cache");
4651 b
.add_u64_counter(l_bluestore_onode_hits
, "bluestore_onode_hits",
4652 "Sum for onode-lookups hit in the cache");
4653 b
.add_u64_counter(l_bluestore_onode_misses
, "bluestore_onode_misses",
4654 "Sum for onode-lookups missed in the cache");
4655 b
.add_u64_counter(l_bluestore_onode_shard_hits
, "bluestore_onode_shard_hits",
4656 "Sum for onode-shard lookups hit in the cache");
4657 b
.add_u64_counter(l_bluestore_onode_shard_misses
,
4658 "bluestore_onode_shard_misses",
4659 "Sum for onode-shard lookups missed in the cache");
4660 b
.add_u64(l_bluestore_extents
, "bluestore_extents",
4661 "Number of extents in cache");
4662 b
.add_u64(l_bluestore_blobs
, "bluestore_blobs",
4663 "Number of blobs in cache");
4664 b
.add_u64(l_bluestore_buffers
, "bluestore_buffers",
4665 "Number of buffers in cache");
4666 b
.add_u64(l_bluestore_buffer_bytes
, "bluestore_buffer_bytes",
4667 "Number of buffer bytes in cache", NULL
, 0, unit_t(UNIT_BYTES
));
4668 b
.add_u64_counter(l_bluestore_buffer_hit_bytes
, "bluestore_buffer_hit_bytes",
4669 "Sum for bytes of read hit in the cache", NULL
, 0, unit_t(UNIT_BYTES
));
4670 b
.add_u64_counter(l_bluestore_buffer_miss_bytes
, "bluestore_buffer_miss_bytes",
4671 "Sum for bytes of read missed in the cache", NULL
, 0, unit_t(UNIT_BYTES
));
4673 b
.add_u64_counter(l_bluestore_write_big
, "bluestore_write_big",
4674 "Large aligned writes into fresh blobs");
4675 b
.add_u64_counter(l_bluestore_write_big_bytes
, "bluestore_write_big_bytes",
4676 "Large aligned writes into fresh blobs (bytes)", NULL
, 0, unit_t(UNIT_BYTES
));
4677 b
.add_u64_counter(l_bluestore_write_big_blobs
, "bluestore_write_big_blobs",
4678 "Large aligned writes into fresh blobs (blobs)");
4679 b
.add_u64_counter(l_bluestore_write_small
, "bluestore_write_small",
4680 "Small writes into existing or sparse small blobs");
4681 b
.add_u64_counter(l_bluestore_write_small_bytes
, "bluestore_write_small_bytes",
4682 "Small writes into existing or sparse small blobs (bytes)", NULL
, 0, unit_t(UNIT_BYTES
));
4683 b
.add_u64_counter(l_bluestore_write_small_unused
,
4684 "bluestore_write_small_unused",
4685 "Small writes into unused portion of existing blob");
4686 b
.add_u64_counter(l_bluestore_write_small_deferred
,
4687 "bluestore_write_small_deferred",
4688 "Small overwrites using deferred");
4689 b
.add_u64_counter(l_bluestore_write_small_pre_read
,
4690 "bluestore_write_small_pre_read",
4691 "Small writes that required we read some data (possibly "
4692 "cached) to fill out the block");
4693 b
.add_u64_counter(l_bluestore_write_small_new
, "bluestore_write_small_new",
4694 "Small write into new (sparse) blob");
4696 b
.add_u64_counter(l_bluestore_txc
, "bluestore_txc", "Transactions committed");
4697 b
.add_u64_counter(l_bluestore_onode_reshard
, "bluestore_onode_reshard",
4698 "Onode extent map reshard events");
4699 b
.add_u64_counter(l_bluestore_blob_split
, "bluestore_blob_split",
4700 "Sum for blob splitting due to resharding");
4701 b
.add_u64_counter(l_bluestore_extent_compress
, "bluestore_extent_compress",
4702 "Sum for extents that have been removed due to compression");
4703 b
.add_u64_counter(l_bluestore_gc_merged
, "bluestore_gc_merged",
4704 "Sum for extents that have been merged due to garbage "
4706 b
.add_u64_counter(l_bluestore_read_eio
, "bluestore_read_eio",
4707 "Read EIO errors propagated to high level callers");
4708 b
.add_u64_counter(l_bluestore_reads_with_retries
, "bluestore_reads_with_retries",
4709 "Read operations that required at least one retry due to failed checksum validation");
4710 b
.add_u64(l_bluestore_fragmentation
, "bluestore_fragmentation_micros",
4711 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
4712 b
.add_time_avg(l_bluestore_omap_seek_to_first_lat
, "omap_seek_to_first_lat",
4713 "Average omap iterator seek_to_first call latency");
4714 b
.add_time_avg(l_bluestore_omap_upper_bound_lat
, "omap_upper_bound_lat",
4715 "Average omap iterator upper_bound call latency");
4716 b
.add_time_avg(l_bluestore_omap_lower_bound_lat
, "omap_lower_bound_lat",
4717 "Average omap iterator lower_bound call latency");
4718 b
.add_time_avg(l_bluestore_omap_next_lat
, "omap_next_lat",
4719 "Average omap iterator next call latency");
4720 b
.add_time_avg(l_bluestore_clist_lat
, "clist_lat",
4721 "Average collection listing latency");
4722 logger
= b
.create_perf_counters();
4723 cct
->get_perfcounters_collection()->add(logger
);
4726 int BlueStore::_reload_logger()
4728 struct store_statfs_t store_statfs
;
4729 int r
= statfs(&store_statfs
);
4731 logger
->set(l_bluestore_allocated
, store_statfs
.allocated
);
4732 logger
->set(l_bluestore_stored
, store_statfs
.data_stored
);
4733 logger
->set(l_bluestore_compressed
, store_statfs
.data_compressed
);
4734 logger
->set(l_bluestore_compressed_allocated
, store_statfs
.data_compressed_allocated
);
4735 logger
->set(l_bluestore_compressed_original
, store_statfs
.data_compressed_original
);
4740 void BlueStore::_shutdown_logger()
4742 cct
->get_perfcounters_collection()->remove(logger
);
4746 int BlueStore::get_block_device_fsid(CephContext
* cct
, const string
& path
,
4749 bluestore_bdev_label_t label
;
4750 int r
= _read_bdev_label(cct
, path
, &label
);
4753 *fsid
= label
.osd_uuid
;
4757 int BlueStore::_open_path()
4760 ceph_assert(path_fd
< 0);
4761 path_fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_DIRECTORY
|O_CLOEXEC
));
4764 derr
<< __func__
<< " unable to open " << path
<< ": " << cpp_strerror(r
)
4771 void BlueStore::_close_path()
4773 VOID_TEMP_FAILURE_RETRY(::close(path_fd
));
4777 int BlueStore::_write_bdev_label(CephContext
*cct
,
4778 string path
, bluestore_bdev_label_t label
)
4780 dout(10) << __func__
<< " path " << path
<< " label " << label
<< dendl
;
4783 uint32_t crc
= bl
.crc32c(-1);
4785 ceph_assert(bl
.length() <= BDEV_LABEL_BLOCK_SIZE
);
4786 bufferptr
z(BDEV_LABEL_BLOCK_SIZE
- bl
.length());
4788 bl
.append(std::move(z
));
4790 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_WRONLY
|O_CLOEXEC
));
4793 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
4797 int r
= bl
.write_fd(fd
);
4799 derr
<< __func__
<< " failed to write to " << path
4800 << ": " << cpp_strerror(r
) << dendl
;
4805 derr
<< __func__
<< " failed to fsync " << path
4806 << ": " << cpp_strerror(r
) << dendl
;
4809 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4813 int BlueStore::_read_bdev_label(CephContext
* cct
, string path
,
4814 bluestore_bdev_label_t
*label
)
4816 dout(10) << __func__
<< dendl
;
4817 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_RDONLY
|O_CLOEXEC
));
4820 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
4825 int r
= bl
.read_fd(fd
, BDEV_LABEL_BLOCK_SIZE
);
4826 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4828 derr
<< __func__
<< " failed to read from " << path
4829 << ": " << cpp_strerror(r
) << dendl
;
4833 uint32_t crc
, expected_crc
;
4834 auto p
= bl
.cbegin();
4838 t
.substr_of(bl
, 0, p
.get_off());
4840 decode(expected_crc
, p
);
4842 catch (buffer::error
& e
) {
4843 dout(2) << __func__
<< " unable to decode label at offset " << p
.get_off()
4848 if (crc
!= expected_crc
) {
4849 derr
<< __func__
<< " bad crc on label, expected " << expected_crc
4850 << " != actual " << crc
<< dendl
;
4853 dout(10) << __func__
<< " got " << *label
<< dendl
;
4857 int BlueStore::_check_or_set_bdev_label(
4858 string path
, uint64_t size
, string desc
, bool create
)
4860 bluestore_bdev_label_t label
;
4862 label
.osd_uuid
= fsid
;
4864 label
.btime
= ceph_clock_now();
4865 label
.description
= desc
;
4866 int r
= _write_bdev_label(cct
, path
, label
);
4870 int r
= _read_bdev_label(cct
, path
, &label
);
4873 if (cct
->_conf
->bluestore_debug_permit_any_bdev_label
) {
4874 dout(20) << __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
4875 << " and fsid " << fsid
<< " check bypassed" << dendl
;
4876 } else if (label
.osd_uuid
!= fsid
) {
4877 derr
<< __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
4878 << " does not match our fsid " << fsid
<< dendl
;
4885 void BlueStore::_set_alloc_sizes(void)
4887 max_alloc_size
= cct
->_conf
->bluestore_max_alloc_size
;
4889 if (cct
->_conf
->bluestore_prefer_deferred_size
) {
4890 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size
;
4893 if (_use_rotational_settings()) {
4894 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_hdd
;
4896 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_ssd
;
4900 if (cct
->_conf
->bluestore_deferred_batch_ops
) {
4901 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops
;
4904 if (_use_rotational_settings()) {
4905 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_hdd
;
4907 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_ssd
;
4911 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
4912 << std::dec
<< " order " << (int)min_alloc_size_order
4913 << " max_alloc_size 0x" << std::hex
<< max_alloc_size
4914 << " prefer_deferred_size 0x" << prefer_deferred_size
4916 << " deferred_batch_ops " << deferred_batch_ops
4920 int BlueStore::_open_bdev(bool create
)
4922 ceph_assert(bdev
== NULL
);
4923 string p
= path
+ "/block";
4924 bdev
= BlockDevice::create(cct
, p
, aio_cb
, static_cast<void*>(this), discard_cb
, static_cast<void*>(this));
4925 int r
= bdev
->open(p
);
4929 if (create
&& cct
->_conf
->bdev_enable_discard
) {
4930 bdev
->discard(0, bdev
->get_size());
4933 if (bdev
->supported_bdev_label()) {
4934 r
= _check_or_set_bdev_label(p
, bdev
->get_size(), "main", create
);
4939 // initialize global block parameters
4940 block_size
= bdev
->get_block_size();
4941 block_mask
= ~(block_size
- 1);
4942 block_size_order
= ctz(block_size
);
4943 ceph_assert(block_size
== 1u << block_size_order
);
4944 _set_max_defer_interval();
4945 // and set cache_size based on device type
4946 r
= _set_cache_sizes();
4960 void BlueStore::_validate_bdev()
4963 ceph_assert(min_alloc_size
); // _get_odisk_reserved depends on that
4964 uint64_t dev_size
= bdev
->get_size();
4966 _get_ondisk_reserved() + cct
->_conf
->bluestore_bluefs_min
) {
4967 dout(1) << __func__
<< " main device size " << byte_u_t(dev_size
)
4968 << " is too small, disable bluestore_bluefs_min for now"
4970 ceph_assert(dev_size
>= _get_ondisk_reserved());
4972 int r
= cct
->_conf
.set_val("bluestore_bluefs_min", "0");
4973 ceph_assert(r
== 0);
4977 void BlueStore::_close_bdev()
4985 int BlueStore::_open_fm(KeyValueDB::Transaction t
, bool read_only
)
4988 bluestore_bdev_label_t label
;
4990 ceph_assert(fm
== NULL
);
4991 fm
= FreelistManager::create(cct
, freelist_type
, PREFIX_ALLOC
);
4994 // create mode. initialize freespace
4995 dout(20) << __func__
<< " initializing freespace" << dendl
;
4998 bl
.append(freelist_type
);
4999 t
->set(PREFIX_SUPER
, "freelist_type", bl
);
5001 // being able to allocate in units less than bdev block size
5002 // seems to be a bad idea.
5003 ceph_assert( cct
->_conf
->bdev_block_size
<= (int64_t)min_alloc_size
);
5004 fm
->create(bdev
->get_size(), (int64_t)min_alloc_size
, t
);
5006 // allocate superblock reserved space. note that we do not mark
5007 // bluefs space as allocated in the freelist; we instead rely on
5009 auto reserved
= _get_ondisk_reserved();
5010 fm
->allocate(0, reserved
, t
);
5012 if (cct
->_conf
->bluestore_bluefs
) {
5013 ceph_assert(bluefs_extents
.num_intervals() == 1);
5014 interval_set
<uint64_t>::iterator p
= bluefs_extents
.begin();
5015 reserved
= round_up_to(p
.get_start() + p
.get_len(), min_alloc_size
);
5016 dout(20) << __func__
<< " reserved 0x" << std::hex
<< reserved
<< std::dec
5017 << " for bluefs" << dendl
;
5020 if (cct
->_conf
->bluestore_debug_prefill
> 0) {
5021 uint64_t end
= bdev
->get_size() - reserved
;
5022 dout(1) << __func__
<< " pre-fragmenting freespace, using "
5023 << cct
->_conf
->bluestore_debug_prefill
<< " with max free extent "
5024 << cct
->_conf
->bluestore_debug_prefragment_max
<< dendl
;
5025 uint64_t start
= p2roundup(reserved
, min_alloc_size
);
5026 uint64_t max_b
= cct
->_conf
->bluestore_debug_prefragment_max
/ min_alloc_size
;
5027 float r
= cct
->_conf
->bluestore_debug_prefill
;
5031 while (!stop
&& start
< end
) {
5032 uint64_t l
= (rand() % max_b
+ 1) * min_alloc_size
;
5033 if (start
+ l
> end
) {
5035 l
= p2align(l
, min_alloc_size
);
5037 ceph_assert(start
+ l
<= end
);
5039 uint64_t u
= 1 + (uint64_t)(r
* (double)l
);
5040 u
= p2roundup(u
, min_alloc_size
);
5041 if (start
+ l
+ u
> end
) {
5042 u
= end
- (start
+ l
);
5043 // trim to align so we don't overflow again
5044 u
= p2align(u
, min_alloc_size
);
5047 ceph_assert(start
+ l
+ u
<= end
);
5049 dout(20) << __func__
<< " free 0x" << std::hex
<< start
<< "~" << l
5050 << " use 0x" << u
<< std::dec
<< dendl
;
5053 // break if u has been trimmed to nothing
5057 fm
->allocate(start
+ l
, u
, t
);
5061 r
= _write_out_fm_meta(0, false, &label
);
5062 ceph_assert(r
== 0);
5064 string p
= path
+ "/block";
5065 r
= _read_bdev_label(cct
, p
, &label
);
5067 derr
<< __func__
<< " freelist init failed, error reading bdev label: " << cpp_strerror(r
) << dendl
;
5073 r
= fm
->init(label
, db
, read_only
);
5075 derr
<< __func__
<< " freelist init failed: " << cpp_strerror(r
) << dendl
;
5080 // if space size tracked by free list manager is that higher than actual
5081 // dev size one can hit out-of-space allocation which will result
5082 // in data loss and/or assertions
5083 // Probably user altered the device size somehow.
5084 // The only fix for now is to redeploy OSD.
5085 if (fm
->get_size() >= bdev
->get_size() + min_alloc_size
) {
5087 ss
<< "slow device size mismatch detected, "
5088 << " fm size(" << fm
->get_size()
5089 << ") > slow device size(" << bdev
->get_size()
5090 << "), Please stop using this OSD as it might cause data loss.";
5091 _set_disk_size_mismatch_alert(ss
.str());
5096 void BlueStore::_close_fm()
5098 dout(10) << __func__
<< dendl
;
5105 int BlueStore::_write_out_fm_meta(uint64_t target_size
,
5106 bool update_root_size
,
5107 bluestore_bdev_label_t
* res_label
)
5109 string p
= path
+ "/block";
5111 std::vector
<std::pair
<string
, string
>> fm_meta
;
5112 fm
->get_meta(target_size
, &fm_meta
);
5114 bluestore_bdev_label_t label
;
5115 int r
= _read_bdev_label(cct
, p
, &label
);
5119 for (auto& m
: fm_meta
) {
5120 label
.meta
[m
.first
] = m
.second
;
5122 if (update_root_size
) {
5123 label
.size
= target_size
;
5125 r
= _write_bdev_label(cct
, p
, label
);
5133 int BlueStore::_open_alloc()
5135 ceph_assert(alloc
== NULL
);
5136 ceph_assert(bdev
->get_size());
5139 bluefs_extents
.clear();
5140 auto r
= bluefs
->get_block_extents(bluefs_layout
.shared_bdev
,
5143 lderr(cct
) << __func__
<< " failed to retrieve bluefs_extents: "
5144 << cpp_strerror(r
) << dendl
;
5148 dout(10) << __func__
<< " bluefs extents 0x"
5149 << std::hex
<< bluefs_extents
<< std::dec
5153 alloc
= Allocator::create(cct
, cct
->_conf
->bluestore_allocator
,
5155 min_alloc_size
, "block");
5157 lderr(cct
) << __func__
<< " Allocator::unknown alloc type "
5158 << cct
->_conf
->bluestore_allocator
5163 uint64_t num
= 0, bytes
= 0;
5165 dout(1) << __func__
<< " opening allocation metadata" << dendl
;
5166 // initialize from freelist
5167 fm
->enumerate_reset();
5168 uint64_t offset
, length
;
5169 while (fm
->enumerate_next(db
, &offset
, &length
)) {
5170 alloc
->init_add_free(offset
, length
);
5174 fm
->enumerate_reset();
5176 // also mark bluefs space as allocated
5177 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
5178 alloc
->init_rm_free(e
.get_start(), e
.get_len());
5181 dout(1) << __func__
<< " loaded " << byte_u_t(bytes
)
5182 << " in " << num
<< " extents"
5183 << " available " << byte_u_t(alloc
->get_free())
5189 void BlueStore::_close_alloc()
5192 bdev
->discard_drain();
5198 bluefs_extents
.clear();
5201 int BlueStore::_open_fsid(bool create
)
5203 ceph_assert(fsid_fd
< 0);
5204 int flags
= O_RDWR
|O_CLOEXEC
;
5207 fsid_fd
= ::openat(path_fd
, "fsid", flags
, 0644);
5210 derr
<< __func__
<< " " << cpp_strerror(err
) << dendl
;
5216 int BlueStore::_read_fsid(uuid_d
*uuid
)
5219 memset(fsid_str
, 0, sizeof(fsid_str
));
5220 int ret
= safe_read(fsid_fd
, fsid_str
, sizeof(fsid_str
));
5222 derr
<< __func__
<< " failed: " << cpp_strerror(ret
) << dendl
;
5229 if (!uuid
->parse(fsid_str
)) {
5230 derr
<< __func__
<< " unparsable uuid " << fsid_str
<< dendl
;
5236 int BlueStore::_write_fsid()
5238 int r
= ::ftruncate(fsid_fd
, 0);
5241 derr
<< __func__
<< " fsid truncate failed: " << cpp_strerror(r
) << dendl
;
5244 string str
= stringify(fsid
) + "\n";
5245 r
= safe_write(fsid_fd
, str
.c_str(), str
.length());
5247 derr
<< __func__
<< " fsid write failed: " << cpp_strerror(r
) << dendl
;
5250 r
= ::fsync(fsid_fd
);
5253 derr
<< __func__
<< " fsid fsync failed: " << cpp_strerror(r
) << dendl
;
5259 void BlueStore::_close_fsid()
5261 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd
));
5265 int BlueStore::_lock_fsid()
5268 memset(&l
, 0, sizeof(l
));
5270 l
.l_whence
= SEEK_SET
;
5271 int r
= ::fcntl(fsid_fd
, F_SETLK
, &l
);
5274 derr
<< __func__
<< " failed to lock " << path
<< "/fsid"
5275 << " (is another ceph-osd still running?)"
5276 << cpp_strerror(err
) << dendl
;
5282 bool BlueStore::is_rotational()
5285 return bdev
->is_rotational();
5288 bool rotational
= true;
5289 int r
= _open_path();
5292 r
= _open_fsid(false);
5295 r
= _read_fsid(&fsid
);
5301 r
= _open_bdev(false);
5304 rotational
= bdev
->is_rotational();
5314 bool BlueStore::is_journal_rotational()
5317 dout(5) << __func__
<< " bluefs disabled, default to store media type"
5319 return is_rotational();
5321 dout(10) << __func__
<< " " << (int)bluefs
->wal_is_rotational() << dendl
;
5322 return bluefs
->wal_is_rotational();
5325 bool BlueStore::_use_rotational_settings()
5327 if (cct
->_conf
->bluestore_debug_enforce_settings
== "hdd") {
5330 if (cct
->_conf
->bluestore_debug_enforce_settings
== "ssd") {
5333 return bdev
->is_rotational();
5336 bool BlueStore::test_mount_in_use()
5338 // most error conditions mean the mount is not in use (e.g., because
5339 // it doesn't exist). only if we fail to lock do we conclude it is
5342 int r
= _open_path();
5345 r
= _open_fsid(false);
5350 ret
= true; // if we can't lock, it is in use
5357 int BlueStore::_minimal_open_bluefs(bool create
)
5360 bluefs
= new BlueFS(cct
);
5365 bfn
= path
+ "/block.db";
5366 if (::stat(bfn
.c_str(), &st
) == 0) {
5367 r
= bluefs
->add_block_device(
5368 BlueFS::BDEV_DB
, bfn
,
5369 create
&& cct
->_conf
->bdev_enable_discard
);
5371 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5372 << cpp_strerror(r
) << dendl
;
5376 if (bluefs
->bdev_support_label(BlueFS::BDEV_DB
)) {
5377 r
= _check_or_set_bdev_label(
5379 bluefs
->get_block_device_size(BlueFS::BDEV_DB
),
5380 "bluefs db", create
);
5383 << " check block device(" << bfn
<< ") label returned: "
5384 << cpp_strerror(r
) << dendl
;
5389 bluefs
->add_block_extent(
5392 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) - SUPER_RESERVED
);
5394 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
5395 bluefs_layout
.dedicated_db
= true;
5398 if (::lstat(bfn
.c_str(), &st
) == -1) {
5400 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
5402 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
5403 << cpp_strerror(r
) << dendl
;
5409 bfn
= path
+ "/block";
5411 r
= bluefs
->add_block_device(bluefs_layout
.shared_bdev
, bfn
, false,
5412 true /* shared with bluestore */);
5414 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5415 << cpp_strerror(r
) << dendl
;
5419 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
5421 bdev
->get_size() * (cct
->_conf
->bluestore_bluefs_min_ratio
+
5422 cct
->_conf
->bluestore_bluefs_gift_ratio
);
5423 initial
= std::max(initial
, cct
->_conf
->bluestore_bluefs_min
);
5424 uint64_t alloc_size
= cct
->_conf
->bluefs_shared_alloc_size
;
5425 if (alloc_size
% min_alloc_size
) {
5426 derr
<< __func__
<< " bluefs_shared_alloc_size 0x" << std::hex
5427 << alloc_size
<< " is not a multiple of "
5428 << "min_alloc_size 0x" << min_alloc_size
<< std::dec
<< dendl
;
5432 // align to bluefs's alloc_size
5433 initial
= p2roundup(initial
, alloc_size
);
5434 // put bluefs in the middle of the device in case it is an HDD
5435 uint64_t start
= p2align((bdev
->get_size() - initial
) / 2, alloc_size
);
5436 //avoiding superblock overwrite
5437 start
= std::max(alloc_size
, start
);
5438 ceph_assert(start
>=_get_ondisk_reserved());
5440 bluefs
->add_block_extent(bluefs_layout
.shared_bdev
, start
, initial
);
5441 bluefs_extents
.insert(start
, initial
);
5445 bfn
= path
+ "/block.wal";
5446 if (::stat(bfn
.c_str(), &st
) == 0) {
5447 r
= bluefs
->add_block_device(BlueFS::BDEV_WAL
, bfn
,
5448 create
&& cct
->_conf
->bdev_enable_discard
);
5450 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
5451 << cpp_strerror(r
) << dendl
;
5455 if (bluefs
->bdev_support_label(BlueFS::BDEV_WAL
)) {
5456 r
= _check_or_set_bdev_label(
5458 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
),
5459 "bluefs wal", create
);
5461 derr
<< __func__
<< " check block device(" << bfn
5462 << ") label returned: " << cpp_strerror(r
) << dendl
;
5468 bluefs
->add_block_extent(
5469 BlueFS::BDEV_WAL
, BDEV_LABEL_BLOCK_SIZE
,
5470 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) -
5471 BDEV_LABEL_BLOCK_SIZE
);
5473 bluefs_layout
.dedicated_wal
= true;
5476 if (::lstat(bfn
.c_str(), &st
) != -1) {
5478 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
5479 << cpp_strerror(r
) << dendl
;
5486 ceph_assert(bluefs
);
5492 int BlueStore::_open_bluefs(bool create
)
5494 int r
= _minimal_open_bluefs(create
);
5498 RocksDBBlueFSVolumeSelector
* vselector
= nullptr;
5499 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
5501 string options
= cct
->_conf
->bluestore_rocksdb_options
;
5503 rocksdb::Options rocks_opts
;
5504 int r
= RocksDBStore::ParseOptionsFromStringStatic(
5513 double reserved_factor
= cct
->_conf
->bluestore_volume_selection_reserved_factor
;
5515 new RocksDBBlueFSVolumeSelector(
5516 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) * 95 / 100,
5517 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) * 95 / 100,
5518 bluefs
->get_block_device_size(BlueFS::BDEV_SLOW
) * 95 / 100,
5519 1024 * 1024 * 1024, //FIXME: set expected l0 size here
5520 rocks_opts
.max_bytes_for_level_base
,
5521 rocks_opts
.max_bytes_for_level_multiplier
,
5523 cct
->_conf
->bluestore_volume_selection_reserved
,
5524 cct
->_conf
->bluestore_volume_selection_policy
!= "rocksdb_original");
5527 bluefs
->mkfs(fsid
, bluefs_layout
);
5529 bluefs
->set_volume_selector(vselector
);
5530 r
= bluefs
->mount();
5532 derr
<< __func__
<< " failed bluefs mount: " << cpp_strerror(r
) << dendl
;
5534 ceph_assert_always(bluefs
->maybe_verify_layout(bluefs_layout
) == 0);
5538 void BlueStore::_close_bluefs(bool cold_close
)
5540 bluefs
->umount(cold_close
);
5541 _minimal_close_bluefs();
5544 void BlueStore::_minimal_close_bluefs()
5550 int BlueStore::_is_bluefs(bool create
, bool* ret
)
5553 *ret
= cct
->_conf
->bluestore_bluefs
;
5556 int r
= read_meta("bluefs", &s
);
5558 derr
<< __func__
<< " unable to read 'bluefs' meta" << dendl
;
5563 } else if (s
== "0") {
5566 derr
<< __func__
<< " bluefs = " << s
<< " : not 0 or 1, aborting"
5575 * opens both DB and dependant super_meta, FreelistManager and allocator
5576 * in the proper order
5578 int BlueStore::_open_db_and_around(bool read_only
)
5581 bool do_bluefs
= false;
5582 _is_bluefs(false, &do_bluefs
); // ignore err code
5584 // open in read-only first to read FM list and init allocator
5585 // as they might be needed for some BlueFS procedures
5586 r
= _open_db(false, false, true);
5590 r
= _open_super_meta();
5595 r
= _open_fm(nullptr, true);
5603 // now open in R/W mode
5607 r
= _open_db(false, false, false);
5616 r
= _open_db(false, false);
5620 r
= _open_super_meta();
5625 r
= _open_fm(nullptr, false);
5638 _close_db(read_only
);
5642 void BlueStore::_close_db_and_around(bool read_only
)
5645 if (!read_only
&& out_of_sync_fm
.fetch_and(0)) {
5646 _sync_bluefs_and_fm();
5648 _close_db(read_only
);
5649 while(!read_only
&& out_of_sync_fm
.fetch_and(0)) {
5650 // if seen some allocations during close - repeat open_db, sync fm, close
5651 dout(0) << __func__
<< " syncing FreelistManager" << dendl
;
5652 int r
= _open_db(false, false, false);
5655 << " unable to open db, FreelistManager is probably out of sync"
5659 _sync_bluefs_and_fm();
5669 _close_db(read_only
);
5673 // updates legacy bluefs related recs in DB to a state valid for
5674 // downgrades from nautilus.
5675 void BlueStore::_sync_bluefs_and_fm()
5677 if (cct
->_conf
->bluestore_bluefs_db_compatibility
) {
5679 encode(bluefs_extents
, bl
);
5680 dout(20) << __func__
<< " bluefs_extents at KV is now 0x"
5681 << std::hex
<< bluefs_extents
<< std::dec
5683 KeyValueDB::Transaction synct
= db
->get_transaction();
5684 synct
->set(PREFIX_SUPER
, "bluefs_extents", bl
);
5685 synct
->set(PREFIX_SUPER
, "bluefs_extents_back", bl
);
5687 // Nice thing is that we don't need to update FreelistManager here.
5688 // It always has corresponding bits set to 'Free' for both Nautilus+ and
5689 // pre-Nautilis releases.
5690 // So once we get an extent to bluefs_extents this means it's
5691 // been free in allocator and hence it's free in FM too.
5693 db
->submit_transaction_sync(synct
);
5697 int BlueStore::_open_db(bool create
, bool to_repair_db
, bool read_only
)
5701 ceph_assert(!(create
&& read_only
));
5702 string fn
= path
+ "/db";
5705 std::shared_ptr
<Int64ArrayMergeOperator
> merge_op(new Int64ArrayMergeOperator
);
5708 std::vector
<KeyValueDB::ColumnFamily
> cfs
;
5711 kv_backend
= cct
->_conf
->bluestore_kvbackend
;
5713 r
= read_meta("kv_backend", &kv_backend
);
5715 derr
<< __func__
<< " unable to read 'kv_backend' meta" << dendl
;
5719 dout(10) << __func__
<< " kv_backend = " << kv_backend
<< dendl
;
5722 r
= _is_bluefs(create
, &do_bluefs
);
5726 dout(10) << __func__
<< " do_bluefs = " << do_bluefs
<< dendl
;
5728 map
<string
,string
> kv_options
;
5729 // force separate wal dir for all new deployments.
5730 kv_options
["separate_wal_dir"] = 1;
5731 rocksdb::Env
*env
= NULL
;
5733 dout(10) << __func__
<< " initializing bluefs" << dendl
;
5734 if (kv_backend
!= "rocksdb") {
5735 derr
<< " backend must be rocksdb to use bluefs" << dendl
;
5739 r
= _open_bluefs(create
);
5744 if (cct
->_conf
->bluestore_bluefs_env_mirror
) {
5745 rocksdb::Env
* a
= new BlueRocksEnv(bluefs
);
5746 rocksdb::Env
* b
= rocksdb::Env::Default();
5748 string cmd
= "rm -rf " + path
+ "/db " +
5749 path
+ "/db.slow " +
5751 int r
= system(cmd
.c_str());
5754 env
= new rocksdb::EnvMirror(b
, a
, false, true);
5756 env
= new BlueRocksEnv(bluefs
);
5758 // simplify the dir names, too, as "seen" by rocksdb
5761 bluefs
->set_slow_device_expander(this);
5762 BlueFSVolumeSelector::paths paths
;
5763 bluefs
->get_vselector_paths(fn
, paths
);
5765 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
5766 // we have both block.db and block; tell rocksdb!
5767 // note: the second (last) size value doesn't really matter
5768 ostringstream db_paths
;
5770 for (auto& p
: paths
) {
5775 db_paths
<< p
.first
<< "," << p
.second
;
5778 kv_options
["db_paths"] = db_paths
.str();
5779 dout(1) << __func__
<< " set db_paths to " << db_paths
.str() << dendl
;
5783 for (auto& p
: paths
) {
5784 env
->CreateDir(p
.first
);
5786 // Selectors don't provide wal path so far hence create explicitly
5787 env
->CreateDir(fn
+ ".wal");
5789 std::vector
<std::string
> res
;
5790 // check for dir presence
5791 auto r
= env
->GetChildren(fn
+".wal", &res
);
5792 if (r
.IsNotFound()) {
5793 kv_options
.erase("separate_wal_dir");
5797 string walfn
= path
+ "/db.wal";
5800 int r
= ::mkdir(fn
.c_str(), 0755);
5803 if (r
< 0 && r
!= -EEXIST
) {
5804 derr
<< __func__
<< " failed to create " << fn
<< ": " << cpp_strerror(r
)
5810 r
= ::mkdir(walfn
.c_str(), 0755);
5813 if (r
< 0 && r
!= -EEXIST
) {
5814 derr
<< __func__
<< " failed to create " << walfn
5815 << ": " << cpp_strerror(r
)
5821 r
= ::stat(walfn
.c_str(), &st
);
5822 if (r
< 0 && errno
== ENOENT
) {
5823 kv_options
.erase("separate_wal_dir");
5829 db
= KeyValueDB::create(cct
,
5833 static_cast<void*>(env
));
5835 derr
<< __func__
<< " error creating db" << dendl
;
5837 _close_bluefs(read_only
);
5839 // delete env manually here since we can't depend on db to do this
5846 FreelistManager::setup_merge_operators(db
);
5847 db
->set_merge_operator(PREFIX_STAT
, merge_op
);
5848 db
->set_cache_size(cache_kv_ratio
* cache_size
);
5850 if (kv_backend
== "rocksdb") {
5851 options
= cct
->_conf
->bluestore_rocksdb_options
;
5853 map
<string
,string
> cf_map
;
5854 cct
->_conf
.with_val
<string
>("bluestore_rocksdb_cfs",
5858 for (auto& i
: cf_map
) {
5859 dout(10) << "column family " << i
.first
<< ": " << i
.second
<< dendl
;
5860 cfs
.push_back(KeyValueDB::ColumnFamily(i
.first
, i
.second
));
5868 if (cct
->_conf
.get_val
<bool>("bluestore_rocksdb_cf")) {
5869 r
= db
->create_and_open(err
, cfs
);
5871 r
= db
->create_and_open(err
);
5874 // we pass in cf list here, but it is only used if the db already has
5875 // column families created.
5877 db
->open_read_only(err
, cfs
) :
5881 derr
<< __func__
<< " erroring opening db: " << err
.str() << dendl
;
5882 _close_db(read_only
);
5885 dout(1) << __func__
<< " opened " << kv_backend
5886 << " path " << fn
<< " options " << options
<< dendl
;
5890 void BlueStore::_close_db(bool cold_close
)
5896 _close_bluefs(cold_close
);
5900 void BlueStore::_dump_alloc_on_failure()
5902 auto dump_interval
=
5903 cct
->_conf
->bluestore_bluefs_alloc_failure_dump_interval
;
5904 if (dump_interval
> 0 &&
5905 next_dump_on_bluefs_alloc_failure
<= ceph_clock_now()) {
5907 next_dump_on_bluefs_alloc_failure
= ceph_clock_now();
5908 next_dump_on_bluefs_alloc_failure
+= dump_interval
;
5913 int BlueStore::allocate_bluefs_freespace(
5916 PExtentVector
* extents_out
)
5918 ceph_assert(min_size
<= size
);
5920 // round up to alloc size
5921 uint64_t alloc_size
= bluefs
->get_alloc_size(bluefs_layout
.shared_bdev
);
5922 min_size
= p2roundup(min_size
, alloc_size
);
5923 size
= p2roundup(size
, alloc_size
);
5925 PExtentVector extents_local
;
5926 PExtentVector
* extents
= extents_out
? extents_out
: &extents_local
;
5930 uint64_t allocated
= 0;
5933 auto extent_count0
= extents
->size();
5935 // hard cap to fit into 32 bits
5936 gift
= std::min
<uint64_t>(size
, 1ull << 30);
5937 dout(10) << __func__
<< " gifting " << gift
5938 << " (" << byte_u_t(gift
) << ")" << dendl
;
5940 alloc_len
= alloc
->allocate(gift
, alloc_size
, 0, 0, extents
);
5941 if (alloc_len
> 0) {
5942 allocated
+= alloc_len
;
5946 if (alloc_len
< 0 ||
5947 (alloc_len
< (int64_t)gift
&& (min_size
> allocated
))) {
5949 << " failed to allocate on 0x" << std::hex
<< gift
5950 << " min_size 0x" << min_size
5951 << " > allocated total 0x" << allocated
5952 << " bluefs_shared_alloc_size 0x" << alloc_size
5953 << " allocated 0x" << (alloc_len
< 0 ? 0 : alloc_len
)
5954 << " available 0x " << alloc
->get_free()
5955 << std::dec
<< dendl
;
5957 _dump_alloc_on_failure();
5958 alloc
->release(*extents
);
5962 } while (size
&& alloc_len
> 0);
5963 _collect_allocation_stats(need
, alloc_size
, extents
->size() - extent_count0
);
5965 for (auto& e
: *extents
) {
5966 dout(5) << __func__
<< " gifting " << e
<< " to bluefs" << dendl
;
5967 bluefs_extents
.insert(e
.offset
, e
.length
);
5969 // apply to bluefs if not requested from outside
5971 bluefs
->add_block_extent(bluefs_layout
.shared_bdev
, e
.offset
, e
.length
);
5978 uint64_t BlueStore::available_freespace(uint64_t alloc_size
) {
5980 auto iterated_allocation
= [&](uint64_t off
, uint64_t len
) {
5981 //only count in size that is alloc_size aligned
5982 uint64_t dist_to_alignment
;
5983 uint64_t offset_in_block
= off
& (alloc_size
- 1);
5984 if (offset_in_block
== 0)
5985 dist_to_alignment
= 0;
5987 dist_to_alignment
= alloc_size
- offset_in_block
;
5988 if (dist_to_alignment
>= len
)
5990 len
-= dist_to_alignment
;
5991 total
+= p2align(len
, alloc_size
);
5993 alloc
->dump(iterated_allocation
);
5997 int64_t BlueStore::_get_bluefs_size_delta(uint64_t bluefs_free
, uint64_t bluefs_total
)
5999 float bluefs_free_ratio
= (float)bluefs_free
/ (float)bluefs_total
;
6001 uint64_t my_free
= alloc
->get_free();
6002 uint64_t total
= bdev
->get_size();
6003 float my_free_ratio
= (float)my_free
/ (float)total
;
6005 uint64_t total_free
= bluefs_free
+ my_free
;
6007 float bluefs_ratio
= (float)bluefs_free
/ (float)total_free
;
6009 dout(10) << __func__
6010 << " bluefs " << byte_u_t(bluefs_free
)
6011 << " free (" << bluefs_free_ratio
6012 << ") bluestore " << byte_u_t(my_free
)
6013 << " free (" << my_free_ratio
6014 << "), bluefs_ratio " << bluefs_ratio
6018 uint64_t reclaim
= 0;
6019 if (bluefs_ratio
< cct
->_conf
->bluestore_bluefs_min_ratio
) {
6020 gift
= cct
->_conf
->bluestore_bluefs_gift_ratio
* total_free
;
6021 if (gift
>= my_free
)
6023 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
6024 << " < min_ratio " << cct
->_conf
->bluestore_bluefs_min_ratio
6025 << ", should gift " << byte_u_t(gift
) << dendl
;
6026 } else if (bluefs_ratio
> cct
->_conf
->bluestore_bluefs_max_ratio
) {
6027 reclaim
= cct
->_conf
->bluestore_bluefs_reclaim_ratio
* total_free
;
6028 if (bluefs_total
- reclaim
< cct
->_conf
->bluestore_bluefs_min
)
6029 reclaim
= bluefs_total
- cct
->_conf
->bluestore_bluefs_min
;
6030 if (reclaim
>= bluefs_free
)
6031 reclaim
= bluefs_free
/ 2;
6032 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
6033 << " > max_ratio " << cct
->_conf
->bluestore_bluefs_max_ratio
6034 << ", should reclaim " << byte_u_t(reclaim
) << dendl
;
6037 // don't take over too much of the freespace
6038 uint64_t free_cap
= cct
->_conf
->bluestore_bluefs_max_ratio
* total_free
;
6039 if (bluefs_total
< cct
->_conf
->bluestore_bluefs_min
&&
6040 cct
->_conf
->bluestore_bluefs_min
< free_cap
) {
6041 uint64_t g
= cct
->_conf
->bluestore_bluefs_min
- bluefs_total
;
6042 dout(10) << __func__
<< " bluefs_total " << bluefs_total
6043 << " < min " << cct
->_conf
->bluestore_bluefs_min
6044 << ", should gift " << byte_u_t(g
) << dendl
;
6050 cct
->_conf
.get_val
<Option::size_t>("bluestore_bluefs_min_free");
6051 if (bluefs_free
< min_free
&&
6052 min_free
< free_cap
) {
6053 uint64_t g
= min_free
- bluefs_free
;
6054 dout(10) << __func__
<< " bluefs_free " << bluefs_free
6055 << " < min " << min_free
6056 << ", should gift " << byte_u_t(g
) << dendl
;
6062 cct
->_conf
.get_val
<Option::size_t>("bluestore_bluefs_max_free");
6063 if (bluefs_free
> max_free
) {
6064 dout(10) << __func__
<< " bluefs_free " << bluefs_free
6065 << " > max " << max_free
6066 << ", stop gifting for now" << dendl
;
6069 ceph_assert((int64_t)gift
>= 0);
6070 ceph_assert((int64_t)reclaim
>= 0);
6071 return gift
> 0 ? (int64_t)gift
: -(int64_t)reclaim
;
6074 int BlueStore::_balance_bluefs_freespace()
6077 ceph_assert(bluefs
);
6079 vector
<pair
<uint64_t,uint64_t>> bluefs_usage
; // <free, total> ...
6080 bluefs
->get_usage(&bluefs_usage
);
6081 ceph_assert(bluefs_usage
.size() > bluefs_layout
.shared_bdev
);
6083 bool clear_alert
= true;
6084 if (bluefs_layout
.shared_bdev
== BlueFS::BDEV_SLOW
) {
6085 auto& p
= bluefs_usage
[bluefs_layout
.shared_bdev
];
6086 if (p
.first
!= p
.second
) {
6087 auto& db
= bluefs_usage
[BlueFS::BDEV_DB
];
6089 ss
<< "spilled over " << byte_u_t(p
.second
- p
.first
)
6090 << " metadata from 'db' device (" << byte_u_t(db
.second
- db
.first
)
6091 << " used of " << byte_u_t(db
.second
) << ") to slow device";
6092 _set_spillover_alert(ss
.str());
6093 clear_alert
= false;
6097 _clear_spillover_alert();
6100 // fixme: look at primary bdev only for now
6101 int64_t delta
= _get_bluefs_size_delta(
6102 bluefs_usage
[bluefs_layout
.shared_bdev
].first
,
6103 bluefs_usage
[bluefs_layout
.shared_bdev
].second
);
6105 // reclaim from bluefs?
6107 // round up to alloc size
6108 uint64_t alloc_size
= bluefs
->get_alloc_size(bluefs_layout
.shared_bdev
);
6109 auto reclaim
= p2roundup(uint64_t(-delta
), alloc_size
);
6111 // hard cap to fit into 32 bits
6112 reclaim
= std::min
<uint64_t>(reclaim
, 1ull << 30);
6113 dout(10) << __func__
<< " reclaiming " << reclaim
6114 << " (" << byte_u_t(reclaim
) << ")" << dendl
;
6116 while (reclaim
> 0) {
6117 // NOTE: this will block and do IO.
6118 PExtentVector extents
;
6119 int r
= bluefs
->reclaim_blocks(bluefs_layout
.shared_bdev
, reclaim
,
6122 derr
<< __func__
<< " failed to reclaim space from bluefs"
6126 for (auto e
: extents
) {
6128 bluefs_extents
.erase(e
.offset
, e
.length
);
6129 bluefs_extents_reclaiming
.insert(e
.offset
, e
.length
);
6130 reclaim
-= e
.length
;
6140 int BlueStore::_open_collections()
6142 dout(10) << __func__
<< dendl
;
6143 collections_had_errors
= false;
6144 ceph_assert(coll_map
.empty());
6145 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
6146 for (it
->upper_bound(string());
6150 if (cid
.parse(it
->key())) {
6151 auto c
= ceph::make_ref
<Collection
>(
6153 onode_cache_shards
[cid
.hash_to_shard(onode_cache_shards
.size())],
6154 buffer_cache_shards
[cid
.hash_to_shard(buffer_cache_shards
.size())],
6156 bufferlist bl
= it
->value();
6157 auto p
= bl
.cbegin();
6159 decode(c
->cnode
, p
);
6160 } catch (buffer::error
& e
) {
6161 derr
<< __func__
<< " failed to decode cnode, key:"
6162 << pretty_binary_string(it
->key()) << dendl
;
6165 dout(20) << __func__
<< " opened " << cid
<< " " << c
6166 << " " << c
->cnode
<< dendl
;
6167 _osr_attach(c
.get());
6171 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
6172 collections_had_errors
= true;
6178 void BlueStore::_fsck_collections(int64_t* errors
)
6180 if (collections_had_errors
) {
6181 dout(10) << __func__
<< dendl
;
6182 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
6183 for (it
->upper_bound(string());
6187 if (!cid
.parse(it
->key())) {
6188 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
6197 void BlueStore::_set_per_pool_omap()
6199 per_pool_omap
= false;
6201 db
->get(PREFIX_SUPER
, "per_pool_omap", &bl
);
6203 per_pool_omap
= true;
6204 dout(10) << __func__
<< " per_pool_omap=1" << dendl
;
6206 dout(10) << __func__
<< " per_pool_omap not present" << dendl
;
6208 _check_no_per_pool_omap_alert();
6211 void BlueStore::_open_statfs()
6217 int r
= db
->get(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, &bl
);
6219 per_pool_stat_collection
= false;
6220 if (size_t(bl
.length()) >= sizeof(vstatfs
.values
)) {
6221 auto it
= bl
.cbegin();
6223 dout(10) << __func__
<< " store_statfs is found" << dendl
;
6225 dout(10) << __func__
<< " store_statfs is corrupt, using empty" << dendl
;
6227 _check_legacy_statfs_alert();
6229 per_pool_stat_collection
= true;
6230 dout(10) << __func__
<< " per-pool statfs is enabled" << dendl
;
6231 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_STAT
);
6232 for (it
->upper_bound(string());
6237 int r
= get_key_pool_stat(it
->key(), &pool_id
);
6238 ceph_assert(r
== 0);
6242 auto p
= bl
.cbegin();
6243 auto& st
= osd_pools
[pool_id
];
6248 dout(30) << __func__
<< " pool " << pool_id
6249 << " statfs " << st
<< dendl
;
6250 } catch (buffer::error
& e
) {
6251 derr
<< __func__
<< " failed to decode pool stats, key:"
6252 << pretty_binary_string(it
->key()) << dendl
;
6256 dout(30) << __func__
<< " statfs " << vstatfs
<< dendl
;
6260 int BlueStore::_setup_block_symlink_or_file(
6266 dout(20) << __func__
<< " name " << name
<< " path " << epath
6267 << " size " << size
<< " create=" << (int)create
<< dendl
;
6269 int flags
= O_RDWR
|O_CLOEXEC
;
6272 if (epath
.length()) {
6273 r
= ::symlinkat(epath
.c_str(), path_fd
, name
.c_str());
6276 derr
<< __func__
<< " failed to create " << name
<< " symlink to "
6277 << epath
<< ": " << cpp_strerror(r
) << dendl
;
6281 if (!epath
.compare(0, strlen(SPDK_PREFIX
), SPDK_PREFIX
)) {
6282 int fd
= ::openat(path_fd
, epath
.c_str(), flags
, 0644);
6285 derr
<< __func__
<< " failed to open " << epath
<< " file: "
6286 << cpp_strerror(r
) << dendl
;
6289 // write the Transport ID of the NVMe device
6290 // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6291 // where "0000:02:00.0" is the selector of a PCI device, see
6292 // the first column of "lspci -mm -n -D"
6293 string trid
{"trtype:PCIe "};
6295 trid
+= epath
.substr(strlen(SPDK_PREFIX
));
6296 r
= ::write(fd
, trid
.c_str(), trid
.size());
6297 ceph_assert(r
== static_cast<int>(trid
.size()));
6298 dout(1) << __func__
<< " created " << name
<< " symlink to "
6300 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6304 int fd
= ::openat(path_fd
, name
.c_str(), flags
, 0644);
6306 // block file is present
6308 int r
= ::fstat(fd
, &st
);
6310 S_ISREG(st
.st_mode
) && // if it is a regular file
6311 st
.st_size
== 0) { // and is 0 bytes
6312 r
= ::ftruncate(fd
, size
);
6315 derr
<< __func__
<< " failed to resize " << name
<< " file to "
6316 << size
<< ": " << cpp_strerror(r
) << dendl
;
6317 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6321 if (cct
->_conf
->bluestore_block_preallocate_file
) {
6322 r
= ::ceph_posix_fallocate(fd
, 0, size
);
6324 derr
<< __func__
<< " failed to prefallocate " << name
<< " file to "
6325 << size
<< ": " << cpp_strerror(r
) << dendl
;
6326 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6330 dout(1) << __func__
<< " resized " << name
<< " file to "
6331 << byte_u_t(size
) << dendl
;
6333 VOID_TEMP_FAILURE_RETRY(::close(fd
));
6337 derr
<< __func__
<< " failed to open " << name
<< " file: "
6338 << cpp_strerror(r
) << dendl
;
6346 int BlueStore::mkfs()
6348 dout(1) << __func__
<< " path " << path
<< dendl
;
6352 if (cct
->_conf
->osd_max_object_size
> OBJECT_MAX_SIZE
) {
6353 derr
<< __func__
<< " osd_max_object_size "
6354 << cct
->_conf
->osd_max_object_size
<< " > bluestore max "
6355 << OBJECT_MAX_SIZE
<< dendl
;
6361 r
= read_meta("mkfs_done", &done
);
6363 dout(1) << __func__
<< " already created" << dendl
;
6364 if (cct
->_conf
->bluestore_fsck_on_mkfs
) {
6365 r
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
6367 derr
<< __func__
<< " fsck found fatal error: " << cpp_strerror(r
)
6372 derr
<< __func__
<< " fsck found " << r
<< " errors" << dendl
;
6376 return r
; // idempotent
6382 r
= read_meta("type", &type
);
6384 if (type
!= "bluestore") {
6385 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
6389 r
= write_meta("type", "bluestore");
6395 freelist_type
= "bitmap";
6401 r
= _open_fsid(true);
6407 goto out_close_fsid
;
6409 r
= _read_fsid(&old_fsid
);
6410 if (r
< 0 || old_fsid
.is_zero()) {
6411 if (fsid
.is_zero()) {
6412 fsid
.generate_random();
6413 dout(1) << __func__
<< " generated fsid " << fsid
<< dendl
;
6415 dout(1) << __func__
<< " using provided fsid " << fsid
<< dendl
;
6417 // we'll write it later.
6419 if (!fsid
.is_zero() && fsid
!= old_fsid
) {
6420 derr
<< __func__
<< " on-disk fsid " << old_fsid
6421 << " != provided " << fsid
<< dendl
;
6423 goto out_close_fsid
;
6428 r
= _setup_block_symlink_or_file("block", cct
->_conf
->bluestore_block_path
,
6429 cct
->_conf
->bluestore_block_size
,
6430 cct
->_conf
->bluestore_block_create
);
6432 goto out_close_fsid
;
6433 if (cct
->_conf
->bluestore_bluefs
) {
6434 r
= _setup_block_symlink_or_file("block.wal", cct
->_conf
->bluestore_block_wal_path
,
6435 cct
->_conf
->bluestore_block_wal_size
,
6436 cct
->_conf
->bluestore_block_wal_create
);
6438 goto out_close_fsid
;
6439 r
= _setup_block_symlink_or_file("block.db", cct
->_conf
->bluestore_block_db_path
,
6440 cct
->_conf
->bluestore_block_db_size
,
6441 cct
->_conf
->bluestore_block_db_create
);
6443 goto out_close_fsid
;
6446 r
= _open_bdev(true);
6448 goto out_close_fsid
;
6450 // choose min_alloc_size
6451 if (cct
->_conf
->bluestore_min_alloc_size
) {
6452 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size
;
6455 if (bdev
->is_rotational()) {
6456 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_hdd
;
6458 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_ssd
;
6463 // make sure min_alloc_size is power of 2 aligned.
6464 if (!isp2(min_alloc_size
)) {
6465 derr
<< __func__
<< " min_alloc_size 0x"
6466 << std::hex
<< min_alloc_size
<< std::dec
6467 << " is not power of 2 aligned!"
6470 goto out_close_bdev
;
6475 goto out_close_bdev
;
6478 KeyValueDB::Transaction t
= db
->get_transaction();
6479 r
= _open_fm(t
, true);
6484 encode((uint64_t)0, bl
);
6485 t
->set(PREFIX_SUPER
, "nid_max", bl
);
6486 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
6491 encode((uint64_t)min_alloc_size
, bl
);
6492 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
6497 t
->set(PREFIX_SUPER
, "per_pool_omap", bl
);
6499 ondisk_format
= latest_ondisk_format
;
6500 _prepare_ondisk_format_super(t
);
6501 db
->submit_transaction_sync(t
);
6504 r
= write_meta("kv_backend", cct
->_conf
->bluestore_kvbackend
);
6508 r
= write_meta("bluefs", stringify(bluefs
? 1 : 0));
6512 if (fsid
!= old_fsid
) {
6515 derr
<< __func__
<< " error writing fsid: " << cpp_strerror(r
) << dendl
;
6520 if (out_of_sync_fm
.fetch_and(0)) {
6521 _sync_bluefs_and_fm();
6536 cct
->_conf
->bluestore_fsck_on_mkfs
) {
6537 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
6541 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
6547 // indicate success by writing the 'mkfs_done' file
6548 r
= write_meta("mkfs_done", "yes");
6552 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
6554 dout(0) << __func__
<< " success" << dendl
;
6559 int BlueStore::_mount_for_bluefs()
6561 int r
= _open_path();
6562 ceph_assert(r
== 0);
6563 r
= _open_fsid(false);
6564 ceph_assert(r
== 0);
6565 r
= _read_fsid(&fsid
);
6566 ceph_assert(r
== 0);
6568 ceph_assert(r
== 0);
6569 r
= _open_bluefs(false);
6570 ceph_assert(r
== 0);
6574 void BlueStore::_umount_for_bluefs()
6576 _close_bluefs(false);
6581 int BlueStore::add_new_bluefs_device(int id
, const string
& dev_path
)
6583 dout(10) << __func__
<< " path " << dev_path
<< " id:" << id
<< dendl
;
6585 ceph_assert(path_fd
< 0);
6587 ceph_assert(id
== BlueFS::BDEV_NEWWAL
|| id
== BlueFS::BDEV_NEWDB
);
6589 if (!cct
->_conf
->bluestore_bluefs
) {
6590 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6594 r
= _mount_for_bluefs();
6597 if (id
== BlueFS::BDEV_NEWWAL
) {
6598 string p
= path
+ "/block.wal";
6599 r
= _setup_block_symlink_or_file("block.wal", dev_path
,
6600 cct
->_conf
->bluestore_block_wal_size
,
6602 ceph_assert(r
== 0);
6604 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWWAL
, p
,
6605 cct
->_conf
->bdev_enable_discard
);
6606 ceph_assert(r
== 0);
6608 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWWAL
)) {
6609 r
= _check_or_set_bdev_label(
6611 bluefs
->get_block_device_size(BlueFS::BDEV_NEWWAL
),
6614 ceph_assert(r
== 0);
6617 reserved
= BDEV_LABEL_BLOCK_SIZE
;
6618 bluefs_layout
.dedicated_wal
= true;
6619 } else if (id
== BlueFS::BDEV_NEWDB
) {
6620 string p
= path
+ "/block.db";
6621 r
= _setup_block_symlink_or_file("block.db", dev_path
,
6622 cct
->_conf
->bluestore_block_db_size
,
6624 ceph_assert(r
== 0);
6626 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWDB
, p
,
6627 cct
->_conf
->bdev_enable_discard
);
6628 ceph_assert(r
== 0);
6630 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWDB
)) {
6631 r
= _check_or_set_bdev_label(
6633 bluefs
->get_block_device_size(BlueFS::BDEV_NEWDB
),
6636 ceph_assert(r
== 0);
6638 reserved
= SUPER_RESERVED
;
6639 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
6640 bluefs_layout
.dedicated_db
= true;
6646 bluefs
->add_block_extent(
6649 bluefs
->get_block_device_size(id
) - reserved
, true);
6651 r
= bluefs
->prepare_new_device(id
, bluefs_layout
);
6652 ceph_assert(r
== 0);
6655 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
6657 dout(0) << __func__
<< " success" << dendl
;
6660 _umount_for_bluefs();
6664 int BlueStore::migrate_to_existing_bluefs_device(const set
<int>& devs_source
,
6667 dout(10) << __func__
<< " id:" << id
<< dendl
;
6668 ceph_assert(path_fd
< 0);
6670 ceph_assert(id
== BlueFS::BDEV_SLOW
|| id
== BlueFS::BDEV_DB
);
6672 if (!cct
->_conf
->bluestore_bluefs
) {
6673 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6677 int r
= _mount_for_bluefs();
6679 // require bluestore_bluefs_min_free to be free at target device!
6680 uint64_t used_space
= cct
->_conf
.get_val
<Option::size_t>("bluestore_bluefs_min_free");
6681 for(auto src_id
: devs_source
) {
6682 used_space
+= bluefs
->get_total(src_id
) - bluefs
->get_free(src_id
);
6684 uint64_t target_free
= bluefs
->get_free(id
);
6685 if (id
== BlueFS::BDEV_SLOW
&& target_free
< used_space
) {
6686 // will need to remount full BlueStore instance to allocate more space
6687 _umount_for_bluefs();
6690 ceph_assert(r
== 0);
6692 << " Allocating more space at slow device for BlueFS: +"
6693 << used_space
- target_free
<< " bytes" << dendl
;
6694 r
= allocate_bluefs_freespace(
6695 used_space
- target_free
,
6696 used_space
- target_free
,
6702 << " can't migrate, unable to allocate extra space: "
6703 << used_space
- target_free
<< " at target:" << id
6708 r
= _mount_for_bluefs();
6709 ceph_assert(r
== 0);
6710 } else if (target_free
< used_space
) {
6712 << " can't migrate, free space at target: " << target_free
6713 << " is less than required space: " << used_space
6717 if (devs_source
.count(BlueFS::BDEV_DB
)) {
6718 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
6719 bluefs_layout
.dedicated_db
= false;
6721 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
6722 bluefs_layout
.dedicated_wal
= false;
6724 r
= bluefs
->device_migrate_to_existing(cct
, devs_source
, id
, bluefs_layout
);
6726 derr
<< __func__
<< " failed during BlueFS migration, " << cpp_strerror(r
) << dendl
;
6730 if (devs_source
.count(BlueFS::BDEV_DB
)) {
6731 r
= unlink(string(path
+ "/block.db").c_str());
6732 ceph_assert(r
== 0);
6734 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
6735 r
= unlink(string(path
+ "/block.wal").c_str());
6736 ceph_assert(r
== 0);
6740 _umount_for_bluefs();
6744 int BlueStore::migrate_to_new_bluefs_device(const set
<int>& devs_source
,
6746 const string
& dev_path
)
6748 dout(10) << __func__
<< " path " << dev_path
<< " id:" << id
<< dendl
;
6750 ceph_assert(path_fd
< 0);
6752 ceph_assert(id
== BlueFS::BDEV_NEWWAL
|| id
== BlueFS::BDEV_NEWDB
);
6754 if (!cct
->_conf
->bluestore_bluefs
) {
6755 derr
<< __func__
<< " bluefs isn't configured, can't add new device " << dendl
;
6759 r
= _mount_for_bluefs();
6764 if (devs_source
.count(BlueFS::BDEV_DB
) &&
6765 bluefs_layout
.shared_bdev
!= BlueFS::BDEV_DB
) {
6766 link_db
= path
+ "/block.db";
6767 bluefs_layout
.shared_bdev
= BlueFS::BDEV_DB
;
6768 bluefs_layout
.dedicated_db
= false;
6770 if (devs_source
.count(BlueFS::BDEV_WAL
)) {
6771 link_wal
= path
+ "/block.wal";
6772 bluefs_layout
.dedicated_wal
= false;
6777 if (id
== BlueFS::BDEV_NEWWAL
) {
6778 target_name
= "block.wal";
6779 target_size
= cct
->_conf
->bluestore_block_wal_size
;
6780 bluefs_layout
.dedicated_wal
= true;
6782 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWWAL
, dev_path
,
6783 cct
->_conf
->bdev_enable_discard
);
6784 ceph_assert(r
== 0);
6786 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWWAL
)) {
6787 r
= _check_or_set_bdev_label(
6789 bluefs
->get_block_device_size(BlueFS::BDEV_NEWWAL
),
6792 ceph_assert(r
== 0);
6794 reserved
= BDEV_LABEL_BLOCK_SIZE
;
6795 } else if (id
== BlueFS::BDEV_NEWDB
) {
6796 target_name
= "block.db";
6797 target_size
= cct
->_conf
->bluestore_block_db_size
;
6798 bluefs_layout
.shared_bdev
= BlueFS::BDEV_SLOW
;
6799 bluefs_layout
.dedicated_db
= true;
6801 r
= bluefs
->add_block_device(BlueFS::BDEV_NEWDB
, dev_path
,
6802 cct
->_conf
->bdev_enable_discard
);
6803 ceph_assert(r
== 0);
6805 if (bluefs
->bdev_support_label(BlueFS::BDEV_NEWDB
)) {
6806 r
= _check_or_set_bdev_label(
6808 bluefs
->get_block_device_size(BlueFS::BDEV_NEWDB
),
6811 ceph_assert(r
== 0);
6813 reserved
= SUPER_RESERVED
;
6819 bluefs
->add_block_extent(
6820 id
, reserved
, bluefs
->get_block_device_size(id
) - reserved
);
6822 r
= bluefs
->device_migrate_to_new(cct
, devs_source
, id
, bluefs_layout
);
6825 derr
<< __func__
<< " failed during BlueFS migration, " << cpp_strerror(r
) << dendl
;
6829 if (!link_db
.empty()) {
6830 r
= unlink(link_db
.c_str());
6831 ceph_assert(r
== 0);
6833 if (!link_wal
.empty()) {
6834 r
= unlink(link_wal
.c_str());
6835 ceph_assert(r
== 0);
6837 r
= _setup_block_symlink_or_file(
6842 ceph_assert(r
== 0);
6843 dout(0) << __func__
<< " success" << dendl
;
6846 _umount_for_bluefs();
6850 string
BlueStore::get_device_path(unsigned id
)
6853 if (id
< BlueFS::MAX_BDEV
) {
6855 case BlueFS::BDEV_WAL
:
6856 res
= path
+ "/block.wal";
6858 case BlueFS::BDEV_DB
:
6859 if (id
== bluefs_layout
.shared_bdev
) {
6860 res
= path
+ "/block";
6862 res
= path
+ "/block.db";
6865 case BlueFS::BDEV_SLOW
:
6866 res
= path
+ "/block";
6873 int BlueStore::expand_devices(ostream
& out
)
6875 int r
= cold_open();
6876 ceph_assert(r
== 0);
6877 bluefs
->dump_block_extents(out
);
6878 out
<< "Expanding DB/WAL..." << std::endl
;
6879 for (auto devid
: { BlueFS::BDEV_WAL
, BlueFS::BDEV_DB
}) {
6880 if (devid
== bluefs_layout
.shared_bdev
) {
6883 uint64_t size
= bluefs
->get_block_device_size(devid
);
6889 interval_set
<uint64_t> before
;
6890 bluefs
->get_block_extents(devid
, &before
);
6891 ceph_assert(!before
.empty());
6892 uint64_t end
= before
.range_end();
6895 <<" : expanding " << " from 0x" << std::hex
6896 << end
<< " to 0x" << size
<< std::dec
<< std::endl
;
6897 bluefs
->add_block_extent(devid
, end
, size
-end
);
6898 string p
= get_device_path(devid
);
6899 const char* path
= p
.c_str();
6900 if (path
== nullptr) {
6902 <<": can't find device path " << dendl
;
6905 bluestore_bdev_label_t label
;
6906 int r
= _read_bdev_label(cct
, path
, &label
);
6908 derr
<< "unable to read label for " << path
<< ": "
6909 << cpp_strerror(r
) << dendl
;
6913 r
= _write_bdev_label(cct
, path
, label
);
6915 derr
<< "unable to write label for " << path
<< ": "
6916 << cpp_strerror(r
) << dendl
;
6920 <<" : size label updated to " << size
6924 uint64_t size0
= fm
->get_size();
6925 uint64_t size
= bdev
->get_size();
6927 out
<< bluefs_layout
.shared_bdev
6928 << " : expanding " << " from 0x" << std::hex
6929 << size0
<< " to 0x" << size
<< std::dec
<< std::endl
;
6930 _write_out_fm_meta(size
, true);
6933 // mount in read/write to sync expansion changes
6935 ceph_assert(r
== 0);
6943 int BlueStore::dump_bluefs_sizes(ostream
& out
)
6945 int r
= cold_open();
6946 ceph_assert(r
== 0);
6947 bluefs
->dump_block_extents(out
);
6952 void BlueStore::set_cache_shards(unsigned num
)
6954 dout(10) << __func__
<< " " << num
<< dendl
;
6955 size_t oold
= onode_cache_shards
.size();
6956 size_t bold
= buffer_cache_shards
.size();
6957 ceph_assert(num
>= oold
&& num
>= bold
);
6958 onode_cache_shards
.resize(num
);
6959 buffer_cache_shards
.resize(num
);
6960 for (unsigned i
= oold
; i
< num
; ++i
) {
6961 onode_cache_shards
[i
] =
6962 OnodeCacheShard::create(cct
, cct
->_conf
->bluestore_cache_type
,
6965 for (unsigned i
= bold
; i
< num
; ++i
) {
6966 buffer_cache_shards
[i
] =
6967 BufferCacheShard::create(cct
, cct
->_conf
->bluestore_cache_type
,
6972 int BlueStore::_mount(bool kv_only
, bool open_db
)
6974 dout(1) << __func__
<< " path " << path
<< dendl
;
6980 int r
= read_meta("type", &type
);
6982 derr
<< __func__
<< " failed to load os-type: " << cpp_strerror(r
)
6987 if (type
!= "bluestore") {
6988 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
6993 if (cct
->_conf
->bluestore_fsck_on_mount
) {
6994 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mount_deep
);
6998 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
7003 if (cct
->_conf
->osd_max_object_size
> OBJECT_MAX_SIZE
) {
7004 derr
<< __func__
<< " osd_max_object_size "
7005 << cct
->_conf
->osd_max_object_size
<< " > bluestore max "
7006 << OBJECT_MAX_SIZE
<< dendl
;
7010 int r
= _open_path();
7013 r
= _open_fsid(false);
7017 r
= _read_fsid(&fsid
);
7025 r
= _open_bdev(false);
7030 r
= _open_db_and_around(false);
7032 // we can bypass db open exclusively in case of kv_only mode
7033 ceph_assert(kv_only
);
7034 r
= _open_db(false, true);
7043 r
= _upgrade_super();
7048 r
= _open_collections();
7052 r
= _reload_logger();
7058 r
= _deferred_replay();
7062 mempool_thread
.init();
7064 if ((!per_pool_stat_collection
|| !per_pool_omap
) &&
7065 cct
->_conf
->bluestore_fsck_quick_fix_on_mount
== true) {
7067 bool was_per_pool_omap
= per_pool_omap
;
7069 dout(1) << __func__
<< " quick-fix on mount" << dendl
;
7070 _fsck_on_open(FSCK_SHALLOW
, true);
7073 //FIXME minor: replace with actual open/close?
7075 _check_legacy_statfs_alert();
7077 //set again as hopefully it has been fixed
7078 if (!was_per_pool_omap
) {
7079 _set_per_pool_omap();
7091 _close_db_and_around(false);
7101 int BlueStore::umount()
7103 ceph_assert(_kv_only
|| mounted
);
7104 dout(1) << __func__
<< dendl
;
7110 mempool_thread
.shutdown();
7111 dout(20) << __func__
<< " stopping kv thread" << dendl
;
7114 dout(20) << __func__
<< " closing" << dendl
;
7117 _close_db_and_around(false);
7122 if (cct
->_conf
->bluestore_fsck_on_umount
) {
7123 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_umount_deep
);
7127 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
7134 int BlueStore::cold_open()
7136 int r
= _open_path();
7139 r
= _open_fsid(false);
7143 r
= _read_fsid(&fsid
);
7151 r
= _open_bdev(false);
7154 r
= _open_db_and_around(true);
7167 int BlueStore::cold_close()
7169 _close_db_and_around(true);
7176 // derr wrapper to limit enormous output and avoid log flooding.
7177 // Of limited use where such output is expected for now
7178 #define fsck_derr(err_cnt, threshold) \
7179 if (err_cnt <= threshold) { \
7180 bool need_skip_print = err_cnt == threshold; \
7183 #define fsck_dendl \
7185 if (need_skip_print) \
7186 derr << "more error lines skipped..." << dendl; \
7189 int _fsck_sum_extents(
7190 const PExtentVector
& extents
,
7192 store_statfs_t
& expected_statfs
)
7194 for (auto e
: extents
) {
7197 expected_statfs
.allocated
+= e
.length
;
7199 expected_statfs
.data_compressed_allocated
+= e
.length
;
7205 int BlueStore::_fsck_check_extents(
7207 const ghobject_t
& oid
,
7208 const PExtentVector
& extents
,
7210 mempool_dynamic_bitset
&used_blocks
,
7211 uint64_t granularity
,
7212 BlueStoreRepairer
* repairer
,
7213 store_statfs_t
& expected_statfs
,
7216 dout(30) << __func__
<< " oid " << oid
<< " extents " << extents
<< dendl
;
7218 for (auto e
: extents
) {
7221 expected_statfs
.allocated
+= e
.length
;
7223 expected_statfs
.data_compressed_allocated
+= e
.length
;
7225 if (depth
!= FSCK_SHALLOW
) {
7226 bool already
= false;
7227 apply_for_bitset_range(
7228 e
.offset
, e
.length
, granularity
, used_blocks
,
7229 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
7232 repairer
->note_misreference(
7233 pos
* min_alloc_size
, min_alloc_size
, !already
);
7236 derr
<< "fsck error: " << oid
<< " extent " << e
7237 << " or a subset is already allocated (misreferenced)" << dendl
;
7246 repairer
->get_space_usage_tracker().set_used( e
.offset
, e
.length
, cid
, oid
);
7249 if (e
.end() > bdev
->get_size()) {
7250 derr
<< "fsck error: " << oid
<< " extent " << e
7251 << " past end of block device" << dendl
;
7259 void BlueStore::_fsck_check_pool_statfs(
7260 BlueStore::per_pool_statfs
& expected_pool_statfs
,
7263 BlueStoreRepairer
* repairer
)
7265 auto it
= db
->get_iterator(PREFIX_STAT
);
7267 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
7268 string key
= it
->key();
7269 if (key
== BLUESTORE_GLOBAL_STATFS_KEY
) {
7272 repairer
->remove_key(db
, PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
);
7273 derr
<< "fsck error: " << "legacy statfs record found, removing"
7279 if (get_key_pool_stat(key
, &pool_id
) < 0) {
7280 derr
<< "fsck error: bad key " << key
7281 << "in statfs namespece" << dendl
;
7283 repairer
->remove_key(db
, PREFIX_STAT
, key
);
7289 volatile_statfs vstatfs
;
7290 bufferlist bl
= it
->value();
7291 auto blp
= bl
.cbegin();
7293 vstatfs
.decode(blp
);
7294 } catch (buffer::error
& e
) {
7295 derr
<< "fsck error: failed to decode Pool StatFS record"
7296 << pretty_binary_string(key
) << dendl
;
7298 dout(20) << __func__
<< " undecodable Pool StatFS record, key:'"
7299 << pretty_binary_string(key
)
7300 << "', removing" << dendl
;
7301 repairer
->remove_key(db
, PREFIX_STAT
, key
);
7306 auto stat_it
= expected_pool_statfs
.find(pool_id
);
7307 if (stat_it
== expected_pool_statfs
.end()) {
7308 if (vstatfs
.is_empty()) {
7309 // we don't consider that as an error since empty pool statfs
7310 // are left in DB for now
7311 dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
7312 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7314 // but we need to increment error count in case of repair
7315 // to have proper counters at the end
7316 // (as repairer increments recovery counter anyway).
7320 derr
<< "fsck error: found stray Pool StatFS record for pool id 0x"
7321 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7325 repairer
->remove_key(db
, PREFIX_SHARED_BLOB
, key
);
7329 store_statfs_t statfs
;
7330 vstatfs
.publish(&statfs
);
7331 if (!(stat_it
->second
== statfs
)) {
7332 derr
<< "fsck error: actual " << statfs
7333 << " != expected " << stat_it
->second
7335 << std::hex
<< pool_id
<< std::dec
<< dendl
;
7337 repairer
->fix_statfs(db
, key
, stat_it
->second
);
7341 expected_pool_statfs
.erase(stat_it
);
7344 for (auto& s
: expected_pool_statfs
) {
7345 if (s
.second
.is_zero()) {
7346 // we might lack empty statfs recs in DB
7349 derr
<< "fsck error: missing Pool StatFS record for pool "
7350 << std::hex
<< s
.first
<< std::dec
<< dendl
;
7353 get_pool_stat_key(s
.first
, &key
);
7354 repairer
->fix_statfs(db
, key
, s
.second
);
7358 if (!per_pool_stat_collection
&&
7360 // by virtue of running this method, we correct the top-level
7361 // error of having global stats
7362 repairer
->inc_repaired();
7366 BlueStore::OnodeRef
BlueStore::fsck_check_objects_shallow(
7367 BlueStore::FSCKDepth depth
,
7369 BlueStore::CollectionRef c
,
7370 const ghobject_t
& oid
,
7372 const bufferlist
& value
,
7373 mempool::bluestore_fsck::list
<string
>* expecting_shards
,
7374 map
<BlobRef
, bluestore_blob_t::unused_t
>* referenced
,
7375 const BlueStore::FSCK_ObjectCtx
& ctx
)
7377 auto& errors
= ctx
.errors
;
7378 auto& num_objects
= ctx
.num_objects
;
7379 auto& num_extents
= ctx
.num_extents
;
7380 auto& num_blobs
= ctx
.num_blobs
;
7381 auto& num_sharded_objects
= ctx
.num_sharded_objects
;
7382 auto& num_spanning_blobs
= ctx
.num_spanning_blobs
;
7383 auto used_blocks
= ctx
.used_blocks
;
7384 auto sb_info_lock
= ctx
.sb_info_lock
;
7385 auto& sb_info
= ctx
.sb_info
;
7386 auto repairer
= ctx
.repairer
;
7388 store_statfs_t
* res_statfs
= (per_pool_stat_collection
|| repairer
) ?
7389 &ctx
.expected_pool_statfs
[pool_id
] :
7390 &ctx
.expected_store_statfs
;
7392 dout(10) << __func__
<< " " << oid
<< dendl
;
7394 o
.reset(Onode::decode(c
, oid
, key
, value
));
7397 num_spanning_blobs
+= o
->extent_map
.spanning_blob_map
.size();
7399 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
7400 _dump_onode
<30>(cct
, *o
);
7402 if (!o
->extent_map
.shards
.empty()) {
7403 ++num_sharded_objects
;
7404 if (depth
!= FSCK_SHALLOW
) {
7405 ceph_assert(expecting_shards
);
7406 for (auto& s
: o
->extent_map
.shards
) {
7407 dout(20) << __func__
<< " shard " << *s
.shard_info
<< dendl
;
7408 expecting_shards
->push_back(string());
7409 get_extent_shard_key(o
->key
, s
.shard_info
->offset
,
7410 &expecting_shards
->back());
7411 if (s
.shard_info
->offset
>= o
->onode
.size
) {
7412 derr
<< "fsck error: " << oid
<< " shard 0x" << std::hex
7413 << s
.shard_info
->offset
<< " past EOF at 0x" << o
->onode
.size
7414 << std::dec
<< dendl
;
7423 mempool::bluestore_fsck::map
<BlobRef
,
7424 bluestore_blob_use_tracker_t
> ref_map
;
7425 for (auto& l
: o
->extent_map
.extent_map
) {
7426 dout(20) << __func__
<< " " << l
<< dendl
;
7427 if (l
.logical_offset
< pos
) {
7428 derr
<< "fsck error: " << oid
<< " lextent at 0x"
7429 << std::hex
<< l
.logical_offset
7430 << " overlaps with the previous, which ends at 0x" << pos
7431 << std::dec
<< dendl
;
7434 if (depth
!= FSCK_SHALLOW
&&
7435 o
->extent_map
.spans_shard(l
.logical_offset
, l
.length
)) {
7436 derr
<< "fsck error: " << oid
<< " lextent at 0x"
7437 << std::hex
<< l
.logical_offset
<< "~" << l
.length
7438 << " spans a shard boundary"
7439 << std::dec
<< dendl
;
7442 pos
= l
.logical_offset
+ l
.length
;
7443 res_statfs
->data_stored
+= l
.length
;
7444 ceph_assert(l
.blob
);
7445 const bluestore_blob_t
& blob
= l
.blob
->get_blob();
7447 auto& ref
= ref_map
[l
.blob
];
7448 if (ref
.is_empty()) {
7449 uint32_t min_release_size
= blob
.get_release_size(min_alloc_size
);
7450 uint32_t l
= blob
.get_logical_length();
7451 ref
.init(l
, min_release_size
);
7457 if (depth
!= FSCK_SHALLOW
&&
7458 blob
.has_unused()) {
7459 ceph_assert(referenced
);
7460 auto p
= referenced
->find(l
.blob
);
7461 bluestore_blob_t::unused_t
* pu
;
7462 if (p
== referenced
->end()) {
7463 pu
= &(*referenced
)[l
.blob
];
7468 uint64_t blob_len
= blob
.get_logical_length();
7469 ceph_assert((blob_len
% (sizeof(*pu
) * 8)) == 0);
7470 ceph_assert(l
.blob_offset
+ l
.length
<= blob_len
);
7471 uint64_t chunk_size
= blob_len
/ (sizeof(*pu
) * 8);
7472 uint64_t start
= l
.blob_offset
/ chunk_size
;
7474 round_up_to(l
.blob_offset
+ l
.length
, chunk_size
) / chunk_size
;
7475 for (auto i
= start
; i
< end
; ++i
) {
7479 } //for (auto& l : o->extent_map.extent_map)
7481 for (auto& i
: ref_map
) {
7483 const bluestore_blob_t
& blob
= i
.first
->get_blob();
7485 depth
== FSCK_SHALLOW
? true :
7486 i
.first
->get_blob_use_tracker().equal(i
.second
);
7488 derr
<< "fsck error: " << oid
<< " blob " << *i
.first
7489 << " doesn't match expected ref_map " << i
.second
<< dendl
;
7492 if (blob
.is_compressed()) {
7493 res_statfs
->data_compressed
+= blob
.get_compressed_payload_length();
7494 res_statfs
->data_compressed_original
+=
7495 i
.first
->get_referenced_bytes();
7497 if (blob
.is_shared()) {
7498 if (i
.first
->shared_blob
->get_sbid() > blobid_max
) {
7499 derr
<< "fsck error: " << oid
<< " blob " << blob
7500 << " sbid " << i
.first
->shared_blob
->get_sbid() << " > blobid_max "
7501 << blobid_max
<< dendl
;
7504 else if (i
.first
->shared_blob
->get_sbid() == 0) {
7505 derr
<< "fsck error: " << oid
<< " blob " << blob
7506 << " marked as shared but has uninitialized sbid"
7510 // the below lock is optional and provided in multithreading mode only
7512 sb_info_lock
->lock();
7514 sb_info_t
& sbi
= sb_info
[i
.first
->shared_blob
->get_sbid()];
7515 ceph_assert(sbi
.cid
== coll_t() || sbi
.cid
== c
->cid
);
7516 ceph_assert(sbi
.pool_id
== INT64_MIN
||
7517 sbi
.pool_id
== oid
.hobj
.get_logical_pool());
7519 sbi
.pool_id
= oid
.hobj
.get_logical_pool();
7520 sbi
.sb
= i
.first
->shared_blob
;
7521 sbi
.oids
.push_back(oid
);
7522 sbi
.compressed
= blob
.is_compressed();
7523 for (auto e
: blob
.get_extents()) {
7525 sbi
.ref_map
.get(e
.offset
, e
.length
);
7529 sb_info_lock
->unlock();
7531 } else if (depth
!= FSCK_SHALLOW
) {
7532 ceph_assert(used_blocks
);
7533 errors
+= _fsck_check_extents(c
->cid
, oid
, blob
.get_extents(),
7534 blob
.is_compressed(),
7536 fm
->get_alloc_size(),
7541 errors
+= _fsck_sum_extents(
7543 blob
.is_compressed(),
7546 } // for (auto& i : ref_map)
7548 if (o
->onode
.has_omap()) {
7549 _fsck_check_object_omap(depth
, o
, ctx
);
7555 #include "common/WorkQueue.h"
7557 class ShallowFSCKThreadPool
: public ThreadPool
7560 ShallowFSCKThreadPool(CephContext
* cct_
, std::string nm
, std::string tn
, int n
) :
7561 ThreadPool(cct_
, nm
, tn
, n
) {
7563 void worker(ThreadPool::WorkThread
* wt
) override
{
7566 next_wq
%= work_queues
.size();
7567 WorkQueue_
*wq
= work_queues
[next_wq
++];
7569 void* item
= wq
->_void_dequeue();
7572 TPHandle
tp_handle(cct
, nullptr, wq
->timeout_interval
, wq
->suicide_interval
);
7573 wq
->_void_process(item
, tp_handle
);
7578 template <size_t BatchLen
>
7579 struct FSCKWorkQueue
: public ThreadPool::WorkQueue_
7583 BlueStore::CollectionRef c
;
7589 std::atomic
<size_t> running
= { 0 };
7590 size_t entry_count
= 0;
7591 std::array
<Entry
, BatchLen
> entries
;
7594 int64_t warnings
= 0;
7595 uint64_t num_objects
= 0;
7596 uint64_t num_extents
= 0;
7597 uint64_t num_blobs
= 0;
7598 uint64_t num_sharded_objects
= 0;
7599 uint64_t num_spanning_blobs
= 0;
7600 store_statfs_t expected_store_statfs
;
7601 BlueStore::per_pool_statfs expected_pool_statfs
;
7605 BlueStore
* store
= nullptr;
7607 ceph::mutex
* sb_info_lock
= nullptr;
7608 BlueStore::sb_info_map_t
* sb_info
= nullptr;
7609 BlueStoreRepairer
* repairer
= nullptr;
7611 Batch
* batches
= nullptr;
7612 size_t last_batch_pos
= 0;
7613 bool batch_acquired
= false;
7615 FSCKWorkQueue(std::string n
,
7618 ceph::mutex
* _sb_info_lock
,
7619 BlueStore::sb_info_map_t
& _sb_info
,
7620 BlueStoreRepairer
* _repairer
) :
7621 WorkQueue_(n
, time_t(), time_t()),
7622 batchCount(_batchCount
),
7624 sb_info_lock(_sb_info_lock
),
7628 batches
= new Batch
[batchCount
];
7634 /// Remove all work items from the queue.
7635 void _clear() override
{
7638 /// Check whether there is anything to do.
7639 bool _empty() override
{
7643 /// Get the next work item to process.
7644 void* _void_dequeue() override
{
7645 size_t pos
= rand() % batchCount
;
7648 auto& batch
= batches
[pos
];
7649 if (batch
.running
.fetch_add(1) == 0) {
7650 if (batch
.entry_count
) {
7657 } while (pos
!= pos0
);
7660 /** @brief Process the work item.
7661 * This function will be called several times in parallel
7662 * and must therefore be thread-safe. */
7663 void _void_process(void* item
, TPHandle
& handle
) override
{
7664 Batch
* batch
= (Batch
*)item
;
7666 BlueStore::FSCK_ObjectCtx
ctx(
7672 batch
->num_sharded_objects
,
7673 batch
->num_spanning_blobs
,
7674 nullptr, // used_blocks
7675 nullptr, //used_omap_head
7678 batch
->expected_store_statfs
,
7679 batch
->expected_pool_statfs
,
7682 for (size_t i
= 0; i
< batch
->entry_count
; i
++) {
7683 auto& entry
= batch
->entries
[i
];
7685 store
->fsck_check_objects_shallow(
7686 BlueStore::FSCK_SHALLOW
,
7692 nullptr, // expecting_shards - this will need a protection if passed
7693 nullptr, // referenced
7696 //std::cout << "processed " << batch << std::endl;
7697 batch
->entry_count
= 0;
7700 /** @brief Synchronously finish processing a work item.
7701 * This function is called after _void_process with the global thread pool lock held,
7702 * so at most one copy will execute simultaneously for a given thread pool.
7703 * It can be used for non-thread-safe finalization. */
7704 void _void_process_finish(void*) override
{
7710 BlueStore::CollectionRef c
,
7711 const ghobject_t
& oid
,
7713 const bufferlist
& value
) {
7715 size_t pos0
= last_batch_pos
;
7716 if (!batch_acquired
) {
7718 auto& batch
= batches
[last_batch_pos
];
7719 if (batch
.running
.fetch_add(1) == 0) {
7720 if (batch
.entry_count
< BatchLen
) {
7721 batch_acquired
= true;
7725 batch
.running
.fetch_sub(1);
7727 last_batch_pos
%= batchCount
;
7728 } while (last_batch_pos
!= pos0
);
7730 if (batch_acquired
) {
7731 auto& batch
= batches
[last_batch_pos
];
7732 ceph_assert(batch
.running
);
7733 ceph_assert(batch
.entry_count
< BatchLen
);
7735 auto& entry
= batch
.entries
[batch
.entry_count
];
7736 entry
.pool_id
= pool_id
;
7740 entry
.value
= value
;
7742 ++batch
.entry_count
;
7743 if (batch
.entry_count
== BatchLen
) {
7744 batch_acquired
= false;
7745 batch
.running
.fetch_sub(1);
7747 last_batch_pos
%= batchCount
;
7754 void finalize(ThreadPool
& tp
,
7755 BlueStore::FSCK_ObjectCtx
& ctx
) {
7756 if (batch_acquired
) {
7757 auto& batch
= batches
[last_batch_pos
];
7758 ceph_assert(batch
.running
);
7759 batch
.running
.fetch_sub(1);
7763 for (size_t i
= 0; i
< batchCount
; i
++) {
7764 auto& batch
= batches
[i
];
7766 //process leftovers if any
7767 if (batch
.entry_count
) {
7768 TPHandle
tp_handle(store
->cct
,
7772 ceph_assert(batch
.running
== 0);
7774 batch
.running
++; // just to be on-par with the regular call
7775 _void_process(&batch
, tp_handle
);
7777 ceph_assert(batch
.entry_count
== 0);
7779 ctx
.errors
+= batch
.errors
;
7780 ctx
.warnings
+= batch
.warnings
;
7781 ctx
.num_objects
+= batch
.num_objects
;
7782 ctx
.num_extents
+= batch
.num_extents
;
7783 ctx
.num_blobs
+= batch
.num_blobs
;
7784 ctx
.num_sharded_objects
+= batch
.num_sharded_objects
;
7785 ctx
.num_spanning_blobs
+= batch
.num_spanning_blobs
;
7787 ctx
.expected_store_statfs
.add(batch
.expected_store_statfs
);
7789 for (auto it
= batch
.expected_pool_statfs
.begin();
7790 it
!= batch
.expected_pool_statfs
.end();
7792 ctx
.expected_pool_statfs
[it
->first
].add(it
->second
);
7799 void BlueStore::_fsck_check_object_omap(FSCKDepth depth
,
7801 const BlueStore::FSCK_ObjectCtx
& ctx
)
7803 auto& errors
= ctx
.errors
;
7804 auto& warnings
= ctx
.warnings
;
7805 auto repairer
= ctx
.repairer
;
7807 ceph_assert(o
->onode
.has_omap());
7808 if (!o
->onode
.is_perpool_omap() && !o
->onode
.is_pgmeta_omap()) {
7809 if (per_pool_omap
) {
7810 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
7811 << "fsck error: " << o
->oid
7812 << " has omap that is not per-pool or pgmeta"
7818 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_omap
) {
7827 fsck_derr(num
, MAX_FSCK_ERROR_LINES
)
7828 << "fsck " << w
<< ": " << o
->oid
7829 << " has omap that is not per-pool or pgmeta"
7834 !o
->onode
.is_perpool_omap() &&
7835 !o
->onode
.is_pgmeta_omap()) {
7836 dout(10) << "fsck converting " << o
->oid
<< " omap to per-pool" << dendl
;
7838 map
<string
, bufferlist
> kv
;
7839 int r
= _onode_omap_get(o
, &h
, &kv
);
7841 derr
<< " got " << r
<< " " << cpp_strerror(r
) << dendl
;
7843 KeyValueDB::Transaction txn
= db
->get_transaction();
7845 const string
& old_omap_prefix
= o
->get_omap_prefix();
7846 string old_head
, old_tail
;
7847 o
->get_omap_header(&old_head
);
7848 o
->get_omap_tail(&old_tail
);
7849 txn
->rm_range_keys(old_omap_prefix
, old_head
, old_tail
);
7850 txn
->rmkey(old_omap_prefix
, old_tail
);
7852 o
->onode
.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP
);
7853 _record_onode(o
, txn
);
7854 const string
& new_omap_prefix
= o
->get_omap_prefix();
7858 o
->get_omap_header(&new_head
);
7859 txn
->set(new_omap_prefix
, new_head
, h
);
7863 o
->get_omap_tail(&new_tail
);
7865 txn
->set(new_omap_prefix
, new_tail
, empty
);
7868 o
->get_omap_key(string(), &final_key
);
7869 size_t base_key_len
= final_key
.size();
7870 for (auto& i
: kv
) {
7871 final_key
.resize(base_key_len
);
7872 final_key
+= i
.first
;
7873 txn
->set(new_omap_prefix
, final_key
, i
.second
);
7875 db
->submit_transaction_sync(txn
);
7876 repairer
->inc_repaired();
7881 void BlueStore::_fsck_check_objects(FSCKDepth depth
,
7882 BlueStore::FSCK_ObjectCtx
& ctx
)
7884 auto& errors
= ctx
.errors
;
7885 auto sb_info_lock
= ctx
.sb_info_lock
;
7886 auto& sb_info
= ctx
.sb_info
;
7887 auto repairer
= ctx
.repairer
;
7889 uint64_t_btree_t used_nids
;
7891 size_t processed_myself
= 0;
7893 auto it
= db
->get_iterator(PREFIX_OBJ
);
7894 mempool::bluestore_fsck::list
<string
> expecting_shards
;
7896 const size_t thread_count
= cct
->_conf
->bluestore_fsck_quick_fix_threads
;
7897 typedef ShallowFSCKThreadPool::FSCKWorkQueue
<256> WQ
;
7898 std::unique_ptr
<WQ
> wq(
7901 (thread_count
? : 1) * 32,
7907 ShallowFSCKThreadPool
thread_pool(cct
, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count
);
7909 thread_pool
.add_work_queue(wq
.get());
7910 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
7911 //not the best place but let's check anyway
7912 ceph_assert(sb_info_lock
);
7913 thread_pool
.start();
7916 //fill global if not overriden below
7918 int64_t pool_id
= -1;
7920 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
7921 dout(30) << __func__
<< " key "
7922 << pretty_binary_string(it
->key()) << dendl
;
7923 if (is_extent_shard_key(it
->key())) {
7924 if (depth
== FSCK_SHALLOW
) {
7927 while (!expecting_shards
.empty() &&
7928 expecting_shards
.front() < it
->key()) {
7929 derr
<< "fsck error: missing shard key "
7930 << pretty_binary_string(expecting_shards
.front())
7933 expecting_shards
.pop_front();
7935 if (!expecting_shards
.empty() &&
7936 expecting_shards
.front() == it
->key()) {
7938 expecting_shards
.pop_front();
7944 get_key_extent_shard(it
->key(), &okey
, &offset
);
7945 derr
<< "fsck error: stray shard 0x" << std::hex
<< offset
7946 << std::dec
<< dendl
;
7947 if (expecting_shards
.empty()) {
7948 derr
<< "fsck error: " << pretty_binary_string(it
->key())
7949 << " is unexpected" << dendl
;
7953 while (expecting_shards
.front() > it
->key()) {
7954 derr
<< "fsck error: saw " << pretty_binary_string(it
->key())
7956 derr
<< "fsck error: exp "
7957 << pretty_binary_string(expecting_shards
.front()) << dendl
;
7959 expecting_shards
.pop_front();
7960 if (expecting_shards
.empty()) {
7968 int r
= get_key_object(it
->key(), &oid
);
7970 derr
<< "fsck error: bad object key "
7971 << pretty_binary_string(it
->key()) << dendl
;
7976 oid
.shard_id
!= pgid
.shard
||
7977 oid
.hobj
.get_logical_pool() != (int64_t)pgid
.pool() ||
7978 !c
->contains(oid
)) {
7980 for (auto& p
: coll_map
) {
7981 if (p
.second
->contains(oid
)) {
7987 derr
<< "fsck error: stray object " << oid
7988 << " not owned by any collection" << dendl
;
7992 pool_id
= c
->cid
.is_pg(&pgid
) ? pgid
.pool() : META_POOL_ID
;
7993 dout(20) << __func__
<< " collection " << c
->cid
<< " " << c
->cnode
7997 if (depth
!= FSCK_SHALLOW
&&
7998 !expecting_shards
.empty()) {
7999 for (auto& k
: expecting_shards
) {
8000 derr
<< "fsck error: missing shard key "
8001 << pretty_binary_string(k
) << dendl
;
8004 expecting_shards
.clear();
8007 bool queued
= false;
8008 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
8017 map
<BlobRef
, bluestore_blob_t::unused_t
> referenced
;
8022 o
= fsck_check_objects_shallow(
8034 if (depth
!= FSCK_SHALLOW
) {
8035 ceph_assert(o
!= nullptr);
8037 if (o
->onode
.nid
> nid_max
) {
8038 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
8039 << " > nid_max " << nid_max
<< dendl
;
8042 if (used_nids
.count(o
->onode
.nid
)) {
8043 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
8044 << " already in use" << dendl
;
8046 continue; // go for next object
8048 used_nids
.insert(o
->onode
.nid
);
8050 for (auto& i
: referenced
) {
8051 dout(20) << __func__
<< " referenced 0x" << std::hex
<< i
.second
8052 << std::dec
<< " for " << *i
.first
<< dendl
;
8053 const bluestore_blob_t
& blob
= i
.first
->get_blob();
8054 if (i
.second
& blob
.unused
) {
8055 derr
<< "fsck error: " << oid
<< " blob claims unused 0x"
8056 << std::hex
<< blob
.unused
8057 << " but extents reference 0x" << i
.second
<< std::dec
8058 << " on blob " << *i
.first
<< dendl
;
8061 if (blob
.has_csum()) {
8062 uint64_t blob_len
= blob
.get_logical_length();
8063 uint64_t unused_chunk_size
= blob_len
/ (sizeof(blob
.unused
) * 8);
8064 unsigned csum_count
= blob
.get_csum_count();
8065 unsigned csum_chunk_size
= blob
.get_csum_chunk_size();
8066 for (unsigned p
= 0; p
< csum_count
; ++p
) {
8067 unsigned pos
= p
* csum_chunk_size
;
8068 unsigned firstbit
= pos
/ unused_chunk_size
; // [firstbit,lastbit]
8069 unsigned lastbit
= (pos
+ csum_chunk_size
- 1) / unused_chunk_size
;
8070 unsigned mask
= 1u << firstbit
;
8071 for (unsigned b
= firstbit
+ 1; b
<= lastbit
; ++b
) {
8074 if ((blob
.unused
& mask
) == mask
) {
8075 // this csum chunk region is marked unused
8076 if (blob
.get_csum_item(p
) != 0) {
8077 derr
<< "fsck error: " << oid
8078 << " blob claims csum chunk 0x" << std::hex
<< pos
8079 << "~" << csum_chunk_size
8080 << " is unused (mask 0x" << mask
<< " of unused 0x"
8081 << blob
.unused
<< ") but csum is non-zero 0x"
8082 << blob
.get_csum_item(p
) << std::dec
<< " on blob "
8083 << *i
.first
<< dendl
;
8091 if (o
->onode
.has_omap()) {
8092 ceph_assert(ctx
.used_omap_head
);
8093 if (ctx
.used_omap_head
->count(o
->onode
.nid
)) {
8094 derr
<< "fsck error: " << o
->oid
<< " omap_head " << o
->onode
.nid
8095 << " already in use" << dendl
;
8098 ctx
.used_omap_head
->insert(o
->onode
.nid
);
8100 } // if (o->onode.has_omap())
8101 if (depth
== FSCK_DEEP
) {
8103 uint64_t max_read_block
= cct
->_conf
->bluestore_fsck_read_bytes_cap
;
8104 uint64_t offset
= 0;
8106 uint64_t l
= std::min(uint64_t(o
->onode
.size
- offset
), max_read_block
);
8107 int r
= _do_read(c
.get(), o
, offset
, l
, bl
,
8108 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
8111 derr
<< "fsck error: " << oid
<< std::hex
8112 << " error during read: "
8113 << " " << offset
<< "~" << l
8114 << " " << cpp_strerror(r
) << std::dec
8119 } while (offset
< o
->onode
.size
);
8121 } //if (depth != FSCK_SHALLOW)
8122 } // for (it->lower_bound(string()); it->valid(); it->next())
8123 if (depth
== FSCK_SHALLOW
&& thread_count
> 0) {
8124 wq
->finalize(thread_pool
, ctx
);
8125 if (processed_myself
) {
8126 // may be needs more threads?
8127 dout(0) << __func__
<< " partial offload"
8128 << ", done myself " << processed_myself
8129 << " of " << ctx
.num_objects
8130 << "objects, threads " << thread_count
8137 An overview for currently implemented repair logics
8138 performed in fsck in two stages: detection(+preparation) and commit.
8139 Detection stage (in processing order):
8140 (Issue -> Repair action to schedule)
8141 - Detect undecodable keys for Shared Blobs -> Remove
8142 - Detect undecodable records for Shared Blobs -> Remove
8143 (might trigger missed Shared Blob detection below)
8144 - Detect stray records for Shared Blobs -> Remove
8145 - Detect misreferenced pextents -> Fix
8146 Prepare Bloom-like filter to track cid/oid -> pextent
8147 Prepare list of extents that are improperly referenced
8148 Enumerate Onode records that might use 'misreferenced' pextents
8149 (Bloom-like filter applied to reduce computation)
8150 Per each questinable Onode enumerate all blobs and identify broken ones
8151 (i.e. blobs having 'misreferences')
8152 Rewrite each broken blob data by allocating another extents and
8154 If blob is shared - unshare it and mark corresponding Shared Blob
8156 Release previously allocated space
8158 - Detect missed Shared Blobs -> Recreate
8159 - Detect undecodable deferred transaction -> Remove
8160 - Detect Freelist Manager's 'false free' entries -> Mark as used
8161 - Detect Freelist Manager's leaked entries -> Mark as free
8162 - Detect statfs inconsistency - Update
8163 Commit stage (separate DB commit per each step):
8164 - Apply leaked FM entries fix
8165 - Apply 'false free' FM entries fix
8166 - Apply 'Remove' actions
8167 - Apply fix for misreference pextents
8168 - Apply Shared Blob recreate
8169 (can be merged with the step above if misreferences were dectected)
8170 - Apply StatFS update
8172 int BlueStore::_fsck(BlueStore::FSCKDepth depth
, bool repair
)
8175 << (repair
? " repair" : " check")
8176 << (depth
== FSCK_DEEP
? " (deep)" :
8177 depth
== FSCK_SHALLOW
? " (shallow)" : " (regular)")
8180 // in deep mode we need R/W write access to be able to replay deferred ops
8181 bool read_only
= !(repair
|| depth
== FSCK_DEEP
);
8183 int r
= _open_path();
8186 r
= _open_fsid(false);
8190 r
= _read_fsid(&fsid
);
8198 r
= _open_bdev(false);
8202 r
= _open_db_and_around(read_only
);
8207 r
= _upgrade_super();
8213 r
= _open_collections();
8217 mempool_thread
.init();
8219 // we need finisher and kv_{sync,finalize}_thread *just* for replay
8220 // enable in repair or deep mode modes only
8223 r
= _deferred_replay();
8229 r
= _fsck_on_open(depth
, repair
);
8232 mempool_thread
.shutdown();
8235 _close_db_and_around(false);
8246 int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth
, bool repair
)
8250 << (repair
? " repair" : " check")
8251 << (depth
== FSCK_DEEP
? " (deep)" :
8252 depth
== FSCK_SHALLOW
? " (shallow)" : " (regular)")
8253 << " start" << dendl
;
8255 int64_t warnings
= 0;
8256 unsigned repaired
= 0;
8258 uint64_t_btree_t used_omap_head
;
8259 uint64_t_btree_t used_sbids
;
8261 mempool_dynamic_bitset used_blocks
;
8262 KeyValueDB::Iterator it
;
8263 store_statfs_t expected_store_statfs
, actual_statfs
;
8264 per_pool_statfs expected_pool_statfs
;
8266 sb_info_map_t sb_info
;
8268 uint64_t num_objects
= 0;
8269 uint64_t num_extents
= 0;
8270 uint64_t num_blobs
= 0;
8271 uint64_t num_spanning_blobs
= 0;
8272 uint64_t num_shared_blobs
= 0;
8273 uint64_t num_sharded_objects
= 0;
8274 BlueStoreRepairer repairer
;
8276 utime_t start
= ceph_clock_now();
8278 _fsck_collections(&errors
);
8279 used_blocks
.resize(fm
->get_alloc_units());
8280 apply_for_bitset_range(
8281 0, std::max
<uint64_t>(min_alloc_size
, SUPER_RESERVED
), fm
->get_alloc_size(), used_blocks
,
8282 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8287 repairer
.get_space_usage_tracker().init(
8293 if( cct
->_conf
->bluestore_bluefs_db_compatibility
) {
8294 interval_set
<uint64_t> bluefs_extents_db
;
8296 db
->get(PREFIX_SUPER
, "bluefs_extents", &bl
);
8297 auto p
= bl
.cbegin();
8298 auto prev_errors
= errors
;
8300 decode(bluefs_extents_db
, p
);
8301 bluefs_extents_db
.union_of(bluefs_extents
);
8302 bluefs_extents_db
.subtract(bluefs_extents
);
8303 if (!bluefs_extents_db
.empty()) {
8304 derr
<< "fsck error: bluefs_extents inconsistency, "
8305 << "downgrade to previous releases might be broken."
8310 catch (buffer::error
& e
) {
8311 derr
<< "fsck error: failed to retrieve bluefs_extents from kv" << dendl
;
8314 if (errors
!= prev_errors
&& repair
) {
8315 repairer
.fix_bluefs_extents(out_of_sync_fm
);
8319 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
8320 apply_for_bitset_range(
8321 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
8322 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8326 int r
= bluefs
->fsck();
8334 if (!per_pool_stat_collection
) {
8336 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_stats
) {
8343 derr
<< "fsck " << w
<< ": store not yet converted to per-pool stats"
8346 if (!per_pool_omap
) {
8348 if (cct
->_conf
->bluestore_fsck_error_on_no_per_pool_omap
) {
8355 derr
<< "fsck " << w
<< ": store not yet converted to per-pool omap"
8359 // get expected statfs; reset unaffected fields to be able to compare
8361 statfs(&actual_statfs
);
8362 actual_statfs
.total
= 0;
8363 actual_statfs
.internally_reserved
= 0;
8364 actual_statfs
.available
= 0;
8365 actual_statfs
.internal_metadata
= 0;
8366 actual_statfs
.omap_allocated
= 0;
8368 if (g_conf()->bluestore_debug_fsck_abort
) {
8369 dout(1) << __func__
<< " debug abort" << dendl
;
8374 dout(1) << __func__
<< " walking object keyspace" << dendl
;
8375 ceph::mutex sb_info_lock
= ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
8376 BlueStore::FSCK_ObjectCtx
ctx(
8382 num_sharded_objects
,
8386 //no need for the below lock when in non-shallow mode as
8387 // there is no multithreading in this case
8388 depth
== FSCK_SHALLOW
? &sb_info_lock
: nullptr,
8390 expected_store_statfs
,
8391 expected_pool_statfs
,
8392 repair
? &repairer
: nullptr);
8394 _fsck_check_objects(depth
, ctx
);
8397 dout(1) << __func__
<< " checking shared_blobs" << dendl
;
8398 it
= db
->get_iterator(PREFIX_SHARED_BLOB
);
8400 // FIXME minor: perhaps simplify for shallow mode?
8401 // fill global if not overriden below
8402 auto expected_statfs
= &expected_store_statfs
;
8404 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8405 string key
= it
->key();
8407 if (get_key_shared_blob(key
, &sbid
)) {
8408 derr
<< "fsck error: bad key '" << key
8409 << "' in shared blob namespace" << dendl
;
8411 repairer
.remove_key(db
, PREFIX_SHARED_BLOB
, key
);
8416 auto p
= sb_info
.find(sbid
);
8417 if (p
== sb_info
.end()) {
8418 derr
<< "fsck error: found stray shared blob data for sbid 0x"
8419 << std::hex
<< sbid
<< std::dec
<< dendl
;
8421 repairer
.remove_key(db
, PREFIX_SHARED_BLOB
, key
);
8426 sb_info_t
& sbi
= p
->second
;
8427 bluestore_shared_blob_t
shared_blob(sbid
);
8428 bufferlist bl
= it
->value();
8429 auto blp
= bl
.cbegin();
8431 decode(shared_blob
, blp
);
8432 } catch (buffer::error
& e
) {
8434 // Force update and don't report as missing
8435 sbi
.updated
= sbi
.passed
= true;
8437 derr
<< "fsck error: failed to decode Shared Blob"
8438 << pretty_binary_string(it
->key()) << dendl
;
8440 dout(20) << __func__
<< " undecodable Shared Blob, key:'"
8441 << pretty_binary_string(it
->key())
8442 << "', removing" << dendl
;
8443 repairer
.remove_key(db
, PREFIX_DEFERRED
, it
->key());
8447 dout(20) << __func__
<< " " << *sbi
.sb
<< " " << shared_blob
<< dendl
;
8448 if (shared_blob
.ref_map
!= sbi
.ref_map
) {
8449 derr
<< "fsck error: shared blob 0x" << std::hex
<< sbid
8450 << std::dec
<< " ref_map " << shared_blob
.ref_map
8451 << " != expected " << sbi
.ref_map
<< dendl
;
8452 sbi
.updated
= true; // will update later in repair mode only!
8455 PExtentVector extents
;
8456 for (auto &r
: shared_blob
.ref_map
.ref_map
) {
8457 extents
.emplace_back(bluestore_pextent_t(r
.first
, r
.second
.length
));
8459 if (per_pool_stat_collection
|| repair
) {
8460 expected_statfs
= &expected_pool_statfs
[sbi
.pool_id
];
8462 errors
+= _fsck_check_extents(sbi
.cid
,
8463 p
->second
.oids
.front(),
8465 p
->second
.compressed
,
8467 fm
->get_alloc_size(),
8468 repair
? &repairer
: nullptr,
8476 if (repair
&& repairer
.preprocess_misreference(db
)) {
8478 dout(1) << __func__
<< " sorting out misreferenced extents" << dendl
;
8479 auto& space_tracker
= repairer
.get_space_usage_tracker();
8480 auto& misref_extents
= repairer
.get_misreferences();
8481 interval_set
<uint64_t> to_release
;
8482 it
= db
->get_iterator(PREFIX_OBJ
);
8484 // fill global if not overriden below
8485 auto expected_statfs
= &expected_store_statfs
;
8489 KeyValueDB::Transaction txn
= repairer
.get_fix_misreferences_txn();
8490 bool bypass_rest
= false;
8491 for (it
->lower_bound(string()); it
->valid() && !bypass_rest
;
8493 dout(30) << __func__
<< " key "
8494 << pretty_binary_string(it
->key()) << dendl
;
8495 if (is_extent_shard_key(it
->key())) {
8500 int r
= get_key_object(it
->key(), &oid
);
8501 if (r
< 0 || !space_tracker
.is_used(oid
)) {
8506 oid
.shard_id
!= pgid
.shard
||
8507 oid
.hobj
.get_logical_pool() != (int64_t)pgid
.pool() ||
8508 !c
->contains(oid
)) {
8510 for (auto& p
: coll_map
) {
8511 if (p
.second
->contains(oid
)) {
8519 if (per_pool_stat_collection
|| repair
) {
8520 auto pool_id
= c
->cid
.is_pg(&pgid
) ? pgid
.pool() : META_POOL_ID
;
8521 expected_statfs
= &expected_pool_statfs
[pool_id
];
8524 if (!space_tracker
.is_used(c
->cid
)) {
8528 dout(20) << __func__
<< " check misreference for col:" << c
->cid
8529 << " obj:" << oid
<< dendl
;
8532 o
.reset(Onode::decode(c
, oid
, it
->key(), it
->value()));
8533 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
8534 mempool::bluestore_fsck::set
<BlobRef
> blobs
;
8536 for (auto& e
: o
->extent_map
.extent_map
) {
8537 blobs
.insert(e
.blob
);
8539 bool need_onode_update
= false;
8540 bool first_dump
= true;
8541 for(auto b
: blobs
) {
8542 bool broken_blob
= false;
8543 auto& pextents
= b
->dirty_blob().dirty_extents();
8544 for (auto& e
: pextents
) {
8545 if (!e
.is_valid()) {
8548 // for the sake of simplicity and proper shared blob handling
8549 // always rewrite the whole blob even when it's partially
8551 if (misref_extents
.intersects(e
.offset
, e
.length
)) {
8554 _dump_onode
<10>(cct
, *o
);
8562 bool compressed
= b
->get_blob().is_compressed();
8563 need_onode_update
= true;
8564 dout(10) << __func__
8565 << " fix misreferences in oid:" << oid
8566 << " " << *b
<< dendl
;
8568 PExtentVector pext_to_release
;
8569 pext_to_release
.reserve(pextents
.size());
8570 // rewriting all valid pextents
8571 for (auto e
= pextents
.begin(); e
!= pextents
.end();
8572 b_off
+= e
->length
, e
++) {
8573 if (!e
->is_valid()) {
8577 int64_t alloc_len
= alloc
->allocate(e
->length
, min_alloc_size
,
8579 if (alloc_len
< 0 || alloc_len
< (int64_t)e
->length
) {
8581 << " failed to allocate 0x" << std::hex
<< e
->length
8582 << " allocated 0x " << (alloc_len
< 0 ? 0 : alloc_len
)
8583 << " min_alloc_size 0x" << min_alloc_size
8584 << " available 0x " << alloc
->get_free()
8585 << std::dec
<< dendl
;
8586 if (alloc_len
> 0) {
8587 alloc
->release(exts
);
8592 expected_statfs
->allocated
+= e
->length
;
8594 expected_statfs
->data_compressed_allocated
+= e
->length
;
8598 IOContext
ioc(cct
, NULL
, true); // allow EIO
8599 r
= bdev
->read(e
->offset
, e
->length
, &bl
, &ioc
, false);
8601 derr
<< __func__
<< " failed to read from 0x" << std::hex
<< e
->offset
8602 <<"~" << e
->length
<< std::dec
<< dendl
;
8603 ceph_abort_msg("read failed, wtf");
8605 pext_to_release
.push_back(*e
);
8606 e
= pextents
.erase(e
);
8607 e
= pextents
.insert(e
, exts
.begin(), exts
.end());
8608 b
->get_blob().map_bl(
8610 [&](uint64_t offset
, bufferlist
& t
) {
8611 int r
= bdev
->write(offset
, t
, false);
8612 ceph_assert(r
== 0);
8614 e
+= exts
.size() - 1;
8615 for (auto& p
: exts
) {
8616 fm
->allocate(p
.offset
, p
.length
, txn
);
8618 } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
8620 if (b
->get_blob().is_shared()) {
8621 b
->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED
);
8623 auto sb_it
= sb_info
.find(b
->shared_blob
->get_sbid());
8624 ceph_assert(sb_it
!= sb_info
.end());
8625 sb_info_t
& sbi
= sb_it
->second
;
8627 for (auto& r
: sbi
.ref_map
.ref_map
) {
8628 expected_statfs
->allocated
-= r
.second
.length
;
8629 if (sbi
.compressed
) {
8630 // NB: it's crucial to use compressed flag from sb_info_t
8631 // as we originally used that value while accumulating
8633 expected_statfs
->data_compressed_allocated
-= r
.second
.length
;
8636 sbi
.updated
= sbi
.passed
= true;
8637 sbi
.ref_map
.clear();
8639 // relying on blob's pextents to decide what to release.
8640 for (auto& p
: pext_to_release
) {
8641 to_release
.union_insert(p
.offset
, p
.length
);
8644 for (auto& p
: pext_to_release
) {
8645 expected_statfs
->allocated
-= p
.length
;
8647 expected_statfs
->data_compressed_allocated
-= p
.length
;
8649 to_release
.union_insert(p
.offset
, p
.length
);
8655 } // for(auto b : blobs)
8656 if (need_onode_update
) {
8657 o
->extent_map
.dirty_range(0, OBJECT_MAX_SIZE
);
8658 _record_onode(o
, txn
);
8660 } // for (it->lower_bound(string()); it->valid(); it->next())
8662 for (auto it
= to_release
.begin(); it
!= to_release
.end(); ++it
) {
8663 dout(10) << __func__
<< " release 0x" << std::hex
<< it
.get_start()
8664 << "~" << it
.get_len() << std::dec
<< dendl
;
8665 fm
->release(it
.get_start(), it
.get_len(), txn
);
8667 alloc
->release(to_release
);
8670 } //if (repair && repairer.preprocess_misreference()) {
8672 if (depth
!= FSCK_SHALLOW
) {
8673 for (auto &p
: sb_info
) {
8674 sb_info_t
& sbi
= p
.second
;
8676 derr
<< "fsck error: missing " << *sbi
.sb
<< dendl
;
8679 if (repair
&& (!sbi
.passed
|| sbi
.updated
)) {
8680 auto sbid
= p
.first
;
8681 if (sbi
.ref_map
.empty()) {
8682 ceph_assert(sbi
.passed
);
8683 dout(20) << __func__
<< " " << *sbi
.sb
8684 << " is empty, removing" << dendl
;
8685 repairer
.fix_shared_blob(db
, sbid
, nullptr);
8688 bluestore_shared_blob_t
persistent(sbid
, std::move(sbi
.ref_map
));
8689 encode(persistent
, bl
);
8690 dout(20) << __func__
<< " " << *sbi
.sb
8691 << " is " << bl
.length() << " bytes, updating" << dendl
;
8693 repairer
.fix_shared_blob(db
, sbid
, &bl
);
8700 // check global stats only if fscking (not repairing) w/o per-pool stats
8701 if (!per_pool_stat_collection
&&
8703 !(actual_statfs
== expected_store_statfs
)) {
8704 derr
<< "fsck error: actual " << actual_statfs
8705 << " != expected " << expected_store_statfs
<< dendl
;
8707 repairer
.fix_statfs(db
, BLUESTORE_GLOBAL_STATFS_KEY
,
8708 expected_store_statfs
);
8713 dout(1) << __func__
<< " checking pool_statfs" << dendl
;
8714 _fsck_check_pool_statfs(expected_pool_statfs
,
8715 errors
, warnings
, repair
? &repairer
: nullptr);
8717 if (depth
!= FSCK_SHALLOW
) {
8718 dout(1) << __func__
<< " checking for stray omap data " << dendl
;
8719 it
= db
->get_iterator(PREFIX_OMAP
);
8721 uint64_t last_omap_head
= 0;
8722 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8724 _key_decode_u64(it
->key().c_str(), &omap_head
);
8725 if (used_omap_head
.count(omap_head
) == 0 &&
8726 omap_head
!= last_omap_head
) {
8727 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8728 << "fsck error: found stray omap data on omap_head "
8729 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
)<< fsck_dendl
;
8731 last_omap_head
= omap_head
;
8735 it
= db
->get_iterator(PREFIX_PGMETA_OMAP
);
8737 uint64_t last_omap_head
= 0;
8738 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8740 _key_decode_u64(it
->key().c_str(), &omap_head
);
8741 if (used_omap_head
.count(omap_head
) == 0 &&
8742 omap_head
!= last_omap_head
) {
8743 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8744 << "fsck error: found stray (pgmeta) omap data on omap_head "
8745 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
8746 last_omap_head
= omap_head
;
8751 it
= db
->get_iterator(PREFIX_PERPOOL_OMAP
);
8753 uint64_t last_omap_head
= 0;
8754 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8757 string k
= it
->key();
8758 const char *c
= k
.c_str();
8759 c
= _key_decode_u64(c
, &pool
);
8760 c
= _key_decode_u64(c
, &omap_head
);
8761 if (used_omap_head
.count(omap_head
) == 0 &&
8762 omap_head
!= last_omap_head
) {
8763 fsck_derr(errors
, MAX_FSCK_ERROR_LINES
)
8764 << "fsck error: found stray (per-pool) omap data on omap_head "
8765 << omap_head
<< " " << last_omap_head
<< " " << used_omap_head
.count(omap_head
) << fsck_dendl
;
8767 last_omap_head
= omap_head
;
8771 dout(1) << __func__
<< " checking deferred events" << dendl
;
8772 it
= db
->get_iterator(PREFIX_DEFERRED
);
8774 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
8775 bufferlist bl
= it
->value();
8776 auto p
= bl
.cbegin();
8777 bluestore_deferred_transaction_t wt
;
8780 } catch (buffer::error
& e
) {
8781 derr
<< "fsck error: failed to decode deferred txn "
8782 << pretty_binary_string(it
->key()) << dendl
;
8784 dout(20) << __func__
<< " undecodable deferred TXN record, key: '"
8785 << pretty_binary_string(it
->key())
8786 << "', removing" << dendl
;
8787 repairer
.remove_key(db
, PREFIX_DEFERRED
, it
->key());
8791 dout(20) << __func__
<< " deferred " << wt
.seq
8792 << " ops " << wt
.ops
.size()
8793 << " released 0x" << std::hex
<< wt
.released
<< std::dec
<< dendl
;
8794 for (auto e
= wt
.released
.begin(); e
!= wt
.released
.end(); ++e
) {
8795 apply_for_bitset_range(
8796 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
8797 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8805 dout(1) << __func__
<< " checking freelist vs allocated" << dendl
;
8807 // remove bluefs_extents from used set since the freelist doesn't
8808 // know they are allocated.
8809 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
8810 apply_for_bitset_range(
8811 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
8812 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8817 fm
->enumerate_reset();
8818 uint64_t offset
, length
;
8819 while (fm
->enumerate_next(db
, &offset
, &length
)) {
8820 bool intersects
= false;
8821 apply_for_bitset_range(
8822 offset
, length
, fm
->get_alloc_size(), used_blocks
,
8823 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
8825 if (offset
== SUPER_RESERVED
&&
8826 length
== min_alloc_size
- SUPER_RESERVED
) {
8827 // this is due to the change just after luminous to min_alloc_size
8828 // granularity allocations, and our baked in assumption at the top
8829 // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
8830 // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
8831 // since we will never allocate this region below min_alloc_size.
8832 dout(10) << __func__
<< " ignoring free extent between SUPER_RESERVED"
8833 << " and min_alloc_size, 0x" << std::hex
<< offset
<< "~"
8834 << length
<< std::dec
<< dendl
;
8838 repairer
.fix_false_free(db
, fm
,
8839 pos
* min_alloc_size
,
8849 derr
<< "fsck error: free extent 0x" << std::hex
<< offset
8850 << "~" << length
<< std::dec
8851 << " intersects allocated blocks" << dendl
;
8855 fm
->enumerate_reset();
8856 size_t count
= used_blocks
.count();
8857 if (used_blocks
.size() != count
) {
8858 ceph_assert(used_blocks
.size() > count
);
8860 size_t start
= used_blocks
.find_first();
8861 while (start
!= decltype(used_blocks
)::npos
) {
8864 size_t next
= used_blocks
.find_next(cur
);
8865 if (next
!= cur
+ 1) {
8867 derr
<< "fsck error: leaked extent 0x" << std::hex
8868 << ((uint64_t)start
* fm
->get_alloc_size()) << "~"
8869 << ((cur
+ 1 - start
) * fm
->get_alloc_size()) << std::dec
8872 repairer
.fix_leaked(db
,
8874 start
* min_alloc_size
,
8875 (cur
+ 1 - start
) * min_alloc_size
);
8888 if (!per_pool_omap
) {
8889 dout(5) << __func__
<< " marking per_pool_omap=1" << dendl
;
8890 repairer
.fix_per_pool_omap(db
);
8893 dout(5) << __func__
<< " applying repair results" << dendl
;
8894 repaired
= repairer
.apply(db
);
8895 dout(5) << __func__
<< " repair applied" << dendl
;
8899 dout(2) << __func__
<< " " << num_objects
<< " objects, "
8900 << num_sharded_objects
<< " of them sharded. "
8902 dout(2) << __func__
<< " " << num_extents
<< " extents to "
8903 << num_blobs
<< " blobs, "
8904 << num_spanning_blobs
<< " spanning, "
8905 << num_shared_blobs
<< " shared."
8908 utime_t duration
= ceph_clock_now() - start
;
8909 dout(1) << __func__
<< " <<<FINISH>>> with " << errors
<< " errors, "
8910 << warnings
<< " warnings, "
8911 << repaired
<< " repaired, "
8912 << (errors
+ warnings
- (int)repaired
) << " remaining in "
8913 << duration
<< " seconds" << dendl
;
8915 // In non-repair mode we should return error count only as
8916 // it indicates if store status is OK.
8917 // In repair mode both errors and warnings are taken into account
8918 // since repaired counter relates to them both.
8919 return repair
? errors
+ warnings
- (int)repaired
: errors
;
8922 /// methods to inject various errors fsck can repair
8923 void BlueStore::inject_broken_shared_blob_key(const string
& key
,
8924 const bufferlist
& bl
)
8926 KeyValueDB::Transaction txn
;
8927 txn
= db
->get_transaction();
8928 txn
->set(PREFIX_SHARED_BLOB
, key
, bl
);
8929 db
->submit_transaction_sync(txn
);
8932 void BlueStore::inject_leaked(uint64_t len
)
8934 KeyValueDB::Transaction txn
;
8935 txn
= db
->get_transaction();
8938 int64_t alloc_len
= alloc
->allocate(len
, min_alloc_size
,
8939 min_alloc_size
* 256, 0, &exts
);
8940 ceph_assert(alloc_len
>= (int64_t)len
);
8941 for (auto& p
: exts
) {
8942 fm
->allocate(p
.offset
, p
.length
, txn
);
8944 db
->submit_transaction_sync(txn
);
8947 void BlueStore::inject_false_free(coll_t cid
, ghobject_t oid
)
8949 KeyValueDB::Transaction txn
;
8951 CollectionRef c
= _get_collection(cid
);
8954 std::unique_lock l
{c
->lock
}; // just to avoid internal asserts
8955 o
= c
->get_onode(oid
, false);
8957 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
8960 bool injected
= false;
8961 txn
= db
->get_transaction();
8962 auto& em
= o
->extent_map
.extent_map
;
8963 std::vector
<const PExtentVector
*> v
;
8965 v
.push_back(&em
.begin()->blob
->get_blob().get_extents());
8967 if (em
.size() > 1) {
8970 v
.push_back(&(it
->blob
->get_blob().get_extents()));
8972 for (auto pext
: v
) {
8974 auto p
= pext
->begin();
8975 while (p
!= pext
->end()) {
8976 if (p
->is_valid()) {
8977 dout(20) << __func__
<< " release 0x" << std::hex
<< p
->offset
8978 << "~" << p
->length
<< std::dec
<< dendl
;
8979 fm
->release(p
->offset
, p
->length
, txn
);
8987 ceph_assert(injected
);
8988 db
->submit_transaction_sync(txn
);
8991 void BlueStore::inject_legacy_omap()
8993 dout(1) << __func__
<< dendl
;
8994 per_pool_omap
= false;
8995 KeyValueDB::Transaction txn
;
8996 txn
= db
->get_transaction();
8997 txn
->rmkey(PREFIX_SUPER
, "per_pool_omap");
8998 db
->submit_transaction_sync(txn
);
9001 void BlueStore::inject_legacy_omap(coll_t cid
, ghobject_t oid
)
9003 dout(1) << __func__
<< " "
9004 << cid
<< " " << oid
9006 KeyValueDB::Transaction txn
;
9008 CollectionRef c
= _get_collection(cid
);
9011 std::unique_lock l
{ c
->lock
}; // just to avoid internal asserts
9012 o
= c
->get_onode(oid
, false);
9015 o
->onode
.clear_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP
| bluestore_onode_t::FLAG_PGMETA_OMAP
);
9016 txn
= db
->get_transaction();
9017 _record_onode(o
, txn
);
9018 db
->submit_transaction_sync(txn
);
9022 void BlueStore::inject_statfs(const string
& key
, const store_statfs_t
& new_statfs
)
9024 BlueStoreRepairer repairer
;
9025 repairer
.fix_statfs(db
, key
, new_statfs
);
9029 void BlueStore::inject_global_statfs(const store_statfs_t
& new_statfs
)
9031 KeyValueDB::Transaction t
= db
->get_transaction();
9036 t
->set(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, bl
);
9037 db
->submit_transaction_sync(t
);
9040 void BlueStore::inject_misreference(coll_t cid1
, ghobject_t oid1
,
9041 coll_t cid2
, ghobject_t oid2
,
9045 CollectionRef c1
= _get_collection(cid1
);
9048 std::unique_lock l
{c1
->lock
}; // just to avoid internal asserts
9049 o1
= c1
->get_onode(oid1
, false);
9051 o1
->extent_map
.fault_range(db
, offset
, OBJECT_MAX_SIZE
);
9054 CollectionRef c2
= _get_collection(cid2
);
9057 std::unique_lock l
{c2
->lock
}; // just to avoid internal asserts
9058 o2
= c2
->get_onode(oid2
, false);
9060 o2
->extent_map
.fault_range(db
, offset
, OBJECT_MAX_SIZE
);
9062 Extent
& e1
= *(o1
->extent_map
.seek_lextent(offset
));
9063 Extent
& e2
= *(o2
->extent_map
.seek_lextent(offset
));
9065 // require onode/extent layout to be the same (and simple)
9066 // to make things easier
9067 ceph_assert(o1
->onode
.extent_map_shards
.empty());
9068 ceph_assert(o2
->onode
.extent_map_shards
.empty());
9069 ceph_assert(o1
->extent_map
.spanning_blob_map
.size() == 0);
9070 ceph_assert(o2
->extent_map
.spanning_blob_map
.size() == 0);
9071 ceph_assert(e1
.logical_offset
== e2
.logical_offset
);
9072 ceph_assert(e1
.length
== e2
.length
);
9073 ceph_assert(e1
.blob_offset
== e2
.blob_offset
);
9075 KeyValueDB::Transaction txn
;
9076 txn
= db
->get_transaction();
9078 // along with misreference error this will create space leaks errors
9079 e2
.blob
->dirty_blob() = e1
.blob
->get_blob();
9080 o2
->extent_map
.dirty_range(offset
, e2
.length
);
9081 o2
->extent_map
.update(txn
, false);
9083 _record_onode(o2
, txn
);
9084 db
->submit_transaction_sync(txn
);
9087 void BlueStore::collect_metadata(map
<string
,string
> *pm
)
9089 dout(10) << __func__
<< dendl
;
9090 bdev
->collect_metadata("bluestore_bdev_", pm
);
9092 (*pm
)["bluefs"] = "1";
9093 // this value is for backward compatibility only
9094 (*pm
)["bluefs_single_shared_device"] = \
9095 stringify((int)bluefs_layout
.single_shared_device());
9096 (*pm
)["bluefs_dedicated_db"] = \
9097 stringify((int)bluefs_layout
.dedicated_db
);
9098 (*pm
)["bluefs_dedicated_wal"] = \
9099 stringify((int)bluefs_layout
.dedicated_wal
);
9100 bluefs
->collect_metadata(pm
, bluefs_layout
.shared_bdev
);
9102 (*pm
)["bluefs"] = "0";
9105 // report numa mapping for underlying devices
9109 int r
= get_numa_node(&node
, &nodes
, &failed
);
9111 if (!failed
.empty()) {
9112 (*pm
)["objectstore_numa_unknown_devices"] = stringify(failed
);
9114 if (!nodes
.empty()) {
9115 dout(1) << __func__
<< " devices span numa nodes " << nodes
<< dendl
;
9116 (*pm
)["objectstore_numa_nodes"] = stringify(nodes
);
9119 (*pm
)["objectstore_numa_node"] = stringify(node
);
9124 int BlueStore::get_numa_node(
9126 set
<int> *out_nodes
,
9127 set
<string
> *out_failed
)
9130 set
<string
> devices
;
9131 get_devices(&devices
);
9134 for (auto& devname
: devices
) {
9136 BlkDev
bdev(devname
);
9137 int r
= bdev
.get_numa_node(&n
);
9139 dout(10) << __func__
<< " bdev " << devname
<< " can't detect numa_node"
9141 failed
.insert(devname
);
9144 dout(10) << __func__
<< " bdev " << devname
<< " on numa_node " << n
9151 if (node
>= 0 && nodes
.size() == 1 && failed
.empty()) {
9158 *out_failed
= failed
;
9163 int BlueStore::get_devices(set
<string
> *ls
)
9166 bdev
->get_devices(ls
);
9168 bluefs
->get_devices(ls
);
9173 // grumble, we haven't started up yet.
9174 int r
= _open_path();
9177 r
= _open_fsid(false);
9180 r
= _read_fsid(&fsid
);
9186 r
= _open_bdev(false);
9189 r
= _minimal_open_bluefs(false);
9192 bdev
->get_devices(ls
);
9194 bluefs
->get_devices(ls
);
9197 _minimal_close_bluefs();
9208 void BlueStore::_get_statfs_overall(struct store_statfs_t
*buf
)
9212 buf
->omap_allocated
=
9213 db
->estimate_prefix_size(PREFIX_OMAP
, string()) +
9214 db
->estimate_prefix_size(PREFIX_PERPOOL_OMAP
, string());
9216 uint64_t bfree
= alloc
->get_free();
9219 int64_t bluefs_total
= bluefs
->get_total(bluefs_layout
.shared_bdev
);
9220 int64_t bluefs_free
= bluefs
->get_free(bluefs_layout
.shared_bdev
);
9221 // part of our shared device is "free" according to BlueFS, but we
9222 // can't touch bluestore_bluefs_min of it.
9223 int64_t shared_available
= std::min(
9225 int64_t(bluefs_total
- cct
->_conf
->bluestore_bluefs_min
));
9226 buf
->internally_reserved
= bluefs_total
- shared_available
;
9227 if (shared_available
> 0) {
9228 bfree
+= shared_available
;
9230 // include dedicated db, too, if that isn't the shared device.
9231 if (bluefs_layout
.shared_bdev
!= BlueFS::BDEV_DB
) {
9232 buf
->total
+= bluefs
->get_total(BlueFS::BDEV_DB
);
9234 // call any non-omap bluefs space "internal metadata"
9235 buf
->internal_metadata
=
9236 std::max(bluefs
->get_used(), (uint64_t)cct
->_conf
->bluestore_bluefs_min
)
9237 - buf
->omap_allocated
;
9240 uint64_t thin_total
, thin_avail
;
9241 if (bdev
->get_thin_utilization(&thin_total
, &thin_avail
)) {
9242 buf
->total
+= thin_total
;
9244 // we are limited by both the size of the virtual device and the
9245 // underlying physical device.
9246 bfree
= std::min(bfree
, thin_avail
);
9248 buf
->allocated
= thin_total
- thin_avail
;
9250 buf
->total
+= bdev
->get_size();
9252 buf
->available
= bfree
;
9255 int BlueStore::statfs(struct store_statfs_t
*buf
,
9256 osd_alert_list_t
* alerts
)
9260 _log_alerts(*alerts
);
9262 _get_statfs_overall(buf
);
9264 std::lock_guard
l(vstatfs_lock
);
9265 buf
->allocated
= vstatfs
.allocated();
9266 buf
->data_stored
= vstatfs
.stored();
9267 buf
->data_compressed
= vstatfs
.compressed();
9268 buf
->data_compressed_original
= vstatfs
.compressed_original();
9269 buf
->data_compressed_allocated
= vstatfs
.compressed_allocated();
9272 dout(20) << __func__
<< " " << *buf
<< dendl
;
9276 int BlueStore::pool_statfs(uint64_t pool_id
, struct store_statfs_t
*buf
,
9277 bool *out_per_pool_omap
)
9279 dout(20) << __func__
<< " pool " << pool_id
<< dendl
;
9281 if (!per_pool_stat_collection
) {
9282 dout(20) << __func__
<< " not supported in legacy mode " << dendl
;
9288 std::lock_guard
l(vstatfs_lock
);
9289 osd_pools
[pool_id
].publish(buf
);
9293 _key_encode_u64(pool_id
, &key_prefix
);
9294 buf
->omap_allocated
= db
->estimate_prefix_size(PREFIX_PERPOOL_OMAP
,
9296 *out_per_pool_omap
= per_pool_omap
;
9298 dout(10) << __func__
<< *buf
<< dendl
;
9302 void BlueStore::_check_legacy_statfs_alert()
9305 if (!per_pool_stat_collection
&&
9306 cct
->_conf
->bluestore_warn_on_legacy_statfs
) {
9307 s
= "legacy statfs reporting detected, "
9308 "suggest to run store repair to get consistent statistic reports";
9310 std::lock_guard
l(qlock
);
9311 legacy_statfs_alert
= s
;
9314 void BlueStore::_check_no_per_pool_omap_alert()
9317 if (!per_pool_omap
&&
9318 cct
->_conf
->bluestore_warn_on_no_per_pool_omap
) {
9319 s
= "legacy (not per-pool) omap detected, "
9320 "suggest to run store repair to measure per-pool omap usage";
9322 std::lock_guard
l(qlock
);
9323 no_per_pool_omap_alert
= s
;
9329 BlueStore::CollectionRef
BlueStore::_get_collection(const coll_t
& cid
)
9331 std::shared_lock
l(coll_lock
);
9332 ceph::unordered_map
<coll_t
,CollectionRef
>::iterator cp
= coll_map
.find(cid
);
9333 if (cp
== coll_map
.end())
9334 return CollectionRef();
9338 void BlueStore::_queue_reap_collection(CollectionRef
& c
)
9340 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
9341 // _reap_collections and this in the same thread,
9342 // so no need a lock.
9343 removed_collections
.push_back(c
);
9346 void BlueStore::_reap_collections()
9349 list
<CollectionRef
> removed_colls
;
9351 // _queue_reap_collection and this in the same thread.
9352 // So no need a lock.
9353 if (!removed_collections
.empty())
9354 removed_colls
.swap(removed_collections
);
9359 list
<CollectionRef
>::iterator p
= removed_colls
.begin();
9360 while (p
!= removed_colls
.end()) {
9361 CollectionRef c
= *p
;
9362 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
9363 if (c
->onode_map
.map_any([&](OnodeRef o
) {
9364 ceph_assert(!o
->exists
);
9365 if (o
->flushing_count
.load()) {
9366 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " " << o
->oid
9367 << " flush_txns " << o
->flushing_count
<< dendl
;
9375 c
->onode_map
.clear();
9376 p
= removed_colls
.erase(p
);
9377 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " done" << dendl
;
9379 if (removed_colls
.empty()) {
9380 dout(10) << __func__
<< " all reaped" << dendl
;
9382 removed_collections
.splice(removed_collections
.begin(), removed_colls
);
9386 void BlueStore::_update_cache_logger()
9388 uint64_t num_onodes
= 0;
9389 uint64_t num_pinned_onodes
= 0;
9390 uint64_t num_extents
= 0;
9391 uint64_t num_blobs
= 0;
9392 uint64_t num_buffers
= 0;
9393 uint64_t num_buffer_bytes
= 0;
9394 for (auto c
: onode_cache_shards
) {
9395 c
->add_stats(&num_onodes
, &num_pinned_onodes
);
9397 for (auto c
: buffer_cache_shards
) {
9398 c
->add_stats(&num_extents
, &num_blobs
,
9399 &num_buffers
, &num_buffer_bytes
);
9401 logger
->set(l_bluestore_onodes
, num_onodes
);
9402 logger
->set(l_bluestore_pinned_onodes
, num_pinned_onodes
);
9403 logger
->set(l_bluestore_extents
, num_extents
);
9404 logger
->set(l_bluestore_blobs
, num_blobs
);
9405 logger
->set(l_bluestore_buffers
, num_buffers
);
9406 logger
->set(l_bluestore_buffer_bytes
, num_buffer_bytes
);
9412 ObjectStore::CollectionHandle
BlueStore::open_collection(const coll_t
& cid
)
9414 return _get_collection(cid
);
9417 ObjectStore::CollectionHandle
BlueStore::create_new_collection(
9420 std::unique_lock l
{coll_lock
};
9421 auto c
= ceph::make_ref
<Collection
>(
9423 onode_cache_shards
[cid
.hash_to_shard(onode_cache_shards
.size())],
9424 buffer_cache_shards
[cid
.hash_to_shard(buffer_cache_shards
.size())],
9426 new_coll_map
[cid
] = c
;
9427 _osr_attach(c
.get());
9431 void BlueStore::set_collection_commit_queue(
9433 ContextQueue
*commit_queue
)
9436 std::shared_lock
l(coll_lock
);
9437 if (coll_map
.count(cid
)) {
9438 coll_map
[cid
]->commit_queue
= commit_queue
;
9439 } else if (new_coll_map
.count(cid
)) {
9440 new_coll_map
[cid
]->commit_queue
= commit_queue
;
9446 bool BlueStore::exists(CollectionHandle
&c_
, const ghobject_t
& oid
)
9448 Collection
*c
= static_cast<Collection
*>(c_
.get());
9449 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
9456 std::shared_lock
l(c
->lock
);
9457 OnodeRef o
= c
->get_onode(oid
, false);
9458 if (!o
|| !o
->exists
)
9465 int BlueStore::stat(
9466 CollectionHandle
&c_
,
9467 const ghobject_t
& oid
,
9471 Collection
*c
= static_cast<Collection
*>(c_
.get());
9474 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
9477 std::shared_lock
l(c
->lock
);
9478 OnodeRef o
= c
->get_onode(oid
, false);
9479 if (!o
|| !o
->exists
)
9481 st
->st_size
= o
->onode
.size
;
9482 st
->st_blksize
= 4096;
9483 st
->st_blocks
= (st
->st_size
+ st
->st_blksize
- 1) / st
->st_blksize
;
9488 if (_debug_mdata_eio(oid
)) {
9490 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
9494 int BlueStore::set_collection_opts(
9495 CollectionHandle
& ch
,
9496 const pool_opts_t
& opts
)
9498 Collection
*c
= static_cast<Collection
*>(ch
.get());
9499 dout(15) << __func__
<< " " << ch
->cid
<< " options " << opts
<< dendl
;
9502 std::unique_lock l
{c
->lock
};
9503 c
->pool_opts
= opts
;
9507 int BlueStore::read(
9508 CollectionHandle
&c_
,
9509 const ghobject_t
& oid
,
9515 auto start
= mono_clock::now();
9516 Collection
*c
= static_cast<Collection
*>(c_
.get());
9517 const coll_t
&cid
= c
->get_cid();
9518 dout(15) << __func__
<< " " << cid
<< " " << oid
9519 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9527 std::shared_lock
l(c
->lock
);
9528 auto start1
= mono_clock::now();
9529 OnodeRef o
= c
->get_onode(oid
, false);
9530 log_latency("get_onode@read",
9531 l_bluestore_read_onode_meta_lat
,
9532 mono_clock::now() - start1
,
9533 cct
->_conf
->bluestore_log_op_age
);
9534 if (!o
|| !o
->exists
) {
9539 if (offset
== length
&& offset
== 0)
9540 length
= o
->onode
.size
;
9542 r
= _do_read(c
, o
, offset
, length
, bl
, op_flags
);
9544 logger
->inc(l_bluestore_read_eio
);
9549 if (r
>= 0 && _debug_data_eio(oid
)) {
9551 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
9552 } else if (oid
.hobj
.pool
> 0 && /* FIXME, see #23029 */
9553 cct
->_conf
->bluestore_debug_random_read_err
&&
9554 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
*
9556 dout(0) << __func__
<< ": inject random EIO" << dendl
;
9559 dout(10) << __func__
<< " " << cid
<< " " << oid
9560 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9561 << " = " << r
<< dendl
;
9562 log_latency(__func__
,
9563 l_bluestore_read_lat
,
9564 mono_clock::now() - start
,
9565 cct
->_conf
->bluestore_log_op_age
);
9569 void BlueStore::_read_cache(
9573 int read_cache_policy
,
9574 ready_regions_t
& ready_regions
,
9575 blobs2read_t
& blobs2read
)
9577 // build blob-wise list to of stuff read (that isn't cached)
9578 unsigned left
= length
;
9579 uint64_t pos
= offset
;
9580 auto lp
= o
->extent_map
.seek_lextent(offset
);
9581 while (left
> 0 && lp
!= o
->extent_map
.extent_map
.end()) {
9582 if (pos
< lp
->logical_offset
) {
9583 unsigned hole
= lp
->logical_offset
- pos
;
9587 dout(30) << __func__
<< " hole 0x" << std::hex
<< pos
<< "~" << hole
9588 << std::dec
<< dendl
;
9592 BlobRef
& bptr
= lp
->blob
;
9593 unsigned l_off
= pos
- lp
->logical_offset
;
9594 unsigned b_off
= l_off
+ lp
->blob_offset
;
9595 unsigned b_len
= std::min(left
, lp
->length
- l_off
);
9597 ready_regions_t cache_res
;
9598 interval_set
<uint32_t> cache_interval
;
9599 bptr
->shared_blob
->bc
.read(
9600 bptr
->shared_blob
->get_cache(), b_off
, b_len
, cache_res
, cache_interval
,
9602 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9603 << " need 0x" << b_off
<< "~" << b_len
9604 << " cache has 0x" << cache_interval
9605 << std::dec
<< dendl
;
9607 auto pc
= cache_res
.begin();
9608 uint64_t chunk_size
= bptr
->get_blob().get_chunk_size(block_size
);
9611 if (pc
!= cache_res
.end() &&
9612 pc
->first
== b_off
) {
9613 l
= pc
->second
.length();
9614 ready_regions
[pos
].claim(pc
->second
);
9615 dout(30) << __func__
<< " use cache 0x" << std::hex
<< pos
<< ": 0x"
9616 << b_off
<< "~" << l
<< std::dec
<< dendl
;
9620 if (pc
!= cache_res
.end()) {
9621 ceph_assert(pc
->first
> b_off
);
9622 l
= pc
->first
- b_off
;
9624 dout(30) << __func__
<< " will read 0x" << std::hex
<< pos
<< ": 0x"
9625 << b_off
<< "~" << l
<< std::dec
<< dendl
;
9628 uint64_t r_off
= b_off
;
9630 uint64_t front
= r_off
% chunk_size
;
9635 unsigned tail
= r_len
% chunk_size
;
9637 r_len
+= chunk_size
- tail
;
9639 bool merged
= false;
9640 regions2read_t
& r2r
= blobs2read
[bptr
];
9642 read_req_t
& pre
= r2r
.back();
9643 if (r_off
<= (pre
.r_off
+ pre
.r_len
)) {
9644 front
+= (r_off
- pre
.r_off
);
9645 pre
.r_len
+= (r_off
+ r_len
- pre
.r_off
- pre
.r_len
);
9646 pre
.regs
.emplace_back(region_t(pos
, b_off
, l
, front
));
9651 read_req_t
req(r_off
, r_len
);
9652 req
.regs
.emplace_back(region_t(pos
, b_off
, l
, front
));
9653 r2r
.emplace_back(std::move(req
));
9666 int BlueStore::_prepare_read_ioc(
9667 blobs2read_t
& blobs2read
,
9668 vector
<bufferlist
>* compressed_blob_bls
,
9671 for (auto& p
: blobs2read
) {
9672 const BlobRef
& bptr
= p
.first
;
9673 regions2read_t
& r2r
= p
.second
;
9674 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9675 << " need " << r2r
<< std::dec
<< dendl
;
9676 if (bptr
->get_blob().is_compressed()) {
9677 // read the whole thing
9678 if (compressed_blob_bls
->empty()) {
9679 // ensure we avoid any reallocation on subsequent blobs
9680 compressed_blob_bls
->reserve(blobs2read
.size());
9682 compressed_blob_bls
->push_back(bufferlist());
9683 bufferlist
& bl
= compressed_blob_bls
->back();
9684 auto r
= bptr
->get_blob().map(
9685 0, bptr
->get_blob().get_ondisk_length(),
9686 [&](uint64_t offset
, uint64_t length
) {
9687 int r
= bdev
->aio_read(offset
, length
, &bl
, ioc
);
9693 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
) << dendl
;
9695 // propagate EIO to caller
9698 ceph_assert(r
== 0);
9702 for (auto& req
: r2r
) {
9703 dout(20) << __func__
<< " region 0x" << std::hex
9704 << req
.regs
.front().logical_offset
9705 << ": 0x" << req
.regs
.front().blob_xoffset
9706 << " reading 0x" << req
.r_off
9707 << "~" << req
.r_len
<< std::dec
9711 auto r
= bptr
->get_blob().map(
9712 req
.r_off
, req
.r_len
,
9713 [&](uint64_t offset
, uint64_t length
) {
9714 int r
= bdev
->aio_read(offset
, length
, &req
.bl
, ioc
);
9720 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
)
9723 // propagate EIO to caller
9726 ceph_assert(r
== 0);
9728 ceph_assert(req
.bl
.length() == req
.r_len
);
9735 int BlueStore::_generate_read_result_bl(
9739 ready_regions_t
& ready_regions
,
9740 vector
<bufferlist
>& compressed_blob_bls
,
9741 blobs2read_t
& blobs2read
,
9746 // enumerate and decompress desired blobs
9747 auto p
= compressed_blob_bls
.begin();
9748 blobs2read_t::iterator b2r_it
= blobs2read
.begin();
9749 while (b2r_it
!= blobs2read
.end()) {
9750 const BlobRef
& bptr
= b2r_it
->first
;
9751 regions2read_t
& r2r
= b2r_it
->second
;
9752 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
9753 << " need 0x" << r2r
<< std::dec
<< dendl
;
9754 if (bptr
->get_blob().is_compressed()) {
9755 ceph_assert(p
!= compressed_blob_bls
.end());
9756 bufferlist
& compressed_bl
= *p
++;
9757 if (_verify_csum(o
, &bptr
->get_blob(), 0, compressed_bl
,
9758 r2r
.front().regs
.front().logical_offset
) < 0) {
9763 auto r
= _decompress(compressed_bl
, &raw_bl
);
9767 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(), 0,
9770 for (auto& req
: r2r
) {
9771 for (auto& r
: req
.regs
) {
9772 ready_regions
[r
.logical_offset
].substr_of(
9773 raw_bl
, r
.blob_xoffset
, r
.length
);
9777 for (auto& req
: r2r
) {
9778 if (_verify_csum(o
, &bptr
->get_blob(), req
.r_off
, req
.bl
,
9779 req
.regs
.front().logical_offset
) < 0) {
9784 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(),
9788 // prune and keep result
9789 for (const auto& r
: req
.regs
) {
9790 ready_regions
[r
.logical_offset
].substr_of(req
.bl
, r
.front
, r
.length
);
9797 // generate a resulting buffer
9798 auto pr
= ready_regions
.begin();
9799 auto pr_end
= ready_regions
.end();
9801 while (pos
< length
) {
9802 if (pr
!= pr_end
&& pr
->first
== pos
+ offset
) {
9803 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
9804 << ": data from 0x" << pr
->first
<< "~" << pr
->second
.length()
9805 << std::dec
<< dendl
;
9806 pos
+= pr
->second
.length();
9807 bl
.claim_append(pr
->second
);
9810 uint64_t l
= length
- pos
;
9812 ceph_assert(pr
->first
> pos
+ offset
);
9813 l
= pr
->first
- (pos
+ offset
);
9815 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
9816 << ": zeros for 0x" << (pos
+ offset
) << "~" << l
9817 << std::dec
<< dendl
;
9822 ceph_assert(bl
.length() == length
);
9823 ceph_assert(pos
== length
);
9824 ceph_assert(pr
== pr_end
);
9828 int BlueStore::_do_read(
9835 uint64_t retry_count
)
9839 int read_cache_policy
= 0; // do not bypass clean or dirty cache
9841 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
9842 << " size 0x" << o
->onode
.size
<< " (" << std::dec
9843 << o
->onode
.size
<< ")" << dendl
;
9846 if (offset
>= o
->onode
.size
) {
9850 // generally, don't buffer anything, unless the client explicitly requests
9852 bool buffered
= false;
9853 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
9854 dout(20) << __func__
<< " will do buffered read" << dendl
;
9856 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
9857 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
9858 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
9859 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
9863 if (offset
+ length
> o
->onode
.size
) {
9864 length
= o
->onode
.size
- offset
;
9867 auto start
= mono_clock::now();
9868 o
->extent_map
.fault_range(db
, offset
, length
);
9869 log_latency(__func__
,
9870 l_bluestore_read_onode_meta_lat
,
9871 mono_clock::now() - start
,
9872 cct
->_conf
->bluestore_log_op_age
);
9873 _dump_onode
<30>(cct
, *o
);
9875 // for deep-scrub, we only read dirty cache and bypass clean cache in
9876 // order to read underlying block device in case there are silent disk errors.
9877 if (op_flags
& CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE
) {
9878 dout(20) << __func__
<< " will bypass cache and do direct read" << dendl
;
9879 read_cache_policy
= BufferSpace::BYPASS_CLEAN_CACHE
;
9882 // build blob-wise list to of stuff read (that isn't cached)
9883 ready_regions_t ready_regions
;
9884 blobs2read_t blobs2read
;
9885 _read_cache(o
, offset
, length
, read_cache_policy
, ready_regions
, blobs2read
);
9888 // read raw blob data.
9889 start
= mono_clock::now(); // for the sake of simplicity
9890 // measure the whole block below.
9891 // The error isn't that much...
9892 vector
<bufferlist
> compressed_blob_bls
;
9893 IOContext
ioc(cct
, NULL
, true); // allow EIO
9894 r
= _prepare_read_ioc(blobs2read
, &compressed_blob_bls
, &ioc
);
9895 // we always issue aio for reading, so errors other than EIO are not allowed
9899 int64_t num_ios
= length
;
9900 if (ioc
.has_pending_aios()) {
9901 num_ios
= -ioc
.get_num_ios();
9902 bdev
->aio_submit(&ioc
);
9903 dout(20) << __func__
<< " waiting for aio" << dendl
;
9905 r
= ioc
.get_return_value();
9907 ceph_assert(r
== -EIO
); // no other errors allowed
9911 log_latency_fn(__func__
,
9912 l_bluestore_read_wait_aio_lat
,
9913 mono_clock::now() - start
,
9914 cct
->_conf
->bluestore_log_op_age
,
9915 [&](auto lat
) { return ", num_ios = " + stringify(num_ios
); }
9918 bool csum_error
= false;
9919 r
= _generate_read_result_bl(o
, offset
, length
, ready_regions
,
9920 compressed_blob_bls
, blobs2read
,
9921 buffered
, &csum_error
, bl
);
9923 // Handles spurious read errors caused by a kernel bug.
9924 // We sometimes get all-zero pages as a result of the read under
9925 // high memory pressure. Retrying the failing read succeeds in most
9927 // See also: http://tracker.ceph.com/issues/22464
9928 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
9931 return _do_read(c
, o
, offset
, length
, bl
, op_flags
, retry_count
+ 1);
9935 logger
->inc(l_bluestore_reads_with_retries
);
9936 dout(5) << __func__
<< " read at 0x" << std::hex
<< offset
<< "~" << length
9937 << " failed " << std::dec
<< retry_count
<< " times before succeeding" << dendl
;
9942 int BlueStore::_verify_csum(OnodeRef
& o
,
9943 const bluestore_blob_t
* blob
, uint64_t blob_xoffset
,
9944 const bufferlist
& bl
,
9945 uint64_t logical_offset
) const
9949 auto start
= mono_clock::now();
9950 int r
= blob
->verify_csum(blob_xoffset
, bl
, &bad
, &bad_csum
);
9951 if (cct
->_conf
->bluestore_debug_inject_csum_err_probability
> 0 &&
9952 (rand() % 10000) < cct
->_conf
->bluestore_debug_inject_csum_err_probability
* 10000.0) {
9953 derr
<< __func__
<< " injecting bluestore checksum verifcation error" << dendl
;
9956 bad_csum
= 0xDEADBEEF;
9963 blob
->get_csum_chunk_size(),
9964 [&](uint64_t offset
, uint64_t length
) {
9965 pex
.emplace_back(bluestore_pextent_t(offset
, length
));
9968 derr
<< __func__
<< " bad "
9969 << Checksummer::get_csum_type_string(blob
->csum_type
)
9970 << "/0x" << std::hex
<< blob
->get_csum_chunk_size()
9971 << " checksum at blob offset 0x" << bad
9972 << ", got 0x" << bad_csum
<< ", expected 0x"
9973 << blob
->get_csum_item(bad
/ blob
->get_csum_chunk_size()) << std::dec
9974 << ", device location " << pex
9975 << ", logical extent 0x" << std::hex
9976 << (logical_offset
+ bad
- blob_xoffset
) << "~"
9977 << blob
->get_csum_chunk_size() << std::dec
9978 << ", object " << o
->oid
9981 derr
<< __func__
<< " failed with exit code: " << cpp_strerror(r
) << dendl
;
9984 log_latency(__func__
,
9985 l_bluestore_csum_lat
,
9986 mono_clock::now() - start
,
9987 cct
->_conf
->bluestore_log_op_age
);
9988 if (cct
->_conf
->bluestore_ignore_data_csum
) {
9994 int BlueStore::_decompress(bufferlist
& source
, bufferlist
* result
)
9997 auto start
= mono_clock::now();
9998 auto i
= source
.cbegin();
9999 bluestore_compression_header_t chdr
;
10001 int alg
= int(chdr
.type
);
10002 CompressorRef cp
= compressor
;
10003 if (!cp
|| (int)cp
->get_type() != alg
) {
10004 cp
= Compressor::create(cct
, alg
);
10008 // if compressor isn't available - error, because cannot return
10009 // decompressed data?
10011 const char* alg_name
= Compressor::get_comp_alg_name(alg
);
10012 derr
<< __func__
<< " can't load decompressor " << alg_name
<< dendl
;
10013 _set_compression_alert(false, alg_name
);
10016 r
= cp
->decompress(i
, chdr
.length
, *result
);
10018 derr
<< __func__
<< " decompression failed with exit code " << r
<< dendl
;
10022 log_latency(__func__
,
10023 l_bluestore_decompress_lat
,
10024 mono_clock::now() - start
,
10025 cct
->_conf
->bluestore_log_op_age
);
10029 // this stores fiemap into interval_set, other variations
10030 // use it internally
10031 int BlueStore::_fiemap(
10032 CollectionHandle
&c_
,
10033 const ghobject_t
& oid
,
10036 interval_set
<uint64_t>& destset
)
10038 Collection
*c
= static_cast<Collection
*>(c_
.get());
10042 std::shared_lock
l(c
->lock
);
10044 OnodeRef o
= c
->get_onode(oid
, false);
10045 if (!o
|| !o
->exists
) {
10048 _dump_onode
<30>(cct
, *o
);
10050 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10051 << " size 0x" << o
->onode
.size
<< std::dec
<< dendl
;
10053 boost::intrusive::set
<Extent
>::iterator ep
, eend
;
10054 if (offset
>= o
->onode
.size
)
10057 if (offset
+ length
> o
->onode
.size
) {
10058 length
= o
->onode
.size
- offset
;
10061 o
->extent_map
.fault_range(db
, offset
, length
);
10062 eend
= o
->extent_map
.extent_map
.end();
10063 ep
= o
->extent_map
.seek_lextent(offset
);
10064 while (length
> 0) {
10065 dout(20) << __func__
<< " offset " << offset
<< dendl
;
10066 if (ep
!= eend
&& ep
->logical_offset
+ ep
->length
<= offset
) {
10071 uint64_t x_len
= length
;
10072 if (ep
!= eend
&& ep
->logical_offset
<= offset
) {
10073 uint64_t x_off
= offset
- ep
->logical_offset
;
10074 x_len
= std::min(x_len
, ep
->length
- x_off
);
10075 dout(30) << __func__
<< " lextent 0x" << std::hex
<< offset
<< "~"
10076 << x_len
<< std::dec
<< " blob " << ep
->blob
<< dendl
;
10077 destset
.insert(offset
, x_len
);
10080 if (x_off
+ x_len
== ep
->length
)
10085 ep
->logical_offset
> offset
&&
10086 ep
->logical_offset
- offset
< x_len
) {
10087 x_len
= ep
->logical_offset
- offset
;
10095 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10096 << " size = 0x(" << destset
<< ")" << std::dec
<< dendl
;
10100 int BlueStore::fiemap(
10101 CollectionHandle
&c_
,
10102 const ghobject_t
& oid
,
10107 interval_set
<uint64_t> m
;
10108 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
10115 int BlueStore::fiemap(
10116 CollectionHandle
&c_
,
10117 const ghobject_t
& oid
,
10120 map
<uint64_t, uint64_t>& destmap
)
10122 interval_set
<uint64_t> m
;
10123 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
10125 destmap
= std::move(m
).detach();
10130 int BlueStore::readv(
10131 CollectionHandle
&c_
,
10132 const ghobject_t
& oid
,
10133 interval_set
<uint64_t>& m
,
10137 auto start
= mono_clock::now();
10138 Collection
*c
= static_cast<Collection
*>(c_
.get());
10139 const coll_t
&cid
= c
->get_cid();
10140 dout(15) << __func__
<< " " << cid
<< " " << oid
10149 std::shared_lock
l(c
->lock
);
10150 auto start1
= mono_clock::now();
10151 OnodeRef o
= c
->get_onode(oid
, false);
10152 log_latency("get_onode@read",
10153 l_bluestore_read_onode_meta_lat
,
10154 mono_clock::now() - start1
,
10155 cct
->_conf
->bluestore_log_op_age
);
10156 if (!o
|| !o
->exists
) {
10166 r
= _do_readv(c
, o
, m
, bl
, op_flags
);
10168 logger
->inc(l_bluestore_read_eio
);
10173 if (r
>= 0 && _debug_data_eio(oid
)) {
10175 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10176 } else if (oid
.hobj
.pool
> 0 && /* FIXME, see #23029 */
10177 cct
->_conf
->bluestore_debug_random_read_err
&&
10178 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
*
10180 dout(0) << __func__
<< ": inject random EIO" << dendl
;
10183 dout(10) << __func__
<< " " << cid
<< " " << oid
10184 << " fiemap " << m
<< std::dec
10185 << " = " << r
<< dendl
;
10186 log_latency(__func__
,
10187 l_bluestore_read_lat
,
10188 mono_clock::now() - start
,
10189 cct
->_conf
->bluestore_log_op_age
);
10193 int BlueStore::_do_readv(
10196 const interval_set
<uint64_t>& m
,
10199 uint64_t retry_count
)
10203 int read_cache_policy
= 0; // do not bypass clean or dirty cache
10205 dout(20) << __func__
<< " fiemap " << m
<< std::hex
10206 << " size 0x" << o
->onode
.size
<< " (" << std::dec
10207 << o
->onode
.size
<< ")" << dendl
;
10209 // generally, don't buffer anything, unless the client explicitly requests
10211 bool buffered
= false;
10212 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
10213 dout(20) << __func__
<< " will do buffered read" << dendl
;
10215 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
10216 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
10217 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
10218 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
10221 // this method must be idempotent since we may call it several times
10222 // before we finally read the expected result.
10225 // call fiemap first!
10226 ceph_assert(m
.range_start() <= o
->onode
.size
);
10227 ceph_assert(m
.range_end() <= o
->onode
.size
);
10228 auto start
= mono_clock::now();
10229 o
->extent_map
.fault_range(db
, m
.range_start(), m
.range_end() - m
.range_start());
10230 log_latency(__func__
,
10231 l_bluestore_read_onode_meta_lat
,
10232 mono_clock::now() - start
,
10233 cct
->_conf
->bluestore_log_op_age
);
10234 _dump_onode
<30>(cct
, *o
);
10236 IOContext
ioc(cct
, NULL
, true); // allow EIO
10237 vector
<std::tuple
<ready_regions_t
, vector
<bufferlist
>, blobs2read_t
>> raw_results
;
10238 raw_results
.reserve(m
.num_intervals());
10240 for (auto p
= m
.begin(); p
!= m
.end(); p
++, i
++) {
10241 raw_results
.push_back({});
10242 _read_cache(o
, p
.get_start(), p
.get_len(), read_cache_policy
,
10243 std::get
<0>(raw_results
[i
]), std::get
<2>(raw_results
[i
]));
10244 r
= _prepare_read_ioc(std::get
<2>(raw_results
[i
]), &std::get
<1>(raw_results
[i
]), &ioc
);
10245 // we always issue aio for reading, so errors other than EIO are not allowed
10250 auto num_ios
= m
.size();
10251 if (ioc
.has_pending_aios()) {
10252 num_ios
= ioc
.get_num_ios();
10253 bdev
->aio_submit(&ioc
);
10254 dout(20) << __func__
<< " waiting for aio" << dendl
;
10256 r
= ioc
.get_return_value();
10258 ceph_assert(r
== -EIO
); // no other errors allowed
10262 log_latency_fn(__func__
,
10263 l_bluestore_read_wait_aio_lat
,
10264 mono_clock::now() - start
,
10265 cct
->_conf
->bluestore_log_op_age
,
10266 [&](auto lat
) { return ", num_ios = " + stringify(num_ios
); }
10269 ceph_assert(raw_results
.size() == (size_t)m
.num_intervals());
10271 for (auto p
= m
.begin(); p
!= m
.end(); p
++, i
++) {
10272 bool csum_error
= false;
10274 r
= _generate_read_result_bl(o
, p
.get_start(), p
.get_len(),
10275 std::get
<0>(raw_results
[i
]),
10276 std::get
<1>(raw_results
[i
]),
10277 std::get
<2>(raw_results
[i
]),
10278 buffered
, &csum_error
, t
);
10280 // Handles spurious read errors caused by a kernel bug.
10281 // We sometimes get all-zero pages as a result of the read under
10282 // high memory pressure. Retrying the failing read succeeds in most
10284 // See also: http://tracker.ceph.com/issues/22464
10285 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
10288 return _do_readv(c
, o
, m
, bl
, op_flags
, retry_count
+ 1);
10290 bl
.claim_append(t
);
10293 logger
->inc(l_bluestore_reads_with_retries
);
10294 dout(5) << __func__
<< " read fiemap " << m
10295 << " failed " << retry_count
<< " times before succeeding"
10298 return bl
.length();
10301 int BlueStore::dump_onode(CollectionHandle
&c_
,
10302 const ghobject_t
& oid
,
10303 const string
& section_name
,
10306 Collection
*c
= static_cast<Collection
*>(c_
.get());
10307 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
10313 std::shared_lock
l(c
->lock
);
10315 OnodeRef o
= c
->get_onode(oid
, false);
10316 if (!o
|| !o
->exists
) {
10320 // FIXME minor: actually the next line isn't enough to
10321 // load shared blobs. Leaving as is for now..
10323 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
10325 _dump_onode
<0>(cct
, *o
);
10326 f
->open_object_section(section_name
.c_str());
10328 f
->close_section();
10332 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
10333 << " = " << r
<< dendl
;
10337 int BlueStore::getattr(
10338 CollectionHandle
&c_
,
10339 const ghobject_t
& oid
,
10343 Collection
*c
= static_cast<Collection
*>(c_
.get());
10344 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
<< dendl
;
10350 std::shared_lock
l(c
->lock
);
10351 mempool::bluestore_cache_other::string
k(name
);
10353 OnodeRef o
= c
->get_onode(oid
, false);
10354 if (!o
|| !o
->exists
) {
10359 if (!o
->onode
.attrs
.count(k
)) {
10363 value
= o
->onode
.attrs
[k
];
10367 if (r
== 0 && _debug_mdata_eio(oid
)) {
10369 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10371 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
10372 << " = " << r
<< dendl
;
10376 int BlueStore::getattrs(
10377 CollectionHandle
&c_
,
10378 const ghobject_t
& oid
,
10379 map
<string
,bufferptr
>& aset
)
10381 Collection
*c
= static_cast<Collection
*>(c_
.get());
10382 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
10388 std::shared_lock
l(c
->lock
);
10390 OnodeRef o
= c
->get_onode(oid
, false);
10391 if (!o
|| !o
->exists
) {
10395 for (auto& i
: o
->onode
.attrs
) {
10396 aset
.emplace(i
.first
.c_str(), i
.second
);
10402 if (r
== 0 && _debug_mdata_eio(oid
)) {
10404 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
10406 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
10407 << " = " << r
<< dendl
;
10411 int BlueStore::list_collections(vector
<coll_t
>& ls
)
10413 std::shared_lock
l(coll_lock
);
10414 ls
.reserve(coll_map
.size());
10415 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
= coll_map
.begin();
10416 p
!= coll_map
.end();
10418 ls
.push_back(p
->first
);
10422 bool BlueStore::collection_exists(const coll_t
& c
)
10424 std::shared_lock
l(coll_lock
);
10425 return coll_map
.count(c
);
10428 int BlueStore::collection_empty(CollectionHandle
& ch
, bool *empty
)
10430 dout(15) << __func__
<< " " << ch
->cid
<< dendl
;
10431 vector
<ghobject_t
> ls
;
10433 int r
= collection_list(ch
, ghobject_t(), ghobject_t::get_max(), 1,
10436 derr
<< __func__
<< " collection_list returned: " << cpp_strerror(r
)
10440 *empty
= ls
.empty();
10441 dout(10) << __func__
<< " " << ch
->cid
<< " = " << (int)(*empty
) << dendl
;
10445 int BlueStore::collection_bits(CollectionHandle
& ch
)
10447 dout(15) << __func__
<< " " << ch
->cid
<< dendl
;
10448 Collection
*c
= static_cast<Collection
*>(ch
.get());
10449 std::shared_lock
l(c
->lock
);
10450 dout(10) << __func__
<< " " << ch
->cid
<< " = " << c
->cnode
.bits
<< dendl
;
10451 return c
->cnode
.bits
;
10454 int BlueStore::collection_list(
10455 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10456 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10458 Collection
*c
= static_cast<Collection
*>(c_
.get());
10460 dout(15) << __func__
<< " " << c
->cid
10461 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
10464 std::shared_lock
l(c
->lock
);
10465 r
= _collection_list(c
, start
, end
, max
, ls
, pnext
);
10468 dout(10) << __func__
<< " " << c
->cid
10469 << " start " << start
<< " end " << end
<< " max " << max
10470 << " = " << r
<< ", ls.size() = " << ls
->size()
10471 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
10475 int BlueStore::_collection_list(
10476 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
10477 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
10483 auto start_time
= mono_clock::now();
10485 ghobject_t static_next
;
10486 KeyValueDB::Iterator it
;
10487 string temp_start_key
, temp_end_key
;
10488 string start_key
, end_key
;
10489 bool set_next
= false;
10494 pnext
= &static_next
;
10496 if (start
.is_max() || start
.hobj
.is_max()) {
10499 get_coll_key_range(c
->cid
, c
->cnode
.bits
, &temp_start_key
, &temp_end_key
,
10500 &start_key
, &end_key
);
10501 dout(20) << __func__
10502 << " range " << pretty_binary_string(temp_start_key
)
10503 << " to " << pretty_binary_string(temp_end_key
)
10504 << " and " << pretty_binary_string(start_key
)
10505 << " to " << pretty_binary_string(end_key
)
10506 << " start " << start
<< dendl
;
10507 it
= db
->get_iterator(PREFIX_OBJ
);
10508 if (start
== ghobject_t() ||
10509 start
.hobj
== hobject_t() ||
10510 start
== c
->cid
.get_min_hobj()) {
10511 it
->upper_bound(temp_start_key
);
10515 get_object_key(cct
, start
, &k
);
10516 if (start
.hobj
.is_temp()) {
10518 ceph_assert(k
>= temp_start_key
&& k
< temp_end_key
);
10521 ceph_assert(k
>= start_key
&& k
< end_key
);
10523 dout(20) << __func__
<< " start from " << pretty_binary_string(k
)
10524 << " temp=" << (int)temp
<< dendl
;
10525 it
->lower_bound(k
);
10527 if (end
.hobj
.is_max()) {
10528 pend
= temp
? temp_end_key
: end_key
;
10530 get_object_key(cct
, end
, &end_key
);
10531 if (end
.hobj
.is_temp()) {
10537 pend
= temp
? temp_end_key
: end_key
;
10540 dout(20) << __func__
<< " pend " << pretty_binary_string(pend
) << dendl
;
10542 if (!it
->valid() || it
->key() >= pend
) {
10544 dout(20) << __func__
<< " iterator not valid (end of db?)" << dendl
;
10546 dout(20) << __func__
<< " key " << pretty_binary_string(it
->key())
10547 << " >= " << end
<< dendl
;
10549 if (end
.hobj
.is_temp()) {
10552 dout(30) << __func__
<< " switch to non-temp namespace" << dendl
;
10554 it
->upper_bound(start_key
);
10556 dout(30) << __func__
<< " pend " << pretty_binary_string(pend
) << dendl
;
10561 dout(30) << __func__
<< " key " << pretty_binary_string(it
->key()) << dendl
;
10562 if (is_extent_shard_key(it
->key())) {
10567 int r
= get_key_object(it
->key(), &oid
);
10568 ceph_assert(r
== 0);
10569 dout(20) << __func__
<< " oid " << oid
<< " end " << end
<< dendl
;
10570 if (ls
->size() >= (unsigned)max
) {
10571 dout(20) << __func__
<< " reached max " << max
<< dendl
;
10576 ls
->push_back(oid
);
10581 *pnext
= ghobject_t::get_max();
10585 l_bluestore_clist_lat
,
10586 mono_clock::now() - start_time
,
10587 cct
->_conf
->bluestore_log_collection_list_age
,
10588 [&] (const ceph::timespan
& lat
) {
10589 ostringstream ostr
;
10590 ostr
<< ", lat = " << timespan_str(lat
)
10591 << " cid =" << c
->cid
10592 << " start " << start
<< " end " << end
10600 int BlueStore::omap_get(
10601 CollectionHandle
&c_
, ///< [in] Collection containing oid
10602 const ghobject_t
&oid
, ///< [in] Object containing omap
10603 bufferlist
*header
, ///< [out] omap header
10604 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10607 Collection
*c
= static_cast<Collection
*>(c_
.get());
10608 return _omap_get(c
, oid
, header
, out
);
10611 int BlueStore::_omap_get(
10612 Collection
*c
, ///< [in] Collection containing oid
10613 const ghobject_t
&oid
, ///< [in] Object containing omap
10614 bufferlist
*header
, ///< [out] omap header
10615 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10618 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10621 std::shared_lock
l(c
->lock
);
10623 OnodeRef o
= c
->get_onode(oid
, false);
10624 if (!o
|| !o
->exists
) {
10628 r
= _onode_omap_get(o
, header
, out
);
10630 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10635 int BlueStore::_onode_omap_get(
10636 const OnodeRef
&o
, ///< [in] Object containing omap
10637 bufferlist
*header
, ///< [out] omap header
10638 map
<string
, bufferlist
> *out
/// < [out] Key to value map
10642 if (!o
|| !o
->exists
) {
10646 if (!o
->onode
.has_omap())
10650 const string
& prefix
= o
->get_omap_prefix();
10651 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
10653 o
->get_omap_header(&head
);
10654 o
->get_omap_tail(&tail
);
10655 it
->lower_bound(head
);
10656 while (it
->valid()) {
10657 if (it
->key() == head
) {
10658 dout(30) << __func__
<< " got header" << dendl
;
10659 *header
= it
->value();
10660 } else if (it
->key() >= tail
) {
10661 dout(30) << __func__
<< " reached tail" << dendl
;
10665 o
->decode_omap_key(it
->key(), &user_key
);
10666 dout(20) << __func__
<< " got " << pretty_binary_string(it
->key())
10667 << " -> " << user_key
<< dendl
;
10668 (*out
)[user_key
] = it
->value();
10677 int BlueStore::omap_get_header(
10678 CollectionHandle
&c_
, ///< [in] Collection containing oid
10679 const ghobject_t
&oid
, ///< [in] Object containing omap
10680 bufferlist
*header
, ///< [out] omap header
10681 bool allow_eio
///< [in] don't assert on eio
10684 Collection
*c
= static_cast<Collection
*>(c_
.get());
10685 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10688 std::shared_lock
l(c
->lock
);
10690 OnodeRef o
= c
->get_onode(oid
, false);
10691 if (!o
|| !o
->exists
) {
10695 if (!o
->onode
.has_omap())
10700 o
->get_omap_header(&head
);
10701 if (db
->get(o
->get_omap_prefix(), head
, header
) >= 0) {
10702 dout(30) << __func__
<< " got header" << dendl
;
10704 dout(30) << __func__
<< " no header" << dendl
;
10708 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10713 int BlueStore::omap_get_keys(
10714 CollectionHandle
&c_
, ///< [in] Collection containing oid
10715 const ghobject_t
&oid
, ///< [in] Object containing omap
10716 set
<string
> *keys
///< [out] Keys defined on oid
10719 Collection
*c
= static_cast<Collection
*>(c_
.get());
10720 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10723 std::shared_lock
l(c
->lock
);
10725 OnodeRef o
= c
->get_onode(oid
, false);
10726 if (!o
|| !o
->exists
) {
10730 if (!o
->onode
.has_omap())
10734 const string
& prefix
= o
->get_omap_prefix();
10735 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
10737 o
->get_omap_key(string(), &head
);
10738 o
->get_omap_tail(&tail
);
10739 it
->lower_bound(head
);
10740 while (it
->valid()) {
10741 if (it
->key() >= tail
) {
10742 dout(30) << __func__
<< " reached tail" << dendl
;
10746 o
->decode_omap_key(it
->key(), &user_key
);
10747 dout(20) << __func__
<< " got " << pretty_binary_string(it
->key())
10748 << " -> " << user_key
<< dendl
;
10749 keys
->insert(user_key
);
10754 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10759 int BlueStore::omap_get_values(
10760 CollectionHandle
&c_
, ///< [in] Collection containing oid
10761 const ghobject_t
&oid
, ///< [in] Object containing omap
10762 const set
<string
> &keys
, ///< [in] Keys to get
10763 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
10766 Collection
*c
= static_cast<Collection
*>(c_
.get());
10767 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10770 std::shared_lock
l(c
->lock
);
10773 OnodeRef o
= c
->get_onode(oid
, false);
10774 if (!o
|| !o
->exists
) {
10778 if (!o
->onode
.has_omap()) {
10783 const string
& prefix
= o
->get_omap_prefix();
10784 o
->get_omap_key(string(), &final_key
);
10785 size_t base_key_len
= final_key
.size();
10786 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
10787 final_key
.resize(base_key_len
); // keep prefix
10790 if (db
->get(prefix
, final_key
, &val
) >= 0) {
10791 dout(30) << __func__
<< " got " << pretty_binary_string(final_key
)
10792 << " -> " << *p
<< dendl
;
10793 out
->insert(make_pair(*p
, val
));
10798 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10803 #ifdef WITH_SEASTAR
10804 int BlueStore::omap_get_values(
10805 CollectionHandle
&c_
, ///< [in] Collection containing oid
10806 const ghobject_t
&oid
, ///< [in] Object containing omap
10807 const std::optional
<string
> &start_after
, ///< [in] Keys to get
10808 map
<string
, bufferlist
> *output
///< [out] Returned keys and values
10811 Collection
*c
= static_cast<Collection
*>(c_
.get());
10812 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10815 std::shared_lock
l(c
->lock
);
10817 OnodeRef o
= c
->get_onode(oid
, false);
10818 if (!o
|| !o
->exists
) {
10822 if (!o
->onode
.has_omap()) {
10827 ObjectMap::ObjectMapIterator iter
= get_omap_iterator(c_
, oid
);
10832 iter
->upper_bound(*start_after
);
10833 for (; iter
->valid(); iter
->next()) {
10834 output
->insert(make_pair(iter
->key(), iter
->value()));
10839 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10845 int BlueStore::omap_check_keys(
10846 CollectionHandle
&c_
, ///< [in] Collection containing oid
10847 const ghobject_t
&oid
, ///< [in] Object containing omap
10848 const set
<string
> &keys
, ///< [in] Keys to check
10849 set
<string
> *out
///< [out] Subset of keys defined on oid
10852 Collection
*c
= static_cast<Collection
*>(c_
.get());
10853 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
10856 std::shared_lock
l(c
->lock
);
10859 OnodeRef o
= c
->get_onode(oid
, false);
10860 if (!o
|| !o
->exists
) {
10864 if (!o
->onode
.has_omap()) {
10869 const string
& prefix
= o
->get_omap_prefix();
10870 o
->get_omap_key(string(), &final_key
);
10871 size_t base_key_len
= final_key
.size();
10872 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
10873 final_key
.resize(base_key_len
); // keep prefix
10876 if (db
->get(prefix
, final_key
, &val
) >= 0) {
10877 dout(30) << __func__
<< " have " << pretty_binary_string(final_key
)
10878 << " -> " << *p
<< dendl
;
10881 dout(30) << __func__
<< " miss " << pretty_binary_string(final_key
)
10882 << " -> " << *p
<< dendl
;
10887 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
10892 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
10893 CollectionHandle
&c_
, ///< [in] collection
10894 const ghobject_t
&oid
///< [in] object
10897 Collection
*c
= static_cast<Collection
*>(c_
.get());
10898 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
10900 return ObjectMap::ObjectMapIterator();
10902 std::shared_lock
l(c
->lock
);
10903 OnodeRef o
= c
->get_onode(oid
, false);
10904 if (!o
|| !o
->exists
) {
10905 dout(10) << __func__
<< " " << oid
<< "doesn't exist" <<dendl
;
10906 return ObjectMap::ObjectMapIterator();
10909 dout(10) << __func__
<< " has_omap = " << (int)o
->onode
.has_omap() <<dendl
;
10910 KeyValueDB::Iterator it
= db
->get_iterator(o
->get_omap_prefix());
10911 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c
, o
, it
));
10914 // -----------------
10917 uint64_t BlueStore::_get_ondisk_reserved() const {
10918 return round_up_to(
10919 std::max
<uint64_t>(SUPER_RESERVED
, min_alloc_size
), min_alloc_size
);
10922 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction
& t
)
10924 dout(10) << __func__
<< " ondisk_format " << ondisk_format
10925 << " min_compat_ondisk_format " << min_compat_ondisk_format
10927 ceph_assert(ondisk_format
== latest_ondisk_format
);
10930 encode(ondisk_format
, bl
);
10931 t
->set(PREFIX_SUPER
, "ondisk_format", bl
);
10935 encode(min_compat_ondisk_format
, bl
);
10936 t
->set(PREFIX_SUPER
, "min_compat_ondisk_format", bl
);
10940 int BlueStore::_open_super_meta()
10946 db
->get(PREFIX_SUPER
, "nid_max", &bl
);
10947 auto p
= bl
.cbegin();
10952 } catch (buffer::error
& e
) {
10953 derr
<< __func__
<< " unable to read nid_max" << dendl
;
10956 dout(10) << __func__
<< " old nid_max " << nid_max
<< dendl
;
10957 nid_last
= nid_max
.load();
10964 db
->get(PREFIX_SUPER
, "blobid_max", &bl
);
10965 auto p
= bl
.cbegin();
10970 } catch (buffer::error
& e
) {
10971 derr
<< __func__
<< " unable to read blobid_max" << dendl
;
10974 dout(10) << __func__
<< " old blobid_max " << blobid_max
<< dendl
;
10975 blobid_last
= blobid_max
.load();
10981 db
->get(PREFIX_SUPER
, "freelist_type", &bl
);
10983 freelist_type
= std::string(bl
.c_str(), bl
.length());
10984 dout(10) << __func__
<< " freelist_type " << freelist_type
<< dendl
;
10986 ceph_abort_msg("Not Support extent freelist manager");
10991 int32_t compat_ondisk_format
= 0;
10994 int r
= db
->get(PREFIX_SUPER
, "ondisk_format", &bl
);
10996 // base case: kraken bluestore is v1 and readable by v1
10997 dout(20) << __func__
<< " missing ondisk_format; assuming kraken"
11000 compat_ondisk_format
= 1;
11002 auto p
= bl
.cbegin();
11004 decode(ondisk_format
, p
);
11005 } catch (buffer::error
& e
) {
11006 derr
<< __func__
<< " unable to read ondisk_format" << dendl
;
11011 r
= db
->get(PREFIX_SUPER
, "min_compat_ondisk_format", &bl
);
11013 auto p
= bl
.cbegin();
11015 decode(compat_ondisk_format
, p
);
11016 } catch (buffer::error
& e
) {
11017 derr
<< __func__
<< " unable to read compat_ondisk_format" << dendl
;
11022 dout(10) << __func__
<< " ondisk_format " << ondisk_format
11023 << " compat_ondisk_format " << compat_ondisk_format
11027 if (latest_ondisk_format
< compat_ondisk_format
) {
11028 derr
<< __func__
<< " compat_ondisk_format is "
11029 << compat_ondisk_format
<< " but we only understand version "
11030 << latest_ondisk_format
<< dendl
;
11036 db
->get(PREFIX_SUPER
, "min_alloc_size", &bl
);
11037 auto p
= bl
.cbegin();
11041 min_alloc_size
= val
;
11042 min_alloc_size_order
= ctz(val
);
11043 ceph_assert(min_alloc_size
== 1u << min_alloc_size_order
);
11044 } catch (buffer::error
& e
) {
11045 derr
<< __func__
<< " unable to read min_alloc_size" << dendl
;
11048 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
11049 << std::dec
<< dendl
;
11052 _set_per_pool_omap();
11055 _set_alloc_sizes();
11056 _set_throttle_params();
11059 _set_compression();
11066 int BlueStore::_upgrade_super()
11068 dout(1) << __func__
<< " from " << ondisk_format
<< ", latest "
11069 << latest_ondisk_format
<< dendl
;
11070 if (ondisk_format
< latest_ondisk_format
) {
11071 ceph_assert(ondisk_format
> 0);
11072 ceph_assert(ondisk_format
< latest_ondisk_format
);
11074 KeyValueDB::Transaction t
= db
->get_transaction();
11075 if (ondisk_format
== 1) {
11077 // - super: added ondisk_format
11078 // - super: added min_readable_ondisk_format
11079 // - super: added min_compat_ondisk_format
11080 // - super: added min_alloc_size
11081 // - super: removed min_min_alloc_size
11084 db
->get(PREFIX_SUPER
, "min_min_alloc_size", &bl
);
11085 auto p
= bl
.cbegin();
11089 min_alloc_size
= val
;
11090 } catch (buffer::error
& e
) {
11091 derr
<< __func__
<< " failed to read min_min_alloc_size" << dendl
;
11094 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
11095 t
->rmkey(PREFIX_SUPER
, "min_min_alloc_size");
11099 if (ondisk_format
== 2) {
11101 // - onode has FLAG_PER_POOL_OMAP. Note that we do not know that *all*
11102 // ondes are using the per-pool prefix until a repair is run; at that
11103 // point the per_pool_omap=1 key will be set.
11104 // - super: added per_pool_omap key, which indicates that *all* objects
11105 // are using the new prefix and key format
11108 if (ondisk_format
== 3) {
11110 // - FreelistManager keeps meta within bdev label
11111 int r
= _write_out_fm_meta(0);
11112 ceph_assert(r
== 0);
11115 // This to be the last operation
11116 _prepare_ondisk_format_super(t
);
11117 int r
= db
->submit_transaction_sync(t
);
11118 ceph_assert(r
== 0);
11121 dout(1) << __func__
<< " done" << dendl
;
11125 void BlueStore::_assign_nid(TransContext
*txc
, OnodeRef o
)
11127 if (o
->onode
.nid
) {
11128 ceph_assert(o
->exists
);
11131 uint64_t nid
= ++nid_last
;
11132 dout(20) << __func__
<< " " << nid
<< dendl
;
11133 o
->onode
.nid
= nid
;
11134 txc
->last_nid
= nid
;
11138 uint64_t BlueStore::_assign_blobid(TransContext
*txc
)
11140 uint64_t bid
= ++blobid_last
;
11141 dout(20) << __func__
<< " " << bid
<< dendl
;
11142 txc
->last_blobid
= bid
;
11146 void BlueStore::get_db_statistics(Formatter
*f
)
11148 db
->get_statistics(f
);
11151 BlueStore::TransContext
*BlueStore::_txc_create(
11152 Collection
*c
, OpSequencer
*osr
,
11153 list
<Context
*> *on_commits
)
11155 TransContext
*txc
= new TransContext(cct
, c
, osr
, on_commits
);
11156 txc
->t
= db
->get_transaction();
11157 osr
->queue_new(txc
);
11158 dout(20) << __func__
<< " osr " << osr
<< " = " << txc
11159 << " seq " << txc
->seq
<< dendl
;
11163 void BlueStore::_txc_calc_cost(TransContext
*txc
)
11165 // one "io" for the kv commit
11166 auto ios
= 1 + txc
->ioc
.get_num_ios();
11167 auto cost
= throttle_cost_per_io
.load();
11168 txc
->cost
= ios
* cost
+ txc
->bytes
;
11170 dout(10) << __func__
<< " " << txc
<< " cost " << txc
->cost
<< " ("
11171 << ios
<< " ios * " << cost
<< " + " << txc
->bytes
11172 << " bytes)" << dendl
;
11175 void BlueStore::_txc_update_store_statfs(TransContext
*txc
)
11177 if (txc
->statfs_delta
.is_empty())
11180 logger
->inc(l_bluestore_allocated
, txc
->statfs_delta
.allocated());
11181 logger
->inc(l_bluestore_stored
, txc
->statfs_delta
.stored());
11182 logger
->inc(l_bluestore_compressed
, txc
->statfs_delta
.compressed());
11183 logger
->inc(l_bluestore_compressed_allocated
, txc
->statfs_delta
.compressed_allocated());
11184 logger
->inc(l_bluestore_compressed_original
, txc
->statfs_delta
.compressed_original());
11187 txc
->statfs_delta
.encode(bl
);
11188 if (per_pool_stat_collection
) {
11190 get_pool_stat_key(txc
->osd_pool_id
, &key
);
11191 txc
->t
->merge(PREFIX_STAT
, key
, bl
);
11193 std::lock_guard
l(vstatfs_lock
);
11194 auto& stats
= osd_pools
[txc
->osd_pool_id
];
11195 stats
+= txc
->statfs_delta
;
11197 vstatfs
+= txc
->statfs_delta
; //non-persistent in this mode
11200 txc
->t
->merge(PREFIX_STAT
, BLUESTORE_GLOBAL_STATFS_KEY
, bl
);
11202 std::lock_guard
l(vstatfs_lock
);
11203 vstatfs
+= txc
->statfs_delta
;
11205 txc
->statfs_delta
.reset();
11208 void BlueStore::_txc_state_proc(TransContext
*txc
)
11211 dout(10) << __func__
<< " txc " << txc
11212 << " " << txc
->get_state_name() << dendl
;
11213 switch (txc
->state
) {
11214 case TransContext::STATE_PREPARE
:
11215 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_prepare_lat
);
11216 if (txc
->ioc
.has_pending_aios()) {
11217 txc
->state
= TransContext::STATE_AIO_WAIT
;
11218 txc
->had_ios
= true;
11219 _txc_aio_submit(txc
);
11224 case TransContext::STATE_AIO_WAIT
:
11226 mono_clock::duration lat
= throttle
.log_state_latency(
11227 *txc
, logger
, l_bluestore_state_aio_wait_lat
);
11228 if (ceph::to_seconds
<double>(lat
) >= cct
->_conf
->bluestore_log_op_age
) {
11229 dout(0) << __func__
<< " slow aio_wait, txc = " << txc
11230 << ", latency = " << lat
11235 _txc_finish_io(txc
); // may trigger blocked txc's too
11238 case TransContext::STATE_IO_DONE
:
11239 ceph_assert(ceph_mutex_is_locked(txc
->osr
->qlock
)); // see _txc_finish_io
11240 if (txc
->had_ios
) {
11241 ++txc
->osr
->txc_with_unstable_io
;
11243 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_io_done_lat
);
11244 txc
->state
= TransContext::STATE_KV_QUEUED
;
11245 if (cct
->_conf
->bluestore_sync_submit_transaction
) {
11246 if (txc
->last_nid
>= nid_max
||
11247 txc
->last_blobid
>= blobid_max
) {
11248 dout(20) << __func__
11249 << " last_{nid,blobid} exceeds max, submit via kv thread"
11251 } else if (txc
->osr
->kv_committing_serially
) {
11252 dout(20) << __func__
<< " prior txc submitted via kv thread, us too"
11254 // note: this is starvation-prone. once we have a txc in a busy
11255 // sequencer that is committing serially it is possible to keep
11256 // submitting new transactions fast enough that we get stuck doing
11257 // so. the alternative is to block here... fixme?
11258 } else if (txc
->osr
->txc_with_unstable_io
) {
11259 dout(20) << __func__
<< " prior txc(s) with unstable ios "
11260 << txc
->osr
->txc_with_unstable_io
.load() << dendl
;
11261 } else if (cct
->_conf
->bluestore_debug_randomize_serial_transaction
&&
11262 rand() % cct
->_conf
->bluestore_debug_randomize_serial_transaction
11264 dout(20) << __func__
<< " DEBUG randomly forcing submit via kv thread"
11267 _txc_apply_kv(txc
, true);
11271 std::lock_guard
l(kv_lock
);
11272 kv_queue
.push_back(txc
);
11273 if (!kv_sync_in_progress
) {
11274 kv_sync_in_progress
= true;
11275 kv_cond
.notify_one();
11277 if (txc
->state
!= TransContext::STATE_KV_SUBMITTED
) {
11278 kv_queue_unsubmitted
.push_back(txc
);
11279 ++txc
->osr
->kv_committing_serially
;
11283 kv_throttle_costs
+= txc
->cost
;
11286 case TransContext::STATE_KV_SUBMITTED
:
11287 _txc_committed_kv(txc
);
11290 case TransContext::STATE_KV_DONE
:
11291 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_done_lat
);
11292 if (txc
->deferred_txn
) {
11293 txc
->state
= TransContext::STATE_DEFERRED_QUEUED
;
11294 _deferred_queue(txc
);
11297 txc
->state
= TransContext::STATE_FINISHING
;
11300 case TransContext::STATE_DEFERRED_CLEANUP
:
11301 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_deferred_cleanup_lat
);
11302 txc
->state
= TransContext::STATE_FINISHING
;
11305 case TransContext::STATE_FINISHING
:
11306 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_finishing_lat
);
11311 derr
<< __func__
<< " unexpected txc " << txc
11312 << " state " << txc
->get_state_name() << dendl
;
11313 ceph_abort_msg("unexpected txc state");
11319 void BlueStore::_txc_finish_io(TransContext
*txc
)
11321 dout(20) << __func__
<< " " << txc
<< dendl
;
11324 * we need to preserve the order of kv transactions,
11325 * even though aio will complete in any order.
11328 OpSequencer
*osr
= txc
->osr
.get();
11329 std::lock_guard
l(osr
->qlock
);
11330 txc
->state
= TransContext::STATE_IO_DONE
;
11331 txc
->ioc
.release_running_aios();
11332 OpSequencer::q_list_t::iterator p
= osr
->q
.iterator_to(*txc
);
11333 while (p
!= osr
->q
.begin()) {
11335 if (p
->state
< TransContext::STATE_IO_DONE
) {
11336 dout(20) << __func__
<< " " << txc
<< " blocked by " << &*p
<< " "
11337 << p
->get_state_name() << dendl
;
11340 if (p
->state
> TransContext::STATE_IO_DONE
) {
11346 _txc_state_proc(&*p
++);
11347 } while (p
!= osr
->q
.end() &&
11348 p
->state
== TransContext::STATE_IO_DONE
);
11350 if (osr
->kv_submitted_waiters
) {
11351 osr
->qcond
.notify_all();
11355 void BlueStore::_txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
)
11357 dout(20) << __func__
<< " txc " << txc
11358 << " onodes " << txc
->onodes
11359 << " shared_blobs " << txc
->shared_blobs
11363 for (auto o
: txc
->onodes
) {
11364 _record_onode(o
, t
);
11365 o
->flushing_count
++;
11368 // objects we modified but didn't affect the onode
11369 auto p
= txc
->modified_objects
.begin();
11370 while (p
!= txc
->modified_objects
.end()) {
11371 if (txc
->onodes
.count(*p
) == 0) {
11372 (*p
)->flushing_count
++;
11375 // remove dups with onodes list to avoid problems in _txc_finish
11376 p
= txc
->modified_objects
.erase(p
);
11380 // finalize shared_blobs
11381 for (auto sb
: txc
->shared_blobs
) {
11383 auto sbid
= sb
->get_sbid();
11384 get_shared_blob_key(sbid
, &key
);
11385 if (sb
->persistent
->empty()) {
11386 dout(20) << __func__
<< " shared_blob 0x"
11387 << std::hex
<< sbid
<< std::dec
11388 << " is empty" << dendl
;
11389 t
->rmkey(PREFIX_SHARED_BLOB
, key
);
11392 encode(*(sb
->persistent
), bl
);
11393 dout(20) << __func__
<< " shared_blob 0x"
11394 << std::hex
<< sbid
<< std::dec
11395 << " is " << bl
.length() << " " << *sb
<< dendl
;
11396 t
->set(PREFIX_SHARED_BLOB
, key
, bl
);
11401 void BlueStore::BSPerfTracker::update_from_perfcounters(
11402 PerfCounters
&logger
)
11404 os_commit_latency_ns
.consume_next(
11405 logger
.get_tavg_ns(
11406 l_bluestore_commit_lat
));
11407 os_apply_latency_ns
.consume_next(
11408 logger
.get_tavg_ns(
11409 l_bluestore_commit_lat
));
11412 void BlueStore::_txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
)
11414 dout(20) << __func__
<< " txc " << txc
<< std::hex
11415 << " allocated 0x" << txc
->allocated
11416 << " released 0x" << txc
->released
11417 << std::dec
<< dendl
;
11419 // We have to handle the case where we allocate *and* deallocate the
11420 // same region in this transaction. The freelist doesn't like that.
11421 // (Actually, the only thing that cares is the BitmapFreelistManager
11422 // debug check. But that's important.)
11423 interval_set
<uint64_t> tmp_allocated
, tmp_released
;
11424 interval_set
<uint64_t> *pallocated
= &txc
->allocated
;
11425 interval_set
<uint64_t> *preleased
= &txc
->released
;
11426 if (!txc
->allocated
.empty() && !txc
->released
.empty()) {
11427 interval_set
<uint64_t> overlap
;
11428 overlap
.intersection_of(txc
->allocated
, txc
->released
);
11429 if (!overlap
.empty()) {
11430 tmp_allocated
= txc
->allocated
;
11431 tmp_allocated
.subtract(overlap
);
11432 tmp_released
= txc
->released
;
11433 tmp_released
.subtract(overlap
);
11434 dout(20) << __func__
<< " overlap 0x" << std::hex
<< overlap
11435 << ", new allocated 0x" << tmp_allocated
11436 << " released 0x" << tmp_released
<< std::dec
11438 pallocated
= &tmp_allocated
;
11439 preleased
= &tmp_released
;
11443 // update freelist with non-overlap sets
11444 for (interval_set
<uint64_t>::iterator p
= pallocated
->begin();
11445 p
!= pallocated
->end();
11447 fm
->allocate(p
.get_start(), p
.get_len(), t
);
11449 for (interval_set
<uint64_t>::iterator p
= preleased
->begin();
11450 p
!= preleased
->end();
11452 dout(20) << __func__
<< " release 0x" << std::hex
<< p
.get_start()
11453 << "~" << p
.get_len() << std::dec
<< dendl
;
11454 fm
->release(p
.get_start(), p
.get_len(), t
);
11457 _txc_update_store_statfs(txc
);
11460 void BlueStore::_txc_apply_kv(TransContext
*txc
, bool sync_submit_transaction
)
11462 ceph_assert(txc
->state
== TransContext::STATE_KV_QUEUED
);
11464 #if defined(WITH_LTTNG)
11465 auto start
= mono_clock::now();
11468 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction(txc
->t
);
11469 ceph_assert(r
== 0);
11470 txc
->state
= TransContext::STATE_KV_SUBMITTED
;
11471 if (txc
->osr
->kv_submitted_waiters
) {
11472 std::lock_guard
l(txc
->osr
->qlock
);
11473 txc
->osr
->qcond
.notify_all();
11476 #if defined(WITH_LTTNG)
11477 if (txc
->tracing
) {
11480 transaction_kv_submit_latency
,
11481 txc
->osr
->get_sequencer_id(),
11483 sync_submit_transaction
,
11484 ceph::to_seconds
<double>(mono_clock::now() - start
));
11489 for (auto ls
: { &txc
->onodes
, &txc
->modified_objects
}) {
11490 for (auto& o
: *ls
) {
11491 dout(20) << __func__
<< " onode " << o
<< " had " << o
->flushing_count
11493 if (--o
->flushing_count
== 0 && o
->waiting_count
.load()) {
11494 std::lock_guard
l(o
->flush_lock
);
11495 o
->flush_cond
.notify_all();
11501 void BlueStore::_txc_committed_kv(TransContext
*txc
)
11503 dout(20) << __func__
<< " txc " << txc
<< dendl
;
11504 throttle
.complete_kv(*txc
);
11506 std::lock_guard
l(txc
->osr
->qlock
);
11507 txc
->state
= TransContext::STATE_KV_DONE
;
11508 if (txc
->ch
->commit_queue
) {
11509 txc
->ch
->commit_queue
->queue(txc
->oncommits
);
11511 finisher
.queue(txc
->oncommits
);
11514 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_committing_lat
);
11517 l_bluestore_commit_lat
,
11518 mono_clock::now() - txc
->start
,
11519 cct
->_conf
->bluestore_log_op_age
,
11521 return ", txc = " + stringify(txc
);
11526 void BlueStore::_txc_finish(TransContext
*txc
)
11528 dout(20) << __func__
<< " " << txc
<< " onodes " << txc
->onodes
<< dendl
;
11529 ceph_assert(txc
->state
== TransContext::STATE_FINISHING
);
11531 for (auto& sb
: txc
->shared_blobs_written
) {
11532 sb
->finish_write(txc
->seq
);
11534 txc
->shared_blobs_written
.clear();
11536 while (!txc
->removed_collections
.empty()) {
11537 _queue_reap_collection(txc
->removed_collections
.front());
11538 txc
->removed_collections
.pop_front();
11541 OpSequencerRef osr
= txc
->osr
;
11542 bool empty
= false;
11543 bool submit_deferred
= false;
11544 OpSequencer::q_list_t releasing_txc
;
11546 std::lock_guard
l(osr
->qlock
);
11547 txc
->state
= TransContext::STATE_DONE
;
11548 bool notify
= false;
11549 while (!osr
->q
.empty()) {
11550 TransContext
*txc
= &osr
->q
.front();
11551 dout(20) << __func__
<< " txc " << txc
<< " " << txc
->get_state_name()
11553 if (txc
->state
!= TransContext::STATE_DONE
) {
11554 if (txc
->state
== TransContext::STATE_PREPARE
&&
11555 deferred_aggressive
) {
11556 // for _osr_drain_preceding()
11559 if (txc
->state
== TransContext::STATE_DEFERRED_QUEUED
&&
11560 osr
->q
.size() > g_conf()->bluestore_max_deferred_txc
) {
11561 submit_deferred
= true;
11566 osr
->q
.pop_front();
11567 releasing_txc
.push_back(*txc
);
11570 if (osr
->q
.empty()) {
11571 dout(20) << __func__
<< " osr " << osr
<< " q now empty" << dendl
;
11575 // only drain()/drain_preceding() need wakeup,
11576 // other cases use kv_submitted_waiters
11577 if (notify
|| empty
) {
11578 osr
->qcond
.notify_all();
11582 while (!releasing_txc
.empty()) {
11583 // release to allocator only after all preceding txc's have also
11584 // finished any deferred writes that potentially land in these
11586 auto txc
= &releasing_txc
.front();
11587 _txc_release_alloc(txc
);
11588 releasing_txc
.pop_front();
11589 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_done_lat
);
11590 throttle
.complete(*txc
);
11594 if (submit_deferred
) {
11595 // we're pinning memory; flush! we could be more fine-grained here but
11596 // i'm not sure it's worth the bother.
11597 deferred_try_submit();
11600 if (empty
&& osr
->zombie
) {
11601 std::lock_guard
l(zombie_osr_lock
);
11602 if (zombie_osr_set
.erase(osr
->cid
)) {
11603 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
11605 dout(10) << __func__
<< " empty zombie osr " << osr
<< " already reaped"
11611 void BlueStore::_txc_release_alloc(TransContext
*txc
)
11613 // it's expected we're called with lazy_release_lock already taken!
11614 if (likely(!cct
->_conf
->bluestore_debug_no_reuse_blocks
)) {
11616 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
11617 r
= bdev
->queue_discard(txc
->released
);
11619 dout(10) << __func__
<< "(queued) " << txc
<< " " << std::hex
11620 << txc
->released
<< std::dec
<< dendl
;
11623 } else if (cct
->_conf
->bdev_enable_discard
) {
11624 for (auto p
= txc
->released
.begin(); p
!= txc
->released
.end(); ++p
) {
11625 bdev
->discard(p
.get_start(), p
.get_len());
11628 dout(10) << __func__
<< "(sync) " << txc
<< " " << std::hex
11629 << txc
->released
<< std::dec
<< dendl
;
11630 alloc
->release(txc
->released
);
11634 txc
->allocated
.clear();
11635 txc
->released
.clear();
11638 void BlueStore::_osr_attach(Collection
*c
)
11640 // note: caller has RWLock on coll_map
11641 auto q
= coll_map
.find(c
->cid
);
11642 if (q
!= coll_map
.end()) {
11643 c
->osr
= q
->second
->osr
;
11644 ldout(cct
, 10) << __func__
<< " " << c
->cid
11645 << " reusing osr " << c
->osr
<< " from existing coll "
11646 << q
->second
<< dendl
;
11648 std::lock_guard
l(zombie_osr_lock
);
11649 auto p
= zombie_osr_set
.find(c
->cid
);
11650 if (p
== zombie_osr_set
.end()) {
11651 c
->osr
= ceph::make_ref
<OpSequencer
>(this, next_sequencer_id
++, c
->cid
);
11652 ldout(cct
, 10) << __func__
<< " " << c
->cid
11653 << " fresh osr " << c
->osr
<< dendl
;
11655 c
->osr
= p
->second
;
11656 zombie_osr_set
.erase(p
);
11657 ldout(cct
, 10) << __func__
<< " " << c
->cid
11658 << " resurrecting zombie osr " << c
->osr
<< dendl
;
11659 c
->osr
->zombie
= false;
11664 void BlueStore::_osr_register_zombie(OpSequencer
*osr
)
11666 std::lock_guard
l(zombie_osr_lock
);
11667 dout(10) << __func__
<< " " << osr
<< " " << osr
->cid
<< dendl
;
11668 osr
->zombie
= true;
11669 auto i
= zombie_osr_set
.emplace(osr
->cid
, osr
);
11670 // this is either a new insertion or the same osr is already there
11671 ceph_assert(i
.second
|| i
.first
->second
== osr
);
11674 void BlueStore::_osr_drain_preceding(TransContext
*txc
)
11676 OpSequencer
*osr
= txc
->osr
.get();
11677 dout(10) << __func__
<< " " << txc
<< " osr " << osr
<< dendl
;
11678 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
11680 // submit anything pending
11681 deferred_lock
.lock();
11682 if (osr
->deferred_pending
&& !osr
->deferred_running
) {
11683 _deferred_submit_unlock(osr
);
11685 deferred_lock
.unlock();
11689 // wake up any previously finished deferred events
11690 std::lock_guard
l(kv_lock
);
11691 if (!kv_sync_in_progress
) {
11692 kv_sync_in_progress
= true;
11693 kv_cond
.notify_one();
11696 osr
->drain_preceding(txc
);
11697 --deferred_aggressive
;
11698 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
11701 void BlueStore::_osr_drain(OpSequencer
*osr
)
11703 dout(10) << __func__
<< " " << osr
<< dendl
;
11704 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
11706 // submit anything pending
11707 deferred_lock
.lock();
11708 if (osr
->deferred_pending
&& !osr
->deferred_running
) {
11709 _deferred_submit_unlock(osr
);
11711 deferred_lock
.unlock();
11715 // wake up any previously finished deferred events
11716 std::lock_guard
l(kv_lock
);
11717 if (!kv_sync_in_progress
) {
11718 kv_sync_in_progress
= true;
11719 kv_cond
.notify_one();
11723 --deferred_aggressive
;
11724 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
11727 void BlueStore::_osr_drain_all()
11729 dout(10) << __func__
<< dendl
;
11731 set
<OpSequencerRef
> s
;
11732 vector
<OpSequencerRef
> zombies
;
11734 std::shared_lock
l(coll_lock
);
11735 for (auto& i
: coll_map
) {
11736 s
.insert(i
.second
->osr
);
11740 std::lock_guard
l(zombie_osr_lock
);
11741 for (auto& i
: zombie_osr_set
) {
11742 s
.insert(i
.second
);
11743 zombies
.push_back(i
.second
);
11746 dout(20) << __func__
<< " osr_set " << s
<< dendl
;
11748 ++deferred_aggressive
;
11750 // submit anything pending
11751 deferred_try_submit();
11754 // wake up any previously finished deferred events
11755 std::lock_guard
l(kv_lock
);
11756 kv_cond
.notify_one();
11759 std::lock_guard
l(kv_finalize_lock
);
11760 kv_finalize_cond
.notify_one();
11762 for (auto osr
: s
) {
11763 dout(20) << __func__
<< " drain " << osr
<< dendl
;
11766 --deferred_aggressive
;
11769 std::lock_guard
l(zombie_osr_lock
);
11770 for (auto& osr
: zombies
) {
11771 if (zombie_osr_set
.erase(osr
->cid
)) {
11772 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
11773 ceph_assert(osr
->q
.empty());
11774 } else if (osr
->zombie
) {
11775 dout(10) << __func__
<< " empty zombie osr " << osr
11776 << " already reaped" << dendl
;
11777 ceph_assert(osr
->q
.empty());
11779 dout(10) << __func__
<< " empty zombie osr " << osr
11780 << " resurrected" << dendl
;
11785 dout(10) << __func__
<< " done" << dendl
;
11789 void BlueStore::_kv_start()
11791 dout(10) << __func__
<< dendl
;
11794 kv_sync_thread
.create("bstore_kv_sync");
11795 kv_finalize_thread
.create("bstore_kv_final");
11798 void BlueStore::_kv_stop()
11800 dout(10) << __func__
<< dendl
;
11802 std::unique_lock l
{kv_lock
};
11803 while (!kv_sync_started
) {
11807 kv_cond
.notify_all();
11810 std::unique_lock l
{kv_finalize_lock
};
11811 while (!kv_finalize_started
) {
11812 kv_finalize_cond
.wait(l
);
11814 kv_finalize_stop
= true;
11815 kv_finalize_cond
.notify_all();
11817 kv_sync_thread
.join();
11818 kv_finalize_thread
.join();
11819 ceph_assert(removed_collections
.empty());
11821 std::lock_guard
l(kv_lock
);
11825 std::lock_guard
l(kv_finalize_lock
);
11826 kv_finalize_stop
= false;
11828 dout(10) << __func__
<< " stopping finishers" << dendl
;
11829 finisher
.wait_for_empty();
11831 dout(10) << __func__
<< " stopped" << dendl
;
11834 void BlueStore::_kv_sync_thread()
11836 dout(10) << __func__
<< " start" << dendl
;
11837 deque
<DeferredBatch
*> deferred_stable_queue
; ///< deferred ios done + stable
11838 std::unique_lock l
{kv_lock
};
11839 ceph_assert(!kv_sync_started
);
11840 kv_sync_started
= true;
11841 kv_cond
.notify_all();
11843 ceph_assert(kv_committing
.empty());
11844 if (kv_queue
.empty() &&
11845 ((deferred_done_queue
.empty() && deferred_stable_queue
.empty()) ||
11846 !deferred_aggressive
)) {
11849 dout(20) << __func__
<< " sleep" << dendl
;
11850 kv_sync_in_progress
= false;
11852 dout(20) << __func__
<< " wake" << dendl
;
11854 deque
<TransContext
*> kv_submitting
;
11855 deque
<DeferredBatch
*> deferred_done
, deferred_stable
;
11856 uint64_t aios
= 0, costs
= 0;
11858 dout(20) << __func__
<< " committing " << kv_queue
.size()
11859 << " submitting " << kv_queue_unsubmitted
.size()
11860 << " deferred done " << deferred_done_queue
.size()
11861 << " stable " << deferred_stable_queue
.size()
11863 kv_committing
.swap(kv_queue
);
11864 kv_submitting
.swap(kv_queue_unsubmitted
);
11865 deferred_done
.swap(deferred_done_queue
);
11866 deferred_stable
.swap(deferred_stable_queue
);
11868 costs
= kv_throttle_costs
;
11870 kv_throttle_costs
= 0;
11873 dout(30) << __func__
<< " committing " << kv_committing
<< dendl
;
11874 dout(30) << __func__
<< " submitting " << kv_submitting
<< dendl
;
11875 dout(30) << __func__
<< " deferred_done " << deferred_done
<< dendl
;
11876 dout(30) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
11878 auto start
= mono_clock::now();
11880 bool force_flush
= false;
11881 // if bluefs is sharing the same device as data (only), then we
11882 // can rely on the bluefs commit to flush the device and make
11883 // deferred aios stable. that means that if we do have done deferred
11884 // txcs AND we are not on a single device, we need to force a flush.
11885 if (bluefs
&& bluefs_layout
.single_shared_device()) {
11887 force_flush
= true;
11888 } else if (kv_committing
.empty() && deferred_stable
.empty()) {
11889 force_flush
= true; // there's nothing else to commit!
11890 } else if (deferred_aggressive
) {
11891 force_flush
= true;
11894 if (aios
|| !deferred_done
.empty()) {
11895 force_flush
= true;
11897 dout(20) << __func__
<< " skipping flush (no aios, no deferred_done)" << dendl
;
11902 dout(20) << __func__
<< " num_aios=" << aios
11903 << " force_flush=" << (int)force_flush
11904 << ", flushing, deferred done->stable" << dendl
;
11905 // flush/barrier on block device
11908 // if we flush then deferred done are now deferred stable
11909 deferred_stable
.insert(deferred_stable
.end(), deferred_done
.begin(),
11910 deferred_done
.end());
11911 deferred_done
.clear();
11913 auto after_flush
= mono_clock::now();
11915 // we will use one final transaction to force a sync
11916 KeyValueDB::Transaction synct
= db
->get_transaction();
11918 // increase {nid,blobid}_max? note that this covers both the
11919 // case where we are approaching the max and the case we passed
11920 // it. in either case, we increase the max in the earlier txn
11922 uint64_t new_nid_max
= 0, new_blobid_max
= 0;
11923 if (nid_last
+ cct
->_conf
->bluestore_nid_prealloc
/2 > nid_max
) {
11924 KeyValueDB::Transaction t
=
11925 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
11926 new_nid_max
= nid_last
+ cct
->_conf
->bluestore_nid_prealloc
;
11928 encode(new_nid_max
, bl
);
11929 t
->set(PREFIX_SUPER
, "nid_max", bl
);
11930 dout(10) << __func__
<< " new_nid_max " << new_nid_max
<< dendl
;
11932 if (blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
/2 > blobid_max
) {
11933 KeyValueDB::Transaction t
=
11934 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
11935 new_blobid_max
= blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
;
11937 encode(new_blobid_max
, bl
);
11938 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
11939 dout(10) << __func__
<< " new_blobid_max " << new_blobid_max
<< dendl
;
11942 for (auto txc
: kv_committing
) {
11943 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_kv_queued_lat
);
11944 if (txc
->state
== TransContext::STATE_KV_QUEUED
) {
11945 _txc_apply_kv(txc
, false);
11946 --txc
->osr
->kv_committing_serially
;
11948 ceph_assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
11950 if (txc
->had_ios
) {
11951 --txc
->osr
->txc_with_unstable_io
;
11955 // release throttle *before* we commit. this allows new ops
11956 // to be prepared and enter pipeline while we are waiting on
11957 // the kv commit sync/flush. then hopefully on the next
11958 // iteration there will already be ops awake. otherwise, we
11959 // end up going to sleep, and then wake up when the very first
11960 // transaction is ready for commit.
11961 throttle
.release_kv_throttle(costs
);
11964 after_flush
- bluefs_last_balance
>
11965 ceph::make_timespan(cct
->_conf
->bluestore_bluefs_balance_interval
)) {
11966 bluefs_last_balance
= after_flush
;
11967 int r
= _balance_bluefs_freespace();
11968 ceph_assert(r
>= 0);
11971 // cleanup sync deferred keys
11972 for (auto b
: deferred_stable
) {
11973 for (auto& txc
: b
->txcs
) {
11974 bluestore_deferred_transaction_t
& wt
= *txc
.deferred_txn
;
11975 ceph_assert(wt
.released
.empty()); // only kraken did this
11977 get_deferred_key(wt
.seq
, &key
);
11978 synct
->rm_single_key(PREFIX_DEFERRED
, key
);
11982 #if defined(WITH_LTTNG)
11983 auto sync_start
= mono_clock::now();
11985 // submit synct synchronously (block and wait for it to commit)
11986 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction_sync(synct
);
11987 ceph_assert(r
== 0);
11989 int committing_size
= kv_committing
.size();
11990 int deferred_size
= deferred_stable
.size();
11992 #if defined(WITH_LTTNG)
11993 double sync_latency
= ceph::to_seconds
<double>(mono_clock::now() - sync_start
);
11994 for (auto txc
: kv_committing
) {
11995 if (txc
->tracing
) {
11998 transaction_kv_sync_latency
,
11999 txc
->osr
->get_sequencer_id(),
12001 kv_committing
.size(),
12002 deferred_done
.size(),
12003 deferred_stable
.size(),
12010 std::unique_lock m
{kv_finalize_lock
};
12011 if (kv_committing_to_finalize
.empty()) {
12012 kv_committing_to_finalize
.swap(kv_committing
);
12014 kv_committing_to_finalize
.insert(
12015 kv_committing_to_finalize
.end(),
12016 kv_committing
.begin(),
12017 kv_committing
.end());
12018 kv_committing
.clear();
12020 if (deferred_stable_to_finalize
.empty()) {
12021 deferred_stable_to_finalize
.swap(deferred_stable
);
12023 deferred_stable_to_finalize
.insert(
12024 deferred_stable_to_finalize
.end(),
12025 deferred_stable
.begin(),
12026 deferred_stable
.end());
12027 deferred_stable
.clear();
12029 if (!kv_finalize_in_progress
) {
12030 kv_finalize_in_progress
= true;
12031 kv_finalize_cond
.notify_one();
12036 nid_max
= new_nid_max
;
12037 dout(10) << __func__
<< " nid_max now " << nid_max
<< dendl
;
12039 if (new_blobid_max
) {
12040 blobid_max
= new_blobid_max
;
12041 dout(10) << __func__
<< " blobid_max now " << blobid_max
<< dendl
;
12045 auto finish
= mono_clock::now();
12046 ceph::timespan dur_flush
= after_flush
- start
;
12047 ceph::timespan dur_kv
= finish
- after_flush
;
12048 ceph::timespan dur
= finish
- start
;
12049 dout(20) << __func__
<< " committed " << committing_size
12050 << " cleaned " << deferred_size
12052 << " (" << dur_flush
<< " flush + " << dur_kv
<< " kv commit)"
12054 log_latency("kv_flush",
12055 l_bluestore_kv_flush_lat
,
12057 cct
->_conf
->bluestore_log_op_age
);
12058 log_latency("kv_commit",
12059 l_bluestore_kv_commit_lat
,
12061 cct
->_conf
->bluestore_log_op_age
);
12062 log_latency("kv_sync",
12063 l_bluestore_kv_sync_lat
,
12065 cct
->_conf
->bluestore_log_op_age
);
12069 if (!bluefs_extents_reclaiming
.empty()) {
12070 dout(0) << __func__
<< " releasing old bluefs 0x" << std::hex
12071 << bluefs_extents_reclaiming
<< std::dec
<< dendl
;
12073 if (cct
->_conf
->bdev_enable_discard
&& cct
->_conf
->bdev_async_discard
) {
12074 r
= bdev
->queue_discard(bluefs_extents_reclaiming
);
12078 } else if (cct
->_conf
->bdev_enable_discard
) {
12079 for (auto p
= bluefs_extents_reclaiming
.begin(); p
!= bluefs_extents_reclaiming
.end(); ++p
) {
12080 bdev
->discard(p
.get_start(), p
.get_len());
12084 alloc
->release(bluefs_extents_reclaiming
);
12086 bluefs_extents_reclaiming
.clear();
12091 // previously deferred "done" are now "stable" by virtue of this
12093 deferred_stable_queue
.swap(deferred_done
);
12096 dout(10) << __func__
<< " finish" << dendl
;
12097 kv_sync_started
= false;
12100 void BlueStore::_kv_finalize_thread()
12102 deque
<TransContext
*> kv_committed
;
12103 deque
<DeferredBatch
*> deferred_stable
;
12104 dout(10) << __func__
<< " start" << dendl
;
12105 std::unique_lock
l(kv_finalize_lock
);
12106 ceph_assert(!kv_finalize_started
);
12107 kv_finalize_started
= true;
12108 kv_finalize_cond
.notify_all();
12110 ceph_assert(kv_committed
.empty());
12111 ceph_assert(deferred_stable
.empty());
12112 if (kv_committing_to_finalize
.empty() &&
12113 deferred_stable_to_finalize
.empty()) {
12114 if (kv_finalize_stop
)
12116 dout(20) << __func__
<< " sleep" << dendl
;
12117 kv_finalize_in_progress
= false;
12118 kv_finalize_cond
.wait(l
);
12119 dout(20) << __func__
<< " wake" << dendl
;
12121 kv_committed
.swap(kv_committing_to_finalize
);
12122 deferred_stable
.swap(deferred_stable_to_finalize
);
12124 dout(20) << __func__
<< " kv_committed " << kv_committed
<< dendl
;
12125 dout(20) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
12127 auto start
= mono_clock::now();
12129 while (!kv_committed
.empty()) {
12130 TransContext
*txc
= kv_committed
.front();
12131 ceph_assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
12132 _txc_state_proc(txc
);
12133 kv_committed
.pop_front();
12136 for (auto b
: deferred_stable
) {
12137 auto p
= b
->txcs
.begin();
12138 while (p
!= b
->txcs
.end()) {
12139 TransContext
*txc
= &*p
;
12140 p
= b
->txcs
.erase(p
); // unlink here because
12141 _txc_state_proc(txc
); // this may destroy txc
12145 deferred_stable
.clear();
12147 if (!deferred_aggressive
) {
12148 if (deferred_queue_size
>= deferred_batch_ops
.load() ||
12149 throttle
.should_submit_deferred()) {
12150 deferred_try_submit();
12154 // this is as good a place as any ...
12155 _reap_collections();
12157 logger
->set(l_bluestore_fragmentation
,
12158 (uint64_t)(alloc
->get_fragmentation() * 1000));
12160 log_latency("kv_final",
12161 l_bluestore_kv_final_lat
,
12162 mono_clock::now() - start
,
12163 cct
->_conf
->bluestore_log_op_age
);
12168 dout(10) << __func__
<< " finish" << dendl
;
12169 kv_finalize_started
= false;
12172 bluestore_deferred_op_t
*BlueStore::_get_deferred_op(
12175 if (!txc
->deferred_txn
) {
12176 txc
->deferred_txn
= new bluestore_deferred_transaction_t
;
12178 txc
->deferred_txn
->ops
.push_back(bluestore_deferred_op_t());
12179 return &txc
->deferred_txn
->ops
.back();
12182 void BlueStore::_deferred_queue(TransContext
*txc
)
12184 dout(20) << __func__
<< " txc " << txc
<< " osr " << txc
->osr
<< dendl
;
12185 deferred_lock
.lock();
12186 if (!txc
->osr
->deferred_pending
&&
12187 !txc
->osr
->deferred_running
) {
12188 deferred_queue
.push_back(*txc
->osr
);
12190 if (!txc
->osr
->deferred_pending
) {
12191 txc
->osr
->deferred_pending
= new DeferredBatch(cct
, txc
->osr
.get());
12193 ++deferred_queue_size
;
12194 txc
->osr
->deferred_pending
->txcs
.push_back(*txc
);
12195 bluestore_deferred_transaction_t
& wt
= *txc
->deferred_txn
;
12196 for (auto opi
= wt
.ops
.begin(); opi
!= wt
.ops
.end(); ++opi
) {
12197 const auto& op
= *opi
;
12198 ceph_assert(op
.op
== bluestore_deferred_op_t::OP_WRITE
);
12199 bufferlist::const_iterator p
= op
.data
.begin();
12200 for (auto e
: op
.extents
) {
12201 txc
->osr
->deferred_pending
->prepare_write(
12202 cct
, wt
.seq
, e
.offset
, e
.length
, p
);
12205 if (deferred_aggressive
&&
12206 !txc
->osr
->deferred_running
) {
12207 _deferred_submit_unlock(txc
->osr
.get());
12209 deferred_lock
.unlock();
12213 void BlueStore::deferred_try_submit()
12215 dout(20) << __func__
<< " " << deferred_queue
.size() << " osrs, "
12216 << deferred_queue_size
<< " txcs" << dendl
;
12217 std::lock_guard
l(deferred_lock
);
12218 vector
<OpSequencerRef
> osrs
;
12219 osrs
.reserve(deferred_queue
.size());
12220 for (auto& osr
: deferred_queue
) {
12221 osrs
.push_back(&osr
);
12223 for (auto& osr
: osrs
) {
12224 if (osr
->deferred_pending
) {
12225 if (!osr
->deferred_running
) {
12226 _deferred_submit_unlock(osr
.get());
12227 deferred_lock
.lock();
12229 dout(20) << __func__
<< " osr " << osr
<< " already has running"
12233 dout(20) << __func__
<< " osr " << osr
<< " has no pending" << dendl
;
12237 deferred_last_submitted
= ceph_clock_now();
12240 void BlueStore::_deferred_submit_unlock(OpSequencer
*osr
)
12242 dout(10) << __func__
<< " osr " << osr
12243 << " " << osr
->deferred_pending
->iomap
.size() << " ios pending "
12245 ceph_assert(osr
->deferred_pending
);
12246 ceph_assert(!osr
->deferred_running
);
12248 auto b
= osr
->deferred_pending
;
12249 deferred_queue_size
-= b
->seq_bytes
.size();
12250 ceph_assert(deferred_queue_size
>= 0);
12252 osr
->deferred_running
= osr
->deferred_pending
;
12253 osr
->deferred_pending
= nullptr;
12255 deferred_lock
.unlock();
12257 for (auto& txc
: b
->txcs
) {
12258 throttle
.log_state_latency(txc
, logger
, l_bluestore_state_deferred_queued_lat
);
12260 uint64_t start
= 0, pos
= 0;
12262 auto i
= b
->iomap
.begin();
12264 if (i
== b
->iomap
.end() || i
->first
!= pos
) {
12266 dout(20) << __func__
<< " write 0x" << std::hex
12267 << start
<< "~" << bl
.length()
12268 << " crc " << bl
.crc32c(-1) << std::dec
<< dendl
;
12269 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
12270 logger
->inc(l_bluestore_deferred_write_ops
);
12271 logger
->inc(l_bluestore_deferred_write_bytes
, bl
.length());
12272 int r
= bdev
->aio_write(start
, bl
, &b
->ioc
, false);
12273 ceph_assert(r
== 0);
12276 if (i
== b
->iomap
.end()) {
12283 dout(20) << __func__
<< " seq " << i
->second
.seq
<< " 0x"
12284 << std::hex
<< pos
<< "~" << i
->second
.bl
.length() << std::dec
12286 if (!bl
.length()) {
12289 pos
+= i
->second
.bl
.length();
12290 bl
.claim_append(i
->second
.bl
);
12294 bdev
->aio_submit(&b
->ioc
);
12297 struct C_DeferredTrySubmit
: public Context
{
12299 C_DeferredTrySubmit(BlueStore
*s
) : store(s
) {}
12300 void finish(int r
) {
12301 store
->deferred_try_submit();
12305 void BlueStore::_deferred_aio_finish(OpSequencer
*osr
)
12307 dout(10) << __func__
<< " osr " << osr
<< dendl
;
12308 ceph_assert(osr
->deferred_running
);
12309 DeferredBatch
*b
= osr
->deferred_running
;
12312 deferred_lock
.lock();
12313 ceph_assert(osr
->deferred_running
== b
);
12314 osr
->deferred_running
= nullptr;
12315 if (!osr
->deferred_pending
) {
12316 dout(20) << __func__
<< " dequeueing" << dendl
;
12317 auto q
= deferred_queue
.iterator_to(*osr
);
12318 deferred_queue
.erase(q
);
12319 deferred_lock
.unlock();
12321 deferred_lock
.unlock();
12322 if (deferred_aggressive
) {
12323 dout(20) << __func__
<< " queuing async deferred_try_submit" << dendl
;
12324 finisher
.queue(new C_DeferredTrySubmit(this));
12326 dout(20) << __func__
<< " leaving queued, more pending" << dendl
;
12332 uint64_t costs
= 0;
12334 for (auto& i
: b
->txcs
) {
12335 TransContext
*txc
= &i
;
12336 throttle
.log_state_latency(*txc
, logger
, l_bluestore_state_deferred_aio_wait_lat
);
12337 txc
->state
= TransContext::STATE_DEFERRED_CLEANUP
;
12338 costs
+= txc
->cost
;
12341 throttle
.release_deferred_throttle(costs
);
12345 std::lock_guard
l(kv_lock
);
12346 deferred_done_queue
.emplace_back(b
);
12348 // in the normal case, do not bother waking up the kv thread; it will
12349 // catch us on the next commit anyway.
12350 if (deferred_aggressive
&& !kv_sync_in_progress
) {
12351 kv_sync_in_progress
= true;
12352 kv_cond
.notify_one();
12357 int BlueStore::_deferred_replay()
12359 dout(10) << __func__
<< " start" << dendl
;
12362 CollectionRef ch
= _get_collection(coll_t::meta());
12363 bool fake_ch
= false;
12365 // hmm, replaying initial mkfs?
12366 ch
= static_cast<Collection
*>(create_new_collection(coll_t::meta()).get());
12369 OpSequencer
*osr
= static_cast<OpSequencer
*>(ch
->osr
.get());
12370 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_DEFERRED
);
12371 for (it
->lower_bound(string()); it
->valid(); it
->next(), ++count
) {
12372 dout(20) << __func__
<< " replay " << pretty_binary_string(it
->key())
12374 bluestore_deferred_transaction_t
*deferred_txn
=
12375 new bluestore_deferred_transaction_t
;
12376 bufferlist bl
= it
->value();
12377 auto p
= bl
.cbegin();
12379 decode(*deferred_txn
, p
);
12380 } catch (buffer::error
& e
) {
12381 derr
<< __func__
<< " failed to decode deferred txn "
12382 << pretty_binary_string(it
->key()) << dendl
;
12383 delete deferred_txn
;
12387 TransContext
*txc
= _txc_create(ch
.get(), osr
, nullptr);
12388 txc
->deferred_txn
= deferred_txn
;
12389 txc
->state
= TransContext::STATE_KV_DONE
;
12390 _txc_state_proc(txc
);
12393 dout(20) << __func__
<< " draining osr" << dendl
;
12394 _osr_register_zombie(osr
);
12397 new_coll_map
.clear();
12399 dout(10) << __func__
<< " completed " << count
<< " events" << dendl
;
12403 // ---------------------------
12406 int BlueStore::queue_transactions(
12407 CollectionHandle
& ch
,
12408 vector
<Transaction
>& tls
,
12410 ThreadPool::TPHandle
*handle
)
12413 list
<Context
*> on_applied
, on_commit
, on_applied_sync
;
12414 ObjectStore::Transaction::collect_contexts(
12415 tls
, &on_applied
, &on_commit
, &on_applied_sync
);
12417 auto start
= mono_clock::now();
12419 Collection
*c
= static_cast<Collection
*>(ch
.get());
12420 OpSequencer
*osr
= c
->osr
.get();
12421 dout(10) << __func__
<< " ch " << c
<< " " << c
->cid
<< dendl
;
12424 TransContext
*txc
= _txc_create(static_cast<Collection
*>(ch
.get()), osr
,
12427 for (vector
<Transaction
>::iterator p
= tls
.begin(); p
!= tls
.end(); ++p
) {
12428 txc
->bytes
+= (*p
).get_num_bytes();
12429 _txc_add_transaction(txc
, &(*p
));
12431 _txc_calc_cost(txc
);
12433 _txc_write_nodes(txc
, txc
->t
);
12435 // journal deferred items
12436 if (txc
->deferred_txn
) {
12437 txc
->deferred_txn
->seq
= ++deferred_seq
;
12439 encode(*txc
->deferred_txn
, bl
);
12441 get_deferred_key(txc
->deferred_txn
->seq
, &key
);
12442 txc
->t
->set(PREFIX_DEFERRED
, key
, bl
);
12445 _txc_finalize_kv(txc
, txc
->t
);
12447 handle
->suspend_tp_timeout();
12449 auto tstart
= mono_clock::now();
12451 if (!throttle
.try_start_transaction(
12455 // ensure we do not block here because of deferred writes
12456 dout(10) << __func__
<< " failed get throttle_deferred_bytes, aggressive"
12458 ++deferred_aggressive
;
12459 deferred_try_submit();
12461 // wake up any previously finished deferred events
12462 std::lock_guard
l(kv_lock
);
12463 if (!kv_sync_in_progress
) {
12464 kv_sync_in_progress
= true;
12465 kv_cond
.notify_one();
12468 throttle
.finish_start_transaction(*db
, *txc
, tstart
);
12469 --deferred_aggressive
;
12471 auto tend
= mono_clock::now();
12474 handle
->reset_tp_timeout();
12476 logger
->inc(l_bluestore_txc
);
12479 _txc_state_proc(txc
);
12481 // we're immediately readable (unlike FileStore)
12482 for (auto c
: on_applied_sync
) {
12485 if (!on_applied
.empty()) {
12486 if (c
->commit_queue
) {
12487 c
->commit_queue
->queue(on_applied
);
12489 finisher
.queue(on_applied
);
12493 log_latency("submit_transact",
12494 l_bluestore_submit_lat
,
12495 mono_clock::now() - start
,
12496 cct
->_conf
->bluestore_log_op_age
);
12497 log_latency("throttle_transact",
12498 l_bluestore_throttle_lat
,
12500 cct
->_conf
->bluestore_log_op_age
);
12504 void BlueStore::_txc_aio_submit(TransContext
*txc
)
12506 dout(10) << __func__
<< " txc " << txc
<< dendl
;
12507 bdev
->aio_submit(&txc
->ioc
);
12510 void BlueStore::_txc_add_transaction(TransContext
*txc
, Transaction
*t
)
12512 Transaction::iterator i
= t
->begin();
12514 _dump_transaction
<30>(cct
, t
);
12516 vector
<CollectionRef
> cvec(i
.colls
.size());
12518 for (vector
<coll_t
>::iterator p
= i
.colls
.begin(); p
!= i
.colls
.end();
12520 cvec
[j
] = _get_collection(*p
);
12523 vector
<OnodeRef
> ovec(i
.objects
.size());
12525 for (int pos
= 0; i
.have_op(); ++pos
) {
12526 Transaction::Op
*op
= i
.decode_op();
12530 if (op
->op
== Transaction::OP_NOP
)
12534 // collection operations
12535 CollectionRef
&c
= cvec
[op
->cid
];
12537 // initialize osd_pool_id and do a smoke test that all collections belong
12538 // to the same pool
12540 if (!!c
? c
->cid
.is_pg(&pgid
) : false) {
12541 ceph_assert(txc
->osd_pool_id
== META_POOL_ID
||
12542 txc
->osd_pool_id
== pgid
.pool());
12543 txc
->osd_pool_id
= pgid
.pool();
12547 case Transaction::OP_RMCOLL
:
12549 const coll_t
&cid
= i
.get_cid(op
->cid
);
12550 r
= _remove_collection(txc
, cid
, &c
);
12556 case Transaction::OP_MKCOLL
:
12559 const coll_t
&cid
= i
.get_cid(op
->cid
);
12560 r
= _create_collection(txc
, cid
, op
->split_bits
, &c
);
12566 case Transaction::OP_SPLIT_COLLECTION
:
12567 ceph_abort_msg("deprecated");
12570 case Transaction::OP_SPLIT_COLLECTION2
:
12572 uint32_t bits
= op
->split_bits
;
12573 uint32_t rem
= op
->split_rem
;
12574 r
= _split_collection(txc
, c
, cvec
[op
->dest_cid
], bits
, rem
);
12580 case Transaction::OP_MERGE_COLLECTION
:
12582 uint32_t bits
= op
->split_bits
;
12583 r
= _merge_collection(txc
, &c
, cvec
[op
->dest_cid
], bits
);
12589 case Transaction::OP_COLL_HINT
:
12591 uint32_t type
= op
->hint_type
;
12594 auto hiter
= hint
.cbegin();
12595 if (type
== Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
) {
12598 decode(pg_num
, hiter
);
12599 decode(num_objs
, hiter
);
12600 dout(10) << __func__
<< " collection hint objects is a no-op, "
12601 << " pg_num " << pg_num
<< " num_objects " << num_objs
12605 dout(10) << __func__
<< " unknown collection hint " << type
<< dendl
;
12611 case Transaction::OP_COLL_SETATTR
:
12615 case Transaction::OP_COLL_RMATTR
:
12619 case Transaction::OP_COLL_RENAME
:
12620 ceph_abort_msg("not implemented");
12624 derr
<< __func__
<< " error " << cpp_strerror(r
)
12625 << " not handled on operation " << op
->op
12626 << " (op " << pos
<< ", counting from 0)" << dendl
;
12627 _dump_transaction
<0>(cct
, t
);
12628 ceph_abort_msg("unexpected error");
12631 // these operations implicity create the object
12632 bool create
= false;
12633 if (op
->op
== Transaction::OP_TOUCH
||
12634 op
->op
== Transaction::OP_CREATE
||
12635 op
->op
== Transaction::OP_WRITE
||
12636 op
->op
== Transaction::OP_ZERO
) {
12640 // object operations
12641 std::unique_lock
l(c
->lock
);
12642 OnodeRef
&o
= ovec
[op
->oid
];
12644 ghobject_t oid
= i
.get_oid(op
->oid
);
12645 o
= c
->get_onode(oid
, create
, op
->op
== Transaction::OP_CREATE
);
12647 if (!create
&& (!o
|| !o
->exists
)) {
12648 dout(10) << __func__
<< " op " << op
->op
<< " got ENOENT on "
12649 << i
.get_oid(op
->oid
) << dendl
;
12655 case Transaction::OP_CREATE
:
12656 case Transaction::OP_TOUCH
:
12657 r
= _touch(txc
, c
, o
);
12660 case Transaction::OP_WRITE
:
12662 uint64_t off
= op
->off
;
12663 uint64_t len
= op
->len
;
12664 uint32_t fadvise_flags
= i
.get_fadvise_flags();
12667 r
= _write(txc
, c
, o
, off
, len
, bl
, fadvise_flags
);
12671 case Transaction::OP_ZERO
:
12673 uint64_t off
= op
->off
;
12674 uint64_t len
= op
->len
;
12675 r
= _zero(txc
, c
, o
, off
, len
);
12679 case Transaction::OP_TRIMCACHE
:
12681 // deprecated, no-op
12685 case Transaction::OP_TRUNCATE
:
12687 uint64_t off
= op
->off
;
12688 r
= _truncate(txc
, c
, o
, off
);
12692 case Transaction::OP_REMOVE
:
12694 r
= _remove(txc
, c
, o
);
12698 case Transaction::OP_SETATTR
:
12700 string name
= i
.decode_string();
12703 r
= _setattr(txc
, c
, o
, name
, bp
);
12707 case Transaction::OP_SETATTRS
:
12709 map
<string
, bufferptr
> aset
;
12710 i
.decode_attrset(aset
);
12711 r
= _setattrs(txc
, c
, o
, aset
);
12715 case Transaction::OP_RMATTR
:
12717 string name
= i
.decode_string();
12718 r
= _rmattr(txc
, c
, o
, name
);
12722 case Transaction::OP_RMATTRS
:
12724 r
= _rmattrs(txc
, c
, o
);
12728 case Transaction::OP_CLONE
:
12730 OnodeRef
& no
= ovec
[op
->dest_oid
];
12732 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
12733 no
= c
->get_onode(noid
, true);
12735 r
= _clone(txc
, c
, o
, no
);
12739 case Transaction::OP_CLONERANGE
:
12740 ceph_abort_msg("deprecated");
12743 case Transaction::OP_CLONERANGE2
:
12745 OnodeRef
& no
= ovec
[op
->dest_oid
];
12747 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
12748 no
= c
->get_onode(noid
, true);
12750 uint64_t srcoff
= op
->off
;
12751 uint64_t len
= op
->len
;
12752 uint64_t dstoff
= op
->dest_off
;
12753 r
= _clone_range(txc
, c
, o
, no
, srcoff
, len
, dstoff
);
12757 case Transaction::OP_COLL_ADD
:
12758 ceph_abort_msg("not implemented");
12761 case Transaction::OP_COLL_REMOVE
:
12762 ceph_abort_msg("not implemented");
12765 case Transaction::OP_COLL_MOVE
:
12766 ceph_abort_msg("deprecated");
12769 case Transaction::OP_COLL_MOVE_RENAME
:
12770 case Transaction::OP_TRY_RENAME
:
12772 ceph_assert(op
->cid
== op
->dest_cid
);
12773 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
12774 OnodeRef
& no
= ovec
[op
->dest_oid
];
12776 no
= c
->get_onode(noid
, false);
12778 r
= _rename(txc
, c
, o
, no
, noid
);
12782 case Transaction::OP_OMAP_CLEAR
:
12784 r
= _omap_clear(txc
, c
, o
);
12787 case Transaction::OP_OMAP_SETKEYS
:
12789 bufferlist aset_bl
;
12790 i
.decode_attrset_bl(&aset_bl
);
12791 r
= _omap_setkeys(txc
, c
, o
, aset_bl
);
12794 case Transaction::OP_OMAP_RMKEYS
:
12796 bufferlist keys_bl
;
12797 i
.decode_keyset_bl(&keys_bl
);
12798 r
= _omap_rmkeys(txc
, c
, o
, keys_bl
);
12801 case Transaction::OP_OMAP_RMKEYRANGE
:
12803 string first
, last
;
12804 first
= i
.decode_string();
12805 last
= i
.decode_string();
12806 r
= _omap_rmkey_range(txc
, c
, o
, first
, last
);
12809 case Transaction::OP_OMAP_SETHEADER
:
12813 r
= _omap_setheader(txc
, c
, o
, bl
);
12817 case Transaction::OP_SETALLOCHINT
:
12819 r
= _set_alloc_hint(txc
, c
, o
,
12820 op
->expected_object_size
,
12821 op
->expected_write_size
,
12822 op
->alloc_hint_flags
);
12827 derr
<< __func__
<< " bad op " << op
->op
<< dendl
;
12835 if (r
== -ENOENT
&& !(op
->op
== Transaction::OP_CLONERANGE
||
12836 op
->op
== Transaction::OP_CLONE
||
12837 op
->op
== Transaction::OP_CLONERANGE2
||
12838 op
->op
== Transaction::OP_COLL_ADD
||
12839 op
->op
== Transaction::OP_SETATTR
||
12840 op
->op
== Transaction::OP_SETATTRS
||
12841 op
->op
== Transaction::OP_RMATTR
||
12842 op
->op
== Transaction::OP_OMAP_SETKEYS
||
12843 op
->op
== Transaction::OP_OMAP_RMKEYS
||
12844 op
->op
== Transaction::OP_OMAP_RMKEYRANGE
||
12845 op
->op
== Transaction::OP_OMAP_SETHEADER
))
12846 // -ENOENT is usually okay
12852 const char *msg
= "unexpected error code";
12854 if (r
== -ENOENT
&& (op
->op
== Transaction::OP_CLONERANGE
||
12855 op
->op
== Transaction::OP_CLONE
||
12856 op
->op
== Transaction::OP_CLONERANGE2
))
12857 msg
= "ENOENT on clone suggests osd bug";
12860 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
12861 // by partially applying transactions.
12862 msg
= "ENOSPC from bluestore, misconfigured cluster";
12864 if (r
== -ENOTEMPTY
) {
12865 msg
= "ENOTEMPTY suggests garbage data in osd data dir";
12868 derr
<< __func__
<< " error " << cpp_strerror(r
)
12869 << " not handled on operation " << op
->op
12870 << " (op " << pos
<< ", counting from 0)"
12872 derr
<< msg
<< dendl
;
12873 _dump_transaction
<0>(cct
, t
);
12874 ceph_abort_msg("unexpected error");
12882 // -----------------
12883 // write operations
12885 int BlueStore::_touch(TransContext
*txc
,
12889 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
12891 _assign_nid(txc
, o
);
12892 txc
->write_onode(o
);
12893 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
12897 void BlueStore::_pad_zeros(
12898 bufferlist
*bl
, uint64_t *offset
,
12899 uint64_t chunk_size
)
12901 auto length
= bl
->length();
12902 dout(30) << __func__
<< " 0x" << std::hex
<< *offset
<< "~" << length
12903 << " chunk_size 0x" << chunk_size
<< std::dec
<< dendl
;
12904 dout(40) << "before:\n";
12905 bl
->hexdump(*_dout
);
12908 size_t front_pad
= *offset
% chunk_size
;
12909 size_t back_pad
= 0;
12910 size_t pad_count
= 0;
12912 size_t front_copy
= std::min
<uint64_t>(chunk_size
- front_pad
, length
);
12913 bufferptr z
= buffer::create_small_page_aligned(chunk_size
);
12914 z
.zero(0, front_pad
, false);
12915 pad_count
+= front_pad
;
12916 bl
->begin().copy(front_copy
, z
.c_str() + front_pad
);
12917 if (front_copy
+ front_pad
< chunk_size
) {
12918 back_pad
= chunk_size
- (length
+ front_pad
);
12919 z
.zero(front_pad
+ length
, back_pad
, false);
12920 pad_count
+= back_pad
;
12924 t
.substr_of(old
, front_copy
, length
- front_copy
);
12926 bl
->claim_append(t
);
12927 *offset
-= front_pad
;
12928 length
+= pad_count
;
12932 uint64_t end
= *offset
+ length
;
12933 unsigned back_copy
= end
% chunk_size
;
12935 ceph_assert(back_pad
== 0);
12936 back_pad
= chunk_size
- back_copy
;
12937 ceph_assert(back_copy
<= length
);
12938 bufferptr
tail(chunk_size
);
12939 bl
->begin(length
- back_copy
).copy(back_copy
, tail
.c_str());
12940 tail
.zero(back_copy
, back_pad
, false);
12943 bl
->substr_of(old
, 0, length
- back_copy
);
12945 length
+= back_pad
;
12946 pad_count
+= back_pad
;
12948 dout(20) << __func__
<< " pad 0x" << std::hex
<< front_pad
<< " + 0x"
12949 << back_pad
<< " on front/back, now 0x" << *offset
<< "~"
12950 << length
<< std::dec
<< dendl
;
12951 dout(40) << "after:\n";
12952 bl
->hexdump(*_dout
);
12955 logger
->inc(l_bluestore_write_pad_bytes
, pad_count
);
12956 ceph_assert(bl
->length() == length
);
12959 void BlueStore::_do_write_small(
12963 uint64_t offset
, uint64_t length
,
12964 bufferlist::iterator
& blp
,
12965 WriteContext
*wctx
)
12967 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
12968 << std::dec
<< dendl
;
12969 ceph_assert(length
< min_alloc_size
);
12970 uint64_t end_offs
= offset
+ length
;
12972 logger
->inc(l_bluestore_write_small
);
12973 logger
->inc(l_bluestore_write_small_bytes
, length
);
12976 blp
.copy(length
, bl
);
12978 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
12979 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
12980 uint32_t alloc_len
= min_alloc_size
;
12981 auto offset0
= p2align
<uint64_t>(offset
, alloc_len
);
12985 // search suitable extent in both forward and reverse direction in
12986 // [offset - target_max_blob_size, offset + target_max_blob_size] range
12987 // then check if blob can be reused via can_reuse_blob func or apply
12988 // direct/deferred write (the latter for extents including or higher
12989 // than 'offset' only).
12990 o
->extent_map
.fault_range(db
, min_off
, offset
+ max_bsize
- min_off
);
12992 // Look for an existing mutable blob we can use.
12993 auto begin
= o
->extent_map
.extent_map
.begin();
12994 auto end
= o
->extent_map
.extent_map
.end();
12995 auto ep
= o
->extent_map
.seek_lextent(offset
);
12998 if (ep
->blob_end() <= offset
) {
13003 if (prev_ep
!= begin
) {
13006 prev_ep
= end
; // to avoid this extent check as it's a duplicate
13009 boost::container::flat_set
<const bluestore_blob_t
*> inspected_blobs
;
13010 // We don't want to have more blobs than min alloc units fit
13011 // into 2 max blobs
13012 size_t blob_threshold
= max_blob_size
/ min_alloc_size
* 2 + 1;
13013 bool above_blob_threshold
= false;
13015 inspected_blobs
.reserve(blob_threshold
);
13017 uint64_t max_off
= 0;
13018 auto start_ep
= ep
;
13019 auto end_ep
= ep
; // exclusively
13021 any_change
= false;
13023 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
13024 BlobRef b
= ep
->blob
;
13025 if (!above_blob_threshold
) {
13026 inspected_blobs
.insert(&b
->get_blob());
13027 above_blob_threshold
= inspected_blobs
.size() >= blob_threshold
;
13029 max_off
= ep
->logical_end();
13030 auto bstart
= ep
->blob_start();
13032 dout(20) << __func__
<< " considering " << *b
13033 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
13034 if (bstart
>= end_offs
) {
13035 dout(20) << __func__
<< " ignoring distant " << *b
<< dendl
;
13036 } else if (!b
->get_blob().is_mutable()) {
13037 dout(20) << __func__
<< " ignoring immutable " << *b
<< dendl
;
13038 } else if (ep
->logical_offset
% min_alloc_size
!=
13039 ep
->blob_offset
% min_alloc_size
) {
13040 dout(20) << __func__
<< " ignoring offset-skewed " << *b
<< dendl
;
13042 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
13043 // can we pad our head/tail out with zeros?
13044 uint64_t head_pad
, tail_pad
;
13045 head_pad
= p2phase(offset
, chunk_size
);
13046 tail_pad
= p2nphase(end_offs
, chunk_size
);
13047 if (head_pad
|| tail_pad
) {
13048 o
->extent_map
.fault_range(db
, offset
- head_pad
,
13049 end_offs
- offset
+ head_pad
+ tail_pad
);
13052 o
->extent_map
.has_any_lextents(offset
- head_pad
, chunk_size
)) {
13055 if (tail_pad
&& o
->extent_map
.has_any_lextents(end_offs
, tail_pad
)) {
13059 uint64_t b_off
= offset
- head_pad
- bstart
;
13060 uint64_t b_len
= length
+ head_pad
+ tail_pad
;
13062 // direct write into unused blocks of an existing mutable blob?
13063 if ((b_off
% chunk_size
== 0 && b_len
% chunk_size
== 0) &&
13064 b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
13065 b
->get_blob().is_unused(b_off
, b_len
) &&
13066 b
->get_blob().is_allocated(b_off
, b_len
)) {
13067 _apply_padding(head_pad
, tail_pad
, bl
);
13069 dout(20) << __func__
<< " write to unused 0x" << std::hex
13070 << b_off
<< "~" << b_len
13071 << " pad 0x" << head_pad
<< " + 0x" << tail_pad
13072 << std::dec
<< " of mutable " << *b
<< dendl
;
13073 _buffer_cache_write(txc
, b
, b_off
, bl
,
13074 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13076 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13077 if (b_len
<= prefer_deferred_size
) {
13078 dout(20) << __func__
<< " deferring small 0x" << std::hex
13079 << b_len
<< std::dec
<< " unused write via deferred" << dendl
;
13080 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
13081 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13084 [&](uint64_t offset
, uint64_t length
) {
13085 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13090 b
->get_blob().map_bl(
13092 [&](uint64_t offset
, bufferlist
& t
) {
13093 bdev
->aio_write(offset
, t
,
13094 &txc
->ioc
, wctx
->buffered
);
13098 b
->dirty_blob().calc_csum(b_off
, bl
);
13099 dout(20) << __func__
<< " lex old " << *ep
<< dendl
;
13100 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, b_off
+ head_pad
, length
,
13102 &wctx
->old_extents
);
13103 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13104 txc
->statfs_delta
.stored() += le
->length
;
13105 dout(20) << __func__
<< " lex " << *le
<< dendl
;
13106 logger
->inc(l_bluestore_write_small_unused
);
13109 // read some data to fill out the chunk?
13110 uint64_t head_read
= p2phase(b_off
, chunk_size
);
13111 uint64_t tail_read
= p2nphase(b_off
+ b_len
, chunk_size
);
13112 if ((head_read
|| tail_read
) &&
13113 (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
+ tail_read
) &&
13114 head_read
+ tail_read
< min_alloc_size
) {
13115 b_off
-= head_read
;
13116 b_len
+= head_read
+ tail_read
;
13119 head_read
= tail_read
= 0;
13122 // chunk-aligned deferred overwrite?
13123 if (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
13124 b_off
% chunk_size
== 0 &&
13125 b_len
% chunk_size
== 0 &&
13126 b
->get_blob().is_allocated(b_off
, b_len
)) {
13128 _apply_padding(head_pad
, tail_pad
, bl
);
13130 dout(20) << __func__
<< " reading head 0x" << std::hex
<< head_read
13131 << " and tail 0x" << tail_read
<< std::dec
<< dendl
;
13133 bufferlist head_bl
;
13134 int r
= _do_read(c
.get(), o
, offset
- head_pad
- head_read
, head_read
,
13136 ceph_assert(r
>= 0 && r
<= (int)head_read
);
13137 size_t zlen
= head_read
- r
;
13139 head_bl
.append_zero(zlen
);
13140 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13142 head_bl
.claim_append(bl
);
13144 logger
->inc(l_bluestore_write_penalty_read_ops
);
13147 bufferlist tail_bl
;
13148 int r
= _do_read(c
.get(), o
, offset
+ length
+ tail_pad
, tail_read
,
13150 ceph_assert(r
>= 0 && r
<= (int)tail_read
);
13151 size_t zlen
= tail_read
- r
;
13153 tail_bl
.append_zero(zlen
);
13154 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
13156 bl
.claim_append(tail_bl
);
13157 logger
->inc(l_bluestore_write_penalty_read_ops
);
13159 logger
->inc(l_bluestore_write_small_pre_read
);
13161 _buffer_cache_write(txc
, b
, b_off
, bl
,
13162 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13164 if (b
->get_blob().csum_type
) {
13165 b
->dirty_blob().calc_csum(b_off
, bl
);
13168 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13169 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
13170 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13171 int r
= b
->get_blob().map(
13173 [&](uint64_t offset
, uint64_t length
) {
13174 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13177 ceph_assert(r
== 0);
13178 op
->data
.claim(bl
);
13179 dout(20) << __func__
<< " deferred write 0x" << std::hex
<< b_off
<< "~"
13180 << b_len
<< std::dec
<< " of mutable " << *b
13181 << " at " << op
->extents
<< dendl
;
13184 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, offset
- bstart
, length
,
13185 b
, &wctx
->old_extents
);
13186 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13187 txc
->statfs_delta
.stored() += le
->length
;
13188 dout(20) << __func__
<< " lex " << *le
<< dendl
;
13189 logger
->inc(l_bluestore_write_small_deferred
);
13192 // try to reuse blob if we can
13193 if (b
->can_reuse_blob(min_alloc_size
,
13197 ceph_assert(alloc_len
== min_alloc_size
); // expecting data always
13198 // fit into reused blob
13199 // Need to check for pending writes desiring to
13200 // reuse the same pextent. The rationale is that during GC two chunks
13201 // from garbage blobs(compressed?) can share logical space within the same
13202 // AU. That's in turn might be caused by unaligned len in clone_range2.
13203 // Hence the second write will fail in an attempt to reuse blob at
13204 // do_alloc_write().
13205 if (!wctx
->has_conflict(b
,
13207 offset0
+ alloc_len
,
13210 // we can't reuse pad_head/pad_tail since they might be truncated
13211 // due to existent extents
13212 uint64_t b_off
= offset
- bstart
;
13213 uint64_t b_off0
= b_off
;
13214 _pad_zeros(&bl
, &b_off0
, chunk_size
);
13216 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13217 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
13218 << " (0x" << b_off
<< "~" << length
<< ")"
13219 << std::dec
<< dendl
;
13221 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13222 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13224 logger
->inc(l_bluestore_write_small_unused
);
13232 } // if (ep != end && ep->logical_offset < offset + max_bsize)
13234 // check extent for reuse in reverse order
13235 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
13236 BlobRef b
= prev_ep
->blob
;
13237 if (!above_blob_threshold
) {
13238 inspected_blobs
.insert(&b
->get_blob());
13239 above_blob_threshold
= inspected_blobs
.size() >= blob_threshold
;
13241 start_ep
= prev_ep
;
13242 auto bstart
= prev_ep
->blob_start();
13243 dout(20) << __func__
<< " considering " << *b
13244 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
13245 if (b
->can_reuse_blob(min_alloc_size
,
13249 ceph_assert(alloc_len
== min_alloc_size
); // expecting data always
13250 // fit into reused blob
13251 // Need to check for pending writes desiring to
13252 // reuse the same pextent. The rationale is that during GC two chunks
13253 // from garbage blobs(compressed?) can share logical space within the same
13254 // AU. That's in turn might be caused by unaligned len in clone_range2.
13255 // Hence the second write will fail in an attempt to reuse blob at
13256 // do_alloc_write().
13257 if (!wctx
->has_conflict(b
,
13259 offset0
+ alloc_len
,
13262 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
13263 uint64_t b_off
= offset
- bstart
;
13264 uint64_t b_off0
= b_off
;
13265 _pad_zeros(&bl
, &b_off0
, chunk_size
);
13267 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13268 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
13269 << " (0x" << b_off
<< "~" << length
<< ")"
13270 << std::dec
<< dendl
;
13272 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13273 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13275 logger
->inc(l_bluestore_write_small_unused
);
13279 if (prev_ep
!= begin
) {
13283 prev_ep
= end
; // to avoid useless first extent re-check
13285 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
13286 } while (any_change
);
13288 if (above_blob_threshold
) {
13289 dout(10) << __func__
<< " request GC, blobs >= " << inspected_blobs
.size()
13290 << " " << std::hex
<< min_off
<< "~" << max_off
<< std::dec
13292 ceph_assert(start_ep
!= end_ep
);
13293 for (auto ep
= start_ep
; ep
!= end_ep
; ++ep
) {
13294 dout(20) << __func__
<< " inserting for GC "
13295 << std::hex
<< ep
->logical_offset
<< "~" << ep
->length
13296 << std::dec
<< dendl
;
13298 wctx
->extents_to_gc
.union_insert(ep
->logical_offset
, ep
->length
);
13300 // insert newly written extent to GC
13301 wctx
->extents_to_gc
.union_insert(offset
, length
);
13302 dout(20) << __func__
<< " inserting (last) for GC "
13303 << std::hex
<< offset
<< "~" << length
13304 << std::dec
<< dendl
;
13307 BlobRef b
= c
->new_blob();
13308 uint64_t b_off
= p2phase
<uint64_t>(offset
, alloc_len
);
13309 uint64_t b_off0
= b_off
;
13310 _pad_zeros(&bl
, &b_off0
, block_size
);
13311 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13312 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
13313 min_alloc_size
!= block_size
, // use 'unused' bitmap when alloc granularity
13314 // doesn't match disk one only
13320 void BlueStore::_do_write_big(
13324 uint64_t offset
, uint64_t length
,
13325 bufferlist::iterator
& blp
,
13326 WriteContext
*wctx
)
13328 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
13329 << " target_blob_size 0x" << wctx
->target_blob_size
<< std::dec
13330 << " compress " << (int)wctx
->compress
13332 logger
->inc(l_bluestore_write_big
);
13333 logger
->inc(l_bluestore_write_big_bytes
, length
);
13334 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
13335 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13336 while (length
> 0) {
13337 bool new_blob
= false;
13338 uint32_t l
= std::min(max_bsize
, length
);
13340 uint32_t b_off
= 0;
13342 //attempting to reuse existing blob
13343 if (!wctx
->compress
) {
13344 // look for an existing mutable blob we can reuse
13345 auto begin
= o
->extent_map
.extent_map
.begin();
13346 auto end
= o
->extent_map
.extent_map
.end();
13347 auto ep
= o
->extent_map
.seek_lextent(offset
);
13349 if (prev_ep
!= begin
) {
13352 prev_ep
= end
; // to avoid this extent check as it's a duplicate
13354 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
13355 // search suitable extent in both forward and reverse direction in
13356 // [offset - target_max_blob_size, offset + target_max_blob_size] range
13357 // then check if blob can be reused via can_reuse_blob func.
13360 any_change
= false;
13361 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
13362 if (offset
>= ep
->blob_start() &&
13363 ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
13364 offset
- ep
->blob_start(),
13367 b_off
= offset
- ep
->blob_start();
13368 prev_ep
= end
; // to avoid check below
13369 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13370 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
13377 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
13378 if (prev_ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
13379 offset
- prev_ep
->blob_start(),
13382 b_off
= offset
- prev_ep
->blob_start();
13383 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
13384 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
13385 } else if (prev_ep
!= begin
) {
13389 prev_ep
= end
; // to avoid useless first extent re-check
13392 } while (b
== nullptr && any_change
);
13394 if (b
== nullptr) {
13402 wctx
->write(offset
, b
, l
, b_off
, t
, b_off
, l
, false, new_blob
);
13405 logger
->inc(l_bluestore_write_big_blobs
);
13409 int BlueStore::_do_alloc_write(
13411 CollectionRef coll
,
13413 WriteContext
*wctx
)
13415 dout(20) << __func__
<< " txc " << txc
13416 << " " << wctx
->writes
.size() << " blobs"
13418 if (wctx
->writes
.empty()) {
13424 if (wctx
->compress
) {
13426 "compression_algorithm",
13430 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_ALGORITHM
, &val
)) {
13431 CompressorRef cp
= compressor
;
13432 if (!cp
|| cp
->get_type_name() != val
) {
13433 cp
= Compressor::create(cct
, val
);
13435 if (_set_compression_alert(false, val
.c_str())) {
13436 derr
<< __func__
<< " unable to initialize " << val
.c_str()
13437 << " compressor" << dendl
;
13441 return boost::optional
<CompressorRef
>(cp
);
13443 return boost::optional
<CompressorRef
>();
13447 crr
= select_option(
13448 "compression_required_ratio",
13449 cct
->_conf
->bluestore_compression_required_ratio
,
13452 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO
, &val
)) {
13453 return boost::optional
<double>(val
);
13455 return boost::optional
<double>();
13461 int64_t csum
= csum_type
.load();
13462 csum
= select_option(
13467 if (coll
->pool_opts
.get(pool_opts_t::CSUM_TYPE
, &val
)) {
13468 return boost::optional
<int64_t>(val
);
13470 return boost::optional
<int64_t>();
13474 // compress (as needed) and calc needed space
13476 auto max_bsize
= std::max(wctx
->target_blob_size
, min_alloc_size
);
13477 for (auto& wi
: wctx
->writes
) {
13478 if (c
&& wi
.blob_length
> min_alloc_size
) {
13479 auto start
= mono_clock::now();
13482 ceph_assert(wi
.b_off
== 0);
13483 ceph_assert(wi
.blob_length
== wi
.bl
.length());
13485 // FIXME: memory alignment here is bad
13487 int r
= c
->compress(wi
.bl
, t
);
13488 uint64_t want_len_raw
= wi
.blob_length
* crr
;
13489 uint64_t want_len
= p2roundup(want_len_raw
, min_alloc_size
);
13490 bool rejected
= false;
13491 uint64_t compressed_len
= t
.length();
13492 // do an approximate (fast) estimation for resulting blob size
13493 // that doesn't take header overhead into account
13494 uint64_t result_len
= p2roundup(compressed_len
, min_alloc_size
);
13495 if (r
== 0 && result_len
<= want_len
&& result_len
< wi
.blob_length
) {
13496 bluestore_compression_header_t chdr
;
13497 chdr
.type
= c
->get_type();
13498 chdr
.length
= t
.length();
13499 encode(chdr
, wi
.compressed_bl
);
13500 wi
.compressed_bl
.claim_append(t
);
13502 compressed_len
= wi
.compressed_bl
.length();
13503 result_len
= p2roundup(compressed_len
, min_alloc_size
);
13504 if (result_len
<= want_len
&& result_len
< wi
.blob_length
) {
13505 // Cool. We compressed at least as much as we were hoping to.
13506 // pad out to min_alloc_size
13507 wi
.compressed_bl
.append_zero(result_len
- compressed_len
);
13508 wi
.compressed_len
= compressed_len
;
13509 wi
.compressed
= true;
13510 logger
->inc(l_bluestore_write_pad_bytes
, result_len
- compressed_len
);
13511 dout(20) << __func__
<< std::hex
<< " compressed 0x" << wi
.blob_length
13512 << " -> 0x" << compressed_len
<< " => 0x" << result_len
13513 << " with " << c
->get_type()
13514 << std::dec
<< dendl
;
13515 txc
->statfs_delta
.compressed() += compressed_len
;
13516 txc
->statfs_delta
.compressed_original() += wi
.blob_length
;
13517 txc
->statfs_delta
.compressed_allocated() += result_len
;
13518 logger
->inc(l_bluestore_compress_success_count
);
13519 need
+= result_len
;
13523 } else if (r
!= 0) {
13524 dout(5) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
13525 << " bytes compressed using " << c
->get_type_name()
13527 << " failed with errcode = " << r
13528 << ", leaving uncompressed"
13530 logger
->inc(l_bluestore_compress_rejected_count
);
13531 need
+= wi
.blob_length
;
13537 dout(20) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
13538 << " compressed to 0x" << compressed_len
<< " -> 0x" << result_len
13539 << " with " << c
->get_type()
13540 << ", which is more than required 0x" << want_len_raw
13541 << " -> 0x" << want_len
13542 << ", leaving uncompressed"
13543 << std::dec
<< dendl
;
13544 logger
->inc(l_bluestore_compress_rejected_count
);
13545 need
+= wi
.blob_length
;
13547 log_latency("compress@_do_alloc_write",
13548 l_bluestore_compress_lat
,
13549 mono_clock::now() - start
,
13550 cct
->_conf
->bluestore_log_op_age
);
13552 need
+= wi
.blob_length
;
13555 PExtentVector prealloc
;
13556 prealloc
.reserve(2 * wctx
->writes
.size());;
13557 int64_t prealloc_left
= 0;
13558 prealloc_left
= alloc
->allocate(
13559 need
, min_alloc_size
, need
,
13561 if (prealloc_left
< 0 || prealloc_left
< (int64_t)need
) {
13562 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< need
13563 << " allocated 0x " << (prealloc_left
< 0 ? 0 : prealloc_left
)
13564 << " min_alloc_size 0x" << min_alloc_size
13565 << " available 0x " << alloc
->get_free()
13566 << std::dec
<< dendl
;
13567 if (prealloc
.size()) {
13568 alloc
->release(prealloc
);
13572 _collect_allocation_stats(need
, min_alloc_size
, prealloc
.size());
13574 dout(20) << __func__
<< " prealloc " << prealloc
<< dendl
;
13575 auto prealloc_pos
= prealloc
.begin();
13577 for (auto& wi
: wctx
->writes
) {
13579 bluestore_blob_t
& dblob
= b
->dirty_blob();
13580 uint64_t b_off
= wi
.b_off
;
13581 bufferlist
*l
= &wi
.bl
;
13582 uint64_t final_length
= wi
.blob_length
;
13583 uint64_t csum_length
= wi
.blob_length
;
13584 if (wi
.compressed
) {
13585 final_length
= wi
.compressed_bl
.length();
13586 csum_length
= final_length
;
13587 l
= &wi
.compressed_bl
;
13588 dblob
.set_compressed(wi
.blob_length
, wi
.compressed_len
);
13589 } else if (wi
.new_blob
) {
13590 // initialize newly created blob only
13591 ceph_assert(dblob
.is_mutable());
13592 unsigned csum_order
;
13593 if (l
->length() != wi
.blob_length
) {
13594 // hrm, maybe we could do better here, but let's not bother.
13595 dout(20) << __func__
<< " forcing csum_order to block_size_order "
13596 << block_size_order
<< dendl
;
13597 csum_order
= block_size_order
;
13599 csum_order
= std::min(wctx
->csum_order
, ctz(l
->length()));
13601 // try to align blob with max_blob_size to improve
13602 // its reuse ratio, e.g. in case of reverse write
13603 uint32_t suggested_boff
=
13604 (wi
.logical_offset
- (wi
.b_off0
- wi
.b_off
)) % max_bsize
;
13605 if ((suggested_boff
% (1 << csum_order
)) == 0 &&
13606 suggested_boff
+ final_length
<= max_bsize
&&
13607 suggested_boff
> b_off
) {
13608 dout(20) << __func__
<< " forcing blob_offset to 0x"
13609 << std::hex
<< suggested_boff
<< std::dec
<< dendl
;
13610 ceph_assert(suggested_boff
>= b_off
);
13611 csum_length
+= suggested_boff
- b_off
;
13612 b_off
= suggested_boff
;
13614 if (csum
!= Checksummer::CSUM_NONE
) {
13615 dout(20) << __func__
<< " initialize csum setting for new blob " << *b
13616 << " csum_type " << Checksummer::get_csum_type_string(csum
)
13617 << " csum_order " << csum_order
13618 << " csum_length 0x" << std::hex
<< csum_length
<< std::dec
13620 dblob
.init_csum(csum
, csum_order
, csum_length
);
13624 PExtentVector extents
;
13625 int64_t left
= final_length
;
13627 ceph_assert(prealloc_left
> 0);
13628 if (prealloc_pos
->length
<= left
) {
13629 prealloc_left
-= prealloc_pos
->length
;
13630 left
-= prealloc_pos
->length
;
13631 txc
->statfs_delta
.allocated() += prealloc_pos
->length
;
13632 extents
.push_back(*prealloc_pos
);
13635 extents
.emplace_back(prealloc_pos
->offset
, left
);
13636 prealloc_pos
->offset
+= left
;
13637 prealloc_pos
->length
-= left
;
13638 prealloc_left
-= left
;
13639 txc
->statfs_delta
.allocated() += left
;
13644 for (auto& p
: extents
) {
13645 txc
->allocated
.insert(p
.offset
, p
.length
);
13647 dblob
.allocated(p2align(b_off
, min_alloc_size
), final_length
, extents
);
13649 dout(20) << __func__
<< " blob " << *b
<< dendl
;
13650 if (dblob
.has_csum()) {
13651 dblob
.calc_csum(b_off
, *l
);
13654 if (wi
.mark_unused
) {
13655 ceph_assert(!dblob
.is_compressed());
13656 auto b_end
= b_off
+ wi
.bl
.length();
13658 dblob
.add_unused(0, b_off
);
13660 uint64_t llen
= dblob
.get_logical_length();
13661 if (b_end
< llen
) {
13662 dblob
.add_unused(b_end
, llen
- b_end
);
13666 Extent
*le
= o
->extent_map
.set_lextent(coll
, wi
.logical_offset
,
13667 b_off
+ (wi
.b_off0
- wi
.b_off
),
13671 wi
.b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
13672 txc
->statfs_delta
.stored() += le
->length
;
13673 dout(20) << __func__
<< " lex " << *le
<< dendl
;
13674 _buffer_cache_write(txc
, wi
.b
, b_off
, wi
.bl
,
13675 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
13678 if (!g_conf()->bluestore_debug_omit_block_device_write
) {
13679 if (l
->length() <= prefer_deferred_size
.load()) {
13680 dout(20) << __func__
<< " deferring small 0x" << std::hex
13681 << l
->length() << std::dec
<< " write via deferred" << dendl
;
13682 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
);
13683 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
13684 int r
= b
->get_blob().map(
13685 b_off
, l
->length(),
13686 [&](uint64_t offset
, uint64_t length
) {
13687 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
13690 ceph_assert(r
== 0);
13692 logger
->inc(l_bluestore_write_small_deferred
);
13694 b
->get_blob().map_bl(
13696 [&](uint64_t offset
, bufferlist
& t
) {
13697 bdev
->aio_write(offset
, t
, &txc
->ioc
, false);
13699 logger
->inc(l_bluestore_write_small_new
);
13703 ceph_assert(prealloc_pos
== prealloc
.end());
13704 ceph_assert(prealloc_left
== 0);
13708 void BlueStore::_wctx_finish(
13712 WriteContext
*wctx
,
13713 set
<SharedBlob
*> *maybe_unshared_blobs
)
13715 auto oep
= wctx
->old_extents
.begin();
13716 while (oep
!= wctx
->old_extents
.end()) {
13718 oep
= wctx
->old_extents
.erase(oep
);
13719 dout(20) << __func__
<< " lex_old " << lo
.e
<< dendl
;
13720 BlobRef b
= lo
.e
.blob
;
13721 const bluestore_blob_t
& blob
= b
->get_blob();
13722 if (blob
.is_compressed()) {
13723 if (lo
.blob_empty
) {
13724 txc
->statfs_delta
.compressed() -= blob
.get_compressed_payload_length();
13726 txc
->statfs_delta
.compressed_original() -= lo
.e
.length
;
13729 txc
->statfs_delta
.stored() -= lo
.e
.length
;
13731 dout(20) << __func__
<< " blob release " << r
<< dendl
;
13732 if (blob
.is_shared()) {
13733 PExtentVector final
;
13734 c
->load_shared_blob(b
->shared_blob
);
13735 bool unshare
= false;
13736 bool* unshare_ptr
=
13737 !maybe_unshared_blobs
|| b
->is_referenced() ? nullptr : &unshare
;
13739 b
->shared_blob
->put_ref(
13740 e
.offset
, e
.length
, &final
,
13744 ceph_assert(maybe_unshared_blobs
);
13745 maybe_unshared_blobs
->insert(b
->shared_blob
.get());
13747 dout(20) << __func__
<< " shared_blob release " << final
13748 << " from " << *b
->shared_blob
<< dendl
;
13749 txc
->write_shared_blob(b
->shared_blob
);
13754 // we can't invalidate our logical extents as we drop them because
13755 // other lextents (either in our onode or others) may still
13756 // reference them. but we can throw out anything that is no
13757 // longer allocated. Note that this will leave behind edge bits
13758 // that are no longer referenced but not deallocated (until they
13759 // age out of the cache naturally).
13760 b
->discard_unallocated(c
.get());
13762 dout(20) << __func__
<< " release " << e
<< dendl
;
13763 txc
->released
.insert(e
.offset
, e
.length
);
13764 txc
->statfs_delta
.allocated() -= e
.length
;
13765 if (blob
.is_compressed()) {
13766 txc
->statfs_delta
.compressed_allocated() -= e
.length
;
13770 if (b
->is_spanning() && !b
->is_referenced() && lo
.blob_empty
) {
13771 dout(20) << __func__
<< " spanning_blob_map removing empty " << *b
13773 o
->extent_map
.spanning_blob_map
.erase(b
->id
);
13779 void BlueStore::_do_write_data(
13786 WriteContext
*wctx
)
13788 uint64_t end
= offset
+ length
;
13789 bufferlist::iterator p
= bl
.begin();
13791 if (offset
/ min_alloc_size
== (end
- 1) / min_alloc_size
&&
13792 (length
!= min_alloc_size
)) {
13793 // we fall within the same block
13794 _do_write_small(txc
, c
, o
, offset
, length
, p
, wctx
);
13796 uint64_t head_offset
, head_length
;
13797 uint64_t middle_offset
, middle_length
;
13798 uint64_t tail_offset
, tail_length
;
13800 head_offset
= offset
;
13801 head_length
= p2nphase(offset
, min_alloc_size
);
13803 tail_offset
= p2align(end
, min_alloc_size
);
13804 tail_length
= p2phase(end
, min_alloc_size
);
13806 middle_offset
= head_offset
+ head_length
;
13807 middle_length
= length
- head_length
- tail_length
;
13810 _do_write_small(txc
, c
, o
, head_offset
, head_length
, p
, wctx
);
13813 if (middle_length
) {
13814 _do_write_big(txc
, c
, o
, middle_offset
, middle_length
, p
, wctx
);
13818 _do_write_small(txc
, c
, o
, tail_offset
, tail_length
, p
, wctx
);
13823 void BlueStore::_choose_write_options(
13826 uint32_t fadvise_flags
,
13827 WriteContext
*wctx
)
13829 if (fadvise_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
13830 dout(20) << __func__
<< " will do buffered write" << dendl
;
13831 wctx
->buffered
= true;
13832 } else if (cct
->_conf
->bluestore_default_buffered_write
&&
13833 (fadvise_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
13834 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
13835 dout(20) << __func__
<< " defaulting to buffered write" << dendl
;
13836 wctx
->buffered
= true;
13839 // apply basic csum block size
13840 wctx
->csum_order
= block_size_order
;
13842 // compression parameters
13843 unsigned alloc_hints
= o
->onode
.alloc_hint_flags
;
13844 auto cm
= select_option(
13845 "compression_mode",
13849 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MODE
, &val
)) {
13850 return boost::optional
<Compressor::CompressionMode
>(
13851 Compressor::get_comp_mode_type(val
));
13853 return boost::optional
<Compressor::CompressionMode
>();
13857 wctx
->compress
= (cm
!= Compressor::COMP_NONE
) &&
13858 ((cm
== Compressor::COMP_FORCE
) ||
13859 (cm
== Compressor::COMP_AGGRESSIVE
&&
13860 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE
) == 0) ||
13861 (cm
== Compressor::COMP_PASSIVE
&&
13862 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE
)));
13864 if ((alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ
) &&
13865 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ
) == 0 &&
13866 (alloc_hints
& (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE
|
13867 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY
)) &&
13868 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE
) == 0) {
13870 dout(20) << __func__
<< " will prefer large blob and csum sizes" << dendl
;
13872 if (o
->onode
.expected_write_size
) {
13873 wctx
->csum_order
= std::max(min_alloc_size_order
,
13874 (uint8_t)ctz(o
->onode
.expected_write_size
));
13876 wctx
->csum_order
= min_alloc_size_order
;
13879 if (wctx
->compress
) {
13880 wctx
->target_blob_size
= select_option(
13881 "compression_max_blob_size",
13882 comp_max_blob_size
.load(),
13885 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, &val
)) {
13886 return boost::optional
<uint64_t>((uint64_t)val
);
13888 return boost::optional
<uint64_t>();
13893 if (wctx
->compress
) {
13894 wctx
->target_blob_size
= select_option(
13895 "compression_min_blob_size",
13896 comp_min_blob_size
.load(),
13899 if (c
->pool_opts
.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, &val
)) {
13900 return boost::optional
<uint64_t>((uint64_t)val
);
13902 return boost::optional
<uint64_t>();
13908 uint64_t max_bsize
= max_blob_size
.load();
13909 if (wctx
->target_blob_size
== 0 || wctx
->target_blob_size
> max_bsize
) {
13910 wctx
->target_blob_size
= max_bsize
;
13913 // set the min blob size floor at 2x the min_alloc_size, or else we
13914 // won't be able to allocate a smaller extent for the compressed
13916 if (wctx
->compress
&&
13917 wctx
->target_blob_size
< min_alloc_size
* 2) {
13918 wctx
->target_blob_size
= min_alloc_size
* 2;
13921 dout(20) << __func__
<< " prefer csum_order " << wctx
->csum_order
13922 << " target_blob_size 0x" << std::hex
<< wctx
->target_blob_size
13923 << " compress=" << (int)wctx
->compress
13924 << " buffered=" << (int)wctx
->buffered
13925 << std::dec
<< dendl
;
13928 int BlueStore::_do_gc(
13932 const WriteContext
& wctx
,
13933 uint64_t *dirty_start
,
13934 uint64_t *dirty_end
)
13937 bool dirty_range_updated
= false;
13938 WriteContext wctx_gc
;
13939 wctx_gc
.fork(wctx
); // make a clone for garbage collection
13941 auto & extents_to_collect
= wctx
.extents_to_gc
;
13942 for (auto it
= extents_to_collect
.begin();
13943 it
!= extents_to_collect
.end();
13946 auto offset
= (*it
).first
;
13947 auto length
= (*it
).second
;
13948 dout(20) << __func__
<< " processing " << std::hex
13949 << offset
<< "~" << length
<< std::dec
13951 int r
= _do_read(c
.get(), o
, offset
, length
, bl
, 0);
13952 ceph_assert(r
== (int)length
);
13954 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx_gc
);
13955 logger
->inc(l_bluestore_gc_merged
, length
);
13957 if (*dirty_start
> offset
) {
13958 *dirty_start
= offset
;
13959 dirty_range_updated
= true;
13962 if (*dirty_end
< offset
+ length
) {
13963 *dirty_end
= offset
+ length
;
13964 dirty_range_updated
= true;
13967 if (dirty_range_updated
) {
13968 o
->extent_map
.fault_range(db
, *dirty_start
, *dirty_end
);
13971 dout(30) << __func__
<< " alloc write" << dendl
;
13972 int r
= _do_alloc_write(txc
, c
, o
, &wctx_gc
);
13974 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
13979 _wctx_finish(txc
, c
, o
, &wctx_gc
);
13983 int BlueStore::_do_write(
13990 uint32_t fadvise_flags
)
13994 dout(20) << __func__
13996 << " 0x" << std::hex
<< offset
<< "~" << length
13997 << " - have 0x" << o
->onode
.size
13998 << " (" << std::dec
<< o
->onode
.size
<< ")"
14000 << " fadvise_flags 0x" << std::hex
<< fadvise_flags
<< std::dec
14002 _dump_onode
<30>(cct
, *o
);
14008 uint64_t end
= offset
+ length
;
14010 GarbageCollector
gc(c
->store
->cct
);
14011 int64_t benefit
= 0;
14012 auto dirty_start
= offset
;
14013 auto dirty_end
= end
;
14016 _choose_write_options(c
, o
, fadvise_flags
, &wctx
);
14017 o
->extent_map
.fault_range(db
, offset
, length
);
14018 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx
);
14019 r
= _do_alloc_write(txc
, c
, o
, &wctx
);
14021 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
14026 if (wctx
.extents_to_gc
.empty() ||
14027 wctx
.extents_to_gc
.range_start() > offset
||
14028 wctx
.extents_to_gc
.range_end() < offset
+ length
) {
14029 benefit
= gc
.estimate(offset
,
14036 // NB: _wctx_finish() will empty old_extents
14037 // so we must do gc estimation before that
14038 _wctx_finish(txc
, c
, o
, &wctx
);
14039 if (end
> o
->onode
.size
) {
14040 dout(20) << __func__
<< " extending size to 0x" << std::hex
<< end
14041 << std::dec
<< dendl
;
14042 o
->onode
.size
= end
;
14045 if (benefit
>= g_conf()->bluestore_gc_enable_total_threshold
) {
14046 wctx
.extents_to_gc
.union_of(gc
.get_extents_to_collect());
14047 dout(20) << __func__
14048 << " perform garbage collection for compressed extents, "
14049 << "expected benefit = " << benefit
<< " AUs" << dendl
;
14051 if (!wctx
.extents_to_gc
.empty()) {
14052 dout(20) << __func__
<< " perform garbage collection" << dendl
;
14054 r
= _do_gc(txc
, c
, o
,
14056 &dirty_start
, &dirty_end
);
14058 derr
<< __func__
<< " _do_gc failed with " << cpp_strerror(r
)
14062 dout(20)<<__func__
<<" gc range is " << std::hex
<< dirty_start
14063 << "~" << dirty_end
- dirty_start
<< std::dec
<< dendl
;
14065 o
->extent_map
.compress_extent_map(dirty_start
, dirty_end
- dirty_start
);
14066 o
->extent_map
.dirty_range(dirty_start
, dirty_end
- dirty_start
);
14074 int BlueStore::_write(TransContext
*txc
,
14077 uint64_t offset
, size_t length
,
14079 uint32_t fadvise_flags
)
14081 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14082 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14085 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
14088 _assign_nid(txc
, o
);
14089 r
= _do_write(txc
, c
, o
, offset
, length
, bl
, fadvise_flags
);
14090 txc
->write_onode(o
);
14092 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14093 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14094 << " = " << r
<< dendl
;
14098 int BlueStore::_zero(TransContext
*txc
,
14101 uint64_t offset
, size_t length
)
14103 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14104 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14107 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
14110 _assign_nid(txc
, o
);
14111 r
= _do_zero(txc
, c
, o
, offset
, length
);
14113 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14114 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14115 << " = " << r
<< dendl
;
14119 int BlueStore::_do_zero(TransContext
*txc
,
14122 uint64_t offset
, size_t length
)
14124 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14125 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14129 _dump_onode
<30>(cct
, *o
);
14132 o
->extent_map
.fault_range(db
, offset
, length
);
14133 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
14134 o
->extent_map
.dirty_range(offset
, length
);
14135 _wctx_finish(txc
, c
, o
, &wctx
);
14137 if (length
> 0 && offset
+ length
> o
->onode
.size
) {
14138 o
->onode
.size
= offset
+ length
;
14139 dout(20) << __func__
<< " extending size to " << offset
+ length
14142 txc
->write_onode(o
);
14144 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14145 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
14146 << " = " << r
<< dendl
;
14150 void BlueStore::_do_truncate(
14151 TransContext
*txc
, CollectionRef
& c
, OnodeRef o
, uint64_t offset
,
14152 set
<SharedBlob
*> *maybe_unshared_blobs
)
14154 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14155 << " 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
14157 _dump_onode
<30>(cct
, *o
);
14159 if (offset
== o
->onode
.size
)
14162 if (offset
< o
->onode
.size
) {
14164 uint64_t length
= o
->onode
.size
- offset
;
14165 o
->extent_map
.fault_range(db
, offset
, length
);
14166 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
14167 o
->extent_map
.dirty_range(offset
, length
);
14168 _wctx_finish(txc
, c
, o
, &wctx
, maybe_unshared_blobs
);
14170 // if we have shards past EOF, ask for a reshard
14171 if (!o
->onode
.extent_map_shards
.empty() &&
14172 o
->onode
.extent_map_shards
.back().offset
>= offset
) {
14173 dout(10) << __func__
<< " request reshard past EOF" << dendl
;
14175 o
->extent_map
.request_reshard(offset
- 1, offset
+ length
);
14177 o
->extent_map
.request_reshard(0, length
);
14182 o
->onode
.size
= offset
;
14184 txc
->write_onode(o
);
14187 int BlueStore::_truncate(TransContext
*txc
,
14192 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14193 << " 0x" << std::hex
<< offset
<< std::dec
14196 if (offset
>= OBJECT_MAX_SIZE
) {
14199 _do_truncate(txc
, c
, o
, offset
);
14201 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14202 << " 0x" << std::hex
<< offset
<< std::dec
14203 << " = " << r
<< dendl
;
14207 int BlueStore::_do_remove(
14212 set
<SharedBlob
*> maybe_unshared_blobs
;
14213 bool is_gen
= !o
->oid
.is_no_gen();
14214 _do_truncate(txc
, c
, o
, 0, is_gen
? &maybe_unshared_blobs
: nullptr);
14215 if (o
->onode
.has_omap()) {
14217 _do_omap_clear(txc
, o
);
14221 for (auto &s
: o
->extent_map
.shards
) {
14222 dout(20) << __func__
<< " removing shard 0x" << std::hex
14223 << s
.shard_info
->offset
<< std::dec
<< dendl
;
14224 generate_extent_shard_key_and_apply(o
->key
, s
.shard_info
->offset
, &key
,
14225 [&](const string
& final_key
) {
14226 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
14230 txc
->t
->rmkey(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size());
14231 txc
->note_removed_object(o
);
14232 o
->extent_map
.clear();
14233 o
->onode
= bluestore_onode_t();
14234 _debug_obj_on_delete(o
->oid
);
14236 if (!is_gen
|| maybe_unshared_blobs
.empty()) {
14240 // see if we can unshare blobs still referenced by the head
14241 dout(10) << __func__
<< " gen and maybe_unshared_blobs "
14242 << maybe_unshared_blobs
<< dendl
;
14243 ghobject_t nogen
= o
->oid
;
14244 nogen
.generation
= ghobject_t::NO_GEN
;
14245 OnodeRef h
= c
->onode_map
.lookup(nogen
);
14247 if (!h
|| !h
->exists
) {
14251 dout(20) << __func__
<< " checking for unshareable blobs on " << h
14252 << " " << h
->oid
<< dendl
;
14253 map
<SharedBlob
*,bluestore_extent_ref_map_t
> expect
;
14254 for (auto& e
: h
->extent_map
.extent_map
) {
14255 const bluestore_blob_t
& b
= e
.blob
->get_blob();
14256 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
14257 if (b
.is_shared() &&
14259 maybe_unshared_blobs
.count(sb
)) {
14260 if (b
.is_compressed()) {
14261 expect
[sb
].get(0, b
.get_ondisk_length());
14263 b
.map(e
.blob_offset
, e
.length
, [&](uint64_t off
, uint64_t len
) {
14264 expect
[sb
].get(off
, len
);
14271 vector
<SharedBlob
*> unshared_blobs
;
14272 unshared_blobs
.reserve(maybe_unshared_blobs
.size());
14273 for (auto& p
: expect
) {
14274 dout(20) << " ? " << *p
.first
<< " vs " << p
.second
<< dendl
;
14275 if (p
.first
->persistent
->ref_map
== p
.second
) {
14276 SharedBlob
*sb
= p
.first
;
14277 dout(20) << __func__
<< " unsharing " << *sb
<< dendl
;
14278 unshared_blobs
.push_back(sb
);
14279 txc
->unshare_blob(sb
);
14280 uint64_t sbid
= c
->make_blob_unshared(sb
);
14282 get_shared_blob_key(sbid
, &key
);
14283 txc
->t
->rmkey(PREFIX_SHARED_BLOB
, key
);
14287 if (unshared_blobs
.empty()) {
14291 for (auto& e
: h
->extent_map
.extent_map
) {
14292 const bluestore_blob_t
& b
= e
.blob
->get_blob();
14293 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
14294 if (b
.is_shared() &&
14295 std::find(unshared_blobs
.begin(), unshared_blobs
.end(),
14296 sb
) != unshared_blobs
.end()) {
14297 dout(20) << __func__
<< " unsharing " << e
<< dendl
;
14298 bluestore_blob_t
& blob
= e
.blob
->dirty_blob();
14299 blob
.clear_flag(bluestore_blob_t::FLAG_SHARED
);
14300 h
->extent_map
.dirty_range(e
.logical_offset
, 1);
14303 txc
->write_onode(h
);
14308 int BlueStore::_remove(TransContext
*txc
,
14312 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14313 << " onode " << o
.get()
14314 << " txc "<< txc
<< dendl
;
14315 int r
= _do_remove(txc
, c
, o
);
14316 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14320 int BlueStore::_setattr(TransContext
*txc
,
14323 const string
& name
,
14326 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14327 << " " << name
<< " (" << val
.length() << " bytes)"
14330 if (val
.is_partial()) {
14331 auto& b
= o
->onode
.attrs
[name
.c_str()] = bufferptr(val
.c_str(),
14333 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
14335 auto& b
= o
->onode
.attrs
[name
.c_str()] = val
;
14336 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
14338 txc
->write_onode(o
);
14339 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14340 << " " << name
<< " (" << val
.length() << " bytes)"
14341 << " = " << r
<< dendl
;
14345 int BlueStore::_setattrs(TransContext
*txc
,
14348 const map
<string
,bufferptr
>& aset
)
14350 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14351 << " " << aset
.size() << " keys"
14354 for (map
<string
,bufferptr
>::const_iterator p
= aset
.begin();
14355 p
!= aset
.end(); ++p
) {
14356 if (p
->second
.is_partial()) {
14357 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] =
14358 bufferptr(p
->second
.c_str(), p
->second
.length());
14359 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
14361 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] = p
->second
;
14362 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
14365 txc
->write_onode(o
);
14366 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14367 << " " << aset
.size() << " keys"
14368 << " = " << r
<< dendl
;
14373 int BlueStore::_rmattr(TransContext
*txc
,
14376 const string
& name
)
14378 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14379 << " " << name
<< dendl
;
14381 auto it
= o
->onode
.attrs
.find(name
.c_str());
14382 if (it
== o
->onode
.attrs
.end())
14385 o
->onode
.attrs
.erase(it
);
14386 txc
->write_onode(o
);
14389 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14390 << " " << name
<< " = " << r
<< dendl
;
14394 int BlueStore::_rmattrs(TransContext
*txc
,
14398 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14401 if (o
->onode
.attrs
.empty())
14404 o
->onode
.attrs
.clear();
14405 txc
->write_onode(o
);
14408 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14412 void BlueStore::_do_omap_clear(TransContext
*txc
, OnodeRef
& o
)
14414 const string
& omap_prefix
= o
->get_omap_prefix();
14415 string prefix
, tail
;
14416 o
->get_omap_header(&prefix
);
14417 o
->get_omap_tail(&tail
);
14418 txc
->t
->rm_range_keys(omap_prefix
, prefix
, tail
);
14419 txc
->t
->rmkey(omap_prefix
, tail
);
14420 dout(20) << __func__
<< " remove range start: "
14421 << pretty_binary_string(prefix
) << " end: "
14422 << pretty_binary_string(tail
) << dendl
;
14425 int BlueStore::_omap_clear(TransContext
*txc
,
14429 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14431 if (o
->onode
.has_omap()) {
14433 _do_omap_clear(txc
, o
);
14434 o
->onode
.clear_omap_flag();
14435 txc
->write_onode(o
);
14437 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14441 int BlueStore::_omap_setkeys(TransContext
*txc
,
14446 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14448 auto p
= bl
.cbegin();
14450 if (!o
->onode
.has_omap()) {
14451 if (o
->oid
.is_pgmeta()) {
14452 o
->onode
.set_omap_flags_pgmeta();
14454 o
->onode
.set_omap_flags();
14456 txc
->write_onode(o
);
14458 const string
& prefix
= o
->get_omap_prefix();
14461 o
->get_omap_tail(&key_tail
);
14462 txc
->t
->set(prefix
, key_tail
, tail
);
14464 txc
->note_modified_object(o
);
14466 const string
& prefix
= o
->get_omap_prefix();
14468 o
->get_omap_key(string(), &final_key
);
14469 size_t base_key_len
= final_key
.size();
14476 final_key
.resize(base_key_len
); // keep prefix
14478 dout(20) << __func__
<< " " << pretty_binary_string(final_key
)
14479 << " <- " << key
<< dendl
;
14480 txc
->t
->set(prefix
, final_key
, value
);
14483 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14487 int BlueStore::_omap_setheader(TransContext
*txc
,
14492 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14495 if (!o
->onode
.has_omap()) {
14496 if (o
->oid
.is_pgmeta()) {
14497 o
->onode
.set_omap_flags_pgmeta();
14499 o
->onode
.set_omap_flags();
14501 txc
->write_onode(o
);
14503 const string
& prefix
= o
->get_omap_prefix();
14506 o
->get_omap_tail(&key_tail
);
14507 txc
->t
->set(prefix
, key_tail
, tail
);
14509 txc
->note_modified_object(o
);
14511 const string
& prefix
= o
->get_omap_prefix();
14512 o
->get_omap_header(&key
);
14513 txc
->t
->set(prefix
, key
, bl
);
14515 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14519 int BlueStore::_omap_rmkeys(TransContext
*txc
,
14524 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14526 auto p
= bl
.cbegin();
14530 if (!o
->onode
.has_omap()) {
14534 const string
& prefix
= o
->get_omap_prefix();
14535 o
->get_omap_key(string(), &final_key
);
14536 size_t base_key_len
= final_key
.size();
14541 final_key
.resize(base_key_len
); // keep prefix
14543 dout(20) << __func__
<< " rm " << pretty_binary_string(final_key
)
14544 << " <- " << key
<< dendl
;
14545 txc
->t
->rmkey(prefix
, final_key
);
14548 txc
->note_modified_object(o
);
14551 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14555 int BlueStore::_omap_rmkey_range(TransContext
*txc
,
14558 const string
& first
, const string
& last
)
14560 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
14561 string key_first
, key_last
;
14563 if (!o
->onode
.has_omap()) {
14567 const string
& prefix
= o
->get_omap_prefix();
14569 o
->get_omap_key(first
, &key_first
);
14570 o
->get_omap_key(last
, &key_last
);
14571 txc
->t
->rm_range_keys(prefix
, key_first
, key_last
);
14572 dout(20) << __func__
<< " remove range start: "
14573 << pretty_binary_string(key_first
) << " end: "
14574 << pretty_binary_string(key_last
) << dendl
;
14576 txc
->note_modified_object(o
);
14579 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
14583 int BlueStore::_set_alloc_hint(
14587 uint64_t expected_object_size
,
14588 uint64_t expected_write_size
,
14591 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
14592 << " object_size " << expected_object_size
14593 << " write_size " << expected_write_size
14594 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
14597 o
->onode
.expected_object_size
= expected_object_size
;
14598 o
->onode
.expected_write_size
= expected_write_size
;
14599 o
->onode
.alloc_hint_flags
= flags
;
14600 txc
->write_onode(o
);
14601 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
14602 << " object_size " << expected_object_size
14603 << " write_size " << expected_write_size
14604 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
14605 << " = " << r
<< dendl
;
14609 int BlueStore::_clone(TransContext
*txc
,
14614 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
14615 << newo
->oid
<< dendl
;
14617 if (oldo
->oid
.hobj
.get_hash() != newo
->oid
.hobj
.get_hash()) {
14618 derr
<< __func__
<< " mismatched hash on " << oldo
->oid
14619 << " and " << newo
->oid
<< dendl
;
14623 _assign_nid(txc
, newo
);
14627 _do_truncate(txc
, c
, newo
, 0);
14628 if (cct
->_conf
->bluestore_clone_cow
) {
14629 _do_clone_range(txc
, c
, oldo
, newo
, 0, oldo
->onode
.size
, 0);
14632 r
= _do_read(c
.get(), oldo
, 0, oldo
->onode
.size
, bl
, 0);
14635 r
= _do_write(txc
, c
, newo
, 0, oldo
->onode
.size
, bl
, 0);
14641 newo
->onode
.attrs
= oldo
->onode
.attrs
;
14644 if (newo
->onode
.has_omap()) {
14645 dout(20) << __func__
<< " clearing old omap data" << dendl
;
14647 _do_omap_clear(txc
, newo
);
14648 newo
->onode
.clear_omap_flag();
14650 if (oldo
->onode
.has_omap()) {
14651 dout(20) << __func__
<< " copying omap data" << dendl
;
14652 if (newo
->oid
.is_pgmeta()) {
14653 newo
->onode
.set_omap_flags_pgmeta();
14655 newo
->onode
.set_omap_flags();
14657 const string
& prefix
= newo
->get_omap_prefix();
14658 KeyValueDB::Iterator it
= db
->get_iterator(prefix
);
14660 oldo
->get_omap_header(&head
);
14661 oldo
->get_omap_tail(&tail
);
14662 it
->lower_bound(head
);
14663 while (it
->valid()) {
14664 if (it
->key() >= tail
) {
14665 dout(30) << __func__
<< " reached tail" << dendl
;
14668 dout(30) << __func__
<< " got header/data "
14669 << pretty_binary_string(it
->key()) << dendl
;
14671 newo
->rewrite_omap_key(it
->key(), &key
);
14672 txc
->t
->set(prefix
, key
, it
->value());
14677 bufferlist new_tail_value
;
14678 newo
->get_omap_tail(&new_tail
);
14679 txc
->t
->set(prefix
, new_tail
, new_tail_value
);
14682 txc
->write_onode(newo
);
14686 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
14687 << newo
->oid
<< " = " << r
<< dendl
;
14691 int BlueStore::_do_clone_range(
14700 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
14702 << " 0x" << std::hex
<< srcoff
<< "~" << length
<< " -> "
14703 << " 0x" << dstoff
<< "~" << length
<< std::dec
<< dendl
;
14704 oldo
->extent_map
.fault_range(db
, srcoff
, length
);
14705 newo
->extent_map
.fault_range(db
, dstoff
, length
);
14706 _dump_onode
<30>(cct
, *oldo
);
14707 _dump_onode
<30>(cct
, *newo
);
14709 oldo
->extent_map
.dup(this, txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
14710 _dump_onode
<30>(cct
, *oldo
);
14711 _dump_onode
<30>(cct
, *newo
);
14715 int BlueStore::_clone_range(TransContext
*txc
,
14719 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
)
14721 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
14722 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
14723 << " to offset 0x" << dstoff
<< std::dec
<< dendl
;
14726 if (srcoff
+ length
>= OBJECT_MAX_SIZE
||
14727 dstoff
+ length
>= OBJECT_MAX_SIZE
) {
14731 if (srcoff
+ length
> oldo
->onode
.size
) {
14736 _assign_nid(txc
, newo
);
14739 if (cct
->_conf
->bluestore_clone_cow
) {
14740 _do_zero(txc
, c
, newo
, dstoff
, length
);
14741 _do_clone_range(txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
14744 r
= _do_read(c
.get(), oldo
, srcoff
, length
, bl
, 0);
14747 r
= _do_write(txc
, c
, newo
, dstoff
, bl
.length(), bl
, 0);
14753 txc
->write_onode(newo
);
14757 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
14758 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
14759 << " to offset 0x" << dstoff
<< std::dec
14760 << " = " << r
<< dendl
;
14764 int BlueStore::_rename(TransContext
*txc
,
14768 const ghobject_t
& new_oid
)
14770 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
14771 << new_oid
<< dendl
;
14773 ghobject_t old_oid
= oldo
->oid
;
14774 mempool::bluestore_cache_other::string new_okey
;
14777 if (newo
->exists
) {
14781 ceph_assert(txc
->onodes
.count(newo
) == 0);
14784 txc
->t
->rmkey(PREFIX_OBJ
, oldo
->key
.c_str(), oldo
->key
.size());
14788 oldo
->extent_map
.fault_range(db
, 0, oldo
->onode
.size
);
14789 get_object_key(cct
, new_oid
, &new_okey
);
14791 for (auto &s
: oldo
->extent_map
.shards
) {
14792 generate_extent_shard_key_and_apply(oldo
->key
, s
.shard_info
->offset
, &key
,
14793 [&](const string
& final_key
) {
14794 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
14802 txc
->write_onode(newo
);
14804 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
14805 // Onode in the old slot
14806 c
->onode_map
.rename(oldo
, old_oid
, new_oid
, new_okey
);
14809 // hold a ref to new Onode in old name position, to ensure we don't drop
14810 // it from the cache before this txc commits (or else someone may come along
14811 // and read newo's metadata via the old name).
14812 txc
->note_modified_object(oldo
);
14815 dout(10) << __func__
<< " " << c
->cid
<< " " << old_oid
<< " -> "
14816 << new_oid
<< " = " << r
<< dendl
;
14822 int BlueStore::_create_collection(
14828 dout(15) << __func__
<< " " << cid
<< " bits " << bits
<< dendl
;
14833 std::unique_lock
l(coll_lock
);
14838 auto p
= new_coll_map
.find(cid
);
14839 ceph_assert(p
!= new_coll_map
.end());
14841 (*c
)->cnode
.bits
= bits
;
14842 coll_map
[cid
] = *c
;
14843 new_coll_map
.erase(p
);
14845 encode((*c
)->cnode
, bl
);
14846 txc
->t
->set(PREFIX_COLL
, stringify(cid
), bl
);
14850 dout(10) << __func__
<< " " << cid
<< " bits " << bits
<< " = " << r
<< dendl
;
14854 int BlueStore::_remove_collection(TransContext
*txc
, const coll_t
&cid
,
14857 dout(15) << __func__
<< " " << cid
<< dendl
;
14860 (*c
)->flush_all_but_last();
14862 std::unique_lock
l(coll_lock
);
14867 size_t nonexistent_count
= 0;
14868 ceph_assert((*c
)->exists
);
14869 if ((*c
)->onode_map
.map_any([&](OnodeRef o
) {
14871 dout(1) << __func__
<< " " << o
->oid
<< " " << o
14872 << " exists in onode_map" << dendl
;
14875 ++nonexistent_count
;
14882 vector
<ghobject_t
> ls
;
14884 // Enumerate onodes in db, up to nonexistent_count + 1
14885 // then check if all of them are marked as non-existent.
14886 // Bypass the check if (next != ghobject_t::get_max())
14887 r
= _collection_list(c
->get(), ghobject_t(), ghobject_t::get_max(),
14888 nonexistent_count
+ 1, &ls
, &next
);
14890 // If true mean collecton has more objects than nonexistent_count,
14891 // so bypass check.
14892 bool exists
= (!next
.is_max());
14893 for (auto it
= ls
.begin(); !exists
&& it
< ls
.end(); ++it
) {
14894 dout(10) << __func__
<< " oid " << *it
<< dendl
;
14895 auto onode
= (*c
)->onode_map
.lookup(*it
);
14896 exists
= !onode
|| onode
->exists
;
14898 dout(1) << __func__
<< " " << *it
14899 << " exists in db, "
14900 << (!onode
? "not present in ram" : "present in ram")
14905 _do_remove_collection(txc
, c
);
14908 dout(10) << __func__
<< " " << cid
14909 << " is non-empty" << dendl
;
14916 dout(10) << __func__
<< " " << cid
<< " = " << r
<< dendl
;
14920 void BlueStore::_do_remove_collection(TransContext
*txc
,
14923 coll_map
.erase((*c
)->cid
);
14924 txc
->removed_collections
.push_back(*c
);
14925 (*c
)->exists
= false;
14926 _osr_register_zombie((*c
)->osr
.get());
14927 txc
->t
->rmkey(PREFIX_COLL
, stringify((*c
)->cid
));
14931 int BlueStore::_split_collection(TransContext
*txc
,
14934 unsigned bits
, int rem
)
14936 dout(15) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
14937 << " bits " << bits
<< dendl
;
14938 std::unique_lock
l(c
->lock
);
14939 std::unique_lock
l2(d
->lock
);
14942 // flush all previous deferred writes on this sequencer. this is a bit
14943 // heavyweight, but we need to make sure all deferred writes complete
14944 // before we split as the new collection's sequencer may need to order
14945 // this after those writes, and we don't bother with the complexity of
14946 // moving those TransContexts over to the new osr.
14947 _osr_drain_preceding(txc
);
14949 // move any cached items (onodes and referenced shared blobs) that will
14950 // belong to the child collection post-split. leave everything else behind.
14951 // this may include things that don't strictly belong to the now-smaller
14952 // parent split, but the OSD will always send us a split for every new
14955 spg_t pgid
, dest_pgid
;
14956 bool is_pg
= c
->cid
.is_pg(&pgid
);
14957 ceph_assert(is_pg
);
14958 is_pg
= d
->cid
.is_pg(&dest_pgid
);
14959 ceph_assert(is_pg
);
14961 // the destination should initially be empty.
14962 ceph_assert(d
->onode_map
.empty());
14963 ceph_assert(d
->shared_blob_set
.empty());
14964 ceph_assert(d
->cnode
.bits
== bits
);
14966 c
->split_cache(d
.get());
14968 // adjust bits. note that this will be redundant for all but the first
14969 // split call for this parent (first child).
14970 c
->cnode
.bits
= bits
;
14971 ceph_assert(d
->cnode
.bits
== bits
);
14975 encode(c
->cnode
, bl
);
14976 txc
->t
->set(PREFIX_COLL
, stringify(c
->cid
), bl
);
14978 dout(10) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
14979 << " bits " << bits
<< " = " << r
<< dendl
;
14983 int BlueStore::_merge_collection(
14989 dout(15) << __func__
<< " " << (*c
)->cid
<< " to " << d
->cid
14990 << " bits " << bits
<< dendl
;
14991 std::unique_lock
l((*c
)->lock
);
14992 std::unique_lock
l2(d
->lock
);
14995 coll_t cid
= (*c
)->cid
;
14997 // flush all previous deferred writes on the source collection to ensure
14998 // that all deferred writes complete before we merge as the target collection's
14999 // sequencer may need to order new ops after those writes.
15001 _osr_drain((*c
)->osr
.get());
15003 // move any cached items (onodes and referenced shared blobs) that will
15004 // belong to the child collection post-split. leave everything else behind.
15005 // this may include things that don't strictly belong to the now-smaller
15006 // parent split, but the OSD will always send us a split for every new
15009 spg_t pgid
, dest_pgid
;
15010 bool is_pg
= cid
.is_pg(&pgid
);
15011 ceph_assert(is_pg
);
15012 is_pg
= d
->cid
.is_pg(&dest_pgid
);
15013 ceph_assert(is_pg
);
15015 // adjust bits. note that this will be redundant for all but the first
15016 // merge call for the parent/target.
15017 d
->cnode
.bits
= bits
;
15019 // behavior depends on target (d) bits, so this after that is updated.
15020 (*c
)->split_cache(d
.get());
15022 // remove source collection
15024 std::unique_lock
l3(coll_lock
);
15025 _do_remove_collection(txc
, c
);
15031 encode(d
->cnode
, bl
);
15032 txc
->t
->set(PREFIX_COLL
, stringify(d
->cid
), bl
);
15034 dout(10) << __func__
<< " " << cid
<< " to " << d
->cid
<< " "
15035 << " bits " << bits
<< " = " << r
<< dendl
;
15039 void BlueStore::log_latency(
15042 const ceph::timespan
& l
,
15043 double lat_threshold
,
15044 const char* info
) const
15046 logger
->tinc(idx
, l
);
15047 if (lat_threshold
> 0.0 &&
15048 l
>= make_timespan(lat_threshold
)) {
15049 dout(0) << __func__
<< " slow operation observed for " << name
15050 << ", latency = " << l
15056 void BlueStore::log_latency_fn(
15059 const ceph::timespan
& l
,
15060 double lat_threshold
,
15061 std::function
<string (const ceph::timespan
& lat
)> fn
) const
15063 logger
->tinc(idx
, l
);
15064 if (lat_threshold
> 0.0 &&
15065 l
>= make_timespan(lat_threshold
)) {
15066 dout(0) << __func__
<< " slow operation observed for " << name
15067 << ", latency = " << l
15073 #if defined(WITH_LTTNG)
15074 void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
15077 mono_clock::time_point start_throttle_acquire
)
15079 pending_kv_ios
+= txc
.ios
;
15080 if (txc
.deferred_txn
) {
15081 pending_deferred_ios
+= txc
.ios
;
15084 uint64_t started
= 0;
15085 uint64_t completed
= 0;
15086 if (should_trace(&started
, &completed
)) {
15087 txc
.tracing
= true;
15088 uint64_t rocksdb_base_level
,
15089 rocksdb_estimate_pending_compaction_bytes
,
15090 rocksdb_cur_size_all_mem_tables
,
15091 rocksdb_compaction_pending
,
15092 rocksdb_mem_table_flush_pending
,
15093 rocksdb_num_running_compactions
,
15094 rocksdb_num_running_flushes
,
15095 rocksdb_actual_delayed_write_rate
;
15097 "rocksdb.base-level",
15098 &rocksdb_base_level
);
15100 "rocksdb.estimate-pending-compaction-bytes",
15101 &rocksdb_estimate_pending_compaction_bytes
);
15103 "rocksdb.cur-size-all-mem-tables",
15104 &rocksdb_cur_size_all_mem_tables
);
15106 "rocksdb.compaction-pending",
15107 &rocksdb_compaction_pending
);
15109 "rocksdb.mem-table-flush-pending",
15110 &rocksdb_mem_table_flush_pending
);
15112 "rocksdb.num-running-compactions",
15113 &rocksdb_num_running_compactions
);
15115 "rocksdb.num-running-flushes",
15116 &rocksdb_num_running_flushes
);
15118 "rocksdb.actual-delayed-write-rate",
15119 &rocksdb_actual_delayed_write_rate
);
15124 transaction_initial_state
,
15125 txc
.osr
->get_sequencer_id(),
15127 throttle_bytes
.get_current(),
15128 throttle_deferred_bytes
.get_current(),
15130 pending_deferred_ios
,
15133 ceph::to_seconds
<double>(mono_clock::now() - start_throttle_acquire
));
15137 transaction_initial_state_rocksdb
,
15138 txc
.osr
->get_sequencer_id(),
15140 rocksdb_base_level
,
15141 rocksdb_estimate_pending_compaction_bytes
,
15142 rocksdb_cur_size_all_mem_tables
,
15143 rocksdb_compaction_pending
,
15144 rocksdb_mem_table_flush_pending
,
15145 rocksdb_num_running_compactions
,
15146 rocksdb_num_running_flushes
,
15147 rocksdb_actual_delayed_write_rate
);
15152 mono_clock::duration
BlueStore::BlueStoreThrottle::log_state_latency(
15153 TransContext
&txc
, PerfCounters
*logger
, int state
)
15155 mono_clock::time_point now
= mono_clock::now();
15156 mono_clock::duration lat
= now
- txc
.last_stamp
;
15157 logger
->tinc(state
, lat
);
15158 #if defined(WITH_LTTNG)
15160 state
>= l_bluestore_state_prepare_lat
&&
15161 state
<= l_bluestore_state_done_lat
) {
15162 OID_ELAPSED("", lat
.to_nsec() / 1000.0, txc
.get_state_latency_name(state
));
15165 transaction_state_duration
,
15166 txc
.osr
->get_sequencer_id(),
15169 ceph::to_seconds
<double>(lat
));
15172 txc
.last_stamp
= now
;
15176 bool BlueStore::BlueStoreThrottle::try_start_transaction(
15179 mono_clock::time_point start_throttle_acquire
)
15181 throttle_bytes
.get(txc
.cost
);
15183 if (!txc
.deferred_txn
|| throttle_deferred_bytes
.get_or_fail(txc
.cost
)) {
15184 emit_initial_tracepoint(db
, txc
, start_throttle_acquire
);
15191 void BlueStore::BlueStoreThrottle::finish_start_transaction(
15194 mono_clock::time_point start_throttle_acquire
)
15196 ceph_assert(txc
.deferred_txn
);
15197 throttle_deferred_bytes
.get(txc
.cost
);
15198 emit_initial_tracepoint(db
, txc
, start_throttle_acquire
);
15201 #if defined(WITH_LTTNG)
15202 void BlueStore::BlueStoreThrottle::complete_kv(TransContext
&txc
)
15204 pending_kv_ios
-= 1;
15205 ios_completed_since_last_traced
++;
15209 transaction_commit_latency
,
15210 txc
.osr
->get_sequencer_id(),
15212 ceph::to_seconds
<double>(mono_clock::now() - txc
.start
));
15217 #if defined(WITH_LTTNG)
15218 void BlueStore::BlueStoreThrottle::complete(TransContext
&txc
)
15220 if (txc
.deferred_txn
) {
15221 pending_deferred_ios
-= 1;
15224 mono_clock::time_point now
= mono_clock::now();
15225 mono_clock::duration lat
= now
- txc
.start
;
15228 transaction_total_duration
,
15229 txc
.osr
->get_sequencer_id(),
15231 ceph::to_seconds
<double>(lat
));
15236 // DB key value Histogram
15237 #define KEY_SLAB 32
15238 #define VALUE_SLAB 64
15240 const string prefix_onode
= "o";
15241 const string prefix_onode_shard
= "x";
15242 const string prefix_other
= "Z";
15244 int BlueStore::DBHistogram::get_key_slab(size_t sz
)
15246 return (sz
/KEY_SLAB
);
15249 string
BlueStore::DBHistogram::get_key_slab_to_range(int slab
)
15251 int lower_bound
= slab
* KEY_SLAB
;
15252 int upper_bound
= (slab
+ 1) * KEY_SLAB
;
15253 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
15257 int BlueStore::DBHistogram::get_value_slab(size_t sz
)
15259 return (sz
/VALUE_SLAB
);
15262 string
BlueStore::DBHistogram::get_value_slab_to_range(int slab
)
15264 int lower_bound
= slab
* VALUE_SLAB
;
15265 int upper_bound
= (slab
+ 1) * VALUE_SLAB
;
15266 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
15270 void BlueStore::DBHistogram::update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
15271 const string
&prefix
, size_t key_size
, size_t value_size
)
15273 uint32_t key_slab
= get_key_slab(key_size
);
15274 uint32_t value_slab
= get_value_slab(value_size
);
15275 key_hist
[prefix
][key_slab
].count
++;
15276 key_hist
[prefix
][key_slab
].max_len
=
15277 std::max
<size_t>(key_size
, key_hist
[prefix
][key_slab
].max_len
);
15278 key_hist
[prefix
][key_slab
].val_map
[value_slab
].count
++;
15279 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
=
15280 std::max
<size_t>(value_size
,
15281 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
);
15284 void BlueStore::DBHistogram::dump(Formatter
*f
)
15286 f
->open_object_section("rocksdb_value_distribution");
15287 for (auto i
: value_hist
) {
15288 f
->dump_unsigned(get_value_slab_to_range(i
.first
).data(), i
.second
);
15290 f
->close_section();
15292 f
->open_object_section("rocksdb_key_value_histogram");
15293 for (auto i
: key_hist
) {
15294 f
->dump_string("prefix", i
.first
);
15295 f
->open_object_section("key_hist");
15296 for ( auto k
: i
.second
) {
15297 f
->dump_unsigned(get_key_slab_to_range(k
.first
).data(), k
.second
.count
);
15298 f
->dump_unsigned("max_len", k
.second
.max_len
);
15299 f
->open_object_section("value_hist");
15300 for ( auto j
: k
.second
.val_map
) {
15301 f
->dump_unsigned(get_value_slab_to_range(j
.first
).data(), j
.second
.count
);
15302 f
->dump_unsigned("max_len", j
.second
.max_len
);
15304 f
->close_section();
15306 f
->close_section();
15308 f
->close_section();
15311 //Itrerates through the db and collects the stats
15312 void BlueStore::generate_db_histogram(Formatter
*f
)
15315 uint64_t num_onodes
= 0;
15316 uint64_t num_shards
= 0;
15317 uint64_t num_super
= 0;
15318 uint64_t num_coll
= 0;
15319 uint64_t num_omap
= 0;
15320 uint64_t num_pgmeta_omap
= 0;
15321 uint64_t num_deferred
= 0;
15322 uint64_t num_alloc
= 0;
15323 uint64_t num_stat
= 0;
15324 uint64_t num_others
= 0;
15325 uint64_t num_shared_shards
= 0;
15326 size_t max_key_size
=0, max_value_size
= 0;
15327 uint64_t total_key_size
= 0, total_value_size
= 0;
15328 size_t key_size
= 0, value_size
= 0;
15331 auto start
= coarse_mono_clock::now();
15333 KeyValueDB::WholeSpaceIterator iter
= db
->get_wholespace_iterator();
15334 iter
->seek_to_first();
15335 while (iter
->valid()) {
15336 dout(30) << __func__
<< " Key: " << iter
->key() << dendl
;
15337 key_size
= iter
->key_size();
15338 value_size
= iter
->value_size();
15339 hist
.value_hist
[hist
.get_value_slab(value_size
)]++;
15340 max_key_size
= std::max(max_key_size
, key_size
);
15341 max_value_size
= std::max(max_value_size
, value_size
);
15342 total_key_size
+= key_size
;
15343 total_value_size
+= value_size
;
15345 pair
<string
,string
> key(iter
->raw_key());
15347 if (key
.first
== PREFIX_SUPER
) {
15348 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SUPER
, key_size
, value_size
);
15350 } else if (key
.first
== PREFIX_STAT
) {
15351 hist
.update_hist_entry(hist
.key_hist
, PREFIX_STAT
, key_size
, value_size
);
15353 } else if (key
.first
== PREFIX_COLL
) {
15354 hist
.update_hist_entry(hist
.key_hist
, PREFIX_COLL
, key_size
, value_size
);
15356 } else if (key
.first
== PREFIX_OBJ
) {
15357 if (key
.second
.back() == ONODE_KEY_SUFFIX
) {
15358 hist
.update_hist_entry(hist
.key_hist
, prefix_onode
, key_size
, value_size
);
15361 hist
.update_hist_entry(hist
.key_hist
, prefix_onode_shard
, key_size
, value_size
);
15364 } else if (key
.first
== PREFIX_OMAP
) {
15365 hist
.update_hist_entry(hist
.key_hist
, PREFIX_OMAP
, key_size
, value_size
);
15367 } else if (key
.first
== PREFIX_PGMETA_OMAP
) {
15368 hist
.update_hist_entry(hist
.key_hist
, PREFIX_PGMETA_OMAP
, key_size
, value_size
);
15370 } else if (key
.first
== PREFIX_DEFERRED
) {
15371 hist
.update_hist_entry(hist
.key_hist
, PREFIX_DEFERRED
, key_size
, value_size
);
15373 } else if (key
.first
== PREFIX_ALLOC
|| key
.first
== PREFIX_ALLOC_BITMAP
) {
15374 hist
.update_hist_entry(hist
.key_hist
, PREFIX_ALLOC
, key_size
, value_size
);
15376 } else if (key
.first
== PREFIX_SHARED_BLOB
) {
15377 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SHARED_BLOB
, key_size
, value_size
);
15378 num_shared_shards
++;
15380 hist
.update_hist_entry(hist
.key_hist
, prefix_other
, key_size
, value_size
);
15386 ceph::timespan duration
= coarse_mono_clock::now() - start
;
15387 f
->open_object_section("rocksdb_key_value_stats");
15388 f
->dump_unsigned("num_onodes", num_onodes
);
15389 f
->dump_unsigned("num_shards", num_shards
);
15390 f
->dump_unsigned("num_super", num_super
);
15391 f
->dump_unsigned("num_coll", num_coll
);
15392 f
->dump_unsigned("num_omap", num_omap
);
15393 f
->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap
);
15394 f
->dump_unsigned("num_deferred", num_deferred
);
15395 f
->dump_unsigned("num_alloc", num_alloc
);
15396 f
->dump_unsigned("num_stat", num_stat
);
15397 f
->dump_unsigned("num_shared_shards", num_shared_shards
);
15398 f
->dump_unsigned("num_others", num_others
);
15399 f
->dump_unsigned("max_key_size", max_key_size
);
15400 f
->dump_unsigned("max_value_size", max_value_size
);
15401 f
->dump_unsigned("total_key_size", total_key_size
);
15402 f
->dump_unsigned("total_value_size", total_value_size
);
15403 f
->close_section();
15407 dout(20) << __func__
<< " finished in " << duration
<< " seconds" << dendl
;
15411 void BlueStore::_shutdown_cache()
15413 dout(10) << __func__
<< dendl
;
15414 for (auto i
: buffer_cache_shards
) {
15416 ceph_assert(i
->empty());
15418 for (auto& p
: coll_map
) {
15419 p
.second
->onode_map
.clear();
15420 if (!p
.second
->shared_blob_set
.empty()) {
15421 derr
<< __func__
<< " stray shared blobs on " << p
.first
<< dendl
;
15422 p
.second
->shared_blob_set
.dump
<0>(cct
);
15424 ceph_assert(p
.second
->onode_map
.empty());
15425 ceph_assert(p
.second
->shared_blob_set
.empty());
15428 for (auto i
: onode_cache_shards
) {
15429 ceph_assert(i
->empty());
15433 // For external caller.
15434 // We use a best-effort policy instead, e.g.,
15435 // we don't care if there are still some pinned onodes/data in the cache
15436 // after this command is completed.
15437 int BlueStore::flush_cache(ostream
*os
)
15439 dout(10) << __func__
<< dendl
;
15440 for (auto i
: onode_cache_shards
) {
15443 for (auto i
: buffer_cache_shards
) {
15450 void BlueStore::_apply_padding(uint64_t head_pad
,
15452 bufferlist
& padded
)
15455 padded
.prepend_zero(head_pad
);
15458 padded
.append_zero(tail_pad
);
15460 if (head_pad
|| tail_pad
) {
15461 dout(20) << __func__
<< " can pad head 0x" << std::hex
<< head_pad
15462 << " tail 0x" << tail_pad
<< std::dec
<< dendl
;
15463 logger
->inc(l_bluestore_write_pad_bytes
, head_pad
+ tail_pad
);
15467 void BlueStore::_record_onode(OnodeRef
&o
, KeyValueDB::Transaction
&txn
)
15469 // finalize extent_map shards
15470 o
->extent_map
.update(txn
, false);
15471 if (o
->extent_map
.needs_reshard()) {
15472 o
->extent_map
.reshard(db
, txn
);
15473 o
->extent_map
.update(txn
, true);
15474 if (o
->extent_map
.needs_reshard()) {
15475 dout(20) << __func__
<< " warning: still wants reshard, check options?"
15477 o
->extent_map
.clear_needs_reshard();
15479 logger
->inc(l_bluestore_onode_reshard
);
15484 denc(o
->onode
, bound
);
15485 o
->extent_map
.bound_encode_spanning_blobs(bound
);
15486 if (o
->onode
.extent_map_shards
.empty()) {
15487 denc(o
->extent_map
.inline_bl
, bound
);
15492 unsigned onode_part
, blob_part
, extent_part
;
15494 auto p
= bl
.get_contiguous_appender(bound
, true);
15496 onode_part
= p
.get_logical_offset();
15497 o
->extent_map
.encode_spanning_blobs(p
);
15498 blob_part
= p
.get_logical_offset() - onode_part
;
15499 if (o
->onode
.extent_map_shards
.empty()) {
15500 denc(o
->extent_map
.inline_bl
, p
);
15502 extent_part
= p
.get_logical_offset() - onode_part
- blob_part
;
15505 dout(20) << __func__
<< " onode " << o
->oid
<< " is " << bl
.length()
15506 << " (" << onode_part
<< " bytes onode + "
15507 << blob_part
<< " bytes spanning blobs + "
15508 << extent_part
<< " bytes inline extents)"
15512 txn
->set(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size(), bl
);
15515 void BlueStore::_log_alerts(osd_alert_list_t
& alerts
)
15517 std::lock_guard
l(qlock
);
15519 if (!disk_size_mismatch_alert
.empty()) {
15521 "BLUESTORE_DISK_SIZE_MISMATCH",
15522 disk_size_mismatch_alert
);
15524 if (!legacy_statfs_alert
.empty()) {
15526 "BLUESTORE_LEGACY_STATFS",
15527 legacy_statfs_alert
);
15529 if (!spillover_alert
.empty() &&
15530 cct
->_conf
->bluestore_warn_on_bluefs_spillover
) {
15532 "BLUEFS_SPILLOVER",
15535 if (!no_per_pool_omap_alert
.empty()) {
15537 "BLUESTORE_NO_PER_POOL_OMAP",
15538 no_per_pool_omap_alert
);
15540 string
s0(failed_cmode
);
15542 if (!failed_compressors
.empty()) {
15546 s0
+= "unable to load:";
15548 for (auto& s
: failed_compressors
) {
15557 "BLUESTORE_NO_COMPRESSION",
15562 void BlueStore::_collect_allocation_stats(uint64_t need
, uint32_t alloc_size
,
15565 alloc_stats_count
++;
15566 alloc_stats_fragments
+= extents
;
15567 alloc_stats_size
+= need
;
15570 void BlueStore::_record_allocation_stats()
15572 // don't care about data consistency,
15573 // fields can be partially modified while making the tuple
15574 auto t0
= std::make_tuple(
15575 alloc_stats_count
.exchange(0),
15576 alloc_stats_fragments
.exchange(0),
15577 alloc_stats_size
.exchange(0));
15579 dout(0) << " allocation stats probe "
15580 << probe_count
<< ":"
15581 << " cnt: " << std::get
<0>(t0
)
15582 << " frags: " << std::get
<1>(t0
)
15583 << " size: " << std::get
<2>(t0
)
15588 // Keep the history for probes from the power-of-two sequence:
15589 // -1, -2, -4, -8, -16
15592 for (auto& t
: alloc_stats_history
) {
15593 dout(0) << " probe -"
15594 << base
+ (probe_count
% base
) << ": "
15596 << ", " << std::get
<1>(t
)
15597 << ", " << std::get
<2>(t
)
15601 dout(0) << "------------" << dendl
;
15603 auto prev
= probe_count
++;
15604 auto mask
= (1 << alloc_stats_history
.size()) - 1;
15605 probe_count
&= mask
;
15607 for (size_t i
= cbits(prev
^ probe_count
) - 1; i
> 0 ; --i
) {
15608 alloc_stats_history
[i
] = alloc_stats_history
[i
- 1];
15610 alloc_stats_history
[0].swap(t0
);
15613 // ===========================================
15614 // BlueStoreRepairer
15616 size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
15617 const interval_set
<uint64_t>& extents
)
15619 ceph_assert(granularity
); // initialized
15620 // can't call for the second time
15621 ceph_assert(!was_filtered_out
);
15622 ceph_assert(collections_bfs
.size() == objects_bfs
.size());
15624 uint64_t prev_pos
= 0;
15625 uint64_t npos
= collections_bfs
.size();
15627 bloom_vector collections_reduced
;
15628 bloom_vector objects_reduced
;
15630 for (auto e
: extents
) {
15631 if (e
.second
== 0) {
15634 uint64_t pos
= max(e
.first
/ granularity
, prev_pos
);
15635 uint64_t end_pos
= 1 + (e
.first
+ e
.second
- 1) / granularity
;
15636 while (pos
!= npos
&& pos
< end_pos
) {
15637 ceph_assert( collections_bfs
[pos
].element_count() ==
15638 objects_bfs
[pos
].element_count());
15639 if (collections_bfs
[pos
].element_count()) {
15640 collections_reduced
.push_back(std::move(collections_bfs
[pos
]));
15641 objects_reduced
.push_back(std::move(objects_bfs
[pos
]));
15645 prev_pos
= end_pos
;
15647 collections_reduced
.swap(collections_bfs
);
15648 objects_reduced
.swap(objects_bfs
);
15649 was_filtered_out
= true;
15650 return collections_bfs
.size();
15653 bool BlueStoreRepairer::remove_key(KeyValueDB
*db
,
15654 const string
& prefix
,
15657 if (!remove_key_txn
) {
15658 remove_key_txn
= db
->get_transaction();
15661 remove_key_txn
->rmkey(prefix
, key
);
15666 void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB
*db
)
15668 fix_per_pool_omap_txn
= db
->get_transaction();
15672 fix_per_pool_omap_txn
->set(PREFIX_SUPER
, "per_pool_omap", bl
);
15675 bool BlueStoreRepairer::fix_shared_blob(
15678 const bufferlist
* bl
)
15680 KeyValueDB::Transaction txn
;
15681 if (fix_misreferences_txn
) { // reuse this txn
15682 txn
= fix_misreferences_txn
;
15684 if (!fix_shared_blob_txn
) {
15685 fix_shared_blob_txn
= db
->get_transaction();
15687 txn
= fix_shared_blob_txn
;
15690 get_shared_blob_key(sbid
, &key
);
15694 txn
->set(PREFIX_SHARED_BLOB
, key
, *bl
);
15696 txn
->rmkey(PREFIX_SHARED_BLOB
, key
);
15701 bool BlueStoreRepairer::fix_statfs(KeyValueDB
*db
,
15703 const store_statfs_t
& new_statfs
)
15705 if (!fix_statfs_txn
) {
15706 fix_statfs_txn
= db
->get_transaction();
15708 BlueStore::volatile_statfs vstatfs
;
15709 vstatfs
= new_statfs
;
15711 vstatfs
.encode(bl
);
15713 fix_statfs_txn
->set(PREFIX_STAT
, key
, bl
);
15717 bool BlueStoreRepairer::fix_leaked(KeyValueDB
*db
,
15718 FreelistManager
* fm
,
15719 uint64_t offset
, uint64_t len
)
15721 if (!fix_fm_leaked_txn
) {
15722 fix_fm_leaked_txn
= db
->get_transaction();
15725 fm
->release(offset
, len
, fix_fm_leaked_txn
);
15728 bool BlueStoreRepairer::fix_false_free(KeyValueDB
*db
,
15729 FreelistManager
* fm
,
15730 uint64_t offset
, uint64_t len
)
15732 if (!fix_fm_false_free_txn
) {
15733 fix_fm_false_free_txn
= db
->get_transaction();
15736 fm
->allocate(offset
, len
, fix_fm_false_free_txn
);
15740 bool BlueStoreRepairer::fix_bluefs_extents(std::atomic
<uint64_t>& out_of_sync_flag
)
15742 // this is just a stub to count num of repairs properly,
15743 // actual repair happens in BlueStore::_close_db_and_around()
15744 // while doing _sync_bluefs_and_fm
15745 ++out_of_sync_flag
;
15750 bool BlueStoreRepairer::preprocess_misreference(KeyValueDB
*db
)
15752 if (misreferenced_extents
.size()) {
15753 size_t n
= space_usage_tracker
.filter_out(misreferenced_extents
);
15754 ceph_assert(n
> 0);
15755 if (!fix_misreferences_txn
) {
15756 fix_misreferences_txn
= db
->get_transaction();
15763 unsigned BlueStoreRepairer::apply(KeyValueDB
* db
)
15765 if (fix_per_pool_omap_txn
) {
15766 db
->submit_transaction_sync(fix_per_pool_omap_txn
);
15767 fix_per_pool_omap_txn
= nullptr;
15769 if (fix_fm_leaked_txn
) {
15770 db
->submit_transaction_sync(fix_fm_leaked_txn
);
15771 fix_fm_leaked_txn
= nullptr;
15773 if (fix_fm_false_free_txn
) {
15774 db
->submit_transaction_sync(fix_fm_false_free_txn
);
15775 fix_fm_false_free_txn
= nullptr;
15777 if (remove_key_txn
) {
15778 db
->submit_transaction_sync(remove_key_txn
);
15779 remove_key_txn
= nullptr;
15781 if (fix_misreferences_txn
) {
15782 db
->submit_transaction_sync(fix_misreferences_txn
);
15783 fix_misreferences_txn
= nullptr;
15785 if (fix_shared_blob_txn
) {
15786 db
->submit_transaction_sync(fix_shared_blob_txn
);
15787 fix_shared_blob_txn
= nullptr;
15790 if (fix_statfs_txn
) {
15791 db
->submit_transaction_sync(fix_statfs_txn
);
15792 fix_statfs_txn
= nullptr;
15794 unsigned repaired
= to_repair_cnt
;
15799 // =======================================================
15800 // RocksDBBlueFSVolumeSelector
15802 uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h
) {
15803 ceph_assert(h
!= nullptr);
15804 uint64_t hint
= reinterpret_cast<uint64_t>(h
);
15808 res
= BlueFS::BDEV_SLOW
;
15809 if (db_avail4slow
> 0) {
15810 // considering statically available db space vs.
15811 // - observed maximums on DB dev for DB/WAL/UNSORTED data
15812 // - observed maximum spillovers
15813 uint64_t max_db_use
= 0; // max db usage we potentially observed
15814 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_LOG
- LEVEL_FIRST
);
15815 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_WAL
- LEVEL_FIRST
);
15816 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_DB
, LEVEL_DB
- LEVEL_FIRST
);
15817 // this could go to db hence using it in the estimation
15818 max_db_use
+= per_level_per_dev_max
.at(BlueFS::BDEV_SLOW
, LEVEL_DB
- LEVEL_FIRST
);
15820 auto db_total
= l_totals
[LEVEL_DB
- LEVEL_FIRST
];
15821 uint64_t avail
= min(
15823 max_db_use
< db_total
? db_total
- max_db_use
: 0);
15825 // considering current DB dev usage for SLOW data
15826 if (avail
> per_level_per_dev_usage
.at(BlueFS::BDEV_DB
, LEVEL_SLOW
- LEVEL_FIRST
)) {
15827 res
= BlueFS::BDEV_DB
;
15833 res
= BlueFS::BDEV_WAL
;
15837 res
= BlueFS::BDEV_DB
;
15843 void RocksDBBlueFSVolumeSelector::get_paths(const std::string
& base
, paths
& res
) const
15845 res
.emplace_back(base
, l_totals
[LEVEL_DB
- LEVEL_FIRST
]);
15846 res
.emplace_back(base
+ ".slow", l_totals
[LEVEL_SLOW
- LEVEL_FIRST
]);
15849 void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(const string
& dirname
) const {
15850 uint8_t res
= LEVEL_DB
;
15851 if (dirname
.length() > 5) {
15852 // the "db.slow" and "db.wal" directory names are hard-coded at
15853 // match up with bluestore. the slow device is always the second
15854 // one (when a dedicated block.db device is present and used at
15855 // bdev 0). the wal device is always last.
15856 if (boost::algorithm::ends_with(dirname
, ".slow")) {
15859 else if (boost::algorithm::ends_with(dirname
, ".wal")) {
15863 return reinterpret_cast<void*>(res
);
15866 void RocksDBBlueFSVolumeSelector::dump(ostream
& sout
) {
15867 auto max_x
= per_level_per_dev_usage
.get_max_x();
15868 auto max_y
= per_level_per_dev_usage
.get_max_y();
15869 sout
<< "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals
[LEVEL_WAL
- LEVEL_FIRST
]
15870 << ", db_total:" << l_totals
[LEVEL_DB
- LEVEL_FIRST
]
15871 << ", slow_total:" << l_totals
[LEVEL_SLOW
- LEVEL_FIRST
]
15872 << ", db_avail:" << db_avail4slow
<< std::endl
15873 << "Usage matrix:" << std::endl
;
15874 constexpr std::array
<const char*, 8> names
{ {
15884 const size_t width
= 12;
15885 for (size_t i
= 0; i
< names
.size(); ++i
) {
15886 sout
.setf(std::ios::left
, std::ios::adjustfield
);
15891 for (size_t l
= 0; l
< max_y
; l
++) {
15892 sout
.setf(std::ios::left
, std::ios::adjustfield
);
15894 switch (l
+ LEVEL_FIRST
) {
15896 sout
<< "LOG"; break;
15898 sout
<< "WAL"; break;
15900 sout
<< "DB"; break;
15902 sout
<< "SLOW"; break;
15904 sout
<< "TOTALS"; break;
15906 for (size_t d
= 0; d
< max_x
; d
++) {
15907 sout
.setf(std::ios::left
, std::ios::adjustfield
);
15909 sout
<< stringify(byte_u_t(per_level_per_dev_usage
.at(d
, l
)));
15911 sout
.setf(std::ios::left
, std::ios::adjustfield
);
15913 sout
<< stringify(per_level_files
[l
]) << std::endl
;
15915 ceph_assert(max_x
== per_level_per_dev_max
.get_max_x());
15916 ceph_assert(max_y
== per_level_per_dev_max
.get_max_y());
15917 sout
<< "MAXIMUMS:" << std::endl
;
15918 for (size_t l
= 0; l
< max_y
; l
++) {
15919 sout
.setf(std::ios::left
, std::ios::adjustfield
);
15921 switch (l
+ LEVEL_FIRST
) {
15923 sout
<< "LOG"; break;
15925 sout
<< "WAL"; break;
15927 sout
<< "DB"; break;
15929 sout
<< "SLOW"; break;
15931 sout
<< "TOTALS"; break;
15933 for (size_t d
= 0; d
< max_x
- 1; d
++) {
15934 sout
.setf(std::ios::left
, std::ios::adjustfield
);
15936 sout
<< stringify(byte_u_t(per_level_per_dev_max
.at(d
, l
)));
15938 sout
.setf(std::ios::left
, std::ios::adjustfield
);
15940 sout
<< stringify(byte_u_t(per_level_per_dev_max
.at(max_x
- 1, l
)));
15941 if (l
< max_y
- 1) {
15947 // =======================================================