1 // vim: ts=8 sw=2 smarttab
3 * Ceph - scalable distributed file system
5 * Copyright (C) 2014 Red Hat
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
16 #include <sys/types.h>
20 #include "include/cpp-btree/btree_set.h"
22 #include "BlueStore.h"
24 #include "include/compat.h"
25 #include "include/intarith.h"
26 #include "include/stringify.h"
27 #include "common/errno.h"
28 #include "common/safe_io.h"
29 #include "common/PriorityCache.h"
30 #include "Allocator.h"
31 #include "FreelistManager.h"
33 #include "BlueRocksEnv.h"
34 #include "auth/Crypto.h"
35 #include "common/EventTrace.h"
36 #include "perfglue/heap_profiler.h"
38 #define dout_context cct
39 #define dout_subsys ceph_subsys_bluestore
41 using bid_t
= decltype(BlueStore::Blob::id
);
43 // bluestore_cache_onode
44 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode
, bluestore_onode
,
45 bluestore_cache_onode
);
47 // bluestore_cache_other
48 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer
, bluestore_buffer
,
49 bluestore_cache_other
);
50 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent
, bluestore_extent
,
51 bluestore_cache_other
);
52 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob
, bluestore_blob
,
53 bluestore_cache_other
);
54 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob
, bluestore_shared_blob
,
55 bluestore_cache_other
);
58 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext
, bluestore_transcontext
,
63 const string PREFIX_SUPER
= "S"; // field -> value
64 const string PREFIX_STAT
= "T"; // field -> value(int64 array)
65 const string PREFIX_COLL
= "C"; // collection name -> cnode_t
66 const string PREFIX_OBJ
= "O"; // object name -> onode_t
67 const string PREFIX_OMAP
= "M"; // u64 + keyname -> value
68 const string PREFIX_DEFERRED
= "L"; // id -> deferred_transaction_t
69 const string PREFIX_ALLOC
= "B"; // u64 offset -> u64 length (freelist)
70 const string PREFIX_SHARED_BLOB
= "X"; // u64 offset -> shared_blob_t
72 // write a label in the first block. always use this size. note that
73 // bluefs makes a matching assumption about the location of its
74 // superblock (always the second block of the device).
75 #define BDEV_LABEL_BLOCK_SIZE 4096
77 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
78 #define SUPER_RESERVED 8192
80 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
84 * extent map blob encoding
86 * we use the low bits of the blobid field to indicate some common scenarios
87 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
89 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
90 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
91 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
92 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
93 #define BLOBID_SHIFT_BITS 4
96 * object name key structure
98 * encoded u8: shard + 2^7 (so that it sorts properly)
99 * encoded u64: poolid + 2^63 (so that it sorts properly)
100 * encoded u32: hash (bit reversed)
102 * escaped string: namespace
104 * escaped string: key or object name
105 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
106 * we are done. otherwise, we are followed by the object name.
107 * escaped string: object name (unless '=' above)
110 * encoded u64: generation
113 #define ONODE_KEY_SUFFIX 'o'
122 #define EXTENT_SHARD_KEY_SUFFIX 'x'
125 * string encoding in the key
127 * The key string needs to lexicographically sort the same way that
128 * ghobject_t does. We do this by escaping anything <= to '#' with #
129 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
132 * We use ! as a terminator for strings; this works because it is < #
133 * and will get escaped if it is present in the string.
137 static void append_escaped(const string
&in
, S
*out
)
139 char hexbyte
[in
.length() * 3 + 1];
140 char* ptr
= &hexbyte
[0];
141 for (string::const_iterator i
= in
.begin(); i
!= in
.end(); ++i
) {
144 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
145 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
146 } else if (*i
>= '~') {
148 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
149 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
155 out
->append(hexbyte
, ptr
- &hexbyte
[0]);
158 inline unsigned h2i(char c
)
160 if ((c
>= '0') && (c
<= '9')) {
162 } else if ((c
>= 'a') && (c
<= 'f')) {
164 } else if ((c
>= 'A') && (c
<= 'F')) {
167 return 256; // make it always larger than 255
171 static int decode_escaped(const char *p
, string
*out
)
174 char* ptr
= &buff
[0];
175 char* max
= &buff
[252];
176 const char *orig_p
= p
;
177 while (*p
&& *p
!= '!') {
178 if (*p
== '#' || *p
== '~') {
181 hex
= h2i(*p
++) << 4;
194 out
->append(buff
, ptr
-buff
);
199 out
->append(buff
, ptr
-buff
);
204 // some things we encode in binary (as le32 or le64); print the
205 // resulting key strings nicely
207 static string
pretty_binary_string(const S
& in
)
211 out
.reserve(in
.length() * 3);
212 enum { NONE
, HEX
, STRING
} mode
= NONE
;
213 unsigned from
= 0, i
;
214 for (i
=0; i
< in
.length(); ++i
) {
215 if ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
216 (mode
== HEX
&& in
.length() - i
>= 4 &&
217 ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
218 (in
[i
+1] < 32 || (unsigned char)in
[i
+1] > 126) ||
219 (in
[i
+2] < 32 || (unsigned char)in
[i
+2] > 126) ||
220 (in
[i
+3] < 32 || (unsigned char)in
[i
+3] > 126)))) {
221 if (mode
== STRING
) {
222 out
.append(in
.c_str() + from
, i
- from
);
229 if (in
.length() - i
>= 4) {
230 // print a whole u32 at once
231 snprintf(buf
, sizeof(buf
), "%08x",
232 (uint32_t)(((unsigned char)in
[i
] << 24) |
233 ((unsigned char)in
[i
+1] << 16) |
234 ((unsigned char)in
[i
+2] << 8) |
235 ((unsigned char)in
[i
+3] << 0)));
238 snprintf(buf
, sizeof(buf
), "%02x", (int)(unsigned char)in
[i
]);
242 if (mode
!= STRING
) {
249 if (mode
== STRING
) {
250 out
.append(in
.c_str() + from
, i
- from
);
257 static void _key_encode_shard(shard_id_t shard
, T
*key
)
259 key
->push_back((char)((uint8_t)shard
.id
+ (uint8_t)0x80));
262 static const char *_key_decode_shard(const char *key
, shard_id_t
*pshard
)
264 pshard
->id
= (uint8_t)*key
- (uint8_t)0x80;
268 static void get_coll_key_range(const coll_t
& cid
, int bits
,
269 string
*temp_start
, string
*temp_end
,
270 string
*start
, string
*end
)
278 if (cid
.is_pg(&pgid
)) {
279 _key_encode_shard(pgid
.shard
, start
);
280 *temp_start
= *start
;
282 _key_encode_u64(pgid
.pool() + 0x8000000000000000ull
, start
);
283 _key_encode_u64((-2ll - pgid
.pool()) + 0x8000000000000000ull
, temp_start
);
286 *temp_end
= *temp_start
;
288 uint32_t reverse_hash
= hobject_t::_reverse_bits(pgid
.ps());
289 _key_encode_u32(reverse_hash
, start
);
290 _key_encode_u32(reverse_hash
, temp_start
);
292 uint64_t end_hash
= reverse_hash
+ (1ull << (32 - bits
));
293 if (end_hash
> 0xffffffffull
)
294 end_hash
= 0xffffffffull
;
296 _key_encode_u32(end_hash
, end
);
297 _key_encode_u32(end_hash
, temp_end
);
299 _key_encode_shard(shard_id_t::NO_SHARD
, start
);
300 _key_encode_u64(-1ull + 0x8000000000000000ull
, start
);
302 _key_encode_u32(0, start
);
303 _key_encode_u32(0xffffffff, end
);
305 // no separate temp section
311 static void get_shared_blob_key(uint64_t sbid
, string
*key
)
314 _key_encode_u64(sbid
, key
);
317 static int get_key_shared_blob(const string
& key
, uint64_t *sbid
)
319 const char *p
= key
.c_str();
320 if (key
.length() < sizeof(uint64_t))
322 _key_decode_u64(p
, sbid
);
327 static int get_key_object(const S
& key
, ghobject_t
*oid
)
330 const char *p
= key
.c_str();
332 if (key
.length() < 1 + 8 + 4)
334 p
= _key_decode_shard(p
, &oid
->shard_id
);
337 p
= _key_decode_u64(p
, &pool
);
338 oid
->hobj
.pool
= pool
- 0x8000000000000000ull
;
341 p
= _key_decode_u32(p
, &hash
);
343 oid
->hobj
.set_bitwise_key_u32(hash
);
345 r
= decode_escaped(p
, &oid
->hobj
.nspace
);
351 r
= decode_escaped(p
, &k
);
358 oid
->hobj
.oid
.name
= k
;
359 } else if (*p
== '<' || *p
== '>') {
362 r
= decode_escaped(p
, &oid
->hobj
.oid
.name
);
366 oid
->hobj
.set_key(k
);
372 p
= _key_decode_u64(p
, &oid
->hobj
.snap
.val
);
373 p
= _key_decode_u64(p
, &oid
->generation
);
375 if (*p
!= ONODE_KEY_SUFFIX
) {
380 // if we get something other than a null terminator here,
381 // something goes wrong.
389 static void get_object_key(CephContext
*cct
, const ghobject_t
& oid
, S
*key
)
393 size_t max_len
= 1 + 8 + 4 +
394 (oid
.hobj
.nspace
.length() * 3 + 1) +
395 (oid
.hobj
.get_key().length() * 3 + 1) +
396 1 + // for '<', '=', or '>'
397 (oid
.hobj
.oid
.name
.length() * 3 + 1) +
399 key
->reserve(max_len
);
401 _key_encode_shard(oid
.shard_id
, key
);
402 _key_encode_u64(oid
.hobj
.pool
+ 0x8000000000000000ull
, key
);
403 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), key
);
405 append_escaped(oid
.hobj
.nspace
, key
);
407 if (oid
.hobj
.get_key().length()) {
408 // is a key... could be < = or >.
409 append_escaped(oid
.hobj
.get_key(), key
);
410 // (ASCII chars < = and > sort in that order, yay)
411 int r
= oid
.hobj
.get_key().compare(oid
.hobj
.oid
.name
);
413 key
->append(r
> 0 ? ">" : "<");
414 append_escaped(oid
.hobj
.oid
.name
, key
);
421 append_escaped(oid
.hobj
.oid
.name
, key
);
425 _key_encode_u64(oid
.hobj
.snap
, key
);
426 _key_encode_u64(oid
.generation
, key
);
428 key
->push_back(ONODE_KEY_SUFFIX
);
433 int r
= get_key_object(*key
, &t
);
435 derr
<< " r " << r
<< dendl
;
436 derr
<< "key " << pretty_binary_string(*key
) << dendl
;
437 derr
<< "oid " << oid
<< dendl
;
438 derr
<< " t " << t
<< dendl
;
439 assert(r
== 0 && t
== oid
);
445 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
446 // char lets us quickly test whether it is a shard key without decoding any
447 // of the prefix bytes.
449 static void get_extent_shard_key(const S
& onode_key
, uint32_t offset
,
453 key
->reserve(onode_key
.length() + 4 + 1);
454 key
->append(onode_key
.c_str(), onode_key
.size());
455 _key_encode_u32(offset
, key
);
456 key
->push_back(EXTENT_SHARD_KEY_SUFFIX
);
459 static void rewrite_extent_shard_key(uint32_t offset
, string
*key
)
461 assert(key
->size() > sizeof(uint32_t) + 1);
462 assert(*key
->rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
463 _key_encode_u32(offset
, key
->size() - sizeof(uint32_t) - 1, key
);
467 static void generate_extent_shard_key_and_apply(
471 std::function
<void(const string
& final_key
)> apply
)
473 if (key
->empty()) { // make full key
474 assert(!onode_key
.empty());
475 get_extent_shard_key(onode_key
, offset
, key
);
477 rewrite_extent_shard_key(offset
, key
);
482 int get_key_extent_shard(const string
& key
, string
*onode_key
, uint32_t *offset
)
484 assert(key
.size() > sizeof(uint32_t) + 1);
485 assert(*key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
486 int okey_len
= key
.size() - sizeof(uint32_t) - 1;
487 *onode_key
= key
.substr(0, okey_len
);
488 const char *p
= key
.data() + okey_len
;
489 _key_decode_u32(p
, offset
);
493 static bool is_extent_shard_key(const string
& key
)
495 return *key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
;
499 static void get_omap_header(uint64_t id
, string
*out
)
501 _key_encode_u64(id
, out
);
505 // hmm, I don't think there's any need to escape the user key since we
506 // have a clean prefix.
507 static void get_omap_key(uint64_t id
, const string
& key
, string
*out
)
509 _key_encode_u64(id
, out
);
514 static void rewrite_omap_key(uint64_t id
, string old
, string
*out
)
516 _key_encode_u64(id
, out
);
517 out
->append(old
.c_str() + out
->length(), old
.size() - out
->length());
520 static void decode_omap_key(const string
& key
, string
*user_key
)
522 *user_key
= key
.substr(sizeof(uint64_t) + 1);
525 static void get_omap_tail(uint64_t id
, string
*out
)
527 _key_encode_u64(id
, out
);
531 static void get_deferred_key(uint64_t seq
, string
*out
)
533 _key_encode_u64(seq
, out
);
539 struct Int64ArrayMergeOperator
: public KeyValueDB::MergeOperator
{
540 void merge_nonexistent(
541 const char *rdata
, size_t rlen
, std::string
*new_value
) override
{
542 *new_value
= std::string(rdata
, rlen
);
545 const char *ldata
, size_t llen
,
546 const char *rdata
, size_t rlen
,
547 std::string
*new_value
) override
{
548 assert(llen
== rlen
);
549 assert((rlen
% 8) == 0);
550 new_value
->resize(rlen
);
551 const __le64
* lv
= (const __le64
*)ldata
;
552 const __le64
* rv
= (const __le64
*)rdata
;
553 __le64
* nv
= &(__le64
&)new_value
->at(0);
554 for (size_t i
= 0; i
< rlen
>> 3; ++i
) {
555 nv
[i
] = lv
[i
] + rv
[i
];
558 // We use each operator name and each prefix to construct the
559 // overall RocksDB operator name for consistency check at open time.
560 const char *name() const override
{
561 return "int64_array";
568 ostream
& operator<<(ostream
& out
, const BlueStore::Buffer
& b
)
570 out
<< "buffer(" << &b
<< " space " << b
.space
<< " 0x" << std::hex
571 << b
.offset
<< "~" << b
.length
<< std::dec
572 << " " << BlueStore::Buffer::get_state_name(b
.state
);
574 out
<< " " << BlueStore::Buffer::get_flag_name(b
.flags
);
580 void BlueStore::GarbageCollector::process_protrusive_extents(
581 const BlueStore::ExtentMap
& extent_map
,
582 uint64_t start_offset
,
584 uint64_t start_touch_offset
,
585 uint64_t end_touch_offset
,
586 uint64_t min_alloc_size
)
588 assert(start_offset
<= start_touch_offset
&& end_offset
>= end_touch_offset
);
590 uint64_t lookup_start_offset
= P2ALIGN(start_offset
, min_alloc_size
);
591 uint64_t lookup_end_offset
= ROUND_UP_TO(end_offset
, min_alloc_size
);
593 dout(30) << __func__
<< " (hex): [" << std::hex
594 << lookup_start_offset
<< ", " << lookup_end_offset
595 << ")" << std::dec
<< dendl
;
597 for (auto it
= extent_map
.seek_lextent(lookup_start_offset
);
598 it
!= extent_map
.extent_map
.end() &&
599 it
->logical_offset
< lookup_end_offset
;
601 uint64_t alloc_unit_start
= it
->logical_offset
/ min_alloc_size
;
602 uint64_t alloc_unit_end
= (it
->logical_end() - 1) / min_alloc_size
;
604 dout(30) << __func__
<< " " << *it
605 << "alloc_units: " << alloc_unit_start
<< ".." << alloc_unit_end
608 Blob
* b
= it
->blob
.get();
610 if (it
->logical_offset
>=start_touch_offset
&&
611 it
->logical_end() <= end_touch_offset
) {
612 // Process extents within the range affected by
613 // the current write request.
614 // Need to take into account if existing extents
615 // can be merged with them (uncompressed case)
616 if (!b
->get_blob().is_compressed()) {
617 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
618 --blob_info_counted
->expected_allocations
; // don't need to allocate
619 // new AU for compressed
620 // data since another
621 // collocated uncompressed
622 // blob already exists
623 dout(30) << __func__
<< " --expected:"
624 << alloc_unit_start
<< dendl
;
626 used_alloc_unit
= alloc_unit_end
;
627 blob_info_counted
= nullptr;
629 } else if (b
->get_blob().is_compressed()) {
631 // additionally we take compressed blobs that were not impacted
632 // by the write into account too
634 affected_blobs
.emplace(
635 b
, BlobInfo(b
->get_referenced_bytes())).first
->second
;
638 (used_alloc_unit
&& used_alloc_unit
== alloc_unit_start
) ? 0 : 1;
639 bi
.expected_allocations
+= alloc_unit_end
- alloc_unit_start
+ adjust
;
640 dout(30) << __func__
<< " expected_allocations="
641 << bi
.expected_allocations
<< " end_au:"
642 << alloc_unit_end
<< dendl
;
644 blob_info_counted
= &bi
;
645 used_alloc_unit
= alloc_unit_end
;
647 assert(it
->length
<= bi
.referenced_bytes
);
648 bi
.referenced_bytes
-= it
->length
;
649 dout(30) << __func__
<< " affected_blob:" << *b
650 << " unref 0x" << std::hex
<< it
->length
651 << " referenced = 0x" << bi
.referenced_bytes
652 << std::dec
<< dendl
;
653 // NOTE: we can't move specific blob to resulting GC list here
654 // when reference counter == 0 since subsequent extents might
655 // decrement its expected_allocation.
656 // Hence need to enumerate all the extents first.
657 if (!bi
.collect_candidate
) {
658 bi
.first_lextent
= it
;
659 bi
.collect_candidate
= true;
661 bi
.last_lextent
= it
;
663 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
664 // don't need to allocate new AU for compressed data since another
665 // collocated uncompressed blob already exists
666 --blob_info_counted
->expected_allocations
;
667 dout(30) << __func__
<< " --expected_allocations:"
668 << alloc_unit_start
<< dendl
;
670 used_alloc_unit
= alloc_unit_end
;
671 blob_info_counted
= nullptr;
675 for (auto b_it
= affected_blobs
.begin();
676 b_it
!= affected_blobs
.end();
678 Blob
* b
= b_it
->first
;
679 BlobInfo
& bi
= b_it
->second
;
680 if (bi
.referenced_bytes
== 0) {
681 uint64_t len_on_disk
= b_it
->first
->get_blob().get_ondisk_length();
682 int64_t blob_expected_for_release
=
683 ROUND_UP_TO(len_on_disk
, min_alloc_size
) / min_alloc_size
;
685 dout(30) << __func__
<< " " << *(b_it
->first
)
686 << " expected4release=" << blob_expected_for_release
687 << " expected_allocations=" << bi
.expected_allocations
689 int64_t benefit
= blob_expected_for_release
- bi
.expected_allocations
;
690 if (benefit
>= g_conf
->bluestore_gc_enable_blob_threshold
) {
691 if (bi
.collect_candidate
) {
692 auto it
= bi
.first_lextent
;
695 if (it
->blob
.get() == b
) {
696 extents_to_collect
.emplace_back(it
->logical_offset
, it
->length
);
698 bExit
= it
== bi
.last_lextent
;
702 expected_for_release
+= blob_expected_for_release
;
703 expected_allocations
+= bi
.expected_allocations
;
709 int64_t BlueStore::GarbageCollector::estimate(
710 uint64_t start_offset
,
712 const BlueStore::ExtentMap
& extent_map
,
713 const BlueStore::old_extent_map_t
& old_extents
,
714 uint64_t min_alloc_size
)
717 affected_blobs
.clear();
718 extents_to_collect
.clear();
719 used_alloc_unit
= boost::optional
<uint64_t >();
720 blob_info_counted
= nullptr;
722 gc_start_offset
= start_offset
;
723 gc_end_offset
= start_offset
+ length
;
725 uint64_t end_offset
= start_offset
+ length
;
727 for (auto it
= old_extents
.begin(); it
!= old_extents
.end(); ++it
) {
728 Blob
* b
= it
->e
.blob
.get();
729 if (b
->get_blob().is_compressed()) {
731 // update gc_start_offset/gc_end_offset if needed
732 gc_start_offset
= min(gc_start_offset
, (uint64_t)it
->e
.blob_start());
733 gc_end_offset
= max(gc_end_offset
, (uint64_t)it
->e
.blob_end());
735 auto o
= it
->e
.logical_offset
;
736 auto l
= it
->e
.length
;
738 uint64_t ref_bytes
= b
->get_referenced_bytes();
739 // micro optimization to bypass blobs that have no more references
740 if (ref_bytes
!= 0) {
741 dout(30) << __func__
<< " affected_blob:" << *b
742 << " unref 0x" << std::hex
<< o
<< "~" << l
743 << std::dec
<< dendl
;
744 affected_blobs
.emplace(b
, BlobInfo(ref_bytes
));
748 dout(30) << __func__
<< " gc range(hex): [" << std::hex
749 << gc_start_offset
<< ", " << gc_end_offset
750 << ")" << std::dec
<< dendl
;
752 // enumerate preceeding extents to check if they reference affected blobs
753 if (gc_start_offset
< start_offset
|| gc_end_offset
> end_offset
) {
754 process_protrusive_extents(extent_map
,
761 return expected_for_release
- expected_allocations
;
766 BlueStore::Cache
*BlueStore::Cache::create(CephContext
* cct
, string type
,
767 PerfCounters
*logger
)
772 c
= new LRUCache(cct
);
773 else if (type
== "2q")
774 c
= new TwoQCache(cct
);
776 assert(0 == "unrecognized cache type");
782 void BlueStore::Cache::trim(uint64_t onode_max
, uint64_t buffer_max
)
784 std::lock_guard
<std::recursive_mutex
> l(lock
);
785 _trim(onode_max
, buffer_max
);
788 void BlueStore::Cache::trim_all()
790 std::lock_guard
<std::recursive_mutex
> l(lock
);
796 #define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
798 void BlueStore::LRUCache::_touch_onode(OnodeRef
& o
)
800 auto p
= onode_lru
.iterator_to(*o
);
802 onode_lru
.push_front(*o
);
805 void BlueStore::LRUCache::_trim(uint64_t onode_max
, uint64_t buffer_max
)
807 dout(20) << __func__
<< " onodes " << onode_lru
.size() << " / " << onode_max
808 << " buffers " << buffer_size
<< " / " << buffer_max
811 _audit("trim start");
814 while (buffer_size
> buffer_max
) {
815 auto i
= buffer_lru
.rbegin();
816 if (i
== buffer_lru
.rend()) {
817 // stop if buffer_lru is now empty
822 assert(b
->is_clean());
823 dout(20) << __func__
<< " rm " << *b
<< dendl
;
824 b
->space
->_rm_buffer(this, b
);
828 if (onode_max
>= onode_lru
.size()) {
829 return; // don't even try
831 uint64_t num
= onode_lru
.size() - onode_max
;
833 auto p
= onode_lru
.end();
834 assert(p
!= onode_lru
.begin());
837 int max_skipped
= g_conf
->bluestore_cache_trim_max_skip_pinned
;
840 int refs
= o
->nref
.load();
842 dout(20) << __func__
<< " " << o
->oid
<< " has " << refs
843 << " refs, skipping" << dendl
;
844 if (++skipped
>= max_skipped
) {
845 dout(20) << __func__
<< " maximum skip pinned reached; stopping with "
846 << num
<< " left to trim" << dendl
;
850 if (p
== onode_lru
.begin()) {
858 dout(30) << __func__
<< " rm " << o
->oid
<< dendl
;
859 if (p
!= onode_lru
.begin()) {
860 onode_lru
.erase(p
--);
865 o
->get(); // paranoia
866 o
->c
->onode_map
.remove(o
->oid
);
873 void BlueStore::LRUCache::_audit(const char *when
)
875 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
877 for (auto i
= buffer_lru
.begin(); i
!= buffer_lru
.end(); ++i
) {
880 if (s
!= buffer_size
) {
881 derr
<< __func__
<< " buffer_size " << buffer_size
<< " actual " << s
883 for (auto i
= buffer_lru
.begin(); i
!= buffer_lru
.end(); ++i
) {
884 derr
<< __func__
<< " " << *i
<< dendl
;
886 assert(s
== buffer_size
);
888 dout(20) << __func__
<< " " << when
<< " buffer_size " << buffer_size
895 #define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
898 void BlueStore::TwoQCache::_touch_onode(OnodeRef
& o
)
900 auto p
= onode_lru
.iterator_to(*o
);
902 onode_lru
.push_front(*o
);
905 void BlueStore::TwoQCache::_add_buffer(Buffer
*b
, int level
, Buffer
*near
)
907 dout(20) << __func__
<< " level " << level
<< " near " << near
909 << " which has cache_private " << b
->cache_private
<< dendl
;
911 b
->cache_private
= near
->cache_private
;
912 switch (b
->cache_private
) {
914 buffer_warm_in
.insert(buffer_warm_in
.iterator_to(*near
), *b
);
916 case BUFFER_WARM_OUT
:
917 assert(b
->is_empty());
918 buffer_warm_out
.insert(buffer_warm_out
.iterator_to(*near
), *b
);
921 buffer_hot
.insert(buffer_hot
.iterator_to(*near
), *b
);
924 assert(0 == "bad cache_private");
926 } else if (b
->cache_private
== BUFFER_NEW
) {
927 b
->cache_private
= BUFFER_WARM_IN
;
929 buffer_warm_in
.push_front(*b
);
931 // take caller hint to start at the back of the warm queue
932 buffer_warm_in
.push_back(*b
);
935 // we got a hint from discard
936 switch (b
->cache_private
) {
938 // stay in warm_in. move to front, even though 2Q doesn't actually
940 dout(20) << __func__
<< " move to front of warm " << *b
<< dendl
;
941 buffer_warm_in
.push_front(*b
);
943 case BUFFER_WARM_OUT
:
944 b
->cache_private
= BUFFER_HOT
;
945 // move to hot. fall-thru
947 dout(20) << __func__
<< " move to front of hot " << *b
<< dendl
;
948 buffer_hot
.push_front(*b
);
951 assert(0 == "bad cache_private");
954 if (!b
->is_empty()) {
955 buffer_bytes
+= b
->length
;
956 buffer_list_bytes
[b
->cache_private
] += b
->length
;
960 void BlueStore::TwoQCache::_rm_buffer(Buffer
*b
)
962 dout(20) << __func__
<< " " << *b
<< dendl
;
963 if (!b
->is_empty()) {
964 assert(buffer_bytes
>= b
->length
);
965 buffer_bytes
-= b
->length
;
966 assert(buffer_list_bytes
[b
->cache_private
] >= b
->length
);
967 buffer_list_bytes
[b
->cache_private
] -= b
->length
;
969 switch (b
->cache_private
) {
971 buffer_warm_in
.erase(buffer_warm_in
.iterator_to(*b
));
973 case BUFFER_WARM_OUT
:
974 buffer_warm_out
.erase(buffer_warm_out
.iterator_to(*b
));
977 buffer_hot
.erase(buffer_hot
.iterator_to(*b
));
980 assert(0 == "bad cache_private");
984 void BlueStore::TwoQCache::_move_buffer(Cache
*srcc
, Buffer
*b
)
986 TwoQCache
*src
= static_cast<TwoQCache
*>(srcc
);
989 // preserve which list we're on (even if we can't preserve the order!)
990 switch (b
->cache_private
) {
992 assert(!b
->is_empty());
993 buffer_warm_in
.push_back(*b
);
995 case BUFFER_WARM_OUT
:
996 assert(b
->is_empty());
997 buffer_warm_out
.push_back(*b
);
1000 assert(!b
->is_empty());
1001 buffer_hot
.push_back(*b
);
1004 assert(0 == "bad cache_private");
1006 if (!b
->is_empty()) {
1007 buffer_bytes
+= b
->length
;
1008 buffer_list_bytes
[b
->cache_private
] += b
->length
;
1012 void BlueStore::TwoQCache::_adjust_buffer_size(Buffer
*b
, int64_t delta
)
1014 dout(20) << __func__
<< " delta " << delta
<< " on " << *b
<< dendl
;
1015 if (!b
->is_empty()) {
1016 assert((int64_t)buffer_bytes
+ delta
>= 0);
1017 buffer_bytes
+= delta
;
1018 assert((int64_t)buffer_list_bytes
[b
->cache_private
] + delta
>= 0);
1019 buffer_list_bytes
[b
->cache_private
] += delta
;
1023 void BlueStore::TwoQCache::_trim(uint64_t onode_max
, uint64_t buffer_max
)
1025 dout(20) << __func__
<< " onodes " << onode_lru
.size() << " / " << onode_max
1026 << " buffers " << buffer_bytes
<< " / " << buffer_max
1029 _audit("trim start");
1032 if (buffer_bytes
> buffer_max
) {
1033 uint64_t kin
= buffer_max
* cct
->_conf
->bluestore_2q_cache_kin_ratio
;
1034 uint64_t khot
= buffer_max
- kin
;
1036 // pre-calculate kout based on average buffer size too,
1037 // which is typical(the warm_in and hot lists may change later)
1039 uint64_t buffer_num
= buffer_hot
.size() + buffer_warm_in
.size();
1041 uint64_t buffer_avg_size
= buffer_bytes
/ buffer_num
;
1042 assert(buffer_avg_size
);
1043 uint64_t calculated_buffer_num
= buffer_max
/ buffer_avg_size
;
1044 kout
= calculated_buffer_num
* cct
->_conf
->bluestore_2q_cache_kout_ratio
;
1047 if (buffer_list_bytes
[BUFFER_HOT
] < khot
) {
1048 // hot is small, give slack to warm_in
1049 kin
+= khot
- buffer_list_bytes
[BUFFER_HOT
];
1050 } else if (buffer_list_bytes
[BUFFER_WARM_IN
] < kin
) {
1051 // warm_in is small, give slack to hot
1052 khot
+= kin
- buffer_list_bytes
[BUFFER_WARM_IN
];
1055 // adjust warm_in list
1056 int64_t to_evict_bytes
= buffer_list_bytes
[BUFFER_WARM_IN
] - kin
;
1057 uint64_t evicted
= 0;
1059 while (to_evict_bytes
> 0) {
1060 auto p
= buffer_warm_in
.rbegin();
1061 if (p
== buffer_warm_in
.rend()) {
1062 // stop if warm_in list is now empty
1067 assert(b
->is_clean());
1068 dout(20) << __func__
<< " buffer_warm_in -> out " << *b
<< dendl
;
1069 assert(buffer_bytes
>= b
->length
);
1070 buffer_bytes
-= b
->length
;
1071 assert(buffer_list_bytes
[BUFFER_WARM_IN
] >= b
->length
);
1072 buffer_list_bytes
[BUFFER_WARM_IN
] -= b
->length
;
1073 to_evict_bytes
-= b
->length
;
1074 evicted
+= b
->length
;
1075 b
->state
= Buffer::STATE_EMPTY
;
1077 buffer_warm_in
.erase(buffer_warm_in
.iterator_to(*b
));
1078 buffer_warm_out
.push_front(*b
);
1079 b
->cache_private
= BUFFER_WARM_OUT
;
1083 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1084 << " from warm_in list, done evicting warm_in buffers"
1089 to_evict_bytes
= buffer_list_bytes
[BUFFER_HOT
] - khot
;
1092 while (to_evict_bytes
> 0) {
1093 auto p
= buffer_hot
.rbegin();
1094 if (p
== buffer_hot
.rend()) {
1095 // stop if hot list is now empty
1100 dout(20) << __func__
<< " buffer_hot rm " << *b
<< dendl
;
1101 assert(b
->is_clean());
1102 // adjust evict size before buffer goes invalid
1103 to_evict_bytes
-= b
->length
;
1104 evicted
+= b
->length
;
1105 b
->space
->_rm_buffer(this, b
);
1109 dout(20) << __func__
<< " evicted " << byte_u_t(evicted
)
1110 << " from hot list, done evicting hot buffers"
1114 // adjust warm out list too, if necessary
1115 int64_t num
= buffer_warm_out
.size() - kout
;
1117 Buffer
*b
= &*buffer_warm_out
.rbegin();
1118 assert(b
->is_empty());
1119 dout(20) << __func__
<< " buffer_warm_out rm " << *b
<< dendl
;
1120 b
->space
->_rm_buffer(this, b
);
1125 if (onode_max
>= onode_lru
.size()) {
1126 return; // don't even try
1128 uint64_t num
= onode_lru
.size() - onode_max
;
1130 auto p
= onode_lru
.end();
1131 assert(p
!= onode_lru
.begin());
1134 int max_skipped
= g_conf
->bluestore_cache_trim_max_skip_pinned
;
1137 dout(20) << __func__
<< " considering " << o
<< dendl
;
1138 int refs
= o
->nref
.load();
1140 dout(20) << __func__
<< " " << o
->oid
<< " has " << refs
1141 << " refs; skipping" << dendl
;
1142 if (++skipped
>= max_skipped
) {
1143 dout(20) << __func__
<< " maximum skip pinned reached; stopping with "
1144 << num
<< " left to trim" << dendl
;
1148 if (p
== onode_lru
.begin()) {
1156 dout(30) << __func__
<< " " << o
->oid
<< " num=" << num
<<" lru size="<<onode_lru
.size()<< dendl
;
1157 if (p
!= onode_lru
.begin()) {
1158 onode_lru
.erase(p
--);
1163 o
->get(); // paranoia
1164 o
->c
->onode_map
.remove(o
->oid
);
1171 void BlueStore::TwoQCache::_audit(const char *when
)
1173 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1175 for (auto i
= buffer_hot
.begin(); i
!= buffer_hot
.end(); ++i
) {
1179 uint64_t hot_bytes
= s
;
1180 if (hot_bytes
!= buffer_list_bytes
[BUFFER_HOT
]) {
1181 derr
<< __func__
<< " hot_list_bytes "
1182 << buffer_list_bytes
[BUFFER_HOT
]
1183 << " != actual " << hot_bytes
1185 assert(hot_bytes
== buffer_list_bytes
[BUFFER_HOT
]);
1188 for (auto i
= buffer_warm_in
.begin(); i
!= buffer_warm_in
.end(); ++i
) {
1192 uint64_t warm_in_bytes
= s
- hot_bytes
;
1193 if (warm_in_bytes
!= buffer_list_bytes
[BUFFER_WARM_IN
]) {
1194 derr
<< __func__
<< " warm_in_list_bytes "
1195 << buffer_list_bytes
[BUFFER_WARM_IN
]
1196 << " != actual " << warm_in_bytes
1198 assert(warm_in_bytes
== buffer_list_bytes
[BUFFER_WARM_IN
]);
1201 if (s
!= buffer_bytes
) {
1202 derr
<< __func__
<< " buffer_bytes " << buffer_bytes
<< " actual " << s
1204 assert(s
== buffer_bytes
);
1207 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1216 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1218 void BlueStore::BufferSpace::_clear(Cache
* cache
)
1220 // note: we already hold cache->lock
1221 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1222 while (!buffer_map
.empty()) {
1223 _rm_buffer(cache
, buffer_map
.begin());
1227 int BlueStore::BufferSpace::_discard(Cache
* cache
, uint32_t offset
, uint32_t length
)
1229 // note: we already hold cache->lock
1230 ldout(cache
->cct
, 20) << __func__
<< std::hex
<< " 0x" << offset
<< "~" << length
1231 << std::dec
<< dendl
;
1232 int cache_private
= 0;
1233 cache
->_audit("discard start");
1234 auto i
= _data_lower_bound(offset
);
1235 uint32_t end
= offset
+ length
;
1236 while (i
!= buffer_map
.end()) {
1237 Buffer
*b
= i
->second
.get();
1238 if (b
->offset
>= end
) {
1241 if (b
->cache_private
> cache_private
) {
1242 cache_private
= b
->cache_private
;
1244 if (b
->offset
< offset
) {
1245 int64_t front
= offset
- b
->offset
;
1246 if (b
->end() > end
) {
1247 // drop middle (split)
1248 uint32_t tail
= b
->end() - end
;
1249 if (b
->data
.length()) {
1251 bl
.substr_of(b
->data
, b
->length
- tail
, tail
);
1252 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1253 nb
->maybe_rebuild();
1254 _add_buffer(cache
, nb
, 0, b
);
1256 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, tail
),
1259 if (!b
->is_writing()) {
1260 cache
->_adjust_buffer_size(b
, front
- (int64_t)b
->length
);
1264 cache
->_audit("discard end 1");
1268 if (!b
->is_writing()) {
1269 cache
->_adjust_buffer_size(b
, front
- (int64_t)b
->length
);
1277 if (b
->end() <= end
) {
1278 // drop entire buffer
1279 _rm_buffer(cache
, i
++);
1283 uint32_t keep
= b
->end() - end
;
1284 if (b
->data
.length()) {
1286 bl
.substr_of(b
->data
, b
->length
- keep
, keep
);
1287 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1288 nb
->maybe_rebuild();
1289 _add_buffer(cache
, nb
, 0, b
);
1291 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, keep
), 0, b
);
1293 _rm_buffer(cache
, i
);
1294 cache
->_audit("discard end 2");
1297 return cache_private
;
1300 void BlueStore::BufferSpace::read(
1304 BlueStore::ready_regions_t
& res
,
1305 interval_set
<uint32_t>& res_intervals
,
1309 res_intervals
.clear();
1310 uint32_t want_bytes
= length
;
1311 uint32_t end
= offset
+ length
;
1314 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1315 for (auto i
= _data_lower_bound(offset
);
1316 i
!= buffer_map
.end() && offset
< end
&& i
->first
< end
;
1318 Buffer
*b
= i
->second
.get();
1319 assert(b
->end() > offset
);
1322 if (flags
& BYPASS_CLEAN_CACHE
)
1323 val
= b
->is_writing();
1325 val
= b
->is_writing() || b
->is_clean();
1327 if (b
->offset
< offset
) {
1328 uint32_t skip
= offset
- b
->offset
;
1329 uint32_t l
= MIN(length
, b
->length
- skip
);
1330 res
[offset
].substr_of(b
->data
, skip
, l
);
1331 res_intervals
.insert(offset
, l
);
1334 if (!b
->is_writing()) {
1335 cache
->_touch_buffer(b
);
1339 if (b
->offset
> offset
) {
1340 uint32_t gap
= b
->offset
- offset
;
1341 if (length
<= gap
) {
1347 if (!b
->is_writing()) {
1348 cache
->_touch_buffer(b
);
1350 if (b
->length
> length
) {
1351 res
[offset
].substr_of(b
->data
, 0, length
);
1352 res_intervals
.insert(offset
, length
);
1355 res
[offset
].append(b
->data
);
1356 res_intervals
.insert(offset
, b
->length
);
1357 if (b
->length
== length
)
1359 offset
+= b
->length
;
1360 length
-= b
->length
;
1366 uint64_t hit_bytes
= res_intervals
.size();
1367 assert(hit_bytes
<= want_bytes
);
1368 uint64_t miss_bytes
= want_bytes
- hit_bytes
;
1369 cache
->logger
->inc(l_bluestore_buffer_hit_bytes
, hit_bytes
);
1370 cache
->logger
->inc(l_bluestore_buffer_miss_bytes
, miss_bytes
);
1373 void BlueStore::BufferSpace::_finish_write(Cache
* cache
, uint64_t seq
)
1375 auto i
= writing
.begin();
1376 while (i
!= writing
.end()) {
1386 assert(b
->is_writing());
1388 if (b
->flags
& Buffer::FLAG_NOCACHE
) {
1390 ldout(cache
->cct
, 20) << __func__
<< " discard " << *b
<< dendl
;
1391 buffer_map
.erase(b
->offset
);
1393 b
->state
= Buffer::STATE_CLEAN
;
1396 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
1397 cache
->_add_buffer(b
, 1, nullptr);
1398 ldout(cache
->cct
, 20) << __func__
<< " added " << *b
<< dendl
;
1402 cache
->_audit("finish_write end");
1405 void BlueStore::BufferSpace::split(Cache
* cache
, size_t pos
, BlueStore::BufferSpace
&r
)
1407 std::lock_guard
<std::recursive_mutex
> lk(cache
->lock
);
1408 if (buffer_map
.empty())
1411 auto p
= --buffer_map
.end();
1413 if (p
->second
->end() <= pos
)
1416 if (p
->second
->offset
< pos
) {
1417 ldout(cache
->cct
, 30) << __func__
<< " cut " << *p
->second
<< dendl
;
1418 size_t left
= pos
- p
->second
->offset
;
1419 size_t right
= p
->second
->length
- left
;
1420 if (p
->second
->data
.length()) {
1422 bl
.substr_of(p
->second
->data
, left
, right
);
1423 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, bl
),
1424 0, p
->second
.get());
1426 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, right
),
1427 0, p
->second
.get());
1429 cache
->_adjust_buffer_size(p
->second
.get(), -right
);
1430 p
->second
->truncate(left
);
1434 assert(p
->second
->end() > pos
);
1435 ldout(cache
->cct
, 30) << __func__
<< " move " << *p
->second
<< dendl
;
1436 if (p
->second
->data
.length()) {
1437 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1438 p
->second
->offset
- pos
, p
->second
->data
),
1439 0, p
->second
.get());
1441 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1442 p
->second
->offset
- pos
, p
->second
->length
),
1443 0, p
->second
.get());
1445 if (p
== buffer_map
.begin()) {
1446 _rm_buffer(cache
, p
);
1449 _rm_buffer(cache
, p
--);
1452 assert(writing
.empty());
1458 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1460 BlueStore::OnodeRef
BlueStore::OnodeSpace::add(const ghobject_t
& oid
, OnodeRef o
)
1462 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1463 auto p
= onode_map
.find(oid
);
1464 if (p
!= onode_map
.end()) {
1465 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
1466 << " raced, returning existing " << p
->second
1470 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
<< dendl
;
1472 cache
->_add_onode(o
, 1);
1476 BlueStore::OnodeRef
BlueStore::OnodeSpace::lookup(const ghobject_t
& oid
)
1478 ldout(cache
->cct
, 30) << __func__
<< dendl
;
1483 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1484 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator p
= onode_map
.find(oid
);
1485 if (p
== onode_map
.end()) {
1486 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " miss" << dendl
;
1488 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " hit " << p
->second
1490 cache
->_touch_onode(p
->second
);
1497 cache
->logger
->inc(l_bluestore_onode_hits
);
1499 cache
->logger
->inc(l_bluestore_onode_misses
);
1504 void BlueStore::OnodeSpace::clear()
1506 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1507 ldout(cache
->cct
, 10) << __func__
<< dendl
;
1508 for (auto &p
: onode_map
) {
1509 cache
->_rm_onode(p
.second
);
1514 bool BlueStore::OnodeSpace::empty()
1516 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1517 return onode_map
.empty();
1520 void BlueStore::OnodeSpace::rename(
1522 const ghobject_t
& old_oid
,
1523 const ghobject_t
& new_oid
,
1524 const mempool::bluestore_cache_other::string
& new_okey
)
1526 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1527 ldout(cache
->cct
, 30) << __func__
<< " " << old_oid
<< " -> " << new_oid
1529 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator po
, pn
;
1530 po
= onode_map
.find(old_oid
);
1531 pn
= onode_map
.find(new_oid
);
1534 assert(po
!= onode_map
.end());
1535 if (pn
!= onode_map
.end()) {
1536 ldout(cache
->cct
, 30) << __func__
<< " removing target " << pn
->second
1538 cache
->_rm_onode(pn
->second
);
1539 onode_map
.erase(pn
);
1541 OnodeRef o
= po
->second
;
1543 // install a non-existent onode at old location
1544 oldo
.reset(new Onode(o
->c
, old_oid
, o
->key
));
1546 cache
->_add_onode(po
->second
, 1);
1548 // add at new position and fix oid, key
1549 onode_map
.insert(make_pair(new_oid
, o
));
1550 cache
->_touch_onode(o
);
1555 bool BlueStore::OnodeSpace::map_any(std::function
<bool(OnodeRef
)> f
)
1557 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1558 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1559 for (auto& i
: onode_map
) {
1567 void BlueStore::OnodeSpace::dump(CephContext
*cct
, int lvl
)
1569 for (auto& i
: onode_map
) {
1570 ldout(cct
, lvl
) << i
.first
<< " : " << i
.second
<< dendl
;
1577 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1579 ostream
& operator<<(ostream
& out
, const BlueStore::SharedBlob
& sb
)
1581 out
<< "SharedBlob(" << &sb
;
1584 out
<< " loaded " << *sb
.persistent
;
1586 out
<< " sbid 0x" << std::hex
<< sb
.sbid_unloaded
<< std::dec
;
1591 BlueStore::SharedBlob::SharedBlob(uint64_t i
, Collection
*_coll
)
1592 : coll(_coll
), sbid_unloaded(i
)
1594 assert(sbid_unloaded
> 0);
1596 get_cache()->add_blob();
1600 BlueStore::SharedBlob::~SharedBlob()
1602 if (loaded
&& persistent
) {
1607 void BlueStore::SharedBlob::put()
1610 ldout(coll
->store
->cct
, 20) << __func__
<< " " << this
1611 << " removing self from set " << get_parent()
1614 auto coll_snap
= coll
;
1616 std::lock_guard
<std::recursive_mutex
> l(coll_snap
->cache
->lock
);
1617 if (coll_snap
!= coll
) {
1620 if (!coll_snap
->shared_blob_set
.remove(this, true)) {
1624 bc
._clear(coll_snap
->cache
);
1625 coll_snap
->cache
->rm_blob();
1631 void BlueStore::SharedBlob::get_ref(uint64_t offset
, uint32_t length
)
1634 persistent
->ref_map
.get(offset
, length
);
1637 void BlueStore::SharedBlob::put_ref(uint64_t offset
, uint32_t length
,
1639 set
<SharedBlob
*> *maybe_unshared
)
1643 persistent
->ref_map
.put(offset
, length
, r
, maybe_unshared
? &maybe
: nullptr);
1644 if (maybe_unshared
&& maybe
) {
1645 maybe_unshared
->insert(this);
1649 void BlueStore::SharedBlob::finish_write(uint64_t seq
)
1652 Cache
*cache
= coll
->cache
;
1653 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1654 if (coll
->cache
!= cache
) {
1655 ldout(coll
->store
->cct
, 20) << __func__
1656 << " raced with sb cache update, was " << cache
1657 << ", now " << coll
->cache
<< ", retrying"
1661 bc
._finish_write(cache
, seq
);
1669 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1671 void BlueStore::SharedBlobSet::dump(CephContext
*cct
, int lvl
)
1673 std::lock_guard
<std::mutex
> l(lock
);
1674 for (auto& i
: sb_map
) {
1675 ldout(cct
, lvl
) << i
.first
<< " : " << *i
.second
<< dendl
;
1682 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1684 ostream
& operator<<(ostream
& out
, const BlueStore::Blob
& b
)
1686 out
<< "Blob(" << &b
;
1687 if (b
.is_spanning()) {
1688 out
<< " spanning " << b
.id
;
1690 out
<< " " << b
.get_blob() << " " << b
.get_blob_use_tracker();
1691 if (b
.shared_blob
) {
1692 out
<< " " << *b
.shared_blob
;
1694 out
<< " (shared_blob=NULL)";
1700 void BlueStore::Blob::discard_unallocated(Collection
*coll
)
1702 if (get_blob().is_shared()) {
1705 if (get_blob().is_compressed()) {
1706 bool discard
= false;
1707 bool all_invalid
= true;
1708 for (auto e
: get_blob().get_extents()) {
1709 if (!e
.is_valid()) {
1712 all_invalid
= false;
1715 assert(discard
== all_invalid
); // in case of compressed blob all
1716 // or none pextents are invalid.
1718 shared_blob
->bc
.discard(shared_blob
->get_cache(), 0,
1719 get_blob().get_logical_length());
1723 for (auto e
: get_blob().get_extents()) {
1724 if (!e
.is_valid()) {
1725 ldout(coll
->store
->cct
, 20) << __func__
<< " 0x" << std::hex
<< pos
1727 << std::dec
<< dendl
;
1728 shared_blob
->bc
.discard(shared_blob
->get_cache(), pos
, e
.length
);
1732 if (get_blob().can_prune_tail()) {
1733 dirty_blob().prune_tail();
1734 used_in_blob
.prune_tail(get_blob().get_ondisk_length());
1735 auto cct
= coll
->store
->cct
; //used by dout
1736 dout(20) << __func__
<< " pruned tail, now " << get_blob() << dendl
;
1741 void BlueStore::Blob::get_ref(
1746 // Caller has to initialize Blob's logical length prior to increment
1747 // references. Otherwise one is neither unable to determine required
1748 // amount of counters in case of per-au tracking nor obtain min_release_size
1749 // for single counter mode.
1750 assert(get_blob().get_logical_length() != 0);
1751 auto cct
= coll
->store
->cct
;
1752 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
1753 << std::dec
<< " " << *this << dendl
;
1755 if (used_in_blob
.is_empty()) {
1756 uint32_t min_release_size
=
1757 get_blob().get_release_size(coll
->store
->min_alloc_size
);
1758 uint64_t l
= get_blob().get_logical_length();
1759 dout(20) << __func__
<< " init 0x" << std::hex
<< l
<< ", "
1760 << min_release_size
<< std::dec
<< dendl
;
1761 used_in_blob
.init(l
, min_release_size
);
1768 bool BlueStore::Blob::put_ref(
1774 PExtentVector logical
;
1776 auto cct
= coll
->store
->cct
;
1777 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
1778 << std::dec
<< " " << *this << dendl
;
1780 bool empty
= used_in_blob
.put(
1785 // nothing to release
1786 if (!empty
&& logical
.empty()) {
1790 bluestore_blob_t
& b
= dirty_blob();
1791 return b
.release_extents(empty
, logical
, r
);
1794 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size
,
1795 uint32_t target_blob_size
,
1797 uint32_t *length0
) {
1798 assert(min_alloc_size
);
1799 assert(target_blob_size
);
1800 if (!get_blob().is_mutable()) {
1804 uint32_t length
= *length0
;
1805 uint32_t end
= b_offset
+ length
;
1807 // Currently for the sake of simplicity we omit blob reuse if data is
1808 // unaligned with csum chunk. Later we can perform padding if needed.
1809 if (get_blob().has_csum() &&
1810 ((b_offset
% get_blob().get_csum_chunk_size()) != 0 ||
1811 (end
% get_blob().get_csum_chunk_size()) != 0)) {
1815 auto blen
= get_blob().get_logical_length();
1816 uint32_t new_blen
= blen
;
1818 // make sure target_blob_size isn't less than current blob len
1819 target_blob_size
= MAX(blen
, target_blob_size
);
1821 if (b_offset
>= blen
) {
1822 // new data totally stands out of the existing blob
1825 // new data overlaps with the existing blob
1826 new_blen
= MAX(blen
, end
);
1828 uint32_t overlap
= 0;
1829 if (new_blen
> blen
) {
1830 overlap
= blen
- b_offset
;
1835 if (!get_blob().is_unallocated(b_offset
, overlap
)) {
1836 // abort if any piece of the overlap has already been allocated
1841 if (new_blen
> blen
) {
1842 int64_t overflow
= int64_t(new_blen
) - target_blob_size
;
1843 // Unable to decrease the provided length to fit into max_blob_size
1844 if (overflow
>= length
) {
1848 // FIXME: in some cases we could reduce unused resolution
1849 if (get_blob().has_unused()) {
1854 new_blen
-= overflow
;
1859 if (new_blen
> blen
) {
1860 dirty_blob().add_tail(new_blen
);
1861 used_in_blob
.add_tail(new_blen
,
1862 get_blob().get_release_size(min_alloc_size
));
1868 void BlueStore::Blob::split(Collection
*coll
, uint32_t blob_offset
, Blob
*r
)
1870 auto cct
= coll
->store
->cct
; //used by dout
1871 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1872 << " start " << *this << dendl
;
1873 assert(blob
.can_split());
1874 assert(used_in_blob
.can_split());
1875 bluestore_blob_t
&lb
= dirty_blob();
1876 bluestore_blob_t
&rb
= r
->dirty_blob();
1880 &(r
->used_in_blob
));
1882 lb
.split(blob_offset
, rb
);
1883 shared_blob
->bc
.split(shared_blob
->get_cache(), blob_offset
, r
->shared_blob
->bc
);
1885 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1886 << " finish " << *this << dendl
;
1887 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1888 << " and " << *r
<< dendl
;
1891 #ifndef CACHE_BLOB_BL
1892 void BlueStore::Blob::decode(
1894 bufferptr::iterator
& p
,
1897 bool include_ref_map
)
1899 denc(blob
, p
, struct_v
);
1900 if (blob
.is_shared()) {
1903 if (include_ref_map
) {
1905 used_in_blob
.decode(p
);
1907 used_in_blob
.clear();
1908 bluestore_extent_ref_map_t legacy_ref_map
;
1909 legacy_ref_map
.decode(p
);
1910 for (auto r
: legacy_ref_map
.ref_map
) {
1914 r
.second
.refs
* r
.second
.length
);
1923 ostream
& operator<<(ostream
& out
, const BlueStore::Extent
& e
)
1925 return out
<< std::hex
<< "0x" << e
.logical_offset
<< "~" << e
.length
1926 << ": 0x" << e
.blob_offset
<< "~" << e
.length
<< std::dec
1931 BlueStore::OldExtent
* BlueStore::OldExtent::create(CollectionRef c
,
1936 OldExtent
* oe
= new OldExtent(lo
, o
, l
, b
);
1937 b
->put_ref(c
.get(), o
, l
, &(oe
->r
));
1938 oe
->blob_empty
= b
->get_referenced_bytes() == 0;
1945 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1947 BlueStore::ExtentMap::ExtentMap(Onode
*o
)
1950 o
->c
->store
->cct
->_conf
->bluestore_extent_map_inline_shard_prealloc_size
) {
1953 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t
,
1956 auto cct
= onode
->c
->store
->cct
; //used by dout
1957 dout(20) << __func__
<< " " << onode
->oid
<< (force
? " force" : "") << dendl
;
1958 if (onode
->onode
.extent_map_shards
.empty()) {
1959 if (inline_bl
.length() == 0) {
1961 // we need to encode inline_bl to measure encoded length
1962 bool never_happen
= encode_some(0, OBJECT_MAX_SIZE
, inline_bl
, &n
);
1963 inline_bl
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
1964 assert(!never_happen
);
1965 size_t len
= inline_bl
.length();
1966 dout(20) << __func__
<< " inline shard " << len
<< " bytes from " << n
1967 << " extents" << dendl
;
1968 if (!force
&& len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
1969 request_reshard(0, OBJECT_MAX_SIZE
);
1973 // will persist in the onode key.
1975 // pending shard update
1976 struct dirty_shard_t
{
1979 dirty_shard_t(Shard
*s
) : shard(s
) {}
1981 vector
<dirty_shard_t
> encoded_shards
;
1982 // allocate slots for all shards in a single call instead of
1983 // doing multiple allocations - one per each dirty shard
1984 encoded_shards
.reserve(shards
.size());
1986 auto p
= shards
.begin();
1988 while (p
!= shards
.end()) {
1989 assert(p
->shard_info
->offset
>= prev_p
->shard_info
->offset
);
1994 if (n
== shards
.end()) {
1995 endoff
= OBJECT_MAX_SIZE
;
1997 endoff
= n
->shard_info
->offset
;
1999 encoded_shards
.emplace_back(dirty_shard_t(&(*p
)));
2000 bufferlist
& bl
= encoded_shards
.back().bl
;
2001 if (encode_some(p
->shard_info
->offset
, endoff
- p
->shard_info
->offset
,
2004 derr
<< __func__
<< " encode_some needs reshard" << dendl
;
2008 size_t len
= bl
.length();
2010 dout(20) << __func__
<< " shard 0x" << std::hex
2011 << p
->shard_info
->offset
<< std::dec
<< " is " << len
2012 << " bytes (was " << p
->shard_info
->bytes
<< ") from "
2013 << p
->extents
<< " extents" << dendl
;
2016 if (len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2017 // we are big; reshard ourselves
2018 request_reshard(p
->shard_info
->offset
, endoff
);
2020 // avoid resharding the trailing shard, even if it is small
2021 else if (n
!= shards
.end() &&
2022 len
< g_conf
->bluestore_extent_map_shard_min_size
) {
2023 assert(endoff
!= OBJECT_MAX_SIZE
);
2024 if (p
== shards
.begin()) {
2025 // we are the first shard, combine with next shard
2026 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2028 // combine either with the previous shard or the next,
2029 // whichever is smaller
2030 if (prev_p
->shard_info
->bytes
> n
->shard_info
->bytes
) {
2031 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2033 request_reshard(prev_p
->shard_info
->offset
, endoff
);
2042 if (needs_reshard()) {
2046 // schedule DB update for dirty shards
2048 for (auto& it
: encoded_shards
) {
2049 it
.shard
->dirty
= false;
2050 it
.shard
->shard_info
->bytes
= it
.bl
.length();
2051 generate_extent_shard_key_and_apply(
2053 it
.shard
->shard_info
->offset
,
2055 [&](const string
& final_key
) {
2056 t
->set(PREFIX_OBJ
, final_key
, it
.bl
);
2063 bid_t
BlueStore::ExtentMap::allocate_spanning_blob_id()
2065 if (spanning_blob_map
.empty())
2067 bid_t bid
= spanning_blob_map
.rbegin()->first
+ 1;
2068 // bid is valid and available.
2071 // Find next unused bid;
2072 bid
= rand() % (numeric_limits
<bid_t
>::max() + 1);
2073 const auto begin_bid
= bid
;
2075 if (!spanning_blob_map
.count(bid
))
2079 if (bid
< 0) bid
= 0;
2081 } while (bid
!= begin_bid
);
2082 assert(0 == "no available blob id");
2085 void BlueStore::ExtentMap::reshard(
2087 KeyValueDB::Transaction t
)
2089 auto cct
= onode
->c
->store
->cct
; // used by dout
2091 dout(10) << __func__
<< " 0x[" << std::hex
<< needs_reshard_begin
<< ","
2092 << needs_reshard_end
<< ")" << std::dec
2093 << " of " << onode
->onode
.extent_map_shards
.size()
2094 << " shards on " << onode
->oid
<< dendl
;
2095 for (auto& p
: spanning_blob_map
) {
2096 dout(20) << __func__
<< " spanning blob " << p
.first
<< " " << *p
.second
2099 // determine shard index range
2100 unsigned si_begin
= 0, si_end
= 0;
2101 if (!shards
.empty()) {
2102 while (si_begin
+ 1 < shards
.size() &&
2103 shards
[si_begin
+ 1].shard_info
->offset
<= needs_reshard_begin
) {
2106 needs_reshard_begin
= shards
[si_begin
].shard_info
->offset
;
2107 for (si_end
= si_begin
; si_end
< shards
.size(); ++si_end
) {
2108 if (shards
[si_end
].shard_info
->offset
>= needs_reshard_end
) {
2109 needs_reshard_end
= shards
[si_end
].shard_info
->offset
;
2113 if (si_end
== shards
.size()) {
2114 needs_reshard_end
= OBJECT_MAX_SIZE
;
2116 dout(20) << __func__
<< " shards [" << si_begin
<< "," << si_end
<< ")"
2117 << " over 0x[" << std::hex
<< needs_reshard_begin
<< ","
2118 << needs_reshard_end
<< ")" << std::dec
<< dendl
;
2121 fault_range(db
, needs_reshard_begin
, (needs_reshard_end
- needs_reshard_begin
));
2123 // we may need to fault in a larger interval later must have all
2124 // referring extents for spanning blobs loaded in order to have
2125 // accurate use_tracker values.
2126 uint32_t spanning_scan_begin
= needs_reshard_begin
;
2127 uint32_t spanning_scan_end
= needs_reshard_end
;
2131 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2132 generate_extent_shard_key_and_apply(
2133 onode
->key
, shards
[i
].shard_info
->offset
, &key
,
2134 [&](const string
& final_key
) {
2135 t
->rmkey(PREFIX_OBJ
, final_key
);
2140 // calculate average extent size
2142 unsigned extents
= 0;
2143 if (onode
->onode
.extent_map_shards
.empty()) {
2144 bytes
= inline_bl
.length();
2145 extents
= extent_map
.size();
2147 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2148 bytes
+= shards
[i
].shard_info
->bytes
;
2149 extents
+= shards
[i
].extents
;
2152 unsigned target
= cct
->_conf
->bluestore_extent_map_shard_target_size
;
2153 unsigned slop
= target
*
2154 cct
->_conf
->bluestore_extent_map_shard_target_size_slop
;
2155 unsigned extent_avg
= bytes
/ MAX(1, extents
);
2156 dout(20) << __func__
<< " extent_avg " << extent_avg
<< ", target " << target
2157 << ", slop " << slop
<< dendl
;
2160 unsigned estimate
= 0;
2161 unsigned offset
= needs_reshard_begin
;
2162 vector
<bluestore_onode_t::shard_info
> new_shard_info
;
2163 unsigned max_blob_end
= 0;
2164 Extent
dummy(needs_reshard_begin
);
2165 for (auto e
= extent_map
.lower_bound(dummy
);
2166 e
!= extent_map
.end();
2168 if (e
->logical_offset
>= needs_reshard_end
) {
2171 dout(30) << " extent " << *e
<< dendl
;
2173 // disfavor shard boundaries that span a blob
2174 bool would_span
= (e
->logical_offset
< max_blob_end
) || e
->blob_offset
;
2176 estimate
+ extent_avg
> target
+ (would_span
? slop
: 0)) {
2178 if (offset
== needs_reshard_begin
) {
2179 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2180 new_shard_info
.back().offset
= offset
;
2181 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2182 << std::dec
<< dendl
;
2184 offset
= e
->logical_offset
;
2185 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2186 new_shard_info
.back().offset
= offset
;
2187 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2188 << std::dec
<< dendl
;
2191 estimate
+= extent_avg
;
2192 unsigned bs
= e
->blob_start();
2193 if (bs
< spanning_scan_begin
) {
2194 spanning_scan_begin
= bs
;
2196 uint32_t be
= e
->blob_end();
2197 if (be
> max_blob_end
) {
2200 if (be
> spanning_scan_end
) {
2201 spanning_scan_end
= be
;
2204 if (new_shard_info
.empty() && (si_begin
> 0 ||
2205 si_end
< shards
.size())) {
2206 // we resharded a partial range; we must produce at least one output
2208 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2209 new_shard_info
.back().offset
= needs_reshard_begin
;
2210 dout(20) << __func__
<< " new shard 0x" << std::hex
<< needs_reshard_begin
2211 << std::dec
<< " (singleton degenerate case)" << dendl
;
2214 auto& sv
= onode
->onode
.extent_map_shards
;
2215 dout(20) << __func__
<< " new " << new_shard_info
<< dendl
;
2216 dout(20) << __func__
<< " old " << sv
<< dendl
;
2218 // no old shards to keep
2219 sv
.swap(new_shard_info
);
2220 init_shards(true, true);
2222 // splice in new shards
2223 sv
.erase(sv
.begin() + si_begin
, sv
.begin() + si_end
);
2224 shards
.erase(shards
.begin() + si_begin
, shards
.begin() + si_end
);
2226 sv
.begin() + si_begin
,
2227 new_shard_info
.begin(),
2228 new_shard_info
.end());
2229 shards
.insert(shards
.begin() + si_begin
, new_shard_info
.size(), Shard());
2230 si_end
= si_begin
+ new_shard_info
.size();
2232 assert(sv
.size() == shards
.size());
2234 // note that we need to update every shard_info of shards here,
2235 // as sv might have been totally re-allocated above
2236 for (unsigned i
= 0; i
< shards
.size(); i
++) {
2237 shards
[i
].shard_info
= &sv
[i
];
2240 // mark newly added shards as dirty
2241 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2242 shards
[i
].loaded
= true;
2243 shards
[i
].dirty
= true;
2246 dout(20) << __func__
<< " fin " << sv
<< dendl
;
2250 // no more shards; unspan all previously spanning blobs
2251 auto p
= spanning_blob_map
.begin();
2252 while (p
!= spanning_blob_map
.end()) {
2254 dout(30) << __func__
<< " un-spanning " << *p
->second
<< dendl
;
2255 p
= spanning_blob_map
.erase(p
);
2258 // identify new spanning blobs
2259 dout(20) << __func__
<< " checking spanning blobs 0x[" << std::hex
2260 << spanning_scan_begin
<< "," << spanning_scan_end
<< ")" << dendl
;
2261 if (spanning_scan_begin
< needs_reshard_begin
) {
2262 fault_range(db
, spanning_scan_begin
,
2263 needs_reshard_begin
- spanning_scan_begin
);
2265 if (spanning_scan_end
> needs_reshard_end
) {
2266 fault_range(db
, needs_reshard_end
,
2267 spanning_scan_end
- needs_reshard_end
);
2269 auto sp
= sv
.begin() + si_begin
;
2270 auto esp
= sv
.end();
2271 unsigned shard_start
= sp
->offset
;
2275 shard_end
= OBJECT_MAX_SIZE
;
2277 shard_end
= sp
->offset
;
2279 Extent
dummy(needs_reshard_begin
);
2280 for (auto e
= extent_map
.lower_bound(dummy
); e
!= extent_map
.end(); ++e
) {
2281 if (e
->logical_offset
>= needs_reshard_end
) {
2284 dout(30) << " extent " << *e
<< dendl
;
2285 while (e
->logical_offset
>= shard_end
) {
2286 shard_start
= shard_end
;
2290 shard_end
= OBJECT_MAX_SIZE
;
2292 shard_end
= sp
->offset
;
2294 dout(30) << __func__
<< " shard 0x" << std::hex
<< shard_start
2295 << " to 0x" << shard_end
<< std::dec
<< dendl
;
2297 if (e
->blob_escapes_range(shard_start
, shard_end
- shard_start
)) {
2298 if (!e
->blob
->is_spanning()) {
2299 // We have two options: (1) split the blob into pieces at the
2300 // shard boundaries (and adjust extents accordingly), or (2)
2301 // mark it spanning. We prefer to cut the blob if we can. Note that
2302 // we may have to split it multiple times--potentially at every
2304 bool must_span
= false;
2305 BlobRef b
= e
->blob
;
2306 if (b
->can_split()) {
2307 uint32_t bstart
= e
->blob_start();
2308 uint32_t bend
= e
->blob_end();
2309 for (const auto& sh
: shards
) {
2310 if (bstart
< sh
.shard_info
->offset
&&
2311 bend
> sh
.shard_info
->offset
) {
2312 uint32_t blob_offset
= sh
.shard_info
->offset
- bstart
;
2313 if (b
->can_split_at(blob_offset
)) {
2314 dout(20) << __func__
<< " splitting blob, bstart 0x"
2315 << std::hex
<< bstart
<< " blob_offset 0x"
2316 << blob_offset
<< std::dec
<< " " << *b
<< dendl
;
2317 b
= split_blob(b
, blob_offset
, sh
.shard_info
->offset
);
2318 // switch b to the new right-hand side, in case it
2319 // *also* has to get split.
2320 bstart
+= blob_offset
;
2321 onode
->c
->store
->logger
->inc(l_bluestore_blob_split
);
2332 auto bid
= allocate_spanning_blob_id();
2334 spanning_blob_map
[b
->id
] = b
;
2335 dout(20) << __func__
<< " adding spanning " << *b
<< dendl
;
2339 if (e
->blob
->is_spanning()) {
2340 spanning_blob_map
.erase(e
->blob
->id
);
2342 dout(30) << __func__
<< " un-spanning " << *e
->blob
<< dendl
;
2348 clear_needs_reshard();
2351 bool BlueStore::ExtentMap::encode_some(
2357 auto cct
= onode
->c
->store
->cct
; //used by dout
2358 Extent
dummy(offset
);
2359 auto start
= extent_map
.lower_bound(dummy
);
2360 uint32_t end
= offset
+ length
;
2362 __u8 struct_v
= 2; // Version 2 differs from v1 in blob's ref_map
2363 // serialization only. Hence there is no specific
2364 // handling at ExtentMap level.
2368 bool must_reshard
= false;
2369 for (auto p
= start
;
2370 p
!= extent_map
.end() && p
->logical_offset
< end
;
2372 assert(p
->logical_offset
>= offset
);
2373 p
->blob
->last_encoded_id
= -1;
2374 if (!p
->blob
->is_spanning() && p
->blob_escapes_range(offset
, length
)) {
2375 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2376 << std::dec
<< " hit new spanning blob " << *p
<< dendl
;
2377 request_reshard(p
->blob_start(), p
->blob_end());
2378 must_reshard
= true;
2380 if (!must_reshard
) {
2381 denc_varint(0, bound
); // blobid
2382 denc_varint(0, bound
); // logical_offset
2383 denc_varint(0, bound
); // len
2384 denc_varint(0, bound
); // blob_offset
2386 p
->blob
->bound_encode(
2389 p
->blob
->shared_blob
->get_sbid(),
2397 denc(struct_v
, bound
);
2398 denc_varint(0, bound
); // number of extents
2401 auto app
= bl
.get_contiguous_appender(bound
);
2402 denc(struct_v
, app
);
2403 denc_varint(n
, app
);
2410 uint64_t prev_len
= 0;
2411 for (auto p
= start
;
2412 p
!= extent_map
.end() && p
->logical_offset
< end
;
2415 bool include_blob
= false;
2416 if (p
->blob
->is_spanning()) {
2417 blobid
= p
->blob
->id
<< BLOBID_SHIFT_BITS
;
2418 blobid
|= BLOBID_FLAG_SPANNING
;
2419 } else if (p
->blob
->last_encoded_id
< 0) {
2420 p
->blob
->last_encoded_id
= n
+ 1; // so it is always non-zero
2421 include_blob
= true;
2422 blobid
= 0; // the decoder will infer the id from n
2424 blobid
= p
->blob
->last_encoded_id
<< BLOBID_SHIFT_BITS
;
2426 if (p
->logical_offset
== pos
) {
2427 blobid
|= BLOBID_FLAG_CONTIGUOUS
;
2429 if (p
->blob_offset
== 0) {
2430 blobid
|= BLOBID_FLAG_ZEROOFFSET
;
2432 if (p
->length
== prev_len
) {
2433 blobid
|= BLOBID_FLAG_SAMELENGTH
;
2435 prev_len
= p
->length
;
2437 denc_varint(blobid
, app
);
2438 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
2439 denc_varint_lowz(p
->logical_offset
- pos
, app
);
2441 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
2442 denc_varint_lowz(p
->blob_offset
, app
);
2444 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
2445 denc_varint_lowz(p
->length
, app
);
2447 pos
= p
->logical_end();
2449 p
->blob
->encode(app
, struct_v
, p
->blob
->shared_blob
->get_sbid(), false);
2453 /*derr << __func__ << bl << dendl;
2454 derr << __func__ << ":";
2461 unsigned BlueStore::ExtentMap::decode_some(bufferlist
& bl
)
2463 auto cct
= onode
->c
->store
->cct
; //used by dout
2465 derr << __func__ << ":";
2470 assert(bl
.get_num_buffers() <= 1);
2471 auto p
= bl
.front().begin_deep();
2474 // Version 2 differs from v1 in blob's ref_map
2475 // serialization only. Hence there is no specific
2476 // handling at ExtentMap level below.
2477 assert(struct_v
== 1 || struct_v
== 2);
2480 denc_varint(num
, p
);
2481 vector
<BlobRef
> blobs(num
);
2483 uint64_t prev_len
= 0;
2487 Extent
*le
= new Extent();
2489 denc_varint(blobid
, p
);
2490 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
2492 denc_varint_lowz(gap
, p
);
2495 le
->logical_offset
= pos
;
2496 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
2497 denc_varint_lowz(le
->blob_offset
, p
);
2499 le
->blob_offset
= 0;
2501 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
2502 denc_varint_lowz(prev_len
, p
);
2504 le
->length
= prev_len
;
2506 if (blobid
& BLOBID_FLAG_SPANNING
) {
2507 dout(30) << __func__
<< " getting spanning blob "
2508 << (blobid
>> BLOBID_SHIFT_BITS
) << dendl
;
2509 le
->assign_blob(get_spanning_blob(blobid
>> BLOBID_SHIFT_BITS
));
2511 blobid
>>= BLOBID_SHIFT_BITS
;
2513 le
->assign_blob(blobs
[blobid
- 1]);
2516 Blob
*b
= new Blob();
2518 b
->decode(onode
->c
, p
, struct_v
, &sbid
, false);
2520 onode
->c
->open_shared_blob(sbid
, b
);
2523 // we build ref_map dynamically for non-spanning blobs
2531 extent_map
.insert(*le
);
2538 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p
)
2540 // Version 2 differs from v1 in blob's ref_map
2541 // serialization only. Hence there is no specific
2542 // handling at ExtentMap level.
2546 denc_varint((uint32_t)0, p
);
2547 size_t key_size
= 0;
2548 denc_varint((uint32_t)0, key_size
);
2549 p
+= spanning_blob_map
.size() * key_size
;
2550 for (const auto& i
: spanning_blob_map
) {
2551 i
.second
->bound_encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
2555 void BlueStore::ExtentMap::encode_spanning_blobs(
2556 bufferlist::contiguous_appender
& p
)
2558 // Version 2 differs from v1 in blob's ref_map
2559 // serialization only. Hence there is no specific
2560 // handling at ExtentMap level.
2564 denc_varint(spanning_blob_map
.size(), p
);
2565 for (auto& i
: spanning_blob_map
) {
2566 denc_varint(i
.second
->id
, p
);
2567 i
.second
->encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
2571 void BlueStore::ExtentMap::decode_spanning_blobs(
2572 bufferptr::iterator
& p
)
2576 // Version 2 differs from v1 in blob's ref_map
2577 // serialization only. Hence there is no specific
2578 // handling at ExtentMap level.
2579 assert(struct_v
== 1 || struct_v
== 2);
2584 BlobRef
b(new Blob());
2585 denc_varint(b
->id
, p
);
2586 spanning_blob_map
[b
->id
] = b
;
2588 b
->decode(onode
->c
, p
, struct_v
, &sbid
, true);
2589 onode
->c
->open_shared_blob(sbid
, b
);
2593 void BlueStore::ExtentMap::init_shards(bool loaded
, bool dirty
)
2595 shards
.resize(onode
->onode
.extent_map_shards
.size());
2597 for (auto &s
: onode
->onode
.extent_map_shards
) {
2598 shards
[i
].shard_info
= &s
;
2599 shards
[i
].loaded
= loaded
;
2600 shards
[i
].dirty
= dirty
;
2605 void BlueStore::ExtentMap::fault_range(
2610 auto cct
= onode
->c
->store
->cct
; //used by dout
2611 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2612 << std::dec
<< dendl
;
2613 auto start
= seek_shard(offset
);
2614 auto last
= seek_shard(offset
+ length
);
2619 assert(last
>= start
);
2621 while (start
<= last
) {
2622 assert((size_t)start
< shards
.size());
2623 auto p
= &shards
[start
];
2625 dout(30) << __func__
<< " opening shard 0x" << std::hex
2626 << p
->shard_info
->offset
<< std::dec
<< dendl
;
2628 generate_extent_shard_key_and_apply(
2629 onode
->key
, p
->shard_info
->offset
, &key
,
2630 [&](const string
& final_key
) {
2631 int r
= db
->get(PREFIX_OBJ
, final_key
, &v
);
2633 derr
<< __func__
<< " missing shard 0x" << std::hex
2634 << p
->shard_info
->offset
<< std::dec
<< " for " << onode
->oid
2640 p
->extents
= decode_some(v
);
2642 dout(20) << __func__
<< " open shard 0x" << std::hex
2643 << p
->shard_info
->offset
<< std::dec
2644 << " (" << v
.length() << " bytes)" << dendl
;
2645 assert(p
->dirty
== false);
2646 assert(v
.length() == p
->shard_info
->bytes
);
2647 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_misses
);
2649 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_hits
);
2655 void BlueStore::ExtentMap::dirty_range(
2659 auto cct
= onode
->c
->store
->cct
; //used by dout
2660 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2661 << std::dec
<< dendl
;
2662 if (shards
.empty()) {
2663 dout(20) << __func__
<< " mark inline shard dirty" << dendl
;
2667 auto start
= seek_shard(offset
);
2668 auto last
= seek_shard(offset
+ length
);
2672 assert(last
>= start
);
2673 while (start
<= last
) {
2674 assert((size_t)start
< shards
.size());
2675 auto p
= &shards
[start
];
2677 dout(20) << __func__
<< " shard 0x" << std::hex
<< p
->shard_info
->offset
2678 << std::dec
<< " is not loaded, can't mark dirty" << dendl
;
2679 assert(0 == "can't mark unloaded shard dirty");
2682 dout(20) << __func__
<< " mark shard 0x" << std::hex
2683 << p
->shard_info
->offset
<< std::dec
<< " dirty" << dendl
;
2690 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::find(
2693 Extent
dummy(offset
);
2694 return extent_map
.find(dummy
);
2697 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::seek_lextent(
2700 Extent
dummy(offset
);
2701 auto fp
= extent_map
.lower_bound(dummy
);
2702 if (fp
!= extent_map
.begin()) {
2704 if (fp
->logical_end() <= offset
) {
2711 BlueStore::extent_map_t::const_iterator
BlueStore::ExtentMap::seek_lextent(
2712 uint64_t offset
) const
2714 Extent
dummy(offset
);
2715 auto fp
= extent_map
.lower_bound(dummy
);
2716 if (fp
!= extent_map
.begin()) {
2718 if (fp
->logical_end() <= offset
) {
2725 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset
, uint64_t length
)
2727 auto fp
= seek_lextent(offset
);
2728 if (fp
== extent_map
.end() || fp
->logical_offset
>= offset
+ length
) {
2734 int BlueStore::ExtentMap::compress_extent_map(
2738 auto cct
= onode
->c
->store
->cct
; //used by dout
2739 if (extent_map
.empty())
2742 auto p
= seek_lextent(offset
);
2743 if (p
!= extent_map
.begin()) {
2744 --p
; // start to the left of offset
2746 // the caller should have just written to this region
2747 assert(p
!= extent_map
.end());
2749 // identify the *next* shard
2750 auto pshard
= shards
.begin();
2751 while (pshard
!= shards
.end() &&
2752 p
->logical_offset
>= pshard
->shard_info
->offset
) {
2756 if (pshard
!= shards
.end()) {
2757 shard_end
= pshard
->shard_info
->offset
;
2759 shard_end
= OBJECT_MAX_SIZE
;
2763 for (++n
; n
!= extent_map
.end(); p
= n
++) {
2764 if (n
->logical_offset
> offset
+ length
) {
2765 break; // stop after end
2767 while (n
!= extent_map
.end() &&
2768 p
->logical_end() == n
->logical_offset
&&
2769 p
->blob
== n
->blob
&&
2770 p
->blob_offset
+ p
->length
== n
->blob_offset
&&
2771 n
->logical_offset
< shard_end
) {
2772 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2773 << " next shard 0x" << shard_end
<< std::dec
2774 << " merging " << *p
<< " and " << *n
<< dendl
;
2775 p
->length
+= n
->length
;
2779 if (n
== extent_map
.end()) {
2782 if (n
->logical_offset
>= shard_end
) {
2783 assert(pshard
!= shards
.end());
2785 if (pshard
!= shards
.end()) {
2786 shard_end
= pshard
->shard_info
->offset
;
2788 shard_end
= OBJECT_MAX_SIZE
;
2792 if (removed
&& onode
) {
2793 onode
->c
->store
->logger
->inc(l_bluestore_extent_compress
, removed
);
2798 void BlueStore::ExtentMap::punch_hole(
2802 old_extent_map_t
*old_extents
)
2804 auto p
= seek_lextent(offset
);
2805 uint64_t end
= offset
+ length
;
2806 while (p
!= extent_map
.end()) {
2807 if (p
->logical_offset
>= end
) {
2810 if (p
->logical_offset
< offset
) {
2811 if (p
->logical_end() > end
) {
2812 // split and deref middle
2813 uint64_t front
= offset
- p
->logical_offset
;
2814 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ front
,
2816 old_extents
->push_back(*oe
);
2818 p
->blob_offset
+ front
+ length
,
2819 p
->length
- front
- length
,
2825 assert(p
->logical_end() > offset
); // else seek_lextent bug
2826 uint64_t keep
= offset
- p
->logical_offset
;
2827 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ keep
,
2828 p
->length
- keep
, p
->blob
);
2829 old_extents
->push_back(*oe
);
2835 if (p
->logical_offset
+ p
->length
<= end
) {
2836 // deref whole lextent
2837 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
2838 p
->length
, p
->blob
);
2839 old_extents
->push_back(*oe
);
2844 uint64_t keep
= p
->logical_end() - end
;
2845 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
2846 p
->length
- keep
, p
->blob
);
2847 old_extents
->push_back(*oe
);
2849 add(end
, p
->blob_offset
+ p
->length
- keep
, keep
, p
->blob
);
2855 BlueStore::Extent
*BlueStore::ExtentMap::set_lextent(
2857 uint64_t logical_offset
,
2858 uint64_t blob_offset
, uint64_t length
, BlobRef b
,
2859 old_extent_map_t
*old_extents
)
2861 // We need to have completely initialized Blob to increment its ref counters.
2862 assert(b
->get_blob().get_logical_length() != 0);
2864 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2865 // old_extents list if we overwre the blob totally
2866 // This might happen during WAL overwrite.
2867 b
->get_ref(onode
->c
, blob_offset
, length
);
2870 punch_hole(c
, logical_offset
, length
, old_extents
);
2873 Extent
*le
= new Extent(logical_offset
, blob_offset
, length
, b
);
2874 extent_map
.insert(*le
);
2875 if (spans_shard(logical_offset
, length
)) {
2876 request_reshard(logical_offset
, logical_offset
+ length
);
2881 BlueStore::BlobRef
BlueStore::ExtentMap::split_blob(
2883 uint32_t blob_offset
,
2886 auto cct
= onode
->c
->store
->cct
; //used by dout
2888 uint32_t end_pos
= pos
+ lb
->get_blob().get_logical_length() - blob_offset
;
2889 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< " end 0x" << end_pos
2890 << " blob_offset 0x" << blob_offset
<< std::dec
<< " " << *lb
2892 BlobRef rb
= onode
->c
->new_blob();
2893 lb
->split(onode
->c
, blob_offset
, rb
.get());
2895 for (auto ep
= seek_lextent(pos
);
2896 ep
!= extent_map
.end() && ep
->logical_offset
< end_pos
;
2898 if (ep
->blob
!= lb
) {
2901 if (ep
->logical_offset
< pos
) {
2903 size_t left
= pos
- ep
->logical_offset
;
2904 Extent
*ne
= new Extent(pos
, 0, ep
->length
- left
, rb
);
2905 extent_map
.insert(*ne
);
2907 dout(30) << __func__
<< " split " << *ep
<< dendl
;
2908 dout(30) << __func__
<< " to " << *ne
<< dendl
;
2911 assert(ep
->blob_offset
>= blob_offset
);
2914 ep
->blob_offset
-= blob_offset
;
2915 dout(30) << __func__
<< " adjusted " << *ep
<< dendl
;
2924 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2926 void BlueStore::Onode::flush()
2928 if (flushing_count
.load()) {
2929 ldout(c
->store
->cct
, 20) << __func__
<< " cnt:" << flushing_count
<< dendl
;
2930 std::unique_lock
<std::mutex
> l(flush_lock
);
2931 while (flushing_count
.load()) {
2935 ldout(c
->store
->cct
, 20) << __func__
<< " done" << dendl
;
2938 // =======================================================
2941 /// Checks for writes to the same pextent within a blob
2942 bool BlueStore::WriteContext::has_conflict(
2946 uint64_t min_alloc_size
)
2948 assert((loffs
% min_alloc_size
) == 0);
2949 assert((loffs_end
% min_alloc_size
) == 0);
2950 for (auto w
: writes
) {
2952 auto loffs2
= P2ALIGN(w
.logical_offset
, min_alloc_size
);
2953 auto loffs2_end
= P2ROUNDUP(w
.logical_offset
+ w
.length0
, min_alloc_size
);
2954 if ((loffs
<= loffs2
&& loffs_end
> loffs2
) ||
2955 (loffs
>= loffs2
&& loffs
< loffs2_end
)) {
2963 // =======================================================
2967 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
2969 void BlueStore::DeferredBatch::prepare_write(
2971 uint64_t seq
, uint64_t offset
, uint64_t length
,
2972 bufferlist::const_iterator
& blp
)
2974 _discard(cct
, offset
, length
);
2975 auto i
= iomap
.insert(make_pair(offset
, deferred_io()));
2976 assert(i
.second
); // this should be a new insertion
2977 i
.first
->second
.seq
= seq
;
2978 blp
.copy(length
, i
.first
->second
.bl
);
2979 i
.first
->second
.bl
.reassign_to_mempool(
2980 mempool::mempool_bluestore_writing_deferred
);
2981 dout(20) << __func__
<< " seq " << seq
2982 << " 0x" << std::hex
<< offset
<< "~" << length
2983 << " crc " << i
.first
->second
.bl
.crc32c(-1)
2984 << std::dec
<< dendl
;
2985 seq_bytes
[seq
] += length
;
2986 #ifdef DEBUG_DEFERRED
2991 void BlueStore::DeferredBatch::_discard(
2992 CephContext
*cct
, uint64_t offset
, uint64_t length
)
2994 generic_dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2995 << std::dec
<< dendl
;
2996 auto p
= iomap
.lower_bound(offset
);
2997 if (p
!= iomap
.begin()) {
2999 auto end
= p
->first
+ p
->second
.bl
.length();
3002 head
.substr_of(p
->second
.bl
, 0, offset
- p
->first
);
3003 dout(20) << __func__
<< " keep head " << p
->second
.seq
3004 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3005 << " -> 0x" << head
.length() << std::dec
<< dendl
;
3006 auto i
= seq_bytes
.find(p
->second
.seq
);
3007 assert(i
!= seq_bytes
.end());
3008 if (end
> offset
+ length
) {
3010 tail
.substr_of(p
->second
.bl
, offset
+ length
- p
->first
,
3011 end
- (offset
+ length
));
3012 dout(20) << __func__
<< " keep tail " << p
->second
.seq
3013 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3014 << " -> 0x" << tail
.length() << std::dec
<< dendl
;
3015 auto &n
= iomap
[offset
+ length
];
3017 n
.seq
= p
->second
.seq
;
3018 i
->second
-= length
;
3020 i
->second
-= end
- offset
;
3022 assert(i
->second
>= 0);
3023 p
->second
.bl
.swap(head
);
3027 while (p
!= iomap
.end()) {
3028 if (p
->first
>= offset
+ length
) {
3031 auto i
= seq_bytes
.find(p
->second
.seq
);
3032 assert(i
!= seq_bytes
.end());
3033 auto end
= p
->first
+ p
->second
.bl
.length();
3034 if (end
> offset
+ length
) {
3035 unsigned drop_front
= offset
+ length
- p
->first
;
3036 unsigned keep_tail
= end
- (offset
+ length
);
3037 dout(20) << __func__
<< " truncate front " << p
->second
.seq
3038 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3039 << " drop_front 0x" << drop_front
<< " keep_tail 0x" << keep_tail
3040 << " to 0x" << (offset
+ length
) << "~" << keep_tail
3041 << std::dec
<< dendl
;
3042 auto &s
= iomap
[offset
+ length
];
3043 s
.seq
= p
->second
.seq
;
3044 s
.bl
.substr_of(p
->second
.bl
, drop_front
, keep_tail
);
3045 i
->second
-= drop_front
;
3047 dout(20) << __func__
<< " drop " << p
->second
.seq
3048 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3049 << std::dec
<< dendl
;
3050 i
->second
-= p
->second
.bl
.length();
3052 assert(i
->second
>= 0);
3057 void BlueStore::DeferredBatch::_audit(CephContext
*cct
)
3059 map
<uint64_t,int> sb
;
3060 for (auto p
: seq_bytes
) {
3061 sb
[p
.first
] = 0; // make sure we have the same set of keys
3064 for (auto& p
: iomap
) {
3065 assert(p
.first
>= pos
);
3066 sb
[p
.second
.seq
] += p
.second
.bl
.length();
3067 pos
= p
.first
+ p
.second
.bl
.length();
3069 assert(sb
== seq_bytes
);
3076 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3078 BlueStore::Collection::Collection(BlueStore
*ns
, Cache
*c
, coll_t cid
)
3082 lock("BlueStore::Collection::lock", true, false),
3088 void BlueStore::Collection::open_shared_blob(uint64_t sbid
, BlobRef b
)
3090 assert(!b
->shared_blob
);
3091 const bluestore_blob_t
& blob
= b
->get_blob();
3092 if (!blob
.is_shared()) {
3093 b
->shared_blob
= new SharedBlob(this);
3097 b
->shared_blob
= shared_blob_set
.lookup(sbid
);
3098 if (b
->shared_blob
) {
3099 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3100 << std::dec
<< " had " << *b
->shared_blob
<< dendl
;
3102 b
->shared_blob
= new SharedBlob(sbid
, this);
3103 shared_blob_set
.add(this, b
->shared_blob
.get());
3104 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3105 << std::dec
<< " opened " << *b
->shared_blob
3110 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb
)
3112 if (!sb
->is_loaded()) {
3116 auto sbid
= sb
->get_sbid();
3117 get_shared_blob_key(sbid
, &key
);
3118 int r
= store
->db
->get(PREFIX_SHARED_BLOB
, key
, &v
);
3120 lderr(store
->cct
) << __func__
<< " sbid 0x" << std::hex
<< sbid
3121 << std::dec
<< " not found at key "
3122 << pretty_binary_string(key
) << dendl
;
3123 assert(0 == "uh oh, missing shared_blob");
3127 sb
->persistent
= new bluestore_shared_blob_t(sbid
);
3128 bufferlist::iterator p
= v
.begin();
3129 ::decode(*(sb
->persistent
), p
);
3130 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3131 << std::dec
<< " loaded shared_blob " << *sb
<< dendl
;
3135 void BlueStore::Collection::make_blob_shared(uint64_t sbid
, BlobRef b
)
3137 ldout(store
->cct
, 10) << __func__
<< " " << *b
<< dendl
;
3138 assert(!b
->shared_blob
->is_loaded());
3141 bluestore_blob_t
& blob
= b
->dirty_blob();
3142 blob
.set_flag(bluestore_blob_t::FLAG_SHARED
);
3144 // update shared blob
3145 b
->shared_blob
->loaded
= true;
3146 b
->shared_blob
->persistent
= new bluestore_shared_blob_t(sbid
);
3147 shared_blob_set
.add(this, b
->shared_blob
.get());
3148 for (auto p
: blob
.get_extents()) {
3150 b
->shared_blob
->get_ref(
3155 ldout(store
->cct
, 20) << __func__
<< " now " << *b
<< dendl
;
3158 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob
*sb
)
3160 ldout(store
->cct
, 10) << __func__
<< " " << *sb
<< dendl
;
3161 assert(sb
->is_loaded());
3163 uint64_t sbid
= sb
->get_sbid();
3164 shared_blob_set
.remove(sb
);
3166 delete sb
->persistent
;
3167 sb
->sbid_unloaded
= 0;
3168 ldout(store
->cct
, 20) << __func__
<< " now " << *sb
<< dendl
;
3172 BlueStore::OnodeRef
BlueStore::Collection::get_onode(
3173 const ghobject_t
& oid
,
3176 assert(create
? lock
.is_wlocked() : lock
.is_locked());
3179 if (cid
.is_pg(&pgid
)) {
3180 if (!oid
.match(cnode
.bits
, pgid
.ps())) {
3181 lderr(store
->cct
) << __func__
<< " oid " << oid
<< " not part of "
3182 << pgid
<< " bits " << cnode
.bits
<< dendl
;
3187 OnodeRef o
= onode_map
.lookup(oid
);
3191 mempool::bluestore_cache_other::string key
;
3192 get_object_key(store
->cct
, oid
, &key
);
3194 ldout(store
->cct
, 20) << __func__
<< " oid " << oid
<< " key "
3195 << pretty_binary_string(key
) << dendl
;
3198 int r
= store
->db
->get(PREFIX_OBJ
, key
.c_str(), key
.size(), &v
);
3199 ldout(store
->cct
, 20) << " r " << r
<< " v.len " << v
.length() << dendl
;
3201 if (v
.length() == 0) {
3202 assert(r
== -ENOENT
);
3203 if (!store
->cct
->_conf
->bluestore_debug_misc
&&
3207 // new object, new onode
3208 on
= new Onode(this, oid
, key
);
3212 on
= new Onode(this, oid
, key
);
3214 bufferptr::iterator p
= v
.front().begin_deep();
3215 on
->onode
.decode(p
);
3216 for (auto& i
: on
->onode
.attrs
) {
3217 i
.second
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
3220 // initialize extent_map
3221 on
->extent_map
.decode_spanning_blobs(p
);
3222 if (on
->onode
.extent_map_shards
.empty()) {
3223 denc(on
->extent_map
.inline_bl
, p
);
3224 on
->extent_map
.decode_some(on
->extent_map
.inline_bl
);
3225 on
->extent_map
.inline_bl
.reassign_to_mempool(
3226 mempool::mempool_bluestore_cache_other
);
3228 on
->extent_map
.init_shards(false, false);
3232 return onode_map
.add(oid
, o
);
3235 void BlueStore::Collection::split_cache(
3238 ldout(store
->cct
, 10) << __func__
<< " to " << dest
<< dendl
;
3240 // lock (one or both) cache shards
3241 std::lock(cache
->lock
, dest
->cache
->lock
);
3242 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
, std::adopt_lock
);
3243 std::lock_guard
<std::recursive_mutex
> l2(dest
->cache
->lock
, std::adopt_lock
);
3245 int destbits
= dest
->cnode
.bits
;
3247 bool is_pg
= dest
->cid
.is_pg(&destpg
);
3250 auto p
= onode_map
.onode_map
.begin();
3251 while (p
!= onode_map
.onode_map
.end()) {
3252 if (!p
->second
->oid
.match(destbits
, destpg
.pgid
.ps())) {
3253 // onode does not belong to this child
3256 OnodeRef o
= p
->second
;
3257 ldout(store
->cct
, 20) << __func__
<< " moving " << o
<< " " << o
->oid
3260 cache
->_rm_onode(p
->second
);
3261 p
= onode_map
.onode_map
.erase(p
);
3264 dest
->cache
->_add_onode(o
, 1);
3265 dest
->onode_map
.onode_map
[o
->oid
] = o
;
3266 dest
->onode_map
.cache
= dest
->cache
;
3268 // move over shared blobs and buffers. cover shared blobs from
3269 // both extent map and spanning blob map (the full extent map
3270 // may not be faulted in)
3271 vector
<SharedBlob
*> sbvec
;
3272 for (auto& e
: o
->extent_map
.extent_map
) {
3273 sbvec
.push_back(e
.blob
->shared_blob
.get());
3275 for (auto& b
: o
->extent_map
.spanning_blob_map
) {
3276 sbvec
.push_back(b
.second
->shared_blob
.get());
3278 for (auto sb
: sbvec
) {
3279 if (sb
->coll
== dest
) {
3280 ldout(store
->cct
, 20) << __func__
<< " already moved " << *sb
3284 ldout(store
->cct
, 20) << __func__
<< " moving " << *sb
<< dendl
;
3285 if (sb
->get_sbid()) {
3286 ldout(store
->cct
, 20) << __func__
3287 << " moving registration " << *sb
<< dendl
;
3288 shared_blob_set
.remove(sb
);
3289 dest
->shared_blob_set
.add(dest
, sb
);
3292 if (dest
->cache
!= cache
) {
3293 for (auto& i
: sb
->bc
.buffer_map
) {
3294 if (!i
.second
->is_writing()) {
3295 ldout(store
->cct
, 20) << __func__
<< " moving " << *i
.second
3297 dest
->cache
->_move_buffer(cache
, i
.second
.get());
3306 // =======================================================
3311 #define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
3313 void *BlueStore::MempoolThread::entry()
3315 Mutex::Locker
l(lock
);
3317 std::list
<PriorityCache::PriCache
*> caches
;
3318 caches
.push_back(store
->db
);
3319 caches
.push_back(&meta_cache
);
3320 caches
.push_back(&data_cache
);
3321 autotune_cache_size
= store
->osd_memory_cache_min
;
3323 utime_t next_balance
= ceph_clock_now();
3324 utime_t next_resize
= ceph_clock_now();
3326 bool interval_stats_trim
= false;
3327 bool interval_stats_resize
= false;
3329 _adjust_cache_settings();
3331 // Before we trim, check and see if it's time to rebalance/resize.
3332 double autotune_interval
= store
->cache_autotune_interval
;
3333 double resize_interval
= store
->osd_memory_cache_resize_interval
;
3335 if (autotune_interval
> 0 && next_balance
< ceph_clock_now()) {
3336 // Log events at 5 instead of 20 when balance happens.
3337 interval_stats_resize
= true;
3338 interval_stats_trim
= true;
3339 if (store
->cache_autotune
) {
3340 _balance_cache(caches
);
3343 next_balance
= ceph_clock_now();
3344 next_balance
+= autotune_interval
;
3346 if (resize_interval
> 0 && next_resize
< ceph_clock_now()) {
3347 if (ceph_using_tcmalloc() && store
->cache_autotune
) {
3348 _tune_cache_size(interval_stats_resize
);
3349 interval_stats_resize
= false;
3351 next_resize
= ceph_clock_now();
3352 next_resize
+= resize_interval
;
3356 _trim_shards(interval_stats_trim
);
3357 interval_stats_trim
= false;
3359 store
->_update_cache_logger();
3361 wait
+= store
->cct
->_conf
->bluestore_cache_trim_interval
;
3362 cond
.WaitInterval(lock
, wait
);
3368 void BlueStore::MempoolThread::_adjust_cache_settings()
3370 store
->db
->set_cache_ratio(store
->cache_kv_ratio
);
3371 meta_cache
.set_cache_ratio(store
->cache_meta_ratio
);
3372 data_cache
.set_cache_ratio(store
->cache_data_ratio
);
3375 void BlueStore::MempoolThread::_trim_shards(bool interval_stats
)
3377 auto cct
= store
->cct
;
3378 size_t num_shards
= store
->cache_shards
.size();
3380 int64_t kv_used
= store
->db
->get_cache_usage();
3381 int64_t meta_used
= meta_cache
._get_used_bytes();
3382 int64_t data_used
= data_cache
._get_used_bytes();
3384 uint64_t cache_size
= store
->cache_size
;
3386 static_cast<int64_t>(store
->db
->get_cache_ratio() * cache_size
);
3387 int64_t meta_alloc
=
3388 static_cast<int64_t>(meta_cache
.get_cache_ratio() * cache_size
);
3389 int64_t data_alloc
=
3390 static_cast<int64_t>(data_cache
.get_cache_ratio() * cache_size
);
3392 if (store
->cache_autotune
) {
3393 cache_size
= autotune_cache_size
;
3395 kv_alloc
= store
->db
->get_cache_bytes();
3396 meta_alloc
= meta_cache
.get_cache_bytes();
3397 data_alloc
= data_cache
.get_cache_bytes();
3400 if (interval_stats
) {
3401 ldout(cct
, 5) << __func__
<< " cache_size: " << cache_size
3402 << " kv_alloc: " << kv_alloc
3403 << " kv_used: " << kv_used
3404 << " meta_alloc: " << meta_alloc
3405 << " meta_used: " << meta_used
3406 << " data_alloc: " << data_alloc
3407 << " data_used: " << data_used
<< dendl
;
3409 ldout(cct
, 20) << __func__
<< " cache_size: " << cache_size
3410 << " kv_alloc: " << kv_alloc
3411 << " kv_used: " << kv_used
3412 << " meta_alloc: " << meta_alloc
3413 << " meta_used: " << meta_used
3414 << " data_alloc: " << data_alloc
3415 << " data_used: " << data_used
<< dendl
;
3418 uint64_t max_shard_onodes
= static_cast<uint64_t>(
3419 (meta_alloc
/ (double) num_shards
) / meta_cache
.get_bytes_per_onode());
3420 uint64_t max_shard_buffer
= static_cast<uint64_t>(data_alloc
/ num_shards
);
3422 ldout(cct
, 30) << __func__
<< " max_shard_onodes: " << max_shard_onodes
3423 << " max_shard_buffer: " << max_shard_buffer
<< dendl
;
3425 for (auto i
: store
->cache_shards
) {
3426 i
->trim(max_shard_onodes
, max_shard_buffer
);
3430 void BlueStore::MempoolThread::_tune_cache_size(bool interval_stats
)
3432 auto cct
= store
->cct
;
3433 uint64_t target
= store
->osd_memory_target
;
3434 uint64_t base
= store
->osd_memory_base
;
3435 double fragmentation
= store
->osd_memory_expected_fragmentation
;
3436 uint64_t cache_min
= store
->osd_memory_cache_min
;
3437 uint64_t cache_max
= cache_min
;
3438 uint64_t limited_target
= (1.0 - fragmentation
) * target
;
3439 if (limited_target
> base
+ cache_min
) {
3440 cache_max
= limited_target
- base
;
3443 size_t heap_size
= 0;
3444 size_t unmapped
= 0;
3445 uint64_t mapped
= 0;
3447 ceph_heap_release_free_memory();
3448 ceph_heap_get_numeric_property("generic.heap_size", &heap_size
);
3449 ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped
);
3450 mapped
= heap_size
- unmapped
;
3452 uint64_t new_size
= autotune_cache_size
;
3453 new_size
= (new_size
< cache_max
) ? new_size
: cache_max
;
3454 new_size
= (new_size
> cache_min
) ? new_size
: cache_min
;
3456 // Approach the min/max slowly, but bounce away quickly.
3457 if ((uint64_t) mapped
< target
) {
3458 double ratio
= 1 - ((double) mapped
/ target
);
3459 new_size
+= ratio
* (cache_max
- new_size
);
3461 double ratio
= 1 - ((double) target
/ mapped
);
3462 new_size
-= ratio
* (new_size
- cache_min
);
3465 if (interval_stats
) {
3466 ldout(cct
, 5) << __func__
3467 << " target: " << target
3468 << " heap: " << heap_size
3469 << " unmapped: " << unmapped
3470 << " mapped: " << mapped
3471 << " old cache_size: " << autotune_cache_size
3472 << " new cache size: " << new_size
<< dendl
;
3474 ldout(cct
, 20) << __func__
3475 << " target: " << target
3476 << " heap: " << heap_size
3477 << " unmapped: " << unmapped
3478 << " mapped: " << mapped
3479 << " old cache_size: " << autotune_cache_size
3480 << " new cache size: " << new_size
<< dendl
;
3482 autotune_cache_size
= new_size
;
3485 void BlueStore::MempoolThread::_balance_cache(
3486 const std::list
<PriorityCache::PriCache
*>& caches
)
3488 int64_t mem_avail
= autotune_cache_size
;
3490 // Assign memory for each priority level
3491 for (int i
= 0; i
< PriorityCache::Priority::LAST
+ 1; i
++) {
3492 ldout(store
->cct
, 10) << __func__
<< " assigning cache bytes for PRI: " << i
<< dendl
;
3493 PriorityCache::Priority pri
= static_cast<PriorityCache::Priority
>(i
);
3494 _balance_cache_pri(&mem_avail
, caches
, pri
);
3496 // Assign any leftover memory based on the default ratios.
3497 if (mem_avail
> 0) {
3498 for (auto it
= caches
.begin(); it
!= caches
.end(); it
++) {
3499 int64_t fair_share
=
3500 static_cast<int64_t>((*it
)->get_cache_ratio() * mem_avail
);
3501 if (fair_share
> 0) {
3502 (*it
)->add_cache_bytes(PriorityCache::Priority::LAST
, fair_share
);
3506 // assert if we assigned more memory than is available.
3507 assert(mem_avail
>= 0);
3509 // Finally commit the new cache sizes
3510 for (auto it
= caches
.begin(); it
!= caches
.end(); it
++) {
3511 (*it
)->commit_cache_size();
3515 void BlueStore::MempoolThread::_balance_cache_pri(int64_t *mem_avail
,
3516 const std::list
<PriorityCache::PriCache
*>& caches
, PriorityCache::Priority pri
)
3518 std::list
<PriorityCache::PriCache
*> tmp_caches
= caches
;
3519 double cur_ratios
= 0;
3520 double new_ratios
= 0;
3522 // Zero this priority's bytes, sum the initial ratios.
3523 for (auto it
= tmp_caches
.begin(); it
!= tmp_caches
.end(); it
++) {
3524 (*it
)->set_cache_bytes(pri
, 0);
3525 cur_ratios
+= (*it
)->get_cache_ratio();
3528 // For this priority, loop until caches are satisified or we run out of memory.
3529 // Since we can't allocate fractional bytes, stop if we have fewer bytes left
3530 // than the number of participating caches.
3531 while (!tmp_caches
.empty() && *mem_avail
> static_cast<int64_t>(tmp_caches
.size())) {
3532 uint64_t total_assigned
= 0;
3534 for (auto it
= tmp_caches
.begin(); it
!= tmp_caches
.end(); ) {
3535 int64_t cache_wants
= (*it
)->request_cache_bytes(pri
, store
->cache_autotune_chunk_size
);
3537 // Usually the ratio should be set to the fraction of the current caches'
3538 // assigned ratio compared to the total ratio of all caches that still
3539 // want memory. There is a special case where the only caches left are
3540 // all assigned 0% ratios but still want memory. In that case, give
3541 // them an equal shot at the remaining memory for this priority.
3542 double ratio
= 1.0 / tmp_caches
.size();
3543 if (cur_ratios
> 0) {
3544 ratio
= (*it
)->get_cache_ratio() / cur_ratios
;
3546 int64_t fair_share
= static_cast<int64_t>(*mem_avail
* ratio
);
3548 if (cache_wants
> fair_share
) {
3549 // If we want too much, take what we can get but stick around for more
3550 (*it
)->add_cache_bytes(pri
, fair_share
);
3551 total_assigned
+= fair_share
;
3553 new_ratios
+= (*it
)->get_cache_ratio();
3554 ldout(store
->cct
, 20) << __func__
<< " " << (*it
)->get_cache_name()
3555 << " wanted: " << cache_wants
<< " fair_share: " << fair_share
3556 << " mem_avail: " << *mem_avail
3557 << " staying in list. Size: " << tmp_caches
.size()
3561 // Otherwise assign only what we want
3562 if (cache_wants
> 0) {
3563 (*it
)->add_cache_bytes(pri
, cache_wants
);
3564 total_assigned
+= cache_wants
;
3566 ldout(store
->cct
, 20) << __func__
<< " " << (*it
)->get_cache_name()
3567 << " wanted: " << cache_wants
<< " fair_share: " << fair_share
3568 << " mem_avail: " << *mem_avail
3569 << " removing from list. New size: " << tmp_caches
.size() - 1
3573 // Either the cache didn't want anything or got what it wanted, so remove it from the tmp list.
3574 it
= tmp_caches
.erase(it
);
3578 *mem_avail
-= total_assigned
;
3579 cur_ratios
= new_ratios
;
3584 // =======================================================
3589 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3591 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3592 CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
)
3593 : c(c
), o(o
), it(it
)
3595 RWLock::RLocker
l(c
->lock
);
3596 if (o
->onode
.has_omap()) {
3597 get_omap_key(o
->onode
.nid
, string(), &head
);
3598 get_omap_tail(o
->onode
.nid
, &tail
);
3599 it
->lower_bound(head
);
3603 int BlueStore::OmapIteratorImpl::seek_to_first()
3605 RWLock::RLocker
l(c
->lock
);
3606 if (o
->onode
.has_omap()) {
3607 it
->lower_bound(head
);
3609 it
= KeyValueDB::Iterator();
3614 int BlueStore::OmapIteratorImpl::upper_bound(const string
& after
)
3616 RWLock::RLocker
l(c
->lock
);
3617 if (o
->onode
.has_omap()) {
3619 get_omap_key(o
->onode
.nid
, after
, &key
);
3620 ldout(c
->store
->cct
,20) << __func__
<< " after " << after
<< " key "
3621 << pretty_binary_string(key
) << dendl
;
3622 it
->upper_bound(key
);
3624 it
= KeyValueDB::Iterator();
3629 int BlueStore::OmapIteratorImpl::lower_bound(const string
& to
)
3631 RWLock::RLocker
l(c
->lock
);
3632 if (o
->onode
.has_omap()) {
3634 get_omap_key(o
->onode
.nid
, to
, &key
);
3635 ldout(c
->store
->cct
,20) << __func__
<< " to " << to
<< " key "
3636 << pretty_binary_string(key
) << dendl
;
3637 it
->lower_bound(key
);
3639 it
= KeyValueDB::Iterator();
3644 bool BlueStore::OmapIteratorImpl::valid()
3646 RWLock::RLocker
l(c
->lock
);
3647 bool r
= o
->onode
.has_omap() && it
&& it
->valid() &&
3648 it
->raw_key().second
<= tail
;
3649 if (it
&& it
->valid()) {
3650 ldout(c
->store
->cct
,20) << __func__
<< " is at "
3651 << pretty_binary_string(it
->raw_key().second
)
3657 int BlueStore::OmapIteratorImpl::next(bool validate
)
3659 RWLock::RLocker
l(c
->lock
);
3660 if (o
->onode
.has_omap()) {
3668 string
BlueStore::OmapIteratorImpl::key()
3670 RWLock::RLocker
l(c
->lock
);
3671 assert(it
->valid());
3672 string db_key
= it
->raw_key().second
;
3674 decode_omap_key(db_key
, &user_key
);
3678 bufferlist
BlueStore::OmapIteratorImpl::value()
3680 RWLock::RLocker
l(c
->lock
);
3681 assert(it
->valid());
3686 // =====================================
3689 #define dout_prefix *_dout << "bluestore(" << path << ") "
3692 static void aio_cb(void *priv
, void *priv2
)
3694 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
3695 BlueStore::AioContext
*c
= static_cast<BlueStore::AioContext
*>(priv2
);
3696 c
->aio_finish(store
);
3699 BlueStore::BlueStore(CephContext
*cct
, const string
& path
)
3700 : ObjectStore(cct
, path
),
3701 throttle_bytes(cct
, "bluestore_throttle_bytes",
3702 cct
->_conf
->bluestore_throttle_bytes
),
3703 throttle_deferred_bytes(cct
, "bluestore_throttle_deferred_bytes",
3704 cct
->_conf
->bluestore_throttle_bytes
+
3705 cct
->_conf
->bluestore_throttle_deferred_bytes
),
3706 deferred_finisher(cct
, "defered_finisher", "dfin"),
3707 kv_sync_thread(this),
3708 kv_finalize_thread(this),
3709 mempool_thread(this)
3712 cct
->_conf
->add_observer(this);
3713 set_cache_shards(1);
3716 BlueStore::BlueStore(CephContext
*cct
,
3718 uint64_t _min_alloc_size
)
3719 : ObjectStore(cct
, path
),
3720 throttle_bytes(cct
, "bluestore_throttle_bytes",
3721 cct
->_conf
->bluestore_throttle_bytes
),
3722 throttle_deferred_bytes(cct
, "bluestore_throttle_deferred_bytes",
3723 cct
->_conf
->bluestore_throttle_bytes
+
3724 cct
->_conf
->bluestore_throttle_deferred_bytes
),
3725 deferred_finisher(cct
, "defered_finisher", "dfin"),
3726 kv_sync_thread(this),
3727 kv_finalize_thread(this),
3728 min_alloc_size(_min_alloc_size
),
3729 min_alloc_size_order(ctz(_min_alloc_size
)),
3730 mempool_thread(this)
3733 cct
->_conf
->add_observer(this);
3734 set_cache_shards(1);
3737 BlueStore::~BlueStore()
3739 for (auto f
: finishers
) {
3744 cct
->_conf
->remove_observer(this);
3748 assert(bluefs
== NULL
);
3749 assert(fsid_fd
< 0);
3750 assert(path_fd
< 0);
3751 for (auto i
: cache_shards
) {
3754 cache_shards
.clear();
3757 const char **BlueStore::get_tracked_conf_keys() const
3759 static const char* KEYS
[] = {
3760 "bluestore_csum_type",
3761 "bluestore_compression_mode",
3762 "bluestore_compression_algorithm",
3763 "bluestore_compression_min_blob_size",
3764 "bluestore_compression_min_blob_size_ssd",
3765 "bluestore_compression_min_blob_size_hdd",
3766 "bluestore_compression_max_blob_size",
3767 "bluestore_compression_max_blob_size_ssd",
3768 "bluestore_compression_max_blob_size_hdd",
3769 "bluestore_compression_required_ratio",
3770 "bluestore_max_alloc_size",
3771 "bluestore_prefer_deferred_size",
3772 "bluestore_prefer_deferred_size_hdd",
3773 "bluestore_prefer_deferred_size_ssd",
3774 "bluestore_deferred_batch_ops",
3775 "bluestore_deferred_batch_ops_hdd",
3776 "bluestore_deferred_batch_ops_ssd",
3777 "bluestore_throttle_bytes",
3778 "bluestore_throttle_deferred_bytes",
3779 "bluestore_throttle_cost_per_io_hdd",
3780 "bluestore_throttle_cost_per_io_ssd",
3781 "bluestore_throttle_cost_per_io",
3782 "bluestore_max_blob_size",
3783 "bluestore_max_blob_size_ssd",
3784 "bluestore_max_blob_size_hdd",
3790 void BlueStore::handle_conf_change(const struct md_config_t
*conf
,
3791 const std::set
<std::string
> &changed
)
3793 if (changed
.count("bluestore_csum_type")) {
3796 if (changed
.count("bluestore_compression_mode") ||
3797 changed
.count("bluestore_compression_algorithm") ||
3798 changed
.count("bluestore_compression_min_blob_size") ||
3799 changed
.count("bluestore_compression_max_blob_size")) {
3804 if (changed
.count("bluestore_max_blob_size") ||
3805 changed
.count("bluestore_max_blob_size_ssd") ||
3806 changed
.count("bluestore_max_blob_size_hdd")) {
3808 // only after startup
3812 if (changed
.count("bluestore_prefer_deferred_size") ||
3813 changed
.count("bluestore_prefer_deferred_size_hdd") ||
3814 changed
.count("bluestore_prefer_deferred_size_ssd") ||
3815 changed
.count("bluestore_max_alloc_size") ||
3816 changed
.count("bluestore_deferred_batch_ops") ||
3817 changed
.count("bluestore_deferred_batch_ops_hdd") ||
3818 changed
.count("bluestore_deferred_batch_ops_ssd")) {
3820 // only after startup
3824 if (changed
.count("bluestore_throttle_cost_per_io") ||
3825 changed
.count("bluestore_throttle_cost_per_io_hdd") ||
3826 changed
.count("bluestore_throttle_cost_per_io_ssd")) {
3828 _set_throttle_params();
3831 if (changed
.count("bluestore_throttle_bytes")) {
3832 throttle_bytes
.reset_max(conf
->bluestore_throttle_bytes
);
3833 throttle_deferred_bytes
.reset_max(
3834 conf
->bluestore_throttle_bytes
+ conf
->bluestore_throttle_deferred_bytes
);
3836 if (changed
.count("bluestore_throttle_deferred_bytes")) {
3837 throttle_deferred_bytes
.reset_max(
3838 conf
->bluestore_throttle_bytes
+ conf
->bluestore_throttle_deferred_bytes
);
3842 void BlueStore::_set_compression()
3844 auto m
= Compressor::get_comp_mode_type(cct
->_conf
->bluestore_compression_mode
);
3848 derr
<< __func__
<< " unrecognized value '"
3849 << cct
->_conf
->bluestore_compression_mode
3850 << "' for bluestore_compression_mode, reverting to 'none'"
3852 comp_mode
= Compressor::COMP_NONE
;
3855 compressor
= nullptr;
3857 if (comp_mode
== Compressor::COMP_NONE
) {
3858 dout(10) << __func__
<< " compression mode set to 'none', "
3859 << "ignore other compression setttings" << dendl
;
3863 if (cct
->_conf
->bluestore_compression_min_blob_size
) {
3864 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size
;
3867 if (bdev
->is_rotational()) {
3868 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_hdd
;
3870 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_ssd
;
3874 if (cct
->_conf
->bluestore_compression_max_blob_size
) {
3875 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size
;
3878 if (bdev
->is_rotational()) {
3879 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_hdd
;
3881 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_ssd
;
3885 auto& alg_name
= cct
->_conf
->bluestore_compression_algorithm
;
3886 if (!alg_name
.empty()) {
3887 compressor
= Compressor::create(cct
, alg_name
);
3889 derr
<< __func__
<< " unable to initialize " << alg_name
.c_str() << " compressor"
3894 dout(10) << __func__
<< " mode " << Compressor::get_comp_mode_name(comp_mode
)
3895 << " alg " << (compressor
? compressor
->get_type_name() : "(none)")
3899 void BlueStore::_set_csum()
3901 csum_type
= Checksummer::CSUM_NONE
;
3902 int t
= Checksummer::get_csum_string_type(cct
->_conf
->bluestore_csum_type
);
3903 if (t
> Checksummer::CSUM_NONE
)
3906 dout(10) << __func__
<< " csum_type "
3907 << Checksummer::get_csum_type_string(csum_type
)
3911 void BlueStore::_set_throttle_params()
3913 if (cct
->_conf
->bluestore_throttle_cost_per_io
) {
3914 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io
;
3917 if (bdev
->is_rotational()) {
3918 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_hdd
;
3920 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_ssd
;
3924 dout(10) << __func__
<< " throttle_cost_per_io " << throttle_cost_per_io
3927 void BlueStore::_set_blob_size()
3929 if (cct
->_conf
->bluestore_max_blob_size
) {
3930 max_blob_size
= cct
->_conf
->bluestore_max_blob_size
;
3933 if (bdev
->is_rotational()) {
3934 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_hdd
;
3936 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_ssd
;
3939 dout(10) << __func__
<< " max_blob_size 0x" << std::hex
<< max_blob_size
3940 << std::dec
<< dendl
;
3943 void BlueStore::_set_finisher_num()
3945 if (cct
->_conf
->bluestore_shard_finishers
) {
3946 if (cct
->_conf
->osd_op_num_shards
) {
3947 m_finisher_num
= cct
->_conf
->osd_op_num_shards
;
3950 if (bdev
->is_rotational()) {
3951 m_finisher_num
= cct
->_conf
->osd_op_num_shards_hdd
;
3953 m_finisher_num
= cct
->_conf
->osd_op_num_shards_ssd
;
3957 assert(m_finisher_num
!= 0);
3960 int BlueStore::_set_cache_sizes()
3963 cache_autotune
= cct
->_conf
->get_val
<bool>("bluestore_cache_autotune");
3964 cache_autotune_chunk_size
=
3965 cct
->_conf
->get_val
<uint64_t>("bluestore_cache_autotune_chunk_size");
3966 cache_autotune_interval
=
3967 cct
->_conf
->get_val
<double>("bluestore_cache_autotune_interval");
3968 osd_memory_target
= cct
->_conf
->get_val
<uint64_t>("osd_memory_target");
3969 osd_memory_base
= cct
->_conf
->get_val
<uint64_t>("osd_memory_base");
3970 osd_memory_expected_fragmentation
=
3971 cct
->_conf
->get_val
<double>("osd_memory_expected_fragmentation");
3972 osd_memory_cache_min
= cct
->_conf
->get_val
<uint64_t>("osd_memory_cache_min");
3973 osd_memory_cache_resize_interval
=
3974 cct
->_conf
->get_val
<double>("osd_memory_cache_resize_interval");
3976 if (cct
->_conf
->bluestore_cache_size
) {
3977 cache_size
= cct
->_conf
->bluestore_cache_size
;
3979 // choose global cache size based on backend type
3980 if (bdev
->is_rotational()) {
3981 cache_size
= cct
->_conf
->bluestore_cache_size_hdd
;
3983 cache_size
= cct
->_conf
->bluestore_cache_size_ssd
;
3987 cache_meta_ratio
= cct
->_conf
->bluestore_cache_meta_ratio
;
3988 if (cache_meta_ratio
< 0 || cache_meta_ratio
> 1.0) {
3989 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
3990 << ") must be in range [0,1.0]" << dendl
;
3994 cache_kv_ratio
= cct
->_conf
->bluestore_cache_kv_ratio
;
3995 if (cache_kv_ratio
< 0 || cache_kv_ratio
> 1.0) {
3996 derr
<< __func__
<< " bluestore_cache_kv_ratio (" << cache_kv_ratio
3997 << ") must be in range [0,1.0]" << dendl
;
4001 if (cache_meta_ratio
+ cache_kv_ratio
> 1.0) {
4002 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
4003 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
4004 << ") = " << cache_meta_ratio
+ cache_kv_ratio
<< "; must be <= 1.0"
4010 (double)1.0 - (double)cache_meta_ratio
- (double)cache_kv_ratio
;
4011 if (cache_data_ratio
< 0) {
4012 // deal with floating point imprecision
4013 cache_data_ratio
= 0;
4016 dout(1) << __func__
<< " cache_size " << cache_size
4017 << " meta " << cache_meta_ratio
4018 << " kv " << cache_kv_ratio
4019 << " data " << cache_data_ratio
4024 int BlueStore::write_meta(const std::string
& key
, const std::string
& value
)
4026 bluestore_bdev_label_t label
;
4027 string p
= path
+ "/block";
4028 int r
= _read_bdev_label(cct
, p
, &label
);
4030 return ObjectStore::write_meta(key
, value
);
4032 label
.meta
[key
] = value
;
4033 r
= _write_bdev_label(cct
, p
, label
);
4035 return ObjectStore::write_meta(key
, value
);
4038 int BlueStore::read_meta(const std::string
& key
, std::string
*value
)
4040 bluestore_bdev_label_t label
;
4041 string p
= path
+ "/block";
4042 int r
= _read_bdev_label(cct
, p
, &label
);
4044 return ObjectStore::read_meta(key
, value
);
4046 auto i
= label
.meta
.find(key
);
4047 if (i
== label
.meta
.end()) {
4048 return ObjectStore::read_meta(key
, value
);
4054 void BlueStore::_init_logger()
4056 PerfCountersBuilder
b(cct
, "bluestore",
4057 l_bluestore_first
, l_bluestore_last
);
4058 b
.add_time_avg(l_bluestore_kv_flush_lat
, "kv_flush_lat",
4059 "Average kv_thread flush latency",
4060 "fl_l", PerfCountersBuilder::PRIO_INTERESTING
);
4061 b
.add_time_avg(l_bluestore_kv_commit_lat
, "kv_commit_lat",
4062 "Average kv_thread commit latency");
4063 b
.add_time_avg(l_bluestore_kv_lat
, "kv_lat",
4064 "Average kv_thread sync latency",
4065 "k_l", PerfCountersBuilder::PRIO_INTERESTING
);
4066 b
.add_time_avg(l_bluestore_state_prepare_lat
, "state_prepare_lat",
4067 "Average prepare state latency");
4068 b
.add_time_avg(l_bluestore_state_aio_wait_lat
, "state_aio_wait_lat",
4069 "Average aio_wait state latency",
4070 "io_l", PerfCountersBuilder::PRIO_INTERESTING
);
4071 b
.add_time_avg(l_bluestore_state_io_done_lat
, "state_io_done_lat",
4072 "Average io_done state latency");
4073 b
.add_time_avg(l_bluestore_state_kv_queued_lat
, "state_kv_queued_lat",
4074 "Average kv_queued state latency");
4075 b
.add_time_avg(l_bluestore_state_kv_committing_lat
, "state_kv_commiting_lat",
4076 "Average kv_commiting state latency");
4077 b
.add_time_avg(l_bluestore_state_kv_done_lat
, "state_kv_done_lat",
4078 "Average kv_done state latency");
4079 b
.add_time_avg(l_bluestore_state_deferred_queued_lat
, "state_deferred_queued_lat",
4080 "Average deferred_queued state latency");
4081 b
.add_time_avg(l_bluestore_state_deferred_aio_wait_lat
, "state_deferred_aio_wait_lat",
4082 "Average aio_wait state latency");
4083 b
.add_time_avg(l_bluestore_state_deferred_cleanup_lat
, "state_deferred_cleanup_lat",
4084 "Average cleanup state latency");
4085 b
.add_time_avg(l_bluestore_state_finishing_lat
, "state_finishing_lat",
4086 "Average finishing state latency");
4087 b
.add_time_avg(l_bluestore_state_done_lat
, "state_done_lat",
4088 "Average done state latency");
4089 b
.add_time_avg(l_bluestore_throttle_lat
, "throttle_lat",
4090 "Average submit throttle latency",
4091 "th_l", PerfCountersBuilder::PRIO_CRITICAL
);
4092 b
.add_time_avg(l_bluestore_submit_lat
, "submit_lat",
4093 "Average submit latency",
4094 "s_l", PerfCountersBuilder::PRIO_CRITICAL
);
4095 b
.add_time_avg(l_bluestore_commit_lat
, "commit_lat",
4096 "Average commit latency",
4097 "c_l", PerfCountersBuilder::PRIO_CRITICAL
);
4098 b
.add_time_avg(l_bluestore_read_lat
, "read_lat",
4099 "Average read latency",
4100 "r_l", PerfCountersBuilder::PRIO_CRITICAL
);
4101 b
.add_time_avg(l_bluestore_read_onode_meta_lat
, "read_onode_meta_lat",
4102 "Average read onode metadata latency");
4103 b
.add_time_avg(l_bluestore_read_wait_aio_lat
, "read_wait_aio_lat",
4104 "Average read latency");
4105 b
.add_time_avg(l_bluestore_compress_lat
, "compress_lat",
4106 "Average compress latency");
4107 b
.add_time_avg(l_bluestore_decompress_lat
, "decompress_lat",
4108 "Average decompress latency");
4109 b
.add_time_avg(l_bluestore_csum_lat
, "csum_lat",
4110 "Average checksum latency");
4111 b
.add_u64_counter(l_bluestore_compress_success_count
, "compress_success_count",
4112 "Sum for beneficial compress ops");
4113 b
.add_u64_counter(l_bluestore_compress_rejected_count
, "compress_rejected_count",
4114 "Sum for compress ops rejected due to low net gain of space");
4115 b
.add_u64_counter(l_bluestore_write_pad_bytes
, "write_pad_bytes",
4116 "Sum for write-op padded bytes", NULL
, 0, unit_t(BYTES
));
4117 b
.add_u64_counter(l_bluestore_deferred_write_ops
, "deferred_write_ops",
4118 "Sum for deferred write op");
4119 b
.add_u64_counter(l_bluestore_deferred_write_bytes
, "deferred_write_bytes",
4120 "Sum for deferred write bytes", "def", 0, unit_t(BYTES
));
4121 b
.add_u64_counter(l_bluestore_write_penalty_read_ops
, "write_penalty_read_ops",
4122 "Sum for write penalty read ops");
4123 b
.add_u64(l_bluestore_allocated
, "bluestore_allocated",
4124 "Sum for allocated bytes");
4125 b
.add_u64(l_bluestore_stored
, "bluestore_stored",
4126 "Sum for stored bytes");
4127 b
.add_u64(l_bluestore_compressed
, "bluestore_compressed",
4128 "Sum for stored compressed bytes");
4129 b
.add_u64(l_bluestore_compressed_allocated
, "bluestore_compressed_allocated",
4130 "Sum for bytes allocated for compressed data");
4131 b
.add_u64(l_bluestore_compressed_original
, "bluestore_compressed_original",
4132 "Sum for original bytes that were compressed");
4134 b
.add_u64(l_bluestore_onodes
, "bluestore_onodes",
4135 "Number of onodes in cache");
4136 b
.add_u64_counter(l_bluestore_onode_hits
, "bluestore_onode_hits",
4137 "Sum for onode-lookups hit in the cache");
4138 b
.add_u64_counter(l_bluestore_onode_misses
, "bluestore_onode_misses",
4139 "Sum for onode-lookups missed in the cache");
4140 b
.add_u64_counter(l_bluestore_onode_shard_hits
, "bluestore_onode_shard_hits",
4141 "Sum for onode-shard lookups hit in the cache");
4142 b
.add_u64_counter(l_bluestore_onode_shard_misses
,
4143 "bluestore_onode_shard_misses",
4144 "Sum for onode-shard lookups missed in the cache");
4145 b
.add_u64(l_bluestore_extents
, "bluestore_extents",
4146 "Number of extents in cache");
4147 b
.add_u64(l_bluestore_blobs
, "bluestore_blobs",
4148 "Number of blobs in cache");
4149 b
.add_u64(l_bluestore_buffers
, "bluestore_buffers",
4150 "Number of buffers in cache");
4151 b
.add_u64(l_bluestore_buffer_bytes
, "bluestore_buffer_bytes",
4152 "Number of buffer bytes in cache", NULL
, 0, unit_t(BYTES
));
4153 b
.add_u64_counter(l_bluestore_buffer_hit_bytes
, "bluestore_buffer_hit_bytes",
4154 "Sum for bytes of read hit in the cache", NULL
, 0, unit_t(BYTES
));
4155 b
.add_u64_counter(l_bluestore_buffer_miss_bytes
, "bluestore_buffer_miss_bytes",
4156 "Sum for bytes of read missed in the cache", NULL
, 0, unit_t(BYTES
));
4158 b
.add_u64_counter(l_bluestore_write_big
, "bluestore_write_big",
4159 "Large aligned writes into fresh blobs");
4160 b
.add_u64_counter(l_bluestore_write_big_bytes
, "bluestore_write_big_bytes",
4161 "Large aligned writes into fresh blobs (bytes)", NULL
, 0, unit_t(BYTES
));
4162 b
.add_u64_counter(l_bluestore_write_big_blobs
, "bluestore_write_big_blobs",
4163 "Large aligned writes into fresh blobs (blobs)");
4164 b
.add_u64_counter(l_bluestore_write_small
, "bluestore_write_small",
4165 "Small writes into existing or sparse small blobs");
4166 b
.add_u64_counter(l_bluestore_write_small_bytes
, "bluestore_write_small_bytes",
4167 "Small writes into existing or sparse small blobs (bytes)", NULL
, 0, unit_t(BYTES
));
4168 b
.add_u64_counter(l_bluestore_write_small_unused
,
4169 "bluestore_write_small_unused",
4170 "Small writes into unused portion of existing blob");
4171 b
.add_u64_counter(l_bluestore_write_small_deferred
,
4172 "bluestore_write_small_deferred",
4173 "Small overwrites using deferred");
4174 b
.add_u64_counter(l_bluestore_write_small_pre_read
,
4175 "bluestore_write_small_pre_read",
4176 "Small writes that required we read some data (possibly "
4177 "cached) to fill out the block");
4178 b
.add_u64_counter(l_bluestore_write_small_new
, "bluestore_write_small_new",
4179 "Small write into new (sparse) blob");
4181 b
.add_u64_counter(l_bluestore_txc
, "bluestore_txc", "Transactions committed");
4182 b
.add_u64_counter(l_bluestore_onode_reshard
, "bluestore_onode_reshard",
4183 "Onode extent map reshard events");
4184 b
.add_u64_counter(l_bluestore_blob_split
, "bluestore_blob_split",
4185 "Sum for blob splitting due to resharding");
4186 b
.add_u64_counter(l_bluestore_extent_compress
, "bluestore_extent_compress",
4187 "Sum for extents that have been removed due to compression");
4188 b
.add_u64_counter(l_bluestore_gc_merged
, "bluestore_gc_merged",
4189 "Sum for extents that have been merged due to garbage "
4191 b
.add_u64_counter(l_bluestore_read_eio
, "bluestore_read_eio",
4192 "Read EIO errors propagated to high level callers");
4193 b
.add_u64_counter(l_bluestore_reads_with_retries
, "bluestore_reads_with_retries",
4194 "Read operations that required at least one retry due to failed checksum validation");
4195 b
.add_u64(l_bluestore_fragmentation
, "bluestore_fragmentation_micros",
4196 "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
4197 logger
= b
.create_perf_counters();
4198 cct
->get_perfcounters_collection()->add(logger
);
4201 int BlueStore::_reload_logger()
4203 struct store_statfs_t store_statfs
;
4205 int r
= statfs(&store_statfs
);
4207 logger
->set(l_bluestore_allocated
, store_statfs
.allocated
);
4208 logger
->set(l_bluestore_stored
, store_statfs
.stored
);
4209 logger
->set(l_bluestore_compressed
, store_statfs
.compressed
);
4210 logger
->set(l_bluestore_compressed_allocated
, store_statfs
.compressed_allocated
);
4211 logger
->set(l_bluestore_compressed_original
, store_statfs
.compressed_original
);
4216 void BlueStore::_shutdown_logger()
4218 cct
->get_perfcounters_collection()->remove(logger
);
4222 int BlueStore::get_block_device_fsid(CephContext
* cct
, const string
& path
,
4225 bluestore_bdev_label_t label
;
4226 int r
= _read_bdev_label(cct
, path
, &label
);
4229 *fsid
= label
.osd_uuid
;
4233 int BlueStore::_open_path()
4236 if (cct
->_conf
->get_val
<uint64_t>("osd_max_object_size") >=
4237 4*1024*1024*1024ull) {
4238 derr
<< __func__
<< " osd_max_object_size >= 4GB; BlueStore has hard limit of 4GB." << dendl
;
4241 assert(path_fd
< 0);
4242 path_fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_DIRECTORY
|O_CLOEXEC
));
4245 derr
<< __func__
<< " unable to open " << path
<< ": " << cpp_strerror(r
)
4252 void BlueStore::_close_path()
4254 VOID_TEMP_FAILURE_RETRY(::close(path_fd
));
4258 int BlueStore::_write_bdev_label(CephContext
*cct
,
4259 string path
, bluestore_bdev_label_t label
)
4261 dout(10) << __func__
<< " path " << path
<< " label " << label
<< dendl
;
4263 ::encode(label
, bl
);
4264 uint32_t crc
= bl
.crc32c(-1);
4266 assert(bl
.length() <= BDEV_LABEL_BLOCK_SIZE
);
4267 bufferptr
z(BDEV_LABEL_BLOCK_SIZE
- bl
.length());
4269 bl
.append(std::move(z
));
4271 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_WRONLY
|O_CLOEXEC
));
4274 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
4278 int r
= bl
.write_fd(fd
);
4280 derr
<< __func__
<< " failed to write to " << path
4281 << ": " << cpp_strerror(r
) << dendl
;
4285 derr
<< __func__
<< " failed to fsync " << path
4286 << ": " << cpp_strerror(r
) << dendl
;
4288 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4292 int BlueStore::_read_bdev_label(CephContext
* cct
, string path
,
4293 bluestore_bdev_label_t
*label
)
4295 dout(10) << __func__
<< dendl
;
4296 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_RDONLY
|O_CLOEXEC
));
4299 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
4304 int r
= bl
.read_fd(fd
, BDEV_LABEL_BLOCK_SIZE
);
4305 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4307 derr
<< __func__
<< " failed to read from " << path
4308 << ": " << cpp_strerror(r
) << dendl
;
4312 uint32_t crc
, expected_crc
;
4313 bufferlist::iterator p
= bl
.begin();
4315 ::decode(*label
, p
);
4317 t
.substr_of(bl
, 0, p
.get_off());
4319 ::decode(expected_crc
, p
);
4321 catch (buffer::error
& e
) {
4322 dout(2) << __func__
<< " unable to decode label at offset " << p
.get_off()
4327 if (crc
!= expected_crc
) {
4328 derr
<< __func__
<< " bad crc on label, expected " << expected_crc
4329 << " != actual " << crc
<< dendl
;
4332 dout(10) << __func__
<< " got " << *label
<< dendl
;
4336 int BlueStore::_check_or_set_bdev_label(
4337 string path
, uint64_t size
, string desc
, bool create
)
4339 bluestore_bdev_label_t label
;
4341 label
.osd_uuid
= fsid
;
4343 label
.btime
= ceph_clock_now();
4344 label
.description
= desc
;
4345 int r
= _write_bdev_label(cct
, path
, label
);
4349 int r
= _read_bdev_label(cct
, path
, &label
);
4352 if (cct
->_conf
->bluestore_debug_permit_any_bdev_label
) {
4353 dout(20) << __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
4354 << " and fsid " << fsid
<< " check bypassed" << dendl
;
4356 else if (label
.osd_uuid
!= fsid
) {
4357 derr
<< __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
4358 << " does not match our fsid " << fsid
<< dendl
;
4365 void BlueStore::_set_alloc_sizes(void)
4367 max_alloc_size
= cct
->_conf
->bluestore_max_alloc_size
;
4369 if (cct
->_conf
->bluestore_prefer_deferred_size
) {
4370 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size
;
4373 if (bdev
->is_rotational()) {
4374 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_hdd
;
4376 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_ssd
;
4380 if (cct
->_conf
->bluestore_deferred_batch_ops
) {
4381 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops
;
4384 if (bdev
->is_rotational()) {
4385 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_hdd
;
4387 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_ssd
;
4391 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
4392 << std::dec
<< " order " << min_alloc_size_order
4393 << " max_alloc_size 0x" << std::hex
<< max_alloc_size
4394 << " prefer_deferred_size 0x" << prefer_deferred_size
4396 << " deferred_batch_ops " << deferred_batch_ops
4400 int BlueStore::_open_bdev(bool create
)
4402 assert(bdev
== NULL
);
4403 string p
= path
+ "/block";
4404 bdev
= BlockDevice::create(cct
, p
, aio_cb
, static_cast<void*>(this));
4405 int r
= bdev
->open(p
);
4409 if (bdev
->supported_bdev_label()) {
4410 r
= _check_or_set_bdev_label(p
, bdev
->get_size(), "main", create
);
4415 // initialize global block parameters
4416 block_size
= bdev
->get_block_size();
4417 block_mask
= ~(block_size
- 1);
4418 block_size_order
= ctz(block_size
);
4419 assert(block_size
== 1u << block_size_order
);
4420 // and set cache_size based on device type
4421 r
= _set_cache_sizes();
4435 void BlueStore::_close_bdev()
4443 int BlueStore::_open_fm(bool create
)
4446 fm
= FreelistManager::create(cct
, freelist_type
, db
, PREFIX_ALLOC
);
4449 // initialize freespace
4450 dout(20) << __func__
<< " initializing freespace" << dendl
;
4451 KeyValueDB::Transaction t
= db
->get_transaction();
4454 bl
.append(freelist_type
);
4455 t
->set(PREFIX_SUPER
, "freelist_type", bl
);
4457 // being able to allocate in units less than bdev block size
4458 // seems to be a bad idea.
4459 assert( cct
->_conf
->bdev_block_size
<= (int64_t)min_alloc_size
);
4460 fm
->create(bdev
->get_size(), (int64_t)min_alloc_size
, t
);
4462 // allocate superblock reserved space. note that we do not mark
4463 // bluefs space as allocated in the freelist; we instead rely on
4465 uint64_t reserved
= ROUND_UP_TO(MAX(SUPER_RESERVED
, min_alloc_size
),
4467 fm
->allocate(0, reserved
, t
);
4469 if (cct
->_conf
->bluestore_bluefs
) {
4470 assert(bluefs_extents
.num_intervals() == 1);
4471 interval_set
<uint64_t>::iterator p
= bluefs_extents
.begin();
4472 reserved
= ROUND_UP_TO(p
.get_start() + p
.get_len(), min_alloc_size
);
4473 dout(20) << __func__
<< " reserved 0x" << std::hex
<< reserved
<< std::dec
4474 << " for bluefs" << dendl
;
4476 ::encode(bluefs_extents
, bl
);
4477 t
->set(PREFIX_SUPER
, "bluefs_extents", bl
);
4478 dout(20) << __func__
<< " bluefs_extents 0x" << std::hex
<< bluefs_extents
4479 << std::dec
<< dendl
;
4482 if (cct
->_conf
->bluestore_debug_prefill
> 0) {
4483 uint64_t end
= bdev
->get_size() - reserved
;
4484 dout(1) << __func__
<< " pre-fragmenting freespace, using "
4485 << cct
->_conf
->bluestore_debug_prefill
<< " with max free extent "
4486 << cct
->_conf
->bluestore_debug_prefragment_max
<< dendl
;
4487 uint64_t start
= P2ROUNDUP(reserved
, min_alloc_size
);
4488 uint64_t max_b
= cct
->_conf
->bluestore_debug_prefragment_max
/ min_alloc_size
;
4489 float r
= cct
->_conf
->bluestore_debug_prefill
;
4493 while (!stop
&& start
< end
) {
4494 uint64_t l
= (rand() % max_b
+ 1) * min_alloc_size
;
4495 if (start
+ l
> end
) {
4497 l
= P2ALIGN(l
, min_alloc_size
);
4499 assert(start
+ l
<= end
);
4501 uint64_t u
= 1 + (uint64_t)(r
* (double)l
);
4502 u
= P2ROUNDUP(u
, min_alloc_size
);
4503 if (start
+ l
+ u
> end
) {
4504 u
= end
- (start
+ l
);
4505 // trim to align so we don't overflow again
4506 u
= P2ALIGN(u
, min_alloc_size
);
4509 assert(start
+ l
+ u
<= end
);
4511 dout(20) << " free 0x" << std::hex
<< start
<< "~" << l
4512 << " use 0x" << u
<< std::dec
<< dendl
;
4515 // break if u has been trimmed to nothing
4519 fm
->allocate(start
+ l
, u
, t
);
4523 db
->submit_transaction_sync(t
);
4526 int r
= fm
->init(bdev
->get_size());
4528 derr
<< __func__
<< " freelist init failed: " << cpp_strerror(r
) << dendl
;
4536 void BlueStore::_close_fm()
4538 dout(10) << __func__
<< dendl
;
4545 int BlueStore::_open_alloc()
4547 assert(alloc
== NULL
);
4548 assert(bdev
->get_size());
4549 alloc
= Allocator::create(cct
, cct
->_conf
->bluestore_allocator
,
4553 lderr(cct
) << __func__
<< " Allocator::unknown alloc type "
4554 << cct
->_conf
->bluestore_allocator
4559 uint64_t num
= 0, bytes
= 0;
4561 dout(1) << __func__
<< " opening allocation metadata" << dendl
;
4562 // initialize from freelist
4563 fm
->enumerate_reset();
4564 uint64_t offset
, length
;
4565 while (fm
->enumerate_next(&offset
, &length
)) {
4566 alloc
->init_add_free(offset
, length
);
4570 fm
->enumerate_reset();
4571 dout(1) << __func__
<< " loaded " << byte_u_t(bytes
)
4572 << " in " << num
<< " extents"
4575 // also mark bluefs space as allocated
4576 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
4577 alloc
->init_rm_free(e
.get_start(), e
.get_len());
4579 dout(10) << __func__
<< " marked bluefs_extents 0x" << std::hex
4580 << bluefs_extents
<< std::dec
<< " as allocated" << dendl
;
4585 void BlueStore::_close_alloc()
4593 int BlueStore::_open_fsid(bool create
)
4595 assert(fsid_fd
< 0);
4596 int flags
= O_RDWR
|O_CLOEXEC
;
4599 fsid_fd
= ::openat(path_fd
, "fsid", flags
, 0644);
4602 derr
<< __func__
<< " " << cpp_strerror(err
) << dendl
;
4608 int BlueStore::_read_fsid(uuid_d
*uuid
)
4611 memset(fsid_str
, 0, sizeof(fsid_str
));
4612 int ret
= safe_read(fsid_fd
, fsid_str
, sizeof(fsid_str
));
4614 derr
<< __func__
<< " failed: " << cpp_strerror(ret
) << dendl
;
4621 if (!uuid
->parse(fsid_str
)) {
4622 derr
<< __func__
<< " unparsable uuid " << fsid_str
<< dendl
;
4628 int BlueStore::_write_fsid()
4630 int r
= ::ftruncate(fsid_fd
, 0);
4633 derr
<< __func__
<< " fsid truncate failed: " << cpp_strerror(r
) << dendl
;
4636 string str
= stringify(fsid
) + "\n";
4637 r
= safe_write(fsid_fd
, str
.c_str(), str
.length());
4639 derr
<< __func__
<< " fsid write failed: " << cpp_strerror(r
) << dendl
;
4642 r
= ::fsync(fsid_fd
);
4645 derr
<< __func__
<< " fsid fsync failed: " << cpp_strerror(r
) << dendl
;
4651 void BlueStore::_close_fsid()
4653 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd
));
4657 int BlueStore::_lock_fsid()
4660 memset(&l
, 0, sizeof(l
));
4662 l
.l_whence
= SEEK_SET
;
4663 int r
= ::fcntl(fsid_fd
, F_SETLK
, &l
);
4666 derr
<< __func__
<< " failed to lock " << path
<< "/fsid"
4667 << " (is another ceph-osd still running?)"
4668 << cpp_strerror(err
) << dendl
;
4674 bool BlueStore::is_rotational()
4677 return bdev
->is_rotational();
4680 bool rotational
= true;
4681 int r
= _open_path();
4684 r
= _open_fsid(false);
4687 r
= _read_fsid(&fsid
);
4693 r
= _open_bdev(false);
4696 rotational
= bdev
->is_rotational();
4706 bool BlueStore::is_journal_rotational()
4709 dout(5) << __func__
<< " bluefs disabled, default to store media type"
4711 return is_rotational();
4713 dout(10) << __func__
<< " " << (int)bluefs
->wal_is_rotational() << dendl
;
4714 return bluefs
->wal_is_rotational();
4717 bool BlueStore::test_mount_in_use()
4719 // most error conditions mean the mount is not in use (e.g., because
4720 // it doesn't exist). only if we fail to lock do we conclude it is
4723 int r
= _open_path();
4726 r
= _open_fsid(false);
4731 ret
= true; // if we can't lock, it is in use
4738 int BlueStore::_open_db(bool create
)
4742 string fn
= path
+ "/db";
4745 ceph::shared_ptr
<Int64ArrayMergeOperator
> merge_op(new Int64ArrayMergeOperator
);
4749 kv_backend
= cct
->_conf
->bluestore_kvbackend
;
4751 r
= read_meta("kv_backend", &kv_backend
);
4753 derr
<< __func__
<< " unable to read 'kv_backend' meta" << dendl
;
4757 dout(10) << __func__
<< " kv_backend = " << kv_backend
<< dendl
;
4761 do_bluefs
= cct
->_conf
->bluestore_bluefs
;
4764 r
= read_meta("bluefs", &s
);
4766 derr
<< __func__
<< " unable to read 'bluefs' meta" << dendl
;
4771 } else if (s
== "0") {
4774 derr
<< __func__
<< " bluefs = " << s
<< " : not 0 or 1, aborting"
4779 dout(10) << __func__
<< " do_bluefs = " << do_bluefs
<< dendl
;
4781 rocksdb::Env
*env
= NULL
;
4783 dout(10) << __func__
<< " initializing bluefs" << dendl
;
4784 if (kv_backend
!= "rocksdb") {
4785 derr
<< " backend must be rocksdb to use bluefs" << dendl
;
4788 bluefs
= new BlueFS(cct
);
4793 bfn
= path
+ "/block.db";
4794 if (::stat(bfn
.c_str(), &st
) == 0) {
4795 r
= bluefs
->add_block_device(BlueFS::BDEV_DB
, bfn
);
4797 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4798 << cpp_strerror(r
) << dendl
;
4802 if (bluefs
->bdev_support_label(BlueFS::BDEV_DB
)) {
4803 r
= _check_or_set_bdev_label(
4805 bluefs
->get_block_device_size(BlueFS::BDEV_DB
),
4806 "bluefs db", create
);
4809 << " check block device(" << bfn
<< ") label returned: "
4810 << cpp_strerror(r
) << dendl
;
4815 bluefs
->add_block_extent(
4818 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) - SUPER_RESERVED
);
4820 bluefs_shared_bdev
= BlueFS::BDEV_SLOW
;
4821 bluefs_single_shared_device
= false;
4824 if (::lstat(bfn
.c_str(), &st
) == -1) {
4826 bluefs_shared_bdev
= BlueFS::BDEV_DB
;
4828 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
4829 << cpp_strerror(r
) << dendl
;
4835 bfn
= path
+ "/block";
4836 r
= bluefs
->add_block_device(bluefs_shared_bdev
, bfn
);
4838 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4839 << cpp_strerror(r
) << dendl
;
4843 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4845 bdev
->get_size() * (cct
->_conf
->bluestore_bluefs_min_ratio
+
4846 cct
->_conf
->bluestore_bluefs_gift_ratio
);
4847 initial
= MAX(initial
, cct
->_conf
->bluestore_bluefs_min
);
4848 if (cct
->_conf
->bluefs_alloc_size
% min_alloc_size
) {
4849 derr
<< __func__
<< " bluefs_alloc_size 0x" << std::hex
4850 << cct
->_conf
->bluefs_alloc_size
<< " is not a multiple of "
4851 << "min_alloc_size 0x" << min_alloc_size
<< std::dec
<< dendl
;
4855 // align to bluefs's alloc_size
4856 initial
= P2ROUNDUP(initial
, cct
->_conf
->bluefs_alloc_size
);
4857 // put bluefs in the middle of the device in case it is an HDD
4858 uint64_t start
= P2ALIGN((bdev
->get_size() - initial
) / 2,
4859 cct
->_conf
->bluefs_alloc_size
);
4860 bluefs
->add_block_extent(bluefs_shared_bdev
, start
, initial
);
4861 bluefs_extents
.insert(start
, initial
);
4864 bfn
= path
+ "/block.wal";
4865 if (::stat(bfn
.c_str(), &st
) == 0) {
4866 r
= bluefs
->add_block_device(BlueFS::BDEV_WAL
, bfn
);
4868 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4869 << cpp_strerror(r
) << dendl
;
4873 if (bluefs
->bdev_support_label(BlueFS::BDEV_WAL
)) {
4874 r
= _check_or_set_bdev_label(
4876 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
),
4877 "bluefs wal", create
);
4879 derr
<< __func__
<< " check block device(" << bfn
4880 << ") label returned: " << cpp_strerror(r
) << dendl
;
4886 bluefs
->add_block_extent(
4887 BlueFS::BDEV_WAL
, BDEV_LABEL_BLOCK_SIZE
,
4888 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) -
4889 BDEV_LABEL_BLOCK_SIZE
);
4891 cct
->_conf
->set_val("rocksdb_separate_wal_dir", "true");
4892 bluefs_single_shared_device
= false;
4895 if (::lstat(bfn
.c_str(), &st
) == -1) {
4897 cct
->_conf
->set_val("rocksdb_separate_wal_dir", "false");
4899 derr
<< __func__
<< " " << bfn
<< " symlink exists but target unusable: "
4900 << cpp_strerror(r
) << dendl
;
4908 r
= bluefs
->mount();
4910 derr
<< __func__
<< " failed bluefs mount: " << cpp_strerror(r
) << dendl
;
4913 if (cct
->_conf
->bluestore_bluefs_env_mirror
) {
4914 rocksdb::Env
*a
= new BlueRocksEnv(bluefs
);
4915 rocksdb::Env
*b
= rocksdb::Env::Default();
4917 string cmd
= "rm -rf " + path
+ "/db " +
4918 path
+ "/db.slow " +
4920 int r
= system(cmd
.c_str());
4923 env
= new rocksdb::EnvMirror(b
, a
, false, true);
4925 env
= new BlueRocksEnv(bluefs
);
4927 // simplify the dir names, too, as "seen" by rocksdb
4931 if (bluefs_shared_bdev
== BlueFS::BDEV_SLOW
) {
4932 // we have both block.db and block; tell rocksdb!
4933 // note: the second (last) size value doesn't really matter
4934 ostringstream db_paths
;
4935 uint64_t db_size
= bluefs
->get_block_device_size(BlueFS::BDEV_DB
);
4936 uint64_t slow_size
= bluefs
->get_block_device_size(BlueFS::BDEV_SLOW
);
4937 db_paths
<< fn
<< ","
4938 << (uint64_t)(db_size
* 95 / 100) << " "
4939 << fn
+ ".slow" << ","
4940 << (uint64_t)(slow_size
* 95 / 100);
4941 cct
->_conf
->set_val("rocksdb_db_paths", db_paths
.str(), false);
4942 dout(10) << __func__
<< " set rocksdb_db_paths to "
4943 << cct
->_conf
->get_val
<std::string
>("rocksdb_db_paths") << dendl
;
4948 if (cct
->_conf
->rocksdb_separate_wal_dir
)
4949 env
->CreateDir(fn
+ ".wal");
4950 if (cct
->_conf
->get_val
<std::string
>("rocksdb_db_paths").length())
4951 env
->CreateDir(fn
+ ".slow");
4953 } else if (create
) {
4954 int r
= ::mkdir(fn
.c_str(), 0755);
4957 if (r
< 0 && r
!= -EEXIST
) {
4958 derr
<< __func__
<< " failed to create " << fn
<< ": " << cpp_strerror(r
)
4964 if (cct
->_conf
->rocksdb_separate_wal_dir
) {
4965 string walfn
= path
+ "/db.wal";
4966 r
= ::mkdir(walfn
.c_str(), 0755);
4969 if (r
< 0 && r
!= -EEXIST
) {
4970 derr
<< __func__
<< " failed to create " << walfn
4971 << ": " << cpp_strerror(r
)
4979 db
= KeyValueDB::create(cct
,
4982 static_cast<void*>(env
));
4984 derr
<< __func__
<< " error creating db" << dendl
;
4990 // delete env manually here since we can't depend on db to do this
4997 FreelistManager::setup_merge_operators(db
);
4998 db
->set_merge_operator(PREFIX_STAT
, merge_op
);
4999 db
->set_cache_size(cache_kv_ratio
* cache_size
);
5001 if (kv_backend
== "rocksdb")
5002 options
= cct
->_conf
->bluestore_rocksdb_options
;
5005 r
= db
->create_and_open(err
);
5009 derr
<< __func__
<< " erroring opening db: " << err
.str() << dendl
;
5019 dout(1) << __func__
<< " opened " << kv_backend
5020 << " path " << fn
<< " options " << options
<< dendl
;
5030 void BlueStore::_close_db()
5042 int BlueStore::_reconcile_bluefs_freespace()
5044 dout(10) << __func__
<< dendl
;
5045 interval_set
<uint64_t> bset
;
5046 int r
= bluefs
->get_block_extents(bluefs_shared_bdev
, &bset
);
5048 if (bset
== bluefs_extents
) {
5049 dout(10) << __func__
<< " we agree bluefs has 0x" << std::hex
<< bset
5050 << std::dec
<< dendl
;
5053 dout(10) << __func__
<< " bluefs says 0x" << std::hex
<< bset
<< std::dec
5055 dout(10) << __func__
<< " super says 0x" << std::hex
<< bluefs_extents
5056 << std::dec
<< dendl
;
5058 interval_set
<uint64_t> overlap
;
5059 overlap
.intersection_of(bset
, bluefs_extents
);
5061 bset
.subtract(overlap
);
5062 if (!bset
.empty()) {
5063 derr
<< __func__
<< " bluefs extra 0x" << std::hex
<< bset
<< std::dec
5068 interval_set
<uint64_t> super_extra
;
5069 super_extra
= bluefs_extents
;
5070 super_extra
.subtract(overlap
);
5071 if (!super_extra
.empty()) {
5072 // This is normal: it can happen if we commit to give extents to
5073 // bluefs and we crash before bluefs commits that it owns them.
5074 dout(10) << __func__
<< " super extra " << super_extra
<< dendl
;
5075 for (interval_set
<uint64_t>::iterator p
= super_extra
.begin();
5076 p
!= super_extra
.end();
5078 bluefs
->add_block_extent(bluefs_shared_bdev
, p
.get_start(), p
.get_len());
5085 void BlueStore::_dump_alloc_on_rebalance_failure()
5087 auto dump_interval
=
5088 cct
->_conf
->bluestore_bluefs_balance_failure_dump_interval
;
5089 if (dump_interval
> 0 &&
5090 next_dump_on_bluefs_balance_failure
<= ceph_clock_now()) {
5092 next_dump_on_bluefs_balance_failure
= ceph_clock_now();
5093 next_dump_on_bluefs_balance_failure
+= dump_interval
;
5097 int BlueStore::_balance_bluefs_freespace(PExtentVector
*extents
)
5102 vector
<pair
<uint64_t,uint64_t>> bluefs_usage
; // <free, total> ...
5103 bluefs
->get_usage(&bluefs_usage
);
5104 assert(bluefs_usage
.size() > bluefs_shared_bdev
);
5106 // fixme: look at primary bdev only for now
5107 uint64_t bluefs_free
= bluefs_usage
[bluefs_shared_bdev
].first
;
5108 uint64_t bluefs_total
= bluefs_usage
[bluefs_shared_bdev
].second
;
5109 float bluefs_free_ratio
= (float)bluefs_free
/ (float)bluefs_total
;
5111 uint64_t my_free
= alloc
->get_free();
5112 uint64_t total
= bdev
->get_size();
5113 float my_free_ratio
= (float)my_free
/ (float)total
;
5115 uint64_t total_free
= bluefs_free
+ my_free
;
5117 float bluefs_ratio
= (float)bluefs_free
/ (float)total_free
;
5119 dout(10) << __func__
5120 << " bluefs " << byte_u_t(bluefs_free
)
5121 << " free (" << bluefs_free_ratio
5122 << ") bluestore " << byte_u_t(my_free
)
5123 << " free (" << my_free_ratio
5124 << "), bluefs_ratio " << bluefs_ratio
5128 uint64_t reclaim
= 0;
5129 if (bluefs_ratio
< cct
->_conf
->bluestore_bluefs_min_ratio
) {
5130 gift
= cct
->_conf
->bluestore_bluefs_gift_ratio
* total_free
;
5131 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
5132 << " < min_ratio " << cct
->_conf
->bluestore_bluefs_min_ratio
5133 << ", should gift " << byte_u_t(gift
) << dendl
;
5134 } else if (bluefs_ratio
> cct
->_conf
->bluestore_bluefs_max_ratio
) {
5135 reclaim
= cct
->_conf
->bluestore_bluefs_reclaim_ratio
* total_free
;
5136 if (bluefs_total
- reclaim
< cct
->_conf
->bluestore_bluefs_min
)
5137 reclaim
= bluefs_total
- cct
->_conf
->bluestore_bluefs_min
;
5138 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
5139 << " > max_ratio " << cct
->_conf
->bluestore_bluefs_max_ratio
5140 << ", should reclaim " << byte_u_t(reclaim
) << dendl
;
5143 // don't take over too much of the freespace
5144 uint64_t free_cap
= cct
->_conf
->bluestore_bluefs_max_ratio
* total_free
;
5145 if (bluefs_total
< cct
->_conf
->bluestore_bluefs_min
&&
5146 cct
->_conf
->bluestore_bluefs_min
< free_cap
) {
5147 uint64_t g
= cct
->_conf
->bluestore_bluefs_min
- bluefs_total
;
5148 dout(10) << __func__
<< " bluefs_total " << bluefs_total
5149 << " < min " << cct
->_conf
->bluestore_bluefs_min
5150 << ", should gift " << byte_u_t(g
) << dendl
;
5155 uint64_t min_free
= cct
->_conf
->get_val
<uint64_t>("bluestore_bluefs_min_free");
5156 if (bluefs_free
< min_free
&&
5157 min_free
< free_cap
) {
5158 uint64_t g
= min_free
- bluefs_free
;
5159 dout(10) << __func__
<< " bluefs_free " << bluefs_total
5160 << " < min " << min_free
5161 << ", should gift " << byte_u_t(g
) << dendl
;
5168 // round up to alloc size
5169 gift
= P2ROUNDUP(gift
, cct
->_conf
->bluefs_alloc_size
);
5171 // hard cap to fit into 32 bits
5172 gift
= MIN(gift
, 1ull<<31);
5173 dout(10) << __func__
<< " gifting " << gift
5174 << " (" << byte_u_t(gift
) << ")" << dendl
;
5176 int64_t alloc_len
= alloc
->allocate(gift
, cct
->_conf
->bluefs_alloc_size
,
5179 if (alloc_len
<= 0) {
5180 dout(0) << __func__
<< " no allocate on 0x" << std::hex
<< gift
5181 << " min_alloc_size 0x" << min_alloc_size
<< std::dec
<< dendl
;
5182 _dump_alloc_on_rebalance_failure();
5184 } else if (alloc_len
< (int64_t)gift
) {
5185 dout(0) << __func__
<< " insufficient allocate on 0x" << std::hex
<< gift
5186 << " min_alloc_size 0x" << min_alloc_size
5187 << " allocated 0x" << alloc_len
5188 << std::dec
<< dendl
;
5189 _dump_alloc_on_rebalance_failure();
5191 for (auto& e
: *extents
) {
5192 dout(1) << __func__
<< " gifting " << e
<< " to bluefs" << dendl
;
5199 // reclaim from bluefs?
5201 // round up to alloc size
5202 reclaim
= P2ROUNDUP(reclaim
, cct
->_conf
->bluefs_alloc_size
);
5204 // hard cap to fit into 32 bits
5205 reclaim
= MIN(reclaim
, 1ull<<31);
5206 dout(10) << __func__
<< " reclaiming " << reclaim
5207 << " (" << byte_u_t(reclaim
) << ")" << dendl
;
5209 while (reclaim
> 0) {
5210 // NOTE: this will block and do IO.
5211 PExtentVector extents
;
5212 int r
= bluefs
->reclaim_blocks(bluefs_shared_bdev
, reclaim
,
5215 derr
<< __func__
<< " failed to reclaim space from bluefs"
5219 for (auto e
: extents
) {
5220 bluefs_extents
.erase(e
.offset
, e
.length
);
5221 bluefs_extents_reclaiming
.insert(e
.offset
, e
.length
);
5222 reclaim
-= e
.length
;
5232 void BlueStore::_commit_bluefs_freespace(
5233 const PExtentVector
& bluefs_gift_extents
)
5235 dout(10) << __func__
<< dendl
;
5236 for (auto& p
: bluefs_gift_extents
) {
5237 bluefs
->add_block_extent(bluefs_shared_bdev
, p
.offset
, p
.length
);
5241 int BlueStore::_open_collections(int *errors
)
5243 dout(10) << __func__
<< dendl
;
5244 assert(coll_map
.empty());
5245 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
5246 for (it
->upper_bound(string());
5250 if (cid
.parse(it
->key())) {
5254 cache_shards
[cid
.hash_to_shard(cache_shards
.size())],
5256 bufferlist bl
= it
->value();
5257 bufferlist::iterator p
= bl
.begin();
5259 ::decode(c
->cnode
, p
);
5260 } catch (buffer::error
& e
) {
5261 derr
<< __func__
<< " failed to decode cnode, key:"
5262 << pretty_binary_string(it
->key()) << dendl
;
5265 dout(20) << __func__
<< " opened " << cid
<< " " << c
5266 << " " << c
->cnode
<< dendl
;
5269 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
5277 void BlueStore::_open_statfs()
5280 int r
= db
->get(PREFIX_STAT
, "bluestore_statfs", &bl
);
5282 if (size_t(bl
.length()) >= sizeof(vstatfs
.values
)) {
5283 auto it
= bl
.begin();
5286 dout(10) << __func__
<< " store_statfs is corrupt, using empty" << dendl
;
5290 dout(10) << __func__
<< " store_statfs missed, using empty" << dendl
;
5294 int BlueStore::_setup_block_symlink_or_file(
5300 dout(20) << __func__
<< " name " << name
<< " path " << epath
5301 << " size " << size
<< " create=" << (int)create
<< dendl
;
5303 int flags
= O_RDWR
|O_CLOEXEC
;
5306 if (epath
.length()) {
5307 r
= ::symlinkat(epath
.c_str(), path_fd
, name
.c_str());
5310 derr
<< __func__
<< " failed to create " << name
<< " symlink to "
5311 << epath
<< ": " << cpp_strerror(r
) << dendl
;
5315 if (!epath
.compare(0, strlen(SPDK_PREFIX
), SPDK_PREFIX
)) {
5316 int fd
= ::openat(path_fd
, epath
.c_str(), flags
, 0644);
5319 derr
<< __func__
<< " failed to open " << epath
<< " file: "
5320 << cpp_strerror(r
) << dendl
;
5323 string serial_number
= epath
.substr(strlen(SPDK_PREFIX
));
5324 r
= ::write(fd
, serial_number
.c_str(), serial_number
.size());
5325 assert(r
== (int)serial_number
.size());
5326 dout(1) << __func__
<< " created " << name
<< " symlink to "
5328 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5332 int fd
= ::openat(path_fd
, name
.c_str(), flags
, 0644);
5334 // block file is present
5336 int r
= ::fstat(fd
, &st
);
5338 S_ISREG(st
.st_mode
) && // if it is a regular file
5339 st
.st_size
== 0) { // and is 0 bytes
5340 r
= ::ftruncate(fd
, size
);
5343 derr
<< __func__
<< " failed to resize " << name
<< " file to "
5344 << size
<< ": " << cpp_strerror(r
) << dendl
;
5345 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5349 if (cct
->_conf
->bluestore_block_preallocate_file
) {
5350 r
= ::ceph_posix_fallocate(fd
, 0, size
);
5352 derr
<< __func__
<< " failed to prefallocate " << name
<< " file to "
5353 << size
<< ": " << cpp_strerror(r
) << dendl
;
5354 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5358 dout(1) << __func__
<< " resized " << name
<< " file to "
5359 << byte_u_t(size
) << dendl
;
5361 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5365 derr
<< __func__
<< " failed to open " << name
<< " file: "
5366 << cpp_strerror(r
) << dendl
;
5374 int BlueStore::mkfs()
5376 dout(1) << __func__
<< " path " << path
<< dendl
;
5382 r
= read_meta("mkfs_done", &done
);
5384 dout(1) << __func__
<< " already created" << dendl
;
5385 if (cct
->_conf
->bluestore_fsck_on_mkfs
) {
5386 r
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
5388 derr
<< __func__
<< " fsck found fatal error: " << cpp_strerror(r
)
5393 derr
<< __func__
<< " fsck found " << r
<< " errors" << dendl
;
5397 return r
; // idempotent
5403 r
= read_meta("type", &type
);
5405 if (type
!= "bluestore") {
5406 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
5410 r
= write_meta("type", "bluestore");
5416 freelist_type
= "bitmap";
5422 r
= _open_fsid(true);
5428 goto out_close_fsid
;
5430 r
= _read_fsid(&old_fsid
);
5431 if (r
< 0 || old_fsid
.is_zero()) {
5432 if (fsid
.is_zero()) {
5433 fsid
.generate_random();
5434 dout(1) << __func__
<< " generated fsid " << fsid
<< dendl
;
5436 dout(1) << __func__
<< " using provided fsid " << fsid
<< dendl
;
5438 // we'll write it later.
5440 if (!fsid
.is_zero() && fsid
!= old_fsid
) {
5441 derr
<< __func__
<< " on-disk fsid " << old_fsid
5442 << " != provided " << fsid
<< dendl
;
5444 goto out_close_fsid
;
5449 r
= _setup_block_symlink_or_file("block", cct
->_conf
->bluestore_block_path
,
5450 cct
->_conf
->bluestore_block_size
,
5451 cct
->_conf
->bluestore_block_create
);
5453 goto out_close_fsid
;
5454 if (cct
->_conf
->bluestore_bluefs
) {
5455 r
= _setup_block_symlink_or_file("block.wal", cct
->_conf
->bluestore_block_wal_path
,
5456 cct
->_conf
->bluestore_block_wal_size
,
5457 cct
->_conf
->bluestore_block_wal_create
);
5459 goto out_close_fsid
;
5460 r
= _setup_block_symlink_or_file("block.db", cct
->_conf
->bluestore_block_db_path
,
5461 cct
->_conf
->bluestore_block_db_size
,
5462 cct
->_conf
->bluestore_block_db_create
);
5464 goto out_close_fsid
;
5467 r
= _open_bdev(true);
5469 goto out_close_fsid
;
5471 // choose min_alloc_size
5472 if (cct
->_conf
->bluestore_min_alloc_size
) {
5473 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size
;
5476 if (bdev
->is_rotational()) {
5477 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_hdd
;
5479 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_ssd
;
5483 // make sure min_alloc_size is power of 2 aligned.
5484 if (!ISP2(min_alloc_size
)) {
5485 derr
<< __func__
<< " min_alloc_size 0x"
5486 << std::hex
<< min_alloc_size
<< std::dec
5487 << " is not power of 2 aligned!"
5490 goto out_close_bdev
;
5495 goto out_close_bdev
;
5502 KeyValueDB::Transaction t
= db
->get_transaction();
5505 ::encode((uint64_t)0, bl
);
5506 t
->set(PREFIX_SUPER
, "nid_max", bl
);
5507 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
5512 ::encode((uint64_t)min_alloc_size
, bl
);
5513 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
5516 ondisk_format
= latest_ondisk_format
;
5517 _prepare_ondisk_format_super(t
);
5518 db
->submit_transaction_sync(t
);
5522 r
= write_meta("kv_backend", cct
->_conf
->bluestore_kvbackend
);
5526 r
= write_meta("bluefs", stringify(bluefs
? 1 : 0));
5530 if (fsid
!= old_fsid
) {
5533 derr
<< __func__
<< " error writing fsid: " << cpp_strerror(r
) << dendl
;
5550 cct
->_conf
->bluestore_fsck_on_mkfs
) {
5551 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
5555 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
5561 // indicate success by writing the 'mkfs_done' file
5562 r
= write_meta("mkfs_done", "yes");
5566 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
5568 dout(0) << __func__
<< " success" << dendl
;
5573 void BlueStore::set_cache_shards(unsigned num
)
5575 dout(10) << __func__
<< " " << num
<< dendl
;
5576 size_t old
= cache_shards
.size();
5578 cache_shards
.resize(num
);
5579 for (unsigned i
= old
; i
< num
; ++i
) {
5580 cache_shards
[i
] = Cache::create(cct
, cct
->_conf
->bluestore_cache_type
,
5585 int BlueStore::_mount(bool kv_only
)
5587 dout(1) << __func__
<< " path " << path
<< dendl
;
5593 int r
= read_meta("type", &type
);
5595 derr
<< __func__
<< " failed to load os-type: " << cpp_strerror(r
)
5600 if (type
!= "bluestore") {
5601 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
5606 if (cct
->_conf
->bluestore_fsck_on_mount
) {
5607 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mount_deep
);
5611 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
5616 int r
= _open_path();
5619 r
= _open_fsid(false);
5623 r
= _read_fsid(&fsid
);
5631 r
= _open_bdev(false);
5635 r
= _open_db(false);
5642 r
= _open_super_meta();
5646 r
= _open_fm(false);
5654 r
= _open_collections();
5658 r
= _reload_logger();
5663 r
= _reconcile_bluefs_freespace();
5670 r
= _deferred_replay();
5674 mempool_thread
.init();
5698 int BlueStore::umount()
5700 assert(_kv_only
|| mounted
);
5701 dout(1) << __func__
<< dendl
;
5704 _osr_unregister_all();
5708 mempool_thread
.shutdown();
5709 dout(20) << __func__
<< " stopping kv thread" << dendl
;
5712 dout(20) << __func__
<< " closing" << dendl
;
5722 if (cct
->_conf
->bluestore_fsck_on_umount
) {
5723 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_umount_deep
);
5727 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
5734 static void apply(uint64_t off
,
5736 uint64_t granularity
,
5737 BlueStore::mempool_dynamic_bitset
&bitset
,
5738 std::function
<void(uint64_t,
5739 BlueStore::mempool_dynamic_bitset
&)> f
) {
5740 auto end
= ROUND_UP_TO(off
+ len
, granularity
);
5742 uint64_t pos
= off
/ granularity
;
5748 int BlueStore::_fsck_check_extents(
5749 const ghobject_t
& oid
,
5750 const PExtentVector
& extents
,
5752 mempool_dynamic_bitset
&used_blocks
,
5753 uint64_t granularity
,
5754 store_statfs_t
& expected_statfs
)
5756 dout(30) << __func__
<< " oid " << oid
<< " extents " << extents
<< dendl
;
5758 for (auto e
: extents
) {
5761 expected_statfs
.allocated
+= e
.length
;
5763 expected_statfs
.compressed_allocated
+= e
.length
;
5765 bool already
= false;
5767 e
.offset
, e
.length
, granularity
, used_blocks
,
5768 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5769 assert(pos
< bs
.size());
5776 derr
<< " " << oid
<< " extent " << e
5777 << " or a subset is already allocated" << dendl
;
5780 if (e
.end() > bdev
->get_size()) {
5781 derr
<< " " << oid
<< " extent " << e
5782 << " past end of block device" << dendl
;
5789 int BlueStore::_fsck(bool deep
, bool repair
)
5792 << (repair
? " fsck" : " repair")
5793 << (deep
? " (deep)" : " (shallow)") << " start" << dendl
;
5797 typedef btree::btree_set
<
5798 uint64_t,std::less
<uint64_t>,
5799 mempool::bluestore_fsck::pool_allocator
<uint64_t>> uint64_t_btree_t
;
5800 uint64_t_btree_t used_nids
;
5801 uint64_t_btree_t used_omap_head
;
5802 uint64_t_btree_t used_sbids
;
5804 mempool_dynamic_bitset used_blocks
;
5805 KeyValueDB::Iterator it
;
5806 store_statfs_t expected_statfs
, actual_statfs
;
5808 list
<ghobject_t
> oids
;
5810 bluestore_extent_ref_map_t ref_map
;
5813 mempool::bluestore_fsck::map
<uint64_t,sb_info_t
> sb_info
;
5815 uint64_t num_objects
= 0;
5816 uint64_t num_extents
= 0;
5817 uint64_t num_blobs
= 0;
5818 uint64_t num_spanning_blobs
= 0;
5819 uint64_t num_shared_blobs
= 0;
5820 uint64_t num_sharded_objects
= 0;
5821 uint64_t num_object_shards
= 0;
5823 utime_t start
= ceph_clock_now();
5825 int r
= _open_path();
5828 r
= _open_fsid(false);
5832 r
= _read_fsid(&fsid
);
5840 r
= _open_bdev(false);
5844 r
= _open_db(false);
5848 r
= _open_super_meta();
5852 r
= _open_fm(false);
5860 r
= _open_collections(&errors
);
5864 mempool_thread
.init();
5866 // we need finishers and kv_{sync,finalize}_thread *just* for replay
5868 r
= _deferred_replay();
5873 used_blocks
.resize(fm
->get_alloc_units());
5875 0, MAX(min_alloc_size
, SUPER_RESERVED
), fm
->get_alloc_size(), used_blocks
,
5876 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5877 assert(pos
< bs
.size());
5883 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
5885 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
5886 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5887 assert(pos
< bs
.size());
5900 // get expected statfs; fill unaffected fields to be able to compare
5902 statfs(&actual_statfs
);
5903 expected_statfs
.total
= actual_statfs
.total
;
5904 expected_statfs
.available
= actual_statfs
.available
;
5907 dout(1) << __func__
<< " walking object keyspace" << dendl
;
5908 it
= db
->get_iterator(PREFIX_OBJ
);
5912 mempool::bluestore_fsck::list
<string
> expecting_shards
;
5913 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
5914 if (g_conf
->bluestore_debug_fsck_abort
) {
5917 dout(30) << " key " << pretty_binary_string(it
->key()) << dendl
;
5918 if (is_extent_shard_key(it
->key())) {
5919 while (!expecting_shards
.empty() &&
5920 expecting_shards
.front() < it
->key()) {
5921 derr
<< "fsck error: missing shard key "
5922 << pretty_binary_string(expecting_shards
.front())
5925 expecting_shards
.pop_front();
5927 if (!expecting_shards
.empty() &&
5928 expecting_shards
.front() == it
->key()) {
5930 expecting_shards
.pop_front();
5936 get_key_extent_shard(it
->key(), &okey
, &offset
);
5937 derr
<< "fsck error: stray shard 0x" << std::hex
<< offset
5938 << std::dec
<< dendl
;
5939 if (expecting_shards
.empty()) {
5940 derr
<< "fsck error: " << pretty_binary_string(it
->key())
5941 << " is unexpected" << dendl
;
5945 while (expecting_shards
.front() > it
->key()) {
5946 derr
<< "fsck error: saw " << pretty_binary_string(it
->key())
5948 derr
<< "fsck error: exp "
5949 << pretty_binary_string(expecting_shards
.front()) << dendl
;
5951 expecting_shards
.pop_front();
5952 if (expecting_shards
.empty()) {
5960 int r
= get_key_object(it
->key(), &oid
);
5962 derr
<< "fsck error: bad object key "
5963 << pretty_binary_string(it
->key()) << dendl
;
5968 oid
.shard_id
!= pgid
.shard
||
5969 oid
.hobj
.pool
!= (int64_t)pgid
.pool() ||
5970 !c
->contains(oid
)) {
5972 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
=
5974 p
!= coll_map
.end();
5976 if (p
->second
->contains(oid
)) {
5982 derr
<< "fsck error: stray object " << oid
5983 << " not owned by any collection" << dendl
;
5987 c
->cid
.is_pg(&pgid
);
5988 dout(20) << __func__
<< " collection " << c
->cid
<< " " << c
->cnode
5992 if (!expecting_shards
.empty()) {
5993 for (auto &k
: expecting_shards
) {
5994 derr
<< "fsck error: missing shard key "
5995 << pretty_binary_string(k
) << dendl
;
5998 expecting_shards
.clear();
6001 dout(10) << __func__
<< " " << oid
<< dendl
;
6002 RWLock::RLocker
l(c
->lock
);
6003 OnodeRef o
= c
->get_onode(oid
, false);
6005 if (o
->onode
.nid
> nid_max
) {
6006 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
6007 << " > nid_max " << nid_max
<< dendl
;
6010 if (used_nids
.count(o
->onode
.nid
)) {
6011 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
6012 << " already in use" << dendl
;
6014 continue; // go for next object
6016 used_nids
.insert(o
->onode
.nid
);
6019 num_spanning_blobs
+= o
->extent_map
.spanning_blob_map
.size();
6020 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
6023 if (!o
->extent_map
.shards
.empty()) {
6024 ++num_sharded_objects
;
6025 num_object_shards
+= o
->extent_map
.shards
.size();
6027 for (auto& s
: o
->extent_map
.shards
) {
6028 dout(20) << __func__
<< " shard " << *s
.shard_info
<< dendl
;
6029 expecting_shards
.push_back(string());
6030 get_extent_shard_key(o
->key
, s
.shard_info
->offset
,
6031 &expecting_shards
.back());
6032 if (s
.shard_info
->offset
>= o
->onode
.size
) {
6033 derr
<< "fsck error: " << oid
<< " shard 0x" << std::hex
6034 << s
.shard_info
->offset
<< " past EOF at 0x" << o
->onode
.size
6035 << std::dec
<< dendl
;
6040 map
<BlobRef
,bluestore_blob_t::unused_t
> referenced
;
6042 mempool::bluestore_fsck::map
<BlobRef
,
6043 bluestore_blob_use_tracker_t
> ref_map
;
6044 for (auto& l
: o
->extent_map
.extent_map
) {
6045 dout(20) << __func__
<< " " << l
<< dendl
;
6046 if (l
.logical_offset
< pos
) {
6047 derr
<< "fsck error: " << oid
<< " lextent at 0x"
6048 << std::hex
<< l
.logical_offset
6049 << " overlaps with the previous, which ends at 0x" << pos
6050 << std::dec
<< dendl
;
6053 if (o
->extent_map
.spans_shard(l
.logical_offset
, l
.length
)) {
6054 derr
<< "fsck error: " << oid
<< " lextent at 0x"
6055 << std::hex
<< l
.logical_offset
<< "~" << l
.length
6056 << " spans a shard boundary"
6057 << std::dec
<< dendl
;
6060 pos
= l
.logical_offset
+ l
.length
;
6061 expected_statfs
.stored
+= l
.length
;
6063 const bluestore_blob_t
& blob
= l
.blob
->get_blob();
6065 auto& ref
= ref_map
[l
.blob
];
6066 if (ref
.is_empty()) {
6067 uint32_t min_release_size
= blob
.get_release_size(min_alloc_size
);
6068 uint32_t l
= blob
.get_logical_length();
6069 ref
.init(l
, min_release_size
);
6075 if (blob
.has_unused()) {
6076 auto p
= referenced
.find(l
.blob
);
6077 bluestore_blob_t::unused_t
*pu
;
6078 if (p
== referenced
.end()) {
6079 pu
= &referenced
[l
.blob
];
6083 uint64_t blob_len
= blob
.get_logical_length();
6084 assert((blob_len
% (sizeof(*pu
)*8)) == 0);
6085 assert(l
.blob_offset
+ l
.length
<= blob_len
);
6086 uint64_t chunk_size
= blob_len
/ (sizeof(*pu
)*8);
6087 uint64_t start
= l
.blob_offset
/ chunk_size
;
6089 ROUND_UP_TO(l
.blob_offset
+ l
.length
, chunk_size
) / chunk_size
;
6090 for (auto i
= start
; i
< end
; ++i
) {
6095 for (auto &i
: referenced
) {
6096 dout(20) << __func__
<< " referenced 0x" << std::hex
<< i
.second
6097 << std::dec
<< " for " << *i
.first
<< dendl
;
6098 const bluestore_blob_t
& blob
= i
.first
->get_blob();
6099 if (i
.second
& blob
.unused
) {
6100 derr
<< "fsck error: " << oid
<< " blob claims unused 0x"
6101 << std::hex
<< blob
.unused
6102 << " but extents reference 0x" << i
.second
6103 << " on blob " << *i
.first
<< dendl
;
6106 if (blob
.has_csum()) {
6107 uint64_t blob_len
= blob
.get_logical_length();
6108 uint64_t unused_chunk_size
= blob_len
/ (sizeof(blob
.unused
)*8);
6109 unsigned csum_count
= blob
.get_csum_count();
6110 unsigned csum_chunk_size
= blob
.get_csum_chunk_size();
6111 for (unsigned p
= 0; p
< csum_count
; ++p
) {
6112 unsigned pos
= p
* csum_chunk_size
;
6113 unsigned firstbit
= pos
/ unused_chunk_size
; // [firstbit,lastbit]
6114 unsigned lastbit
= (pos
+ csum_chunk_size
- 1) / unused_chunk_size
;
6115 unsigned mask
= 1u << firstbit
;
6116 for (unsigned b
= firstbit
+ 1; b
<= lastbit
; ++b
) {
6119 if ((blob
.unused
& mask
) == mask
) {
6120 // this csum chunk region is marked unused
6121 if (blob
.get_csum_item(p
) != 0) {
6122 derr
<< "fsck error: " << oid
6123 << " blob claims csum chunk 0x" << std::hex
<< pos
6124 << "~" << csum_chunk_size
6125 << " is unused (mask 0x" << mask
<< " of unused 0x"
6126 << blob
.unused
<< ") but csum is non-zero 0x"
6127 << blob
.get_csum_item(p
) << std::dec
<< " on blob "
6128 << *i
.first
<< dendl
;
6135 for (auto &i
: ref_map
) {
6137 const bluestore_blob_t
& blob
= i
.first
->get_blob();
6138 bool equal
= i
.first
->get_blob_use_tracker().equal(i
.second
);
6140 derr
<< "fsck error: " << oid
<< " blob " << *i
.first
6141 << " doesn't match expected ref_map " << i
.second
<< dendl
;
6144 if (blob
.is_compressed()) {
6145 expected_statfs
.compressed
+= blob
.get_compressed_payload_length();
6146 expected_statfs
.compressed_original
+=
6147 i
.first
->get_referenced_bytes();
6149 if (blob
.is_shared()) {
6150 if (i
.first
->shared_blob
->get_sbid() > blobid_max
) {
6151 derr
<< "fsck error: " << oid
<< " blob " << blob
6152 << " sbid " << i
.first
->shared_blob
->get_sbid() << " > blobid_max "
6153 << blobid_max
<< dendl
;
6155 } else if (i
.first
->shared_blob
->get_sbid() == 0) {
6156 derr
<< "fsck error: " << oid
<< " blob " << blob
6157 << " marked as shared but has uninitialized sbid"
6161 sb_info_t
& sbi
= sb_info
[i
.first
->shared_blob
->get_sbid()];
6162 sbi
.sb
= i
.first
->shared_blob
;
6163 sbi
.oids
.push_back(oid
);
6164 sbi
.compressed
= blob
.is_compressed();
6165 for (auto e
: blob
.get_extents()) {
6167 sbi
.ref_map
.get(e
.offset
, e
.length
);
6171 errors
+= _fsck_check_extents(oid
, blob
.get_extents(),
6172 blob
.is_compressed(),
6174 fm
->get_alloc_size(),
6180 uint64_t max_read_block
= cct
->_conf
->bluestore_fsck_read_bytes_cap
;
6181 uint64_t offset
= 0;
6183 uint64_t l
= std::min(uint64_t(o
->onode
.size
- offset
), max_read_block
);
6184 int r
= _do_read(c
.get(), o
, offset
, l
, bl
,
6185 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
6188 derr
<< "fsck error: " << oid
<< std::hex
6189 << " error during read: "
6190 << " " << offset
<< "~" << l
6191 << " " << cpp_strerror(r
) << std::dec
6196 } while (offset
< o
->onode
.size
);
6199 if (o
->onode
.has_omap()) {
6200 if (used_omap_head
.count(o
->onode
.nid
)) {
6201 derr
<< "fsck error: " << oid
<< " omap_head " << o
->onode
.nid
6202 << " already in use" << dendl
;
6205 used_omap_head
.insert(o
->onode
.nid
);
6210 dout(1) << __func__
<< " checking shared_blobs" << dendl
;
6211 it
= db
->get_iterator(PREFIX_SHARED_BLOB
);
6213 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
6214 string key
= it
->key();
6216 if (get_key_shared_blob(key
, &sbid
)) {
6217 derr
<< "fsck error: bad key '" << key
6218 << "' in shared blob namespace" << dendl
;
6222 auto p
= sb_info
.find(sbid
);
6223 if (p
== sb_info
.end()) {
6224 derr
<< "fsck error: found stray shared blob data for sbid 0x"
6225 << std::hex
<< sbid
<< std::dec
<< dendl
;
6229 sb_info_t
& sbi
= p
->second
;
6230 bluestore_shared_blob_t
shared_blob(sbid
);
6231 bufferlist bl
= it
->value();
6232 bufferlist::iterator blp
= bl
.begin();
6233 ::decode(shared_blob
, blp
);
6234 dout(20) << __func__
<< " " << *sbi
.sb
<< " " << shared_blob
<< dendl
;
6235 if (shared_blob
.ref_map
!= sbi
.ref_map
) {
6236 derr
<< "fsck error: shared blob 0x" << std::hex
<< sbid
6237 << std::dec
<< " ref_map " << shared_blob
.ref_map
6238 << " != expected " << sbi
.ref_map
<< dendl
;
6241 PExtentVector extents
;
6242 for (auto &r
: shared_blob
.ref_map
.ref_map
) {
6243 extents
.emplace_back(bluestore_pextent_t(r
.first
, r
.second
.length
));
6245 errors
+= _fsck_check_extents(p
->second
.oids
.front(),
6247 p
->second
.compressed
,
6249 fm
->get_alloc_size(),
6255 for (auto &p
: sb_info
) {
6256 derr
<< "fsck error: shared_blob 0x" << p
.first
6257 << " key is missing (" << *p
.second
.sb
<< ")" << dendl
;
6260 if (!(actual_statfs
== expected_statfs
)) {
6261 derr
<< "fsck error: actual " << actual_statfs
6262 << " != expected " << expected_statfs
<< dendl
;
6266 dout(1) << __func__
<< " checking for stray omap data" << dendl
;
6267 it
= db
->get_iterator(PREFIX_OMAP
);
6269 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
6271 _key_decode_u64(it
->key().c_str(), &omap_head
);
6272 if (used_omap_head
.count(omap_head
) == 0) {
6273 derr
<< "fsck error: found stray omap data on omap_head "
6274 << omap_head
<< dendl
;
6280 dout(1) << __func__
<< " checking deferred events" << dendl
;
6281 it
= db
->get_iterator(PREFIX_DEFERRED
);
6283 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
6284 bufferlist bl
= it
->value();
6285 bufferlist::iterator p
= bl
.begin();
6286 bluestore_deferred_transaction_t wt
;
6289 } catch (buffer::error
& e
) {
6290 derr
<< "fsck error: failed to decode deferred txn "
6291 << pretty_binary_string(it
->key()) << dendl
;
6295 dout(20) << __func__
<< " deferred " << wt
.seq
6296 << " ops " << wt
.ops
.size()
6297 << " released 0x" << std::hex
<< wt
.released
<< std::dec
<< dendl
;
6298 for (auto e
= wt
.released
.begin(); e
!= wt
.released
.end(); ++e
) {
6300 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
6301 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
6302 assert(pos
< bs
.size());
6310 dout(1) << __func__
<< " checking freelist vs allocated" << dendl
;
6312 // remove bluefs_extents from used set since the freelist doesn't
6313 // know they are allocated.
6314 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
6316 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
6317 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
6318 assert(pos
< bs
.size());
6323 fm
->enumerate_reset();
6324 uint64_t offset
, length
;
6325 while (fm
->enumerate_next(&offset
, &length
)) {
6326 bool intersects
= false;
6328 offset
, length
, fm
->get_alloc_size(), used_blocks
,
6329 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
6330 assert(pos
< bs
.size());
6339 if (offset
== SUPER_RESERVED
&&
6340 length
== min_alloc_size
- SUPER_RESERVED
) {
6341 // this is due to the change just after luminous to min_alloc_size
6342 // granularity allocations, and our baked in assumption at the top
6343 // of _fsck that 0~ROUND_UP_TO(SUPER_RESERVED,min_alloc_size) is used
6344 // (vs luminous's ROUND_UP_TO(SUPER_RESERVED,block_size)). harmless,
6345 // since we will never allocate this region below min_alloc_size.
6346 dout(10) << __func__
<< " ignoring free extent between SUPER_RESERVED"
6347 << " and min_alloc_size, 0x" << std::hex
<< offset
<< "~"
6350 derr
<< "fsck error: free extent 0x" << std::hex
<< offset
6351 << "~" << length
<< std::dec
6352 << " intersects allocated blocks" << dendl
;
6357 fm
->enumerate_reset();
6358 size_t count
= used_blocks
.count();
6359 if (used_blocks
.size() != count
) {
6360 assert(used_blocks
.size() > count
);
6363 size_t start
= used_blocks
.find_first();
6364 while (start
!= decltype(used_blocks
)::npos
) {
6367 size_t next
= used_blocks
.find_next(cur
);
6368 if (next
!= cur
+ 1) {
6369 derr
<< "fsck error: leaked extent 0x" << std::hex
6370 << ((uint64_t)start
* fm
->get_alloc_size()) << "~"
6371 << ((cur
+ 1 - start
) * fm
->get_alloc_size()) << std::dec
6384 mempool_thread
.shutdown();
6391 it
.reset(); // before db is closed
6400 // fatal errors take precedence
6404 dout(2) << __func__
<< " " << num_objects
<< " objects, "
6405 << num_sharded_objects
<< " of them sharded. "
6407 dout(2) << __func__
<< " " << num_extents
<< " extents to "
6408 << num_blobs
<< " blobs, "
6409 << num_spanning_blobs
<< " spanning, "
6410 << num_shared_blobs
<< " shared."
6413 utime_t duration
= ceph_clock_now() - start
;
6414 dout(1) << __func__
<< " finish with " << errors
<< " errors, " << repaired
6415 << " repaired, " << (errors
- repaired
) << " remaining in "
6416 << duration
<< " seconds" << dendl
;
6417 return errors
- repaired
;
6420 void BlueStore::collect_metadata(map
<string
,string
> *pm
)
6422 dout(10) << __func__
<< dendl
;
6423 bdev
->collect_metadata("bluestore_bdev_", pm
);
6425 (*pm
)["bluefs"] = "1";
6426 (*pm
)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device
);
6427 bluefs
->collect_metadata(pm
);
6429 (*pm
)["bluefs"] = "0";
6433 int BlueStore::statfs(struct store_statfs_t
*buf
)
6436 buf
->total
= bdev
->get_size();
6437 buf
->available
= alloc
->get_free();
6440 // part of our shared device is "free" according to BlueFS, but we
6441 // can't touch bluestore_bluefs_min of it.
6442 int64_t shared_available
= std::min(
6443 bluefs
->get_free(bluefs_shared_bdev
),
6444 bluefs
->get_total(bluefs_shared_bdev
) - cct
->_conf
->bluestore_bluefs_min
);
6445 if (shared_available
> 0) {
6446 buf
->available
+= shared_available
;
6451 std::lock_guard
<std::mutex
> l(vstatfs_lock
);
6453 buf
->allocated
= vstatfs
.allocated();
6454 buf
->stored
= vstatfs
.stored();
6455 buf
->compressed
= vstatfs
.compressed();
6456 buf
->compressed_original
= vstatfs
.compressed_original();
6457 buf
->compressed_allocated
= vstatfs
.compressed_allocated();
6460 dout(20) << __func__
<< *buf
<< dendl
;
6467 BlueStore::CollectionRef
BlueStore::_get_collection(const coll_t
& cid
)
6469 RWLock::RLocker
l(coll_lock
);
6470 ceph::unordered_map
<coll_t
,CollectionRef
>::iterator cp
= coll_map
.find(cid
);
6471 if (cp
== coll_map
.end())
6472 return CollectionRef();
6476 void BlueStore::_queue_reap_collection(CollectionRef
& c
)
6478 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
6479 // _reap_collections and this in the same thread,
6480 // so no need a lock.
6481 removed_collections
.push_back(c
);
6484 void BlueStore::_reap_collections()
6487 list
<CollectionRef
> removed_colls
;
6489 // _queue_reap_collection and this in the same thread.
6490 // So no need a lock.
6491 if (!removed_collections
.empty())
6492 removed_colls
.swap(removed_collections
);
6497 list
<CollectionRef
>::iterator p
= removed_colls
.begin();
6498 while (p
!= removed_colls
.end()) {
6499 CollectionRef c
= *p
;
6500 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
6501 if (c
->onode_map
.map_any([&](OnodeRef o
) {
6503 if (o
->flushing_count
.load()) {
6504 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " " << o
->oid
6505 << " flush_txns " << o
->flushing_count
<< dendl
;
6513 c
->onode_map
.clear();
6514 p
= removed_colls
.erase(p
);
6515 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " done" << dendl
;
6517 if (removed_colls
.empty()) {
6518 dout(10) << __func__
<< " all reaped" << dendl
;
6520 removed_collections
.splice(removed_collections
.begin(), removed_colls
);
6524 void BlueStore::_update_cache_logger()
6526 uint64_t num_onodes
= 0;
6527 uint64_t num_extents
= 0;
6528 uint64_t num_blobs
= 0;
6529 uint64_t num_buffers
= 0;
6530 uint64_t num_buffer_bytes
= 0;
6531 for (auto c
: cache_shards
) {
6532 c
->add_stats(&num_onodes
, &num_extents
, &num_blobs
,
6533 &num_buffers
, &num_buffer_bytes
);
6535 logger
->set(l_bluestore_onodes
, num_onodes
);
6536 logger
->set(l_bluestore_extents
, num_extents
);
6537 logger
->set(l_bluestore_blobs
, num_blobs
);
6538 logger
->set(l_bluestore_buffers
, num_buffers
);
6539 logger
->set(l_bluestore_buffer_bytes
, num_buffer_bytes
);
6545 ObjectStore::CollectionHandle
BlueStore::open_collection(const coll_t
& cid
)
6547 return _get_collection(cid
);
6550 bool BlueStore::exists(const coll_t
& cid
, const ghobject_t
& oid
)
6552 CollectionHandle c
= _get_collection(cid
);
6555 return exists(c
, oid
);
6558 bool BlueStore::exists(CollectionHandle
&c_
, const ghobject_t
& oid
)
6560 Collection
*c
= static_cast<Collection
*>(c_
.get());
6561 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
6568 RWLock::RLocker
l(c
->lock
);
6569 OnodeRef o
= c
->get_onode(oid
, false);
6570 if (!o
|| !o
->exists
)
6577 int BlueStore::stat(
6579 const ghobject_t
& oid
,
6583 CollectionHandle c
= _get_collection(cid
);
6586 return stat(c
, oid
, st
, allow_eio
);
6589 int BlueStore::stat(
6590 CollectionHandle
&c_
,
6591 const ghobject_t
& oid
,
6595 Collection
*c
= static_cast<Collection
*>(c_
.get());
6598 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
6601 RWLock::RLocker
l(c
->lock
);
6602 OnodeRef o
= c
->get_onode(oid
, false);
6603 if (!o
|| !o
->exists
)
6605 st
->st_size
= o
->onode
.size
;
6606 st
->st_blksize
= 4096;
6607 st
->st_blocks
= (st
->st_size
+ st
->st_blksize
- 1) / st
->st_blksize
;
6612 if (_debug_mdata_eio(oid
)) {
6614 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
6618 int BlueStore::set_collection_opts(
6620 const pool_opts_t
& opts
)
6622 CollectionHandle ch
= _get_collection(cid
);
6625 Collection
*c
= static_cast<Collection
*>(ch
.get());
6626 dout(15) << __func__
<< " " << cid
<< " options " << opts
<< dendl
;
6629 RWLock::WLocker
l(c
->lock
);
6630 c
->pool_opts
= opts
;
6634 int BlueStore::read(
6636 const ghobject_t
& oid
,
6642 CollectionHandle c
= _get_collection(cid
);
6645 return read(c
, oid
, offset
, length
, bl
, op_flags
);
6648 int BlueStore::read(
6649 CollectionHandle
&c_
,
6650 const ghobject_t
& oid
,
6656 utime_t start
= ceph_clock_now();
6657 Collection
*c
= static_cast<Collection
*>(c_
.get());
6658 const coll_t
&cid
= c
->get_cid();
6659 dout(15) << __func__
<< " " << cid
<< " " << oid
6660 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
6668 RWLock::RLocker
l(c
->lock
);
6669 utime_t start1
= ceph_clock_now();
6670 OnodeRef o
= c
->get_onode(oid
, false);
6671 logger
->tinc(l_bluestore_read_onode_meta_lat
, ceph_clock_now() - start1
);
6672 if (!o
|| !o
->exists
) {
6677 if (offset
== length
&& offset
== 0)
6678 length
= o
->onode
.size
;
6680 r
= _do_read(c
, o
, offset
, length
, bl
, op_flags
);
6682 logger
->inc(l_bluestore_read_eio
);
6687 if (r
>= 0 && _debug_data_eio(oid
)) {
6689 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
6690 } else if (cct
->_conf
->bluestore_debug_random_read_err
&&
6691 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
* 100.0)) == 0) {
6692 dout(0) << __func__
<< ": inject random EIO" << dendl
;
6695 dout(10) << __func__
<< " " << cid
<< " " << oid
6696 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
6697 << " = " << r
<< dendl
;
6698 logger
->tinc(l_bluestore_read_lat
, ceph_clock_now() - start
);
6702 // --------------------------------------------------------
6703 // intermediate data structures used while reading
6705 uint64_t logical_offset
;
6706 uint64_t blob_xoffset
; //region offset within the blob
6710 // used later in read process
6714 region_t(uint64_t offset
, uint64_t b_offs
, uint64_t len
)
6715 : logical_offset(offset
),
6716 blob_xoffset(b_offs
),
6718 region_t(const region_t
& from
)
6719 : logical_offset(from
.logical_offset
),
6720 blob_xoffset(from
.blob_xoffset
),
6721 length(from
.length
){}
6723 friend ostream
& operator<<(ostream
& out
, const region_t
& r
) {
6724 return out
<< "0x" << std::hex
<< r
.logical_offset
<< ":"
6725 << r
.blob_xoffset
<< "~" << r
.length
<< std::dec
;
6729 typedef list
<region_t
> regions2read_t
;
6730 typedef map
<BlueStore::BlobRef
, regions2read_t
> blobs2read_t
;
6732 int BlueStore::_do_read(
6739 uint64_t retry_count
)
6743 int read_cache_policy
= 0; // do not bypass clean or dirty cache
6745 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
6746 << " size 0x" << o
->onode
.size
<< " (" << std::dec
6747 << o
->onode
.size
<< ")" << dendl
;
6750 if (offset
>= o
->onode
.size
) {
6754 // generally, don't buffer anything, unless the client explicitly requests
6756 bool buffered
= false;
6757 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
6758 dout(20) << __func__
<< " will do buffered read" << dendl
;
6760 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
6761 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
6762 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
6763 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
6767 if (offset
+ length
> o
->onode
.size
) {
6768 length
= o
->onode
.size
- offset
;
6771 utime_t start
= ceph_clock_now();
6772 o
->extent_map
.fault_range(db
, offset
, length
);
6773 logger
->tinc(l_bluestore_read_onode_meta_lat
, ceph_clock_now() - start
);
6776 ready_regions_t ready_regions
;
6778 // for deep-scrub, we only read dirty cache and bypass clean cache in
6779 // order to read underlying block device in case there are silent disk errors.
6780 if (op_flags
& CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE
) {
6781 dout(20) << __func__
<< " will bypass cache and do direct read" << dendl
;
6782 read_cache_policy
= BufferSpace::BYPASS_CLEAN_CACHE
;
6785 // build blob-wise list to of stuff read (that isn't cached)
6786 blobs2read_t blobs2read
;
6787 unsigned left
= length
;
6788 uint64_t pos
= offset
;
6789 unsigned num_regions
= 0;
6790 auto lp
= o
->extent_map
.seek_lextent(offset
);
6791 while (left
> 0 && lp
!= o
->extent_map
.extent_map
.end()) {
6792 if (pos
< lp
->logical_offset
) {
6793 unsigned hole
= lp
->logical_offset
- pos
;
6797 dout(30) << __func__
<< " hole 0x" << std::hex
<< pos
<< "~" << hole
6798 << std::dec
<< dendl
;
6802 BlobRef
& bptr
= lp
->blob
;
6803 unsigned l_off
= pos
- lp
->logical_offset
;
6804 unsigned b_off
= l_off
+ lp
->blob_offset
;
6805 unsigned b_len
= std::min(left
, lp
->length
- l_off
);
6807 ready_regions_t cache_res
;
6808 interval_set
<uint32_t> cache_interval
;
6809 bptr
->shared_blob
->bc
.read(
6810 bptr
->shared_blob
->get_cache(), b_off
, b_len
, cache_res
, cache_interval
,
6812 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6813 << " need 0x" << b_off
<< "~" << b_len
6814 << " cache has 0x" << cache_interval
6815 << std::dec
<< dendl
;
6817 auto pc
= cache_res
.begin();
6820 if (pc
!= cache_res
.end() &&
6821 pc
->first
== b_off
) {
6822 l
= pc
->second
.length();
6823 ready_regions
[pos
].claim(pc
->second
);
6824 dout(30) << __func__
<< " use cache 0x" << std::hex
<< pos
<< ": 0x"
6825 << b_off
<< "~" << l
<< std::dec
<< dendl
;
6829 if (pc
!= cache_res
.end()) {
6830 assert(pc
->first
> b_off
);
6831 l
= pc
->first
- b_off
;
6833 dout(30) << __func__
<< " will read 0x" << std::hex
<< pos
<< ": 0x"
6834 << b_off
<< "~" << l
<< std::dec
<< dendl
;
6835 blobs2read
[bptr
].emplace_back(region_t(pos
, b_off
, l
));
6846 // read raw blob data. use aio if we have >1 blobs to read.
6847 start
= ceph_clock_now(); // for the sake of simplicity
6848 // measure the whole block below.
6849 // The error isn't that much...
6850 vector
<bufferlist
> compressed_blob_bls
;
6851 IOContext
ioc(cct
, NULL
, true); // allow EIO
6852 for (auto& p
: blobs2read
) {
6853 const BlobRef
& bptr
= p
.first
;
6854 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6855 << " need " << p
.second
<< std::dec
<< dendl
;
6856 if (bptr
->get_blob().is_compressed()) {
6857 // read the whole thing
6858 if (compressed_blob_bls
.empty()) {
6859 // ensure we avoid any reallocation on subsequent blobs
6860 compressed_blob_bls
.reserve(blobs2read
.size());
6862 compressed_blob_bls
.push_back(bufferlist());
6863 bufferlist
& bl
= compressed_blob_bls
.back();
6864 r
= bptr
->get_blob().map(
6865 0, bptr
->get_blob().get_ondisk_length(),
6866 [&](uint64_t offset
, uint64_t length
) {
6868 // use aio if there are more regions to read than those in this blob
6869 if (num_regions
> p
.second
.size()) {
6870 r
= bdev
->aio_read(offset
, length
, &bl
, &ioc
);
6872 r
= bdev
->read(offset
, length
, &bl
, &ioc
, false);
6879 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
) << dendl
;
6881 // propagate EIO to caller
6888 for (auto& reg
: p
.second
) {
6889 // determine how much of the blob to read
6890 uint64_t chunk_size
= bptr
->get_blob().get_chunk_size(block_size
);
6891 reg
.r_off
= reg
.blob_xoffset
;
6892 uint64_t r_len
= reg
.length
;
6893 reg
.front
= reg
.r_off
% chunk_size
;
6895 reg
.r_off
-= reg
.front
;
6898 unsigned tail
= r_len
% chunk_size
;
6900 r_len
+= chunk_size
- tail
;
6902 dout(20) << __func__
<< " region 0x" << std::hex
6903 << reg
.logical_offset
6904 << ": 0x" << reg
.blob_xoffset
<< "~" << reg
.length
6905 << " reading 0x" << reg
.r_off
<< "~" << r_len
<< std::dec
6909 r
= bptr
->get_blob().map(
6911 [&](uint64_t offset
, uint64_t length
) {
6913 // use aio if there is more than one region to read
6914 if (num_regions
> 1) {
6915 r
= bdev
->aio_read(offset
, length
, ®
.bl
, &ioc
);
6917 r
= bdev
->read(offset
, length
, ®
.bl
, &ioc
, false);
6924 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
)
6927 // propagate EIO to caller
6932 assert(reg
.bl
.length() == r_len
);
6936 if (ioc
.has_pending_aios()) {
6937 bdev
->aio_submit(&ioc
);
6938 dout(20) << __func__
<< " waiting for aio" << dendl
;
6940 r
= ioc
.get_return_value();
6942 assert(r
== -EIO
); // no other errors allowed
6946 logger
->tinc(l_bluestore_read_wait_aio_lat
, ceph_clock_now() - start
);
6948 // enumerate and decompress desired blobs
6949 auto p
= compressed_blob_bls
.begin();
6950 blobs2read_t::iterator b2r_it
= blobs2read
.begin();
6951 while (b2r_it
!= blobs2read
.end()) {
6952 const BlobRef
& bptr
= b2r_it
->first
;
6953 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6954 << " need 0x" << b2r_it
->second
<< std::dec
<< dendl
;
6955 if (bptr
->get_blob().is_compressed()) {
6956 assert(p
!= compressed_blob_bls
.end());
6957 bufferlist
& compressed_bl
= *p
++;
6958 if (_verify_csum(o
, &bptr
->get_blob(), 0, compressed_bl
,
6959 b2r_it
->second
.front().logical_offset
) < 0) {
6960 // Handles spurious read errors caused by a kernel bug.
6961 // We sometimes get all-zero pages as a result of the read under
6962 // high memory pressure. Retrying the failing read succeeds in most cases.
6963 // See also: http://tracker.ceph.com/issues/22464
6964 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
6967 return _do_read(c
, o
, offset
, length
, bl
, op_flags
, retry_count
+ 1);
6970 r
= _decompress(compressed_bl
, &raw_bl
);
6974 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(), 0,
6977 for (auto& i
: b2r_it
->second
) {
6978 ready_regions
[i
.logical_offset
].substr_of(
6979 raw_bl
, i
.blob_xoffset
, i
.length
);
6982 for (auto& reg
: b2r_it
->second
) {
6983 if (_verify_csum(o
, &bptr
->get_blob(), reg
.r_off
, reg
.bl
,
6984 reg
.logical_offset
) < 0) {
6985 // Handles spurious read errors caused by a kernel bug.
6986 // We sometimes get all-zero pages as a result of the read under
6987 // high memory pressure. Retrying the failing read succeeds in most cases.
6988 // See also: http://tracker.ceph.com/issues/22464
6989 if (retry_count
>= cct
->_conf
->bluestore_retry_disk_reads
) {
6992 return _do_read(c
, o
, offset
, length
, bl
, op_flags
, retry_count
+ 1);
6995 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(),
6999 // prune and keep result
7000 ready_regions
[reg
.logical_offset
].substr_of(
7001 reg
.bl
, reg
.front
, reg
.length
);
7007 // generate a resulting buffer
7008 auto pr
= ready_regions
.begin();
7009 auto pr_end
= ready_regions
.end();
7011 while (pos
< length
) {
7012 if (pr
!= pr_end
&& pr
->first
== pos
+ offset
) {
7013 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
7014 << ": data from 0x" << pr
->first
<< "~" << pr
->second
.length()
7015 << std::dec
<< dendl
;
7016 pos
+= pr
->second
.length();
7017 bl
.claim_append(pr
->second
);
7020 uint64_t l
= length
- pos
;
7022 assert(pr
->first
> pos
+ offset
);
7023 l
= pr
->first
- (pos
+ offset
);
7025 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
7026 << ": zeros for 0x" << (pos
+ offset
) << "~" << l
7027 << std::dec
<< dendl
;
7032 assert(bl
.length() == length
);
7033 assert(pos
== length
);
7034 assert(pr
== pr_end
);
7037 logger
->inc(l_bluestore_reads_with_retries
);
7038 dout(5) << __func__
<< " read at 0x" << std::hex
<< offset
<< "~" << length
7039 << " failed " << std::dec
<< retry_count
<< " times before succeeding" << dendl
;
7044 int BlueStore::_verify_csum(OnodeRef
& o
,
7045 const bluestore_blob_t
* blob
, uint64_t blob_xoffset
,
7046 const bufferlist
& bl
,
7047 uint64_t logical_offset
) const
7051 utime_t start
= ceph_clock_now();
7052 int r
= blob
->verify_csum(blob_xoffset
, bl
, &bad
, &bad_csum
);
7053 if (cct
->_conf
->bluestore_debug_inject_csum_err_probability
> 0 &&
7054 (rand() % 10000) < cct
->_conf
->bluestore_debug_inject_csum_err_probability
* 10000.0) {
7055 derr
<< __func__
<< " injecting bluestore checksum verifcation error" << dendl
;
7058 bad_csum
= 0xDEADBEEF;
7065 blob
->get_csum_chunk_size(),
7066 [&](uint64_t offset
, uint64_t length
) {
7067 pex
.emplace_back(bluestore_pextent_t(offset
, length
));
7070 derr
<< __func__
<< " bad "
7071 << Checksummer::get_csum_type_string(blob
->csum_type
)
7072 << "/0x" << std::hex
<< blob
->get_csum_chunk_size()
7073 << " checksum at blob offset 0x" << bad
7074 << ", got 0x" << bad_csum
<< ", expected 0x"
7075 << blob
->get_csum_item(bad
/ blob
->get_csum_chunk_size()) << std::dec
7076 << ", device location " << pex
7077 << ", logical extent 0x" << std::hex
7078 << (logical_offset
+ bad
- blob_xoffset
) << "~"
7079 << blob
->get_csum_chunk_size() << std::dec
7080 << ", object " << o
->oid
7083 derr
<< __func__
<< " failed with exit code: " << cpp_strerror(r
) << dendl
;
7086 logger
->tinc(l_bluestore_csum_lat
, ceph_clock_now() - start
);
7090 int BlueStore::_decompress(bufferlist
& source
, bufferlist
* result
)
7093 utime_t start
= ceph_clock_now();
7094 bufferlist::iterator i
= source
.begin();
7095 bluestore_compression_header_t chdr
;
7097 int alg
= int(chdr
.type
);
7098 CompressorRef cp
= compressor
;
7099 if (!cp
|| (int)cp
->get_type() != alg
) {
7100 cp
= Compressor::create(cct
, alg
);
7104 // if compressor isn't available - error, because cannot return
7105 // decompressed data?
7106 derr
<< __func__
<< " can't load decompressor " << alg
<< dendl
;
7109 r
= cp
->decompress(i
, chdr
.length
, *result
);
7111 derr
<< __func__
<< " decompression failed with exit code " << r
<< dendl
;
7115 logger
->tinc(l_bluestore_decompress_lat
, ceph_clock_now() - start
);
7119 // this stores fiemap into interval_set, other variations
7120 // use it internally
7121 int BlueStore::_fiemap(
7122 CollectionHandle
&c_
,
7123 const ghobject_t
& oid
,
7126 interval_set
<uint64_t>& destset
)
7128 Collection
*c
= static_cast<Collection
*>(c_
.get());
7132 RWLock::RLocker
l(c
->lock
);
7134 OnodeRef o
= c
->get_onode(oid
, false);
7135 if (!o
|| !o
->exists
) {
7140 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
7141 << " size 0x" << o
->onode
.size
<< std::dec
<< dendl
;
7143 boost::intrusive::set
<Extent
>::iterator ep
, eend
;
7144 if (offset
>= o
->onode
.size
)
7147 if (offset
+ length
> o
->onode
.size
) {
7148 length
= o
->onode
.size
- offset
;
7151 o
->extent_map
.fault_range(db
, offset
, length
);
7152 eend
= o
->extent_map
.extent_map
.end();
7153 ep
= o
->extent_map
.seek_lextent(offset
);
7154 while (length
> 0) {
7155 dout(20) << __func__
<< " offset " << offset
<< dendl
;
7156 if (ep
!= eend
&& ep
->logical_offset
+ ep
->length
<= offset
) {
7161 uint64_t x_len
= length
;
7162 if (ep
!= eend
&& ep
->logical_offset
<= offset
) {
7163 uint64_t x_off
= offset
- ep
->logical_offset
;
7164 x_len
= MIN(x_len
, ep
->length
- x_off
);
7165 dout(30) << __func__
<< " lextent 0x" << std::hex
<< offset
<< "~"
7166 << x_len
<< std::dec
<< " blob " << ep
->blob
<< dendl
;
7167 destset
.insert(offset
, x_len
);
7170 if (x_off
+ x_len
== ep
->length
)
7175 ep
->logical_offset
> offset
&&
7176 ep
->logical_offset
- offset
< x_len
) {
7177 x_len
= ep
->logical_offset
- offset
;
7185 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
7186 << " size = 0x(" << destset
<< ")" << std::dec
<< dendl
;
7190 int BlueStore::fiemap(
7192 const ghobject_t
& oid
,
7197 CollectionHandle c
= _get_collection(cid
);
7200 return fiemap(c
, oid
, offset
, len
, bl
);
7203 int BlueStore::fiemap(
7204 CollectionHandle
&c_
,
7205 const ghobject_t
& oid
,
7210 interval_set
<uint64_t> m
;
7211 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
7218 int BlueStore::fiemap(
7220 const ghobject_t
& oid
,
7223 map
<uint64_t, uint64_t>& destmap
)
7225 CollectionHandle c
= _get_collection(cid
);
7228 return fiemap(c
, oid
, offset
, len
, destmap
);
7231 int BlueStore::fiemap(
7232 CollectionHandle
&c_
,
7233 const ghobject_t
& oid
,
7236 map
<uint64_t, uint64_t>& destmap
)
7238 interval_set
<uint64_t> m
;
7239 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
7241 m
.move_into(destmap
);
7246 int BlueStore::getattr(
7248 const ghobject_t
& oid
,
7252 CollectionHandle c
= _get_collection(cid
);
7255 return getattr(c
, oid
, name
, value
);
7258 int BlueStore::getattr(
7259 CollectionHandle
&c_
,
7260 const ghobject_t
& oid
,
7264 Collection
*c
= static_cast<Collection
*>(c_
.get());
7265 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
<< dendl
;
7271 RWLock::RLocker
l(c
->lock
);
7272 mempool::bluestore_cache_other::string
k(name
);
7274 OnodeRef o
= c
->get_onode(oid
, false);
7275 if (!o
|| !o
->exists
) {
7280 if (!o
->onode
.attrs
.count(k
)) {
7284 value
= o
->onode
.attrs
[k
];
7288 if (r
== 0 && _debug_mdata_eio(oid
)) {
7290 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
7292 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
7293 << " = " << r
<< dendl
;
7298 int BlueStore::getattrs(
7300 const ghobject_t
& oid
,
7301 map
<string
,bufferptr
>& aset
)
7303 CollectionHandle c
= _get_collection(cid
);
7306 return getattrs(c
, oid
, aset
);
7309 int BlueStore::getattrs(
7310 CollectionHandle
&c_
,
7311 const ghobject_t
& oid
,
7312 map
<string
,bufferptr
>& aset
)
7314 Collection
*c
= static_cast<Collection
*>(c_
.get());
7315 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
7321 RWLock::RLocker
l(c
->lock
);
7323 OnodeRef o
= c
->get_onode(oid
, false);
7324 if (!o
|| !o
->exists
) {
7328 for (auto& i
: o
->onode
.attrs
) {
7329 aset
.emplace(i
.first
.c_str(), i
.second
);
7335 if (r
== 0 && _debug_mdata_eio(oid
)) {
7337 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
7339 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
7340 << " = " << r
<< dendl
;
7344 int BlueStore::list_collections(vector
<coll_t
>& ls
)
7346 RWLock::RLocker
l(coll_lock
);
7347 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
= coll_map
.begin();
7348 p
!= coll_map
.end();
7350 ls
.push_back(p
->first
);
7354 bool BlueStore::collection_exists(const coll_t
& c
)
7356 RWLock::RLocker
l(coll_lock
);
7357 return coll_map
.count(c
);
7360 int BlueStore::collection_empty(const coll_t
& cid
, bool *empty
)
7362 dout(15) << __func__
<< " " << cid
<< dendl
;
7363 vector
<ghobject_t
> ls
;
7365 int r
= collection_list(cid
, ghobject_t(), ghobject_t::get_max(), 1,
7368 derr
<< __func__
<< " collection_list returned: " << cpp_strerror(r
)
7372 *empty
= ls
.empty();
7373 dout(10) << __func__
<< " " << cid
<< " = " << (int)(*empty
) << dendl
;
7377 int BlueStore::collection_bits(const coll_t
& cid
)
7379 dout(15) << __func__
<< " " << cid
<< dendl
;
7380 CollectionRef c
= _get_collection(cid
);
7383 RWLock::RLocker
l(c
->lock
);
7384 dout(10) << __func__
<< " " << cid
<< " = " << c
->cnode
.bits
<< dendl
;
7385 return c
->cnode
.bits
;
7388 int BlueStore::collection_list(
7389 const coll_t
& cid
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
7390 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
7392 CollectionHandle c
= _get_collection(cid
);
7395 return collection_list(c
, start
, end
, max
, ls
, pnext
);
7398 int BlueStore::collection_list(
7399 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
7400 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
7402 Collection
*c
= static_cast<Collection
*>(c_
.get());
7403 dout(15) << __func__
<< " " << c
->cid
7404 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
7407 RWLock::RLocker
l(c
->lock
);
7408 r
= _collection_list(c
, start
, end
, max
, ls
, pnext
);
7411 dout(10) << __func__
<< " " << c
->cid
7412 << " start " << start
<< " end " << end
<< " max " << max
7413 << " = " << r
<< ", ls.size() = " << ls
->size()
7414 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
7418 int BlueStore::_collection_list(
7419 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
7420 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
7427 ghobject_t static_next
;
7428 KeyValueDB::Iterator it
;
7429 string temp_start_key
, temp_end_key
;
7430 string start_key
, end_key
;
7431 bool set_next
= false;
7436 pnext
= &static_next
;
7438 if (start
== ghobject_t::get_max() ||
7439 start
.hobj
.is_max()) {
7442 get_coll_key_range(c
->cid
, c
->cnode
.bits
, &temp_start_key
, &temp_end_key
,
7443 &start_key
, &end_key
);
7444 dout(20) << __func__
7445 << " range " << pretty_binary_string(temp_start_key
)
7446 << " to " << pretty_binary_string(temp_end_key
)
7447 << " and " << pretty_binary_string(start_key
)
7448 << " to " << pretty_binary_string(end_key
)
7449 << " start " << start
<< dendl
;
7450 it
= db
->get_iterator(PREFIX_OBJ
);
7451 if (start
== ghobject_t() ||
7452 start
.hobj
== hobject_t() ||
7453 start
== c
->cid
.get_min_hobj()) {
7454 it
->upper_bound(temp_start_key
);
7458 get_object_key(cct
, start
, &k
);
7459 if (start
.hobj
.is_temp()) {
7461 assert(k
>= temp_start_key
&& k
< temp_end_key
);
7464 assert(k
>= start_key
&& k
< end_key
);
7466 dout(20) << " start from " << pretty_binary_string(k
)
7467 << " temp=" << (int)temp
<< dendl
;
7470 if (end
.hobj
.is_max()) {
7471 pend
= temp
? temp_end_key
: end_key
;
7473 get_object_key(cct
, end
, &end_key
);
7474 if (end
.hobj
.is_temp()) {
7480 pend
= temp
? temp_end_key
: end_key
;
7483 dout(20) << __func__
<< " pend " << pretty_binary_string(pend
) << dendl
;
7485 if (!it
->valid() || it
->key() >= pend
) {
7487 dout(20) << __func__
<< " iterator not valid (end of db?)" << dendl
;
7489 dout(20) << __func__
<< " key " << pretty_binary_string(it
->key())
7490 << " >= " << end
<< dendl
;
7492 if (end
.hobj
.is_temp()) {
7495 dout(30) << __func__
<< " switch to non-temp namespace" << dendl
;
7497 it
->upper_bound(start_key
);
7499 dout(30) << __func__
<< " pend " << pretty_binary_string(pend
) << dendl
;
7504 dout(30) << __func__
<< " key " << pretty_binary_string(it
->key()) << dendl
;
7505 if (is_extent_shard_key(it
->key())) {
7510 int r
= get_key_object(it
->key(), &oid
);
7512 dout(20) << __func__
<< " oid " << oid
<< " end " << end
<< dendl
;
7513 if (ls
->size() >= (unsigned)max
) {
7514 dout(20) << __func__
<< " reached max " << max
<< dendl
;
7524 *pnext
= ghobject_t::get_max();
7530 int BlueStore::omap_get(
7531 const coll_t
& cid
, ///< [in] Collection containing oid
7532 const ghobject_t
&oid
, ///< [in] Object containing omap
7533 bufferlist
*header
, ///< [out] omap header
7534 map
<string
, bufferlist
> *out
/// < [out] Key to value map
7537 CollectionHandle c
= _get_collection(cid
);
7540 return omap_get(c
, oid
, header
, out
);
7543 int BlueStore::omap_get(
7544 CollectionHandle
&c_
, ///< [in] Collection containing oid
7545 const ghobject_t
&oid
, ///< [in] Object containing omap
7546 bufferlist
*header
, ///< [out] omap header
7547 map
<string
, bufferlist
> *out
/// < [out] Key to value map
7550 Collection
*c
= static_cast<Collection
*>(c_
.get());
7551 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7554 RWLock::RLocker
l(c
->lock
);
7556 OnodeRef o
= c
->get_onode(oid
, false);
7557 if (!o
|| !o
->exists
) {
7561 if (!o
->onode
.has_omap())
7565 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
7567 get_omap_header(o
->onode
.nid
, &head
);
7568 get_omap_tail(o
->onode
.nid
, &tail
);
7569 it
->lower_bound(head
);
7570 while (it
->valid()) {
7571 if (it
->key() == head
) {
7572 dout(30) << __func__
<< " got header" << dendl
;
7573 *header
= it
->value();
7574 } else if (it
->key() >= tail
) {
7575 dout(30) << __func__
<< " reached tail" << dendl
;
7579 decode_omap_key(it
->key(), &user_key
);
7580 dout(30) << __func__
<< " got " << pretty_binary_string(it
->key())
7581 << " -> " << user_key
<< dendl
;
7582 (*out
)[user_key
] = it
->value();
7588 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7593 int BlueStore::omap_get_header(
7594 const coll_t
& cid
, ///< [in] Collection containing oid
7595 const ghobject_t
&oid
, ///< [in] Object containing omap
7596 bufferlist
*header
, ///< [out] omap header
7597 bool allow_eio
///< [in] don't assert on eio
7600 CollectionHandle c
= _get_collection(cid
);
7603 return omap_get_header(c
, oid
, header
, allow_eio
);
7606 int BlueStore::omap_get_header(
7607 CollectionHandle
&c_
, ///< [in] Collection containing oid
7608 const ghobject_t
&oid
, ///< [in] Object containing omap
7609 bufferlist
*header
, ///< [out] omap header
7610 bool allow_eio
///< [in] don't assert on eio
7613 Collection
*c
= static_cast<Collection
*>(c_
.get());
7614 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7617 RWLock::RLocker
l(c
->lock
);
7619 OnodeRef o
= c
->get_onode(oid
, false);
7620 if (!o
|| !o
->exists
) {
7624 if (!o
->onode
.has_omap())
7629 get_omap_header(o
->onode
.nid
, &head
);
7630 if (db
->get(PREFIX_OMAP
, head
, header
) >= 0) {
7631 dout(30) << __func__
<< " got header" << dendl
;
7633 dout(30) << __func__
<< " no header" << dendl
;
7637 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7642 int BlueStore::omap_get_keys(
7643 const coll_t
& cid
, ///< [in] Collection containing oid
7644 const ghobject_t
&oid
, ///< [in] Object containing omap
7645 set
<string
> *keys
///< [out] Keys defined on oid
7648 CollectionHandle c
= _get_collection(cid
);
7651 return omap_get_keys(c
, oid
, keys
);
7654 int BlueStore::omap_get_keys(
7655 CollectionHandle
&c_
, ///< [in] Collection containing oid
7656 const ghobject_t
&oid
, ///< [in] Object containing omap
7657 set
<string
> *keys
///< [out] Keys defined on oid
7660 Collection
*c
= static_cast<Collection
*>(c_
.get());
7661 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7664 RWLock::RLocker
l(c
->lock
);
7666 OnodeRef o
= c
->get_onode(oid
, false);
7667 if (!o
|| !o
->exists
) {
7671 if (!o
->onode
.has_omap())
7675 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
7677 get_omap_key(o
->onode
.nid
, string(), &head
);
7678 get_omap_tail(o
->onode
.nid
, &tail
);
7679 it
->lower_bound(head
);
7680 while (it
->valid()) {
7681 if (it
->key() >= tail
) {
7682 dout(30) << __func__
<< " reached tail" << dendl
;
7686 decode_omap_key(it
->key(), &user_key
);
7687 dout(30) << __func__
<< " got " << pretty_binary_string(it
->key())
7688 << " -> " << user_key
<< dendl
;
7689 keys
->insert(user_key
);
7694 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7699 int BlueStore::omap_get_values(
7700 const coll_t
& cid
, ///< [in] Collection containing oid
7701 const ghobject_t
&oid
, ///< [in] Object containing omap
7702 const set
<string
> &keys
, ///< [in] Keys to get
7703 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
7706 CollectionHandle c
= _get_collection(cid
);
7709 return omap_get_values(c
, oid
, keys
, out
);
7712 int BlueStore::omap_get_values(
7713 CollectionHandle
&c_
, ///< [in] Collection containing oid
7714 const ghobject_t
&oid
, ///< [in] Object containing omap
7715 const set
<string
> &keys
, ///< [in] Keys to get
7716 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
7719 Collection
*c
= static_cast<Collection
*>(c_
.get());
7720 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7723 RWLock::RLocker
l(c
->lock
);
7726 OnodeRef o
= c
->get_onode(oid
, false);
7727 if (!o
|| !o
->exists
) {
7731 if (!o
->onode
.has_omap())
7734 _key_encode_u64(o
->onode
.nid
, &final_key
);
7735 final_key
.push_back('.');
7736 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
7737 final_key
.resize(9); // keep prefix
7740 if (db
->get(PREFIX_OMAP
, final_key
, &val
) >= 0) {
7741 dout(30) << __func__
<< " got " << pretty_binary_string(final_key
)
7742 << " -> " << *p
<< dendl
;
7743 out
->insert(make_pair(*p
, val
));
7747 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7752 int BlueStore::omap_check_keys(
7753 const coll_t
& cid
, ///< [in] Collection containing oid
7754 const ghobject_t
&oid
, ///< [in] Object containing omap
7755 const set
<string
> &keys
, ///< [in] Keys to check
7756 set
<string
> *out
///< [out] Subset of keys defined on oid
7759 CollectionHandle c
= _get_collection(cid
);
7762 return omap_check_keys(c
, oid
, keys
, out
);
7765 int BlueStore::omap_check_keys(
7766 CollectionHandle
&c_
, ///< [in] Collection containing oid
7767 const ghobject_t
&oid
, ///< [in] Object containing omap
7768 const set
<string
> &keys
, ///< [in] Keys to check
7769 set
<string
> *out
///< [out] Subset of keys defined on oid
7772 Collection
*c
= static_cast<Collection
*>(c_
.get());
7773 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7776 RWLock::RLocker
l(c
->lock
);
7779 OnodeRef o
= c
->get_onode(oid
, false);
7780 if (!o
|| !o
->exists
) {
7784 if (!o
->onode
.has_omap())
7787 _key_encode_u64(o
->onode
.nid
, &final_key
);
7788 final_key
.push_back('.');
7789 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
7790 final_key
.resize(9); // keep prefix
7793 if (db
->get(PREFIX_OMAP
, final_key
, &val
) >= 0) {
7794 dout(30) << __func__
<< " have " << pretty_binary_string(final_key
)
7795 << " -> " << *p
<< dendl
;
7798 dout(30) << __func__
<< " miss " << pretty_binary_string(final_key
)
7799 << " -> " << *p
<< dendl
;
7803 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7808 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
7809 const coll_t
& cid
, ///< [in] collection
7810 const ghobject_t
&oid
///< [in] object
7813 CollectionHandle c
= _get_collection(cid
);
7815 dout(10) << __func__
<< " " << cid
<< "doesn't exist" <<dendl
;
7816 return ObjectMap::ObjectMapIterator();
7818 return get_omap_iterator(c
, oid
);
7821 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
7822 CollectionHandle
&c_
, ///< [in] collection
7823 const ghobject_t
&oid
///< [in] object
7826 Collection
*c
= static_cast<Collection
*>(c_
.get());
7827 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
7829 return ObjectMap::ObjectMapIterator();
7831 RWLock::RLocker
l(c
->lock
);
7832 OnodeRef o
= c
->get_onode(oid
, false);
7833 if (!o
|| !o
->exists
) {
7834 dout(10) << __func__
<< " " << oid
<< "doesn't exist" <<dendl
;
7835 return ObjectMap::ObjectMapIterator();
7838 dout(10) << __func__
<< " has_omap = " << (int)o
->onode
.has_omap() <<dendl
;
7839 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
7840 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c
, o
, it
));
7843 // -----------------
7846 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction
& t
)
7848 dout(10) << __func__
<< " ondisk_format " << ondisk_format
7849 << " min_compat_ondisk_format " << min_compat_ondisk_format
7851 assert(ondisk_format
== latest_ondisk_format
);
7854 ::encode(ondisk_format
, bl
);
7855 t
->set(PREFIX_SUPER
, "ondisk_format", bl
);
7859 ::encode(min_compat_ondisk_format
, bl
);
7860 t
->set(PREFIX_SUPER
, "min_compat_ondisk_format", bl
);
7864 int BlueStore::_open_super_meta()
7870 db
->get(PREFIX_SUPER
, "nid_max", &bl
);
7871 bufferlist::iterator p
= bl
.begin();
7876 } catch (buffer::error
& e
) {
7877 derr
<< __func__
<< " unable to read nid_max" << dendl
;
7880 dout(10) << __func__
<< " old nid_max " << nid_max
<< dendl
;
7881 nid_last
= nid_max
.load();
7888 db
->get(PREFIX_SUPER
, "blobid_max", &bl
);
7889 bufferlist::iterator p
= bl
.begin();
7894 } catch (buffer::error
& e
) {
7895 derr
<< __func__
<< " unable to read blobid_max" << dendl
;
7898 dout(10) << __func__
<< " old blobid_max " << blobid_max
<< dendl
;
7899 blobid_last
= blobid_max
.load();
7905 db
->get(PREFIX_SUPER
, "freelist_type", &bl
);
7907 freelist_type
= std::string(bl
.c_str(), bl
.length());
7908 dout(10) << __func__
<< " freelist_type " << freelist_type
<< dendl
;
7910 assert("Not Support extent freelist manager" == 0);
7915 if (cct
->_conf
->bluestore_bluefs
) {
7916 bluefs_extents
.clear();
7918 db
->get(PREFIX_SUPER
, "bluefs_extents", &bl
);
7919 bufferlist::iterator p
= bl
.begin();
7921 ::decode(bluefs_extents
, p
);
7923 catch (buffer::error
& e
) {
7924 derr
<< __func__
<< " unable to read bluefs_extents" << dendl
;
7927 dout(10) << __func__
<< " bluefs_extents 0x" << std::hex
<< bluefs_extents
7928 << std::dec
<< dendl
;
7932 int32_t compat_ondisk_format
= 0;
7935 int r
= db
->get(PREFIX_SUPER
, "ondisk_format", &bl
);
7937 // base case: kraken bluestore is v1 and readable by v1
7938 dout(20) << __func__
<< " missing ondisk_format; assuming kraken"
7941 compat_ondisk_format
= 1;
7943 auto p
= bl
.begin();
7945 ::decode(ondisk_format
, p
);
7946 } catch (buffer::error
& e
) {
7947 derr
<< __func__
<< " unable to read ondisk_format" << dendl
;
7952 r
= db
->get(PREFIX_SUPER
, "min_compat_ondisk_format", &bl
);
7954 auto p
= bl
.begin();
7956 ::decode(compat_ondisk_format
, p
);
7957 } catch (buffer::error
& e
) {
7958 derr
<< __func__
<< " unable to read compat_ondisk_format" << dendl
;
7963 dout(10) << __func__
<< " ondisk_format " << ondisk_format
7964 << " compat_ondisk_format " << compat_ondisk_format
7968 if (latest_ondisk_format
< compat_ondisk_format
) {
7969 derr
<< __func__
<< " compat_ondisk_format is "
7970 << compat_ondisk_format
<< " but we only understand version "
7971 << latest_ondisk_format
<< dendl
;
7974 if (ondisk_format
< latest_ondisk_format
) {
7975 int r
= _upgrade_super();
7983 db
->get(PREFIX_SUPER
, "min_alloc_size", &bl
);
7984 auto p
= bl
.begin();
7988 min_alloc_size
= val
;
7989 min_alloc_size_order
= ctz(val
);
7990 assert(min_alloc_size
== 1u << min_alloc_size_order
);
7991 } catch (buffer::error
& e
) {
7992 derr
<< __func__
<< " unable to read min_alloc_size" << dendl
;
7995 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
7996 << std::dec
<< dendl
;
8000 _set_throttle_params();
8006 _set_finisher_num();
8011 int BlueStore::_upgrade_super()
8013 dout(1) << __func__
<< " from " << ondisk_format
<< ", latest "
8014 << latest_ondisk_format
<< dendl
;
8015 assert(ondisk_format
> 0);
8016 assert(ondisk_format
< latest_ondisk_format
);
8018 if (ondisk_format
== 1) {
8020 // - super: added ondisk_format
8021 // - super: added min_readable_ondisk_format
8022 // - super: added min_compat_ondisk_format
8023 // - super: added min_alloc_size
8024 // - super: removed min_min_alloc_size
8025 KeyValueDB::Transaction t
= db
->get_transaction();
8028 db
->get(PREFIX_SUPER
, "min_min_alloc_size", &bl
);
8029 auto p
= bl
.begin();
8033 min_alloc_size
= val
;
8034 } catch (buffer::error
& e
) {
8035 derr
<< __func__
<< " failed to read min_min_alloc_size" << dendl
;
8038 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
8039 t
->rmkey(PREFIX_SUPER
, "min_min_alloc_size");
8042 _prepare_ondisk_format_super(t
);
8043 int r
= db
->submit_transaction_sync(t
);
8048 dout(1) << __func__
<< " done" << dendl
;
8052 void BlueStore::_assign_nid(TransContext
*txc
, OnodeRef o
)
8058 uint64_t nid
= ++nid_last
;
8059 dout(20) << __func__
<< " " << nid
<< dendl
;
8061 txc
->last_nid
= nid
;
8065 uint64_t BlueStore::_assign_blobid(TransContext
*txc
)
8067 uint64_t bid
= ++blobid_last
;
8068 dout(20) << __func__
<< " " << bid
<< dendl
;
8069 txc
->last_blobid
= bid
;
8073 void BlueStore::get_db_statistics(Formatter
*f
)
8075 db
->get_statistics(f
);
8078 BlueStore::TransContext
*BlueStore::_txc_create(OpSequencer
*osr
)
8080 TransContext
*txc
= new TransContext(cct
, osr
);
8081 txc
->t
= db
->get_transaction();
8082 osr
->queue_new(txc
);
8083 dout(20) << __func__
<< " osr " << osr
<< " = " << txc
8084 << " seq " << txc
->seq
<< dendl
;
8088 void BlueStore::_txc_calc_cost(TransContext
*txc
)
8090 // this is about the simplest model for transaction cost you can
8091 // imagine. there is some fixed overhead cost by saying there is a
8092 // minimum of one "io". and then we have some cost per "io" that is
8093 // a configurable (with different hdd and ssd defaults), and add
8094 // that to the bytes value.
8095 int ios
= 1; // one "io" for the kv commit
8096 for (auto& p
: txc
->ioc
.pending_aios
) {
8097 ios
+= p
.iov
.size();
8099 auto cost
= throttle_cost_per_io
.load();
8100 txc
->cost
= ios
* cost
+ txc
->bytes
;
8101 dout(10) << __func__
<< " " << txc
<< " cost " << txc
->cost
<< " ("
8102 << ios
<< " ios * " << cost
<< " + " << txc
->bytes
8103 << " bytes)" << dendl
;
8106 void BlueStore::_txc_update_store_statfs(TransContext
*txc
)
8108 if (txc
->statfs_delta
.is_empty())
8111 logger
->inc(l_bluestore_allocated
, txc
->statfs_delta
.allocated());
8112 logger
->inc(l_bluestore_stored
, txc
->statfs_delta
.stored());
8113 logger
->inc(l_bluestore_compressed
, txc
->statfs_delta
.compressed());
8114 logger
->inc(l_bluestore_compressed_allocated
, txc
->statfs_delta
.compressed_allocated());
8115 logger
->inc(l_bluestore_compressed_original
, txc
->statfs_delta
.compressed_original());
8118 std::lock_guard
<std::mutex
> l(vstatfs_lock
);
8119 vstatfs
+= txc
->statfs_delta
;
8123 txc
->statfs_delta
.encode(bl
);
8125 txc
->t
->merge(PREFIX_STAT
, "bluestore_statfs", bl
);
8126 txc
->statfs_delta
.reset();
8129 void BlueStore::_txc_state_proc(TransContext
*txc
)
8132 dout(10) << __func__
<< " txc " << txc
8133 << " " << txc
->get_state_name() << dendl
;
8134 switch (txc
->state
) {
8135 case TransContext::STATE_PREPARE
:
8136 txc
->log_state_latency(logger
, l_bluestore_state_prepare_lat
);
8137 if (txc
->ioc
.has_pending_aios()) {
8138 txc
->state
= TransContext::STATE_AIO_WAIT
;
8139 txc
->had_ios
= true;
8140 _txc_aio_submit(txc
);
8145 case TransContext::STATE_AIO_WAIT
:
8146 txc
->log_state_latency(logger
, l_bluestore_state_aio_wait_lat
);
8147 _txc_finish_io(txc
); // may trigger blocked txc's too
8150 case TransContext::STATE_IO_DONE
:
8151 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
8153 ++txc
->osr
->txc_with_unstable_io
;
8155 txc
->log_state_latency(logger
, l_bluestore_state_io_done_lat
);
8156 txc
->state
= TransContext::STATE_KV_QUEUED
;
8157 if (cct
->_conf
->bluestore_sync_submit_transaction
) {
8158 if (txc
->last_nid
>= nid_max
||
8159 txc
->last_blobid
>= blobid_max
) {
8160 dout(20) << __func__
8161 << " last_{nid,blobid} exceeds max, submit via kv thread"
8163 } else if (txc
->osr
->kv_committing_serially
) {
8164 dout(20) << __func__
<< " prior txc submitted via kv thread, us too"
8166 // note: this is starvation-prone. once we have a txc in a busy
8167 // sequencer that is committing serially it is possible to keep
8168 // submitting new transactions fast enough that we get stuck doing
8169 // so. the alternative is to block here... fixme?
8170 } else if (txc
->osr
->txc_with_unstable_io
) {
8171 dout(20) << __func__
<< " prior txc(s) with unstable ios "
8172 << txc
->osr
->txc_with_unstable_io
.load() << dendl
;
8173 } else if (cct
->_conf
->bluestore_debug_randomize_serial_transaction
&&
8174 rand() % cct
->_conf
->bluestore_debug_randomize_serial_transaction
8176 dout(20) << __func__
<< " DEBUG randomly forcing submit via kv thread"
8179 txc
->state
= TransContext::STATE_KV_SUBMITTED
;
8180 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction(txc
->t
);
8182 _txc_applied_kv(txc
);
8186 std::lock_guard
<std::mutex
> l(kv_lock
);
8187 kv_queue
.push_back(txc
);
8188 kv_cond
.notify_one();
8189 if (txc
->state
!= TransContext::STATE_KV_SUBMITTED
) {
8190 kv_queue_unsubmitted
.push_back(txc
);
8191 ++txc
->osr
->kv_committing_serially
;
8195 kv_throttle_costs
+= txc
->cost
;
8198 case TransContext::STATE_KV_SUBMITTED
:
8199 _txc_committed_kv(txc
);
8202 case TransContext::STATE_KV_DONE
:
8203 txc
->log_state_latency(logger
, l_bluestore_state_kv_done_lat
);
8204 if (txc
->deferred_txn
) {
8205 txc
->state
= TransContext::STATE_DEFERRED_QUEUED
;
8206 _deferred_queue(txc
);
8209 txc
->state
= TransContext::STATE_FINISHING
;
8212 case TransContext::STATE_DEFERRED_CLEANUP
:
8213 txc
->log_state_latency(logger
, l_bluestore_state_deferred_cleanup_lat
);
8214 txc
->state
= TransContext::STATE_FINISHING
;
8217 case TransContext::STATE_FINISHING
:
8218 txc
->log_state_latency(logger
, l_bluestore_state_finishing_lat
);
8223 derr
<< __func__
<< " unexpected txc " << txc
8224 << " state " << txc
->get_state_name() << dendl
;
8225 assert(0 == "unexpected txc state");
8231 void BlueStore::_txc_finish_io(TransContext
*txc
)
8233 dout(20) << __func__
<< " " << txc
<< dendl
;
8236 * we need to preserve the order of kv transactions,
8237 * even though aio will complete in any order.
8240 OpSequencer
*osr
= txc
->osr
.get();
8241 std::lock_guard
<std::mutex
> l(osr
->qlock
);
8242 txc
->state
= TransContext::STATE_IO_DONE
;
8244 // release aio contexts (including pinned buffers).
8245 txc
->ioc
.running_aios
.clear();
8247 OpSequencer::q_list_t::iterator p
= osr
->q
.iterator_to(*txc
);
8248 while (p
!= osr
->q
.begin()) {
8250 if (p
->state
< TransContext::STATE_IO_DONE
) {
8251 dout(20) << __func__
<< " " << txc
<< " blocked by " << &*p
<< " "
8252 << p
->get_state_name() << dendl
;
8255 if (p
->state
> TransContext::STATE_IO_DONE
) {
8261 _txc_state_proc(&*p
++);
8262 } while (p
!= osr
->q
.end() &&
8263 p
->state
== TransContext::STATE_IO_DONE
);
8265 if (osr
->kv_submitted_waiters
&&
8266 osr
->_is_all_kv_submitted()) {
8267 osr
->qcond
.notify_all();
8271 void BlueStore::_txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
)
8273 dout(20) << __func__
<< " txc " << txc
8274 << " onodes " << txc
->onodes
8275 << " shared_blobs " << txc
->shared_blobs
8279 for (auto o
: txc
->onodes
) {
8280 // finalize extent_map shards
8281 o
->extent_map
.update(t
, false);
8282 if (o
->extent_map
.needs_reshard()) {
8283 o
->extent_map
.reshard(db
, t
);
8284 o
->extent_map
.update(t
, true);
8285 if (o
->extent_map
.needs_reshard()) {
8286 dout(20) << __func__
<< " warning: still wants reshard, check options?"
8288 o
->extent_map
.clear_needs_reshard();
8290 logger
->inc(l_bluestore_onode_reshard
);
8295 denc(o
->onode
, bound
);
8296 o
->extent_map
.bound_encode_spanning_blobs(bound
);
8297 if (o
->onode
.extent_map_shards
.empty()) {
8298 denc(o
->extent_map
.inline_bl
, bound
);
8303 unsigned onode_part
, blob_part
, extent_part
;
8305 auto p
= bl
.get_contiguous_appender(bound
, true);
8307 onode_part
= p
.get_logical_offset();
8308 o
->extent_map
.encode_spanning_blobs(p
);
8309 blob_part
= p
.get_logical_offset() - onode_part
;
8310 if (o
->onode
.extent_map_shards
.empty()) {
8311 denc(o
->extent_map
.inline_bl
, p
);
8313 extent_part
= p
.get_logical_offset() - onode_part
- blob_part
;
8316 dout(20) << " onode " << o
->oid
<< " is " << bl
.length()
8317 << " (" << onode_part
<< " bytes onode + "
8318 << blob_part
<< " bytes spanning blobs + "
8319 << extent_part
<< " bytes inline extents)"
8321 t
->set(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size(), bl
);
8322 o
->flushing_count
++;
8325 // objects we modified but didn't affect the onode
8326 auto p
= txc
->modified_objects
.begin();
8327 while (p
!= txc
->modified_objects
.end()) {
8328 if (txc
->onodes
.count(*p
) == 0) {
8329 (*p
)->flushing_count
++;
8332 // remove dups with onodes list to avoid problems in _txc_finish
8333 p
= txc
->modified_objects
.erase(p
);
8337 // finalize shared_blobs
8338 for (auto sb
: txc
->shared_blobs
) {
8340 auto sbid
= sb
->get_sbid();
8341 get_shared_blob_key(sbid
, &key
);
8342 if (sb
->persistent
->empty()) {
8343 dout(20) << " shared_blob 0x" << std::hex
<< sbid
<< std::dec
8344 << " is empty" << dendl
;
8345 t
->rmkey(PREFIX_SHARED_BLOB
, key
);
8348 ::encode(*(sb
->persistent
), bl
);
8349 dout(20) << " shared_blob 0x" << std::hex
<< sbid
<< std::dec
8350 << " is " << bl
.length() << " " << *sb
<< dendl
;
8351 t
->set(PREFIX_SHARED_BLOB
, key
, bl
);
8356 void BlueStore::BSPerfTracker::update_from_perfcounters(
8357 PerfCounters
&logger
)
8359 os_commit_latency
.consume_next(
8361 l_bluestore_commit_lat
));
8362 os_apply_latency
.consume_next(
8364 l_bluestore_commit_lat
));
8367 void BlueStore::_txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
)
8369 dout(20) << __func__
<< " txc " << txc
<< std::hex
8370 << " allocated 0x" << txc
->allocated
8371 << " released 0x" << txc
->released
8372 << std::dec
<< dendl
;
8374 // We have to handle the case where we allocate *and* deallocate the
8375 // same region in this transaction. The freelist doesn't like that.
8376 // (Actually, the only thing that cares is the BitmapFreelistManager
8377 // debug check. But that's important.)
8378 interval_set
<uint64_t> tmp_allocated
, tmp_released
;
8379 interval_set
<uint64_t> *pallocated
= &txc
->allocated
;
8380 interval_set
<uint64_t> *preleased
= &txc
->released
;
8381 if (!txc
->allocated
.empty() && !txc
->released
.empty()) {
8382 interval_set
<uint64_t> overlap
;
8383 overlap
.intersection_of(txc
->allocated
, txc
->released
);
8384 if (!overlap
.empty()) {
8385 tmp_allocated
= txc
->allocated
;
8386 tmp_allocated
.subtract(overlap
);
8387 tmp_released
= txc
->released
;
8388 tmp_released
.subtract(overlap
);
8389 dout(20) << __func__
<< " overlap 0x" << std::hex
<< overlap
8390 << ", new allocated 0x" << tmp_allocated
8391 << " released 0x" << tmp_released
<< std::dec
8393 pallocated
= &tmp_allocated
;
8394 preleased
= &tmp_released
;
8398 // update freelist with non-overlap sets
8399 for (interval_set
<uint64_t>::iterator p
= pallocated
->begin();
8400 p
!= pallocated
->end();
8402 fm
->allocate(p
.get_start(), p
.get_len(), t
);
8404 for (interval_set
<uint64_t>::iterator p
= preleased
->begin();
8405 p
!= preleased
->end();
8407 dout(20) << __func__
<< " release 0x" << std::hex
<< p
.get_start()
8408 << "~" << p
.get_len() << std::dec
<< dendl
;
8409 fm
->release(p
.get_start(), p
.get_len(), t
);
8412 _txc_update_store_statfs(txc
);
8415 void BlueStore::_txc_applied_kv(TransContext
*txc
)
8417 for (auto ls
: { &txc
->onodes
, &txc
->modified_objects
}) {
8418 for (auto& o
: *ls
) {
8419 dout(20) << __func__
<< " onode " << o
<< " had " << o
->flushing_count
8421 if (--o
->flushing_count
== 0) {
8422 std::lock_guard
<std::mutex
> l(o
->flush_lock
);
8423 o
->flush_cond
.notify_all();
8429 void BlueStore::_txc_committed_kv(TransContext
*txc
)
8431 dout(20) << __func__
<< " txc " << txc
<< dendl
;
8433 // warning: we're calling onreadable_sync inside the sequencer lock
8434 if (txc
->onreadable_sync
) {
8435 txc
->onreadable_sync
->complete(0);
8436 txc
->onreadable_sync
= NULL
;
8438 unsigned n
= txc
->osr
->parent
->shard_hint
.hash_to_shard(m_finisher_num
);
8439 if (txc
->oncommit
) {
8440 logger
->tinc(l_bluestore_commit_lat
, ceph_clock_now() - txc
->start
);
8441 finishers
[n
]->queue(txc
->oncommit
);
8442 txc
->oncommit
= NULL
;
8444 if (txc
->onreadable
) {
8445 finishers
[n
]->queue(txc
->onreadable
);
8446 txc
->onreadable
= NULL
;
8450 std::lock_guard
<std::mutex
> l(txc
->osr
->qlock
);
8451 txc
->state
= TransContext::STATE_KV_DONE
;
8452 if (!txc
->oncommits
.empty()) {
8453 finishers
[n
]->queue(txc
->oncommits
);
8456 txc
->log_state_latency(logger
, l_bluestore_state_kv_committing_lat
);
8459 void BlueStore::_txc_finish(TransContext
*txc
)
8461 dout(20) << __func__
<< " " << txc
<< " onodes " << txc
->onodes
<< dendl
;
8462 assert(txc
->state
== TransContext::STATE_FINISHING
);
8464 for (auto& sb
: txc
->shared_blobs_written
) {
8465 sb
->finish_write(txc
->seq
);
8467 txc
->shared_blobs_written
.clear();
8469 while (!txc
->removed_collections
.empty()) {
8470 _queue_reap_collection(txc
->removed_collections
.front());
8471 txc
->removed_collections
.pop_front();
8474 OpSequencerRef osr
= txc
->osr
;
8476 bool submit_deferred
= false;
8477 OpSequencer::q_list_t releasing_txc
;
8479 std::lock_guard
<std::mutex
> l(osr
->qlock
);
8480 txc
->state
= TransContext::STATE_DONE
;
8481 bool notify
= false;
8482 while (!osr
->q
.empty()) {
8483 TransContext
*txc
= &osr
->q
.front();
8484 dout(20) << __func__
<< " txc " << txc
<< " " << txc
->get_state_name()
8486 if (txc
->state
!= TransContext::STATE_DONE
) {
8487 if (txc
->state
== TransContext::STATE_PREPARE
&&
8488 deferred_aggressive
) {
8489 // for _osr_drain_preceding()
8492 if (txc
->state
== TransContext::STATE_DEFERRED_QUEUED
&&
8493 osr
->q
.size() > g_conf
->bluestore_max_deferred_txc
) {
8494 submit_deferred
= true;
8500 releasing_txc
.push_back(*txc
);
8504 osr
->qcond
.notify_all();
8506 if (osr
->q
.empty()) {
8507 dout(20) << __func__
<< " osr " << osr
<< " q now empty" << dendl
;
8511 while (!releasing_txc
.empty()) {
8512 // release to allocator only after all preceding txc's have also
8513 // finished any deferred writes that potentially land in these
8515 auto txc
= &releasing_txc
.front();
8516 _txc_release_alloc(txc
);
8517 releasing_txc
.pop_front();
8518 txc
->log_state_latency(logger
, l_bluestore_state_done_lat
);
8522 if (submit_deferred
) {
8523 // we're pinning memory; flush! we could be more fine-grained here but
8524 // i'm not sure it's worth the bother.
8525 deferred_try_submit();
8528 if (empty
&& osr
->zombie
) {
8529 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
8532 logger
->set(l_bluestore_fragmentation
,
8533 (uint64_t)(alloc
->get_fragmentation(min_alloc_size
) * 1000));
8536 void BlueStore::_txc_release_alloc(TransContext
*txc
)
8538 interval_set
<uint64_t> bulk_release_extents
;
8539 // it's expected we're called with lazy_release_lock already taken!
8540 if (!cct
->_conf
->bluestore_debug_no_reuse_blocks
) {
8541 dout(10) << __func__
<< " " << txc
<< " " << std::hex
8542 << txc
->released
<< std::dec
<< dendl
;
8543 // interval_set seems to be too costly for inserting things in
8544 // bstore_kv_final. We could serialize in simpler format and perform
8545 // the merge separately, maybe even in a dedicated thread.
8546 bulk_release_extents
.insert(txc
->released
);
8549 alloc
->release(bulk_release_extents
);
8550 txc
->allocated
.clear();
8551 txc
->released
.clear();
8554 void BlueStore::_osr_drain_preceding(TransContext
*txc
)
8556 OpSequencer
*osr
= txc
->osr
.get();
8557 dout(10) << __func__
<< " " << txc
<< " osr " << osr
<< dendl
;
8558 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
8560 // submit anything pending
8561 deferred_lock
.lock();
8562 if (osr
->deferred_pending
) {
8563 _deferred_submit_unlock(osr
);
8565 deferred_lock
.unlock();
8569 // wake up any previously finished deferred events
8570 std::lock_guard
<std::mutex
> l(kv_lock
);
8571 kv_cond
.notify_one();
8573 osr
->drain_preceding(txc
);
8574 --deferred_aggressive
;
8575 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
8578 void BlueStore::_osr_drain_all()
8580 dout(10) << __func__
<< dendl
;
8582 set
<OpSequencerRef
> s
;
8584 std::lock_guard
<std::mutex
> l(osr_lock
);
8587 dout(20) << __func__
<< " osr_set " << s
<< dendl
;
8589 ++deferred_aggressive
;
8591 // submit anything pending
8592 deferred_try_submit();
8595 // wake up any previously finished deferred events
8596 std::lock_guard
<std::mutex
> l(kv_lock
);
8597 kv_cond
.notify_one();
8600 std::lock_guard
<std::mutex
> l(kv_finalize_lock
);
8601 kv_finalize_cond
.notify_one();
8603 for (auto osr
: s
) {
8604 dout(20) << __func__
<< " drain " << osr
<< dendl
;
8607 --deferred_aggressive
;
8609 dout(10) << __func__
<< " done" << dendl
;
8612 void BlueStore::_osr_unregister_all()
8614 set
<OpSequencerRef
> s
;
8616 std::lock_guard
<std::mutex
> l(osr_lock
);
8619 dout(10) << __func__
<< " " << s
<< dendl
;
8620 for (auto osr
: s
) {
8624 // break link from Sequencer to us so that this OpSequencer
8625 // instance can die with this mount/umount cycle. note that
8626 // we assume umount() will not race against ~Sequencer.
8627 assert(osr
->parent
);
8628 osr
->parent
->p
.reset();
8631 // nobody should be creating sequencers during umount either.
8633 std::lock_guard
<std::mutex
> l(osr_lock
);
8634 assert(osr_set
.empty());
8638 void BlueStore::_kv_start()
8640 dout(10) << __func__
<< dendl
;
8642 for (int i
= 0; i
< m_finisher_num
; ++i
) {
8644 oss
<< "finisher-" << i
;
8645 Finisher
*f
= new Finisher(cct
, oss
.str(), "finisher");
8646 finishers
.push_back(f
);
8649 deferred_finisher
.start();
8650 for (auto f
: finishers
) {
8653 kv_sync_thread
.create("bstore_kv_sync");
8654 kv_finalize_thread
.create("bstore_kv_final");
8657 void BlueStore::_kv_stop()
8659 dout(10) << __func__
<< dendl
;
8661 std::unique_lock
<std::mutex
> l(kv_lock
);
8662 while (!kv_sync_started
) {
8666 kv_cond
.notify_all();
8669 std::unique_lock
<std::mutex
> l(kv_finalize_lock
);
8670 while (!kv_finalize_started
) {
8671 kv_finalize_cond
.wait(l
);
8673 kv_finalize_stop
= true;
8674 kv_finalize_cond
.notify_all();
8676 kv_sync_thread
.join();
8677 kv_finalize_thread
.join();
8678 assert(removed_collections
.empty());
8680 std::lock_guard
<std::mutex
> l(kv_lock
);
8684 std::lock_guard
<std::mutex
> l(kv_finalize_lock
);
8685 kv_finalize_stop
= false;
8687 dout(10) << __func__
<< " stopping finishers" << dendl
;
8688 deferred_finisher
.wait_for_empty();
8689 deferred_finisher
.stop();
8690 for (auto f
: finishers
) {
8691 f
->wait_for_empty();
8694 dout(10) << __func__
<< " stopped" << dendl
;
8697 void BlueStore::_kv_sync_thread()
8699 dout(10) << __func__
<< " start" << dendl
;
8700 std::unique_lock
<std::mutex
> l(kv_lock
);
8701 assert(!kv_sync_started
);
8702 bool bluefs_do_check_balance
= false;
8703 kv_sync_started
= true;
8704 kv_cond
.notify_all();
8706 assert(kv_committing
.empty());
8707 if (kv_queue
.empty() &&
8708 ((deferred_done_queue
.empty() && deferred_stable_queue
.empty()) ||
8709 !deferred_aggressive
) &&
8710 (bluefs_do_check_balance
== false)) {
8713 dout(20) << __func__
<< " sleep" << dendl
;
8714 std::cv_status status
= kv_cond
.wait_for(l
,
8715 std::chrono::milliseconds(int64_t(cct
->_conf
->bluestore_bluefs_balance_interval
* 1000)));
8716 dout(20) << __func__
<< " wake" << dendl
;
8717 if (status
== std::cv_status::timeout
) {
8718 bluefs_do_check_balance
= true;
8721 deque
<TransContext
*> kv_submitting
;
8722 deque
<DeferredBatch
*> deferred_done
, deferred_stable
;
8723 uint64_t aios
= 0, costs
= 0;
8725 dout(20) << __func__
<< " committing " << kv_queue
.size()
8726 << " submitting " << kv_queue_unsubmitted
.size()
8727 << " deferred done " << deferred_done_queue
.size()
8728 << " stable " << deferred_stable_queue
.size()
8730 kv_committing
.swap(kv_queue
);
8731 kv_submitting
.swap(kv_queue_unsubmitted
);
8732 deferred_done
.swap(deferred_done_queue
);
8733 deferred_stable
.swap(deferred_stable_queue
);
8735 costs
= kv_throttle_costs
;
8737 kv_throttle_costs
= 0;
8738 utime_t start
= ceph_clock_now();
8741 dout(30) << __func__
<< " committing " << kv_committing
<< dendl
;
8742 dout(30) << __func__
<< " submitting " << kv_submitting
<< dendl
;
8743 dout(30) << __func__
<< " deferred_done " << deferred_done
<< dendl
;
8744 dout(30) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
8746 bool force_flush
= false;
8747 // if bluefs is sharing the same device as data (only), then we
8748 // can rely on the bluefs commit to flush the device and make
8749 // deferred aios stable. that means that if we do have done deferred
8750 // txcs AND we are not on a single device, we need to force a flush.
8751 if (bluefs_single_shared_device
&& bluefs
) {
8754 } else if (kv_committing
.empty() && kv_submitting
.empty() &&
8755 deferred_stable
.empty()) {
8756 force_flush
= true; // there's nothing else to commit!
8757 } else if (deferred_aggressive
) {
8764 dout(20) << __func__
<< " num_aios=" << aios
8765 << " force_flush=" << (int)force_flush
8766 << ", flushing, deferred done->stable" << dendl
;
8767 // flush/barrier on block device
8770 // if we flush then deferred done are now deferred stable
8771 deferred_stable
.insert(deferred_stable
.end(), deferred_done
.begin(),
8772 deferred_done
.end());
8773 deferred_done
.clear();
8775 utime_t after_flush
= ceph_clock_now();
8777 // we will use one final transaction to force a sync
8778 KeyValueDB::Transaction synct
= db
->get_transaction();
8780 // increase {nid,blobid}_max? note that this covers both the
8781 // case where we are approaching the max and the case we passed
8782 // it. in either case, we increase the max in the earlier txn
8784 uint64_t new_nid_max
= 0, new_blobid_max
= 0;
8785 if (nid_last
+ cct
->_conf
->bluestore_nid_prealloc
/2 > nid_max
) {
8786 KeyValueDB::Transaction t
=
8787 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
8788 new_nid_max
= nid_last
+ cct
->_conf
->bluestore_nid_prealloc
;
8790 ::encode(new_nid_max
, bl
);
8791 t
->set(PREFIX_SUPER
, "nid_max", bl
);
8792 dout(10) << __func__
<< " new_nid_max " << new_nid_max
<< dendl
;
8794 if (blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
/2 > blobid_max
) {
8795 KeyValueDB::Transaction t
=
8796 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
8797 new_blobid_max
= blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
;
8799 ::encode(new_blobid_max
, bl
);
8800 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
8801 dout(10) << __func__
<< " new_blobid_max " << new_blobid_max
<< dendl
;
8804 for (auto txc
: kv_committing
) {
8805 if (txc
->state
== TransContext::STATE_KV_QUEUED
) {
8806 txc
->log_state_latency(logger
, l_bluestore_state_kv_queued_lat
);
8807 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction(txc
->t
);
8809 _txc_applied_kv(txc
);
8810 --txc
->osr
->kv_committing_serially
;
8811 txc
->state
= TransContext::STATE_KV_SUBMITTED
;
8812 if (txc
->osr
->kv_submitted_waiters
) {
8813 std::lock_guard
<std::mutex
> l(txc
->osr
->qlock
);
8814 if (txc
->osr
->_is_all_kv_submitted()) {
8815 txc
->osr
->qcond
.notify_all();
8820 assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
8821 txc
->log_state_latency(logger
, l_bluestore_state_kv_queued_lat
);
8824 --txc
->osr
->txc_with_unstable_io
;
8828 // release throttle *before* we commit. this allows new ops
8829 // to be prepared and enter pipeline while we are waiting on
8830 // the kv commit sync/flush. then hopefully on the next
8831 // iteration there will already be ops awake. otherwise, we
8832 // end up going to sleep, and then wake up when the very first
8833 // transaction is ready for commit.
8834 throttle_bytes
.put(costs
);
8836 PExtentVector bluefs_gift_extents
;
8838 after_flush
- bluefs_last_balance
>
8839 cct
->_conf
->bluestore_bluefs_balance_interval
) {
8840 bluefs_last_balance
= after_flush
;
8841 int r
= _balance_bluefs_freespace(&bluefs_gift_extents
);
8844 for (auto& p
: bluefs_gift_extents
) {
8845 bluefs_extents
.insert(p
.offset
, p
.length
);
8848 ::encode(bluefs_extents
, bl
);
8849 dout(10) << __func__
<< " bluefs_extents now 0x" << std::hex
8850 << bluefs_extents
<< std::dec
<< dendl
;
8851 synct
->set(PREFIX_SUPER
, "bluefs_extents", bl
);
8854 bluefs_do_check_balance
= false;
8856 // cleanup sync deferred keys
8857 for (auto b
: deferred_stable
) {
8858 for (auto& txc
: b
->txcs
) {
8859 bluestore_deferred_transaction_t
& wt
= *txc
.deferred_txn
;
8860 if (!wt
.released
.empty()) {
8861 // kraken replay compat only
8862 txc
.released
= wt
.released
;
8863 dout(10) << __func__
<< " deferred txn has released "
8865 << " (we just upgraded from kraken) on " << &txc
<< dendl
;
8866 _txc_finalize_kv(&txc
, synct
);
8868 // cleanup the deferred
8870 get_deferred_key(wt
.seq
, &key
);
8871 synct
->rm_single_key(PREFIX_DEFERRED
, key
);
8875 // submit synct synchronously (block and wait for it to commit)
8876 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction_sync(synct
);
8880 nid_max
= new_nid_max
;
8881 dout(10) << __func__
<< " nid_max now " << nid_max
<< dendl
;
8883 if (new_blobid_max
) {
8884 blobid_max
= new_blobid_max
;
8885 dout(10) << __func__
<< " blobid_max now " << blobid_max
<< dendl
;
8889 utime_t finish
= ceph_clock_now();
8890 utime_t dur_flush
= after_flush
- start
;
8891 utime_t dur_kv
= finish
- after_flush
;
8892 utime_t dur
= finish
- start
;
8893 dout(20) << __func__
<< " committed " << kv_committing
.size()
8894 << " cleaned " << deferred_stable
.size()
8896 << " (" << dur_flush
<< " flush + " << dur_kv
<< " kv commit)"
8898 logger
->tinc(l_bluestore_kv_flush_lat
, dur_flush
);
8899 logger
->tinc(l_bluestore_kv_commit_lat
, dur_kv
);
8900 logger
->tinc(l_bluestore_kv_lat
, dur
);
8904 if (!bluefs_gift_extents
.empty()) {
8905 _commit_bluefs_freespace(bluefs_gift_extents
);
8907 dout(20) << __func__
<< " releasing old bluefs 0x" << std::hex
8908 << bluefs_extents_reclaiming
<< std::dec
<< dendl
;
8909 alloc
->release(bluefs_extents_reclaiming
);
8910 bluefs_extents_reclaiming
.clear();
8914 std::unique_lock
<std::mutex
> m(kv_finalize_lock
);
8915 if (kv_committing_to_finalize
.empty()) {
8916 kv_committing_to_finalize
.swap(kv_committing
);
8918 kv_committing_to_finalize
.insert(
8919 kv_committing_to_finalize
.end(),
8920 kv_committing
.begin(),
8921 kv_committing
.end());
8922 kv_committing
.clear();
8924 if (deferred_stable_to_finalize
.empty()) {
8925 deferred_stable_to_finalize
.swap(deferred_stable
);
8927 deferred_stable_to_finalize
.insert(
8928 deferred_stable_to_finalize
.end(),
8929 deferred_stable
.begin(),
8930 deferred_stable
.end());
8931 deferred_stable
.clear();
8933 kv_finalize_cond
.notify_one();
8937 // previously deferred "done" are now "stable" by virtue of this
8939 deferred_stable_queue
.swap(deferred_done
);
8942 dout(10) << __func__
<< " finish" << dendl
;
8943 kv_sync_started
= false;
8946 void BlueStore::_kv_finalize_thread()
8948 deque
<TransContext
*> kv_committed
;
8949 deque
<DeferredBatch
*> deferred_stable
;
8950 dout(10) << __func__
<< " start" << dendl
;
8951 std::unique_lock
<std::mutex
> l(kv_finalize_lock
);
8952 assert(!kv_finalize_started
);
8953 kv_finalize_started
= true;
8954 kv_finalize_cond
.notify_all();
8956 assert(kv_committed
.empty());
8957 assert(deferred_stable
.empty());
8958 if (kv_committing_to_finalize
.empty() &&
8959 deferred_stable_to_finalize
.empty()) {
8960 if (kv_finalize_stop
)
8962 dout(20) << __func__
<< " sleep" << dendl
;
8963 kv_finalize_cond
.wait(l
);
8964 dout(20) << __func__
<< " wake" << dendl
;
8966 kv_committed
.swap(kv_committing_to_finalize
);
8967 deferred_stable
.swap(deferred_stable_to_finalize
);
8969 dout(20) << __func__
<< " kv_committed " << kv_committed
<< dendl
;
8970 dout(20) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
8972 while (!kv_committed
.empty()) {
8973 TransContext
*txc
= kv_committed
.front();
8974 assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
8975 _txc_state_proc(txc
);
8976 kv_committed
.pop_front();
8979 for (auto b
: deferred_stable
) {
8980 auto p
= b
->txcs
.begin();
8981 while (p
!= b
->txcs
.end()) {
8982 TransContext
*txc
= &*p
;
8983 p
= b
->txcs
.erase(p
); // unlink here because
8984 _txc_state_proc(txc
); // this may destroy txc
8988 deferred_stable
.clear();
8990 if (!deferred_aggressive
) {
8991 if (deferred_queue_size
>= deferred_batch_ops
.load() ||
8992 throttle_deferred_bytes
.past_midpoint()) {
8993 deferred_try_submit();
8997 // this is as good a place as any ...
8998 _reap_collections();
9003 dout(10) << __func__
<< " finish" << dendl
;
9004 kv_finalize_started
= false;
9007 bluestore_deferred_op_t
*BlueStore::_get_deferred_op(
9008 TransContext
*txc
, OnodeRef o
)
9010 if (!txc
->deferred_txn
) {
9011 txc
->deferred_txn
= new bluestore_deferred_transaction_t
;
9013 txc
->deferred_txn
->ops
.push_back(bluestore_deferred_op_t());
9014 return &txc
->deferred_txn
->ops
.back();
9017 void BlueStore::_deferred_queue(TransContext
*txc
)
9019 dout(20) << __func__
<< " txc " << txc
<< " osr " << txc
->osr
<< dendl
;
9020 deferred_lock
.lock();
9021 if (!txc
->osr
->deferred_pending
&&
9022 !txc
->osr
->deferred_running
) {
9023 deferred_queue
.push_back(*txc
->osr
);
9025 if (!txc
->osr
->deferred_pending
) {
9026 txc
->osr
->deferred_pending
= new DeferredBatch(cct
, txc
->osr
.get());
9028 ++deferred_queue_size
;
9029 txc
->osr
->deferred_pending
->txcs
.push_back(*txc
);
9030 bluestore_deferred_transaction_t
& wt
= *txc
->deferred_txn
;
9031 for (auto opi
= wt
.ops
.begin(); opi
!= wt
.ops
.end(); ++opi
) {
9032 const auto& op
= *opi
;
9033 assert(op
.op
== bluestore_deferred_op_t::OP_WRITE
);
9034 bufferlist::const_iterator p
= op
.data
.begin();
9035 for (auto e
: op
.extents
) {
9036 txc
->osr
->deferred_pending
->prepare_write(
9037 cct
, wt
.seq
, e
.offset
, e
.length
, p
);
9040 if (deferred_aggressive
&&
9041 !txc
->osr
->deferred_running
) {
9042 _deferred_submit_unlock(txc
->osr
.get());
9044 deferred_lock
.unlock();
9048 void BlueStore::deferred_try_submit()
9050 dout(20) << __func__
<< " " << deferred_queue
.size() << " osrs, "
9051 << deferred_queue_size
<< " txcs" << dendl
;
9052 std::lock_guard
<std::mutex
> l(deferred_lock
);
9053 vector
<OpSequencerRef
> osrs
;
9054 osrs
.reserve(deferred_queue
.size());
9055 for (auto& osr
: deferred_queue
) {
9056 osrs
.push_back(&osr
);
9058 for (auto& osr
: osrs
) {
9059 if (osr
->deferred_pending
) {
9060 if (!osr
->deferred_running
) {
9061 _deferred_submit_unlock(osr
.get());
9062 deferred_lock
.lock();
9064 dout(20) << __func__
<< " osr " << osr
<< " already has running"
9068 dout(20) << __func__
<< " osr " << osr
<< " has no pending" << dendl
;
9073 void BlueStore::_deferred_submit_unlock(OpSequencer
*osr
)
9075 dout(10) << __func__
<< " osr " << osr
9076 << " " << osr
->deferred_pending
->iomap
.size() << " ios pending "
9078 assert(osr
->deferred_pending
);
9079 assert(!osr
->deferred_running
);
9081 auto b
= osr
->deferred_pending
;
9082 deferred_queue_size
-= b
->seq_bytes
.size();
9083 assert(deferred_queue_size
>= 0);
9085 osr
->deferred_running
= osr
->deferred_pending
;
9086 osr
->deferred_pending
= nullptr;
9088 uint64_t start
= 0, pos
= 0;
9090 auto i
= b
->iomap
.begin();
9092 if (i
== b
->iomap
.end() || i
->first
!= pos
) {
9094 dout(20) << __func__
<< " write 0x" << std::hex
9095 << start
<< "~" << bl
.length()
9096 << " crc " << bl
.crc32c(-1) << std::dec
<< dendl
;
9097 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
9098 logger
->inc(l_bluestore_deferred_write_ops
);
9099 logger
->inc(l_bluestore_deferred_write_bytes
, bl
.length());
9100 int r
= bdev
->aio_write(start
, bl
, &b
->ioc
, false);
9104 if (i
== b
->iomap
.end()) {
9111 dout(20) << __func__
<< " seq " << i
->second
.seq
<< " 0x"
9112 << std::hex
<< pos
<< "~" << i
->second
.bl
.length() << std::dec
9117 pos
+= i
->second
.bl
.length();
9118 bl
.claim_append(i
->second
.bl
);
9122 deferred_lock
.unlock();
9123 bdev
->aio_submit(&b
->ioc
);
9126 struct C_DeferredTrySubmit
: public Context
{
9128 C_DeferredTrySubmit(BlueStore
*s
) : store(s
) {}
9129 void finish(int r
) {
9130 store
->deferred_try_submit();
9134 void BlueStore::_deferred_aio_finish(OpSequencer
*osr
)
9136 dout(10) << __func__
<< " osr " << osr
<< dendl
;
9137 assert(osr
->deferred_running
);
9138 DeferredBatch
*b
= osr
->deferred_running
;
9141 std::lock_guard
<std::mutex
> l(deferred_lock
);
9142 assert(osr
->deferred_running
== b
);
9143 osr
->deferred_running
= nullptr;
9144 if (!osr
->deferred_pending
) {
9145 dout(20) << __func__
<< " dequeueing" << dendl
;
9146 auto q
= deferred_queue
.iterator_to(*osr
);
9147 deferred_queue
.erase(q
);
9148 } else if (deferred_aggressive
) {
9149 dout(20) << __func__
<< " queuing async deferred_try_submit" << dendl
;
9150 deferred_finisher
.queue(new C_DeferredTrySubmit(this));
9152 dout(20) << __func__
<< " leaving queued, more pending" << dendl
;
9158 std::lock_guard
<std::mutex
> l2(osr
->qlock
);
9159 for (auto& i
: b
->txcs
) {
9160 TransContext
*txc
= &i
;
9161 txc
->state
= TransContext::STATE_DEFERRED_CLEANUP
;
9164 osr
->qcond
.notify_all();
9165 throttle_deferred_bytes
.put(costs
);
9166 std::lock_guard
<std::mutex
> l(kv_lock
);
9167 deferred_done_queue
.emplace_back(b
);
9170 // in the normal case, do not bother waking up the kv thread; it will
9171 // catch us on the next commit anyway.
9172 if (deferred_aggressive
) {
9173 std::lock_guard
<std::mutex
> l(kv_lock
);
9174 kv_cond
.notify_one();
9178 int BlueStore::_deferred_replay()
9180 dout(10) << __func__
<< " start" << dendl
;
9181 OpSequencerRef osr
= new OpSequencer(cct
, this);
9184 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_DEFERRED
);
9185 for (it
->lower_bound(string()); it
->valid(); it
->next(), ++count
) {
9186 dout(20) << __func__
<< " replay " << pretty_binary_string(it
->key())
9188 bluestore_deferred_transaction_t
*deferred_txn
=
9189 new bluestore_deferred_transaction_t
;
9190 bufferlist bl
= it
->value();
9191 bufferlist::iterator p
= bl
.begin();
9193 ::decode(*deferred_txn
, p
);
9194 } catch (buffer::error
& e
) {
9195 derr
<< __func__
<< " failed to decode deferred txn "
9196 << pretty_binary_string(it
->key()) << dendl
;
9197 delete deferred_txn
;
9201 TransContext
*txc
= _txc_create(osr
.get());
9202 txc
->deferred_txn
= deferred_txn
;
9203 txc
->state
= TransContext::STATE_KV_DONE
;
9204 _txc_state_proc(txc
);
9207 dout(20) << __func__
<< " draining osr" << dendl
;
9210 dout(10) << __func__
<< " completed " << count
<< " events" << dendl
;
9214 // ---------------------------
9217 int BlueStore::queue_transactions(
9219 vector
<Transaction
>& tls
,
9221 ThreadPool::TPHandle
*handle
)
9224 Context
*onreadable
;
9226 Context
*onreadable_sync
;
9227 ObjectStore::Transaction::collect_contexts(
9228 tls
, &onreadable
, &ondisk
, &onreadable_sync
);
9230 if (cct
->_conf
->objectstore_blackhole
) {
9231 dout(0) << __func__
<< " objectstore_blackhole = TRUE, dropping transaction"
9235 delete onreadable_sync
;
9238 utime_t start
= ceph_clock_now();
9239 // set up the sequencer
9243 osr
= static_cast<OpSequencer
*>(posr
->p
.get());
9244 dout(10) << __func__
<< " existing " << osr
<< " " << *osr
<< dendl
;
9246 osr
= new OpSequencer(cct
, this);
9249 dout(10) << __func__
<< " new " << osr
<< " " << *osr
<< dendl
;
9253 TransContext
*txc
= _txc_create(osr
);
9254 txc
->onreadable
= onreadable
;
9255 txc
->onreadable_sync
= onreadable_sync
;
9256 txc
->oncommit
= ondisk
;
9258 for (vector
<Transaction
>::iterator p
= tls
.begin(); p
!= tls
.end(); ++p
) {
9260 txc
->bytes
+= (*p
).get_num_bytes();
9261 _txc_add_transaction(txc
, &(*p
));
9263 _txc_calc_cost(txc
);
9265 _txc_write_nodes(txc
, txc
->t
);
9267 // journal deferred items
9268 if (txc
->deferred_txn
) {
9269 txc
->deferred_txn
->seq
= ++deferred_seq
;
9271 ::encode(*txc
->deferred_txn
, bl
);
9273 get_deferred_key(txc
->deferred_txn
->seq
, &key
);
9274 txc
->t
->set(PREFIX_DEFERRED
, key
, bl
);
9277 _txc_finalize_kv(txc
, txc
->t
);
9279 handle
->suspend_tp_timeout();
9281 utime_t tstart
= ceph_clock_now();
9282 throttle_bytes
.get(txc
->cost
);
9283 if (txc
->deferred_txn
) {
9284 // ensure we do not block here because of deferred writes
9285 if (!throttle_deferred_bytes
.get_or_fail(txc
->cost
)) {
9286 dout(10) << __func__
<< " failed get throttle_deferred_bytes, aggressive"
9288 ++deferred_aggressive
;
9289 deferred_try_submit();
9291 // wake up any previously finished deferred events
9292 std::lock_guard
<std::mutex
> l(kv_lock
);
9293 kv_cond
.notify_one();
9295 throttle_deferred_bytes
.get(txc
->cost
);
9296 --deferred_aggressive
;
9299 utime_t tend
= ceph_clock_now();
9302 handle
->reset_tp_timeout();
9304 logger
->inc(l_bluestore_txc
);
9307 _txc_state_proc(txc
);
9309 logger
->tinc(l_bluestore_submit_lat
, ceph_clock_now() - start
);
9310 logger
->tinc(l_bluestore_throttle_lat
, tend
- tstart
);
9314 void BlueStore::_txc_aio_submit(TransContext
*txc
)
9316 dout(10) << __func__
<< " txc " << txc
<< dendl
;
9317 bdev
->aio_submit(&txc
->ioc
);
9320 void BlueStore::_txc_add_transaction(TransContext
*txc
, Transaction
*t
)
9322 Transaction::iterator i
= t
->begin();
9324 _dump_transaction(t
);
9326 vector
<CollectionRef
> cvec(i
.colls
.size());
9328 for (vector
<coll_t
>::iterator p
= i
.colls
.begin(); p
!= i
.colls
.end();
9330 cvec
[j
] = _get_collection(*p
);
9332 vector
<OnodeRef
> ovec(i
.objects
.size());
9334 for (int pos
= 0; i
.have_op(); ++pos
) {
9335 Transaction::Op
*op
= i
.decode_op();
9339 if (op
->op
== Transaction::OP_NOP
)
9342 // collection operations
9343 CollectionRef
&c
= cvec
[op
->cid
];
9345 case Transaction::OP_RMCOLL
:
9347 const coll_t
&cid
= i
.get_cid(op
->cid
);
9348 r
= _remove_collection(txc
, cid
, &c
);
9354 case Transaction::OP_MKCOLL
:
9357 const coll_t
&cid
= i
.get_cid(op
->cid
);
9358 r
= _create_collection(txc
, cid
, op
->split_bits
, &c
);
9364 case Transaction::OP_SPLIT_COLLECTION
:
9365 assert(0 == "deprecated");
9368 case Transaction::OP_SPLIT_COLLECTION2
:
9370 uint32_t bits
= op
->split_bits
;
9371 uint32_t rem
= op
->split_rem
;
9372 r
= _split_collection(txc
, c
, cvec
[op
->dest_cid
], bits
, rem
);
9378 case Transaction::OP_COLL_HINT
:
9380 uint32_t type
= op
->hint_type
;
9383 bufferlist::iterator hiter
= hint
.begin();
9384 if (type
== Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
) {
9387 ::decode(pg_num
, hiter
);
9388 ::decode(num_objs
, hiter
);
9389 dout(10) << __func__
<< " collection hint objects is a no-op, "
9390 << " pg_num " << pg_num
<< " num_objects " << num_objs
9394 dout(10) << __func__
<< " unknown collection hint " << type
<< dendl
;
9400 case Transaction::OP_COLL_SETATTR
:
9404 case Transaction::OP_COLL_RMATTR
:
9408 case Transaction::OP_COLL_RENAME
:
9409 assert(0 == "not implemented");
9413 derr
<< __func__
<< " error " << cpp_strerror(r
)
9414 << " not handled on operation " << op
->op
9415 << " (op " << pos
<< ", counting from 0)" << dendl
;
9416 _dump_transaction(t
, 0);
9417 assert(0 == "unexpected error");
9420 // these operations implicity create the object
9421 bool create
= false;
9422 if (op
->op
== Transaction::OP_TOUCH
||
9423 op
->op
== Transaction::OP_WRITE
||
9424 op
->op
== Transaction::OP_ZERO
) {
9428 // object operations
9429 RWLock::WLocker
l(c
->lock
);
9430 OnodeRef
&o
= ovec
[op
->oid
];
9432 ghobject_t oid
= i
.get_oid(op
->oid
);
9433 o
= c
->get_onode(oid
, create
);
9435 if (!create
&& (!o
|| !o
->exists
)) {
9436 dout(10) << __func__
<< " op " << op
->op
<< " got ENOENT on "
9437 << i
.get_oid(op
->oid
) << dendl
;
9443 case Transaction::OP_TOUCH
:
9444 r
= _touch(txc
, c
, o
);
9447 case Transaction::OP_WRITE
:
9449 uint64_t off
= op
->off
;
9450 uint64_t len
= op
->len
;
9451 uint32_t fadvise_flags
= i
.get_fadvise_flags();
9454 r
= _write(txc
, c
, o
, off
, len
, bl
, fadvise_flags
);
9458 case Transaction::OP_ZERO
:
9460 uint64_t off
= op
->off
;
9461 uint64_t len
= op
->len
;
9462 r
= _zero(txc
, c
, o
, off
, len
);
9466 case Transaction::OP_TRIMCACHE
:
9468 // deprecated, no-op
9472 case Transaction::OP_TRUNCATE
:
9474 uint64_t off
= op
->off
;
9475 r
= _truncate(txc
, c
, o
, off
);
9479 case Transaction::OP_REMOVE
:
9481 r
= _remove(txc
, c
, o
);
9485 case Transaction::OP_SETATTR
:
9487 string name
= i
.decode_string();
9490 r
= _setattr(txc
, c
, o
, name
, bp
);
9494 case Transaction::OP_SETATTRS
:
9496 map
<string
, bufferptr
> aset
;
9497 i
.decode_attrset(aset
);
9498 r
= _setattrs(txc
, c
, o
, aset
);
9502 case Transaction::OP_RMATTR
:
9504 string name
= i
.decode_string();
9505 r
= _rmattr(txc
, c
, o
, name
);
9509 case Transaction::OP_RMATTRS
:
9511 r
= _rmattrs(txc
, c
, o
);
9515 case Transaction::OP_CLONE
:
9517 OnodeRef
& no
= ovec
[op
->dest_oid
];
9519 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
9520 no
= c
->get_onode(noid
, true);
9522 r
= _clone(txc
, c
, o
, no
);
9526 case Transaction::OP_CLONERANGE
:
9527 assert(0 == "deprecated");
9530 case Transaction::OP_CLONERANGE2
:
9532 OnodeRef
& no
= ovec
[op
->dest_oid
];
9534 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
9535 no
= c
->get_onode(noid
, true);
9537 uint64_t srcoff
= op
->off
;
9538 uint64_t len
= op
->len
;
9539 uint64_t dstoff
= op
->dest_off
;
9540 r
= _clone_range(txc
, c
, o
, no
, srcoff
, len
, dstoff
);
9544 case Transaction::OP_COLL_ADD
:
9545 assert(0 == "not implemented");
9548 case Transaction::OP_COLL_REMOVE
:
9549 assert(0 == "not implemented");
9552 case Transaction::OP_COLL_MOVE
:
9553 assert(0 == "deprecated");
9556 case Transaction::OP_COLL_MOVE_RENAME
:
9557 case Transaction::OP_TRY_RENAME
:
9559 assert(op
->cid
== op
->dest_cid
);
9560 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
9561 OnodeRef
& no
= ovec
[op
->dest_oid
];
9563 no
= c
->get_onode(noid
, false);
9565 r
= _rename(txc
, c
, o
, no
, noid
);
9569 case Transaction::OP_OMAP_CLEAR
:
9571 r
= _omap_clear(txc
, c
, o
);
9574 case Transaction::OP_OMAP_SETKEYS
:
9577 i
.decode_attrset_bl(&aset_bl
);
9578 r
= _omap_setkeys(txc
, c
, o
, aset_bl
);
9581 case Transaction::OP_OMAP_RMKEYS
:
9584 i
.decode_keyset_bl(&keys_bl
);
9585 r
= _omap_rmkeys(txc
, c
, o
, keys_bl
);
9588 case Transaction::OP_OMAP_RMKEYRANGE
:
9591 first
= i
.decode_string();
9592 last
= i
.decode_string();
9593 r
= _omap_rmkey_range(txc
, c
, o
, first
, last
);
9596 case Transaction::OP_OMAP_SETHEADER
:
9600 r
= _omap_setheader(txc
, c
, o
, bl
);
9604 case Transaction::OP_SETALLOCHINT
:
9606 r
= _set_alloc_hint(txc
, c
, o
,
9607 op
->expected_object_size
,
9608 op
->expected_write_size
,
9609 op
->alloc_hint_flags
);
9614 derr
<< __func__
<< "bad op " << op
->op
<< dendl
;
9622 if (r
== -ENOENT
&& !(op
->op
== Transaction::OP_CLONERANGE
||
9623 op
->op
== Transaction::OP_CLONE
||
9624 op
->op
== Transaction::OP_CLONERANGE2
||
9625 op
->op
== Transaction::OP_COLL_ADD
||
9626 op
->op
== Transaction::OP_SETATTR
||
9627 op
->op
== Transaction::OP_SETATTRS
||
9628 op
->op
== Transaction::OP_RMATTR
||
9629 op
->op
== Transaction::OP_OMAP_SETKEYS
||
9630 op
->op
== Transaction::OP_OMAP_RMKEYS
||
9631 op
->op
== Transaction::OP_OMAP_RMKEYRANGE
||
9632 op
->op
== Transaction::OP_OMAP_SETHEADER
))
9633 // -ENOENT is usually okay
9639 const char *msg
= "unexpected error code";
9641 if (r
== -ENOENT
&& (op
->op
== Transaction::OP_CLONERANGE
||
9642 op
->op
== Transaction::OP_CLONE
||
9643 op
->op
== Transaction::OP_CLONERANGE2
))
9644 msg
= "ENOENT on clone suggests osd bug";
9647 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
9648 // by partially applying transactions.
9649 msg
= "ENOSPC from bluestore, misconfigured cluster";
9651 if (r
== -ENOTEMPTY
) {
9652 msg
= "ENOTEMPTY suggests garbage data in osd data dir";
9655 derr
<< __func__
<< " error " << cpp_strerror(r
)
9656 << " not handled on operation " << op
->op
9657 << " (op " << pos
<< ", counting from 0)"
9659 derr
<< msg
<< dendl
;
9660 _dump_transaction(t
, 0);
9661 assert(0 == "unexpected error");
9669 // -----------------
9672 int BlueStore::_touch(TransContext
*txc
,
9676 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
9678 _assign_nid(txc
, o
);
9679 txc
->write_onode(o
);
9680 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
9684 void BlueStore::_dump_onode(const OnodeRef
& o
, int log_level
)
9686 if (!cct
->_conf
->subsys
.should_gather(ceph_subsys_bluestore
, log_level
))
9688 dout(log_level
) << __func__
<< " " << o
<< " " << o
->oid
9689 << " nid " << o
->onode
.nid
9690 << " size 0x" << std::hex
<< o
->onode
.size
9691 << " (" << std::dec
<< o
->onode
.size
<< ")"
9692 << " expected_object_size " << o
->onode
.expected_object_size
9693 << " expected_write_size " << o
->onode
.expected_write_size
9694 << " in " << o
->onode
.extent_map_shards
.size() << " shards"
9695 << ", " << o
->extent_map
.spanning_blob_map
.size()
9696 << " spanning blobs"
9698 for (auto p
= o
->onode
.attrs
.begin();
9699 p
!= o
->onode
.attrs
.end();
9701 dout(log_level
) << __func__
<< " attr " << p
->first
9702 << " len " << p
->second
.length() << dendl
;
9704 _dump_extent_map(o
->extent_map
, log_level
);
9707 void BlueStore::_dump_extent_map(ExtentMap
&em
, int log_level
)
9710 for (auto& s
: em
.shards
) {
9711 dout(log_level
) << __func__
<< " shard " << *s
.shard_info
9712 << (s
.loaded
? " (loaded)" : "")
9713 << (s
.dirty
? " (dirty)" : "")
9716 for (auto& e
: em
.extent_map
) {
9717 dout(log_level
) << __func__
<< " " << e
<< dendl
;
9718 assert(e
.logical_offset
>= pos
);
9719 pos
= e
.logical_offset
+ e
.length
;
9720 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
9721 if (blob
.has_csum()) {
9723 unsigned n
= blob
.get_csum_count();
9724 for (unsigned i
= 0; i
< n
; ++i
)
9725 v
.push_back(blob
.get_csum_item(i
));
9726 dout(log_level
) << __func__
<< " csum: " << std::hex
<< v
<< std::dec
9729 std::lock_guard
<std::recursive_mutex
> l(e
.blob
->shared_blob
->get_cache()->lock
);
9730 for (auto& i
: e
.blob
->shared_blob
->bc
.buffer_map
) {
9731 dout(log_level
) << __func__
<< " 0x" << std::hex
<< i
.first
9732 << "~" << i
.second
->length
<< std::dec
9733 << " " << *i
.second
<< dendl
;
9738 void BlueStore::_dump_transaction(Transaction
*t
, int log_level
)
9740 dout(log_level
) << " transaction dump:\n";
9741 JSONFormatter
f(true);
9742 f
.open_object_section("transaction");
9749 void BlueStore::_pad_zeros(
9750 bufferlist
*bl
, uint64_t *offset
,
9751 uint64_t chunk_size
)
9753 auto length
= bl
->length();
9754 dout(30) << __func__
<< " 0x" << std::hex
<< *offset
<< "~" << length
9755 << " chunk_size 0x" << chunk_size
<< std::dec
<< dendl
;
9756 dout(40) << "before:\n";
9757 bl
->hexdump(*_dout
);
9760 size_t front_pad
= *offset
% chunk_size
;
9761 size_t back_pad
= 0;
9762 size_t pad_count
= 0;
9764 size_t front_copy
= MIN(chunk_size
- front_pad
, length
);
9765 bufferptr z
= buffer::create_page_aligned(chunk_size
);
9766 z
.zero(0, front_pad
, false);
9767 pad_count
+= front_pad
;
9768 bl
->copy(0, front_copy
, z
.c_str() + front_pad
);
9769 if (front_copy
+ front_pad
< chunk_size
) {
9770 back_pad
= chunk_size
- (length
+ front_pad
);
9771 z
.zero(front_pad
+ length
, back_pad
, false);
9772 pad_count
+= back_pad
;
9776 t
.substr_of(old
, front_copy
, length
- front_copy
);
9778 bl
->claim_append(t
);
9779 *offset
-= front_pad
;
9780 length
+= pad_count
;
9784 uint64_t end
= *offset
+ length
;
9785 unsigned back_copy
= end
% chunk_size
;
9787 assert(back_pad
== 0);
9788 back_pad
= chunk_size
- back_copy
;
9789 assert(back_copy
<= length
);
9790 bufferptr
tail(chunk_size
);
9791 bl
->copy(length
- back_copy
, back_copy
, tail
.c_str());
9792 tail
.zero(back_copy
, back_pad
, false);
9795 bl
->substr_of(old
, 0, length
- back_copy
);
9798 pad_count
+= back_pad
;
9800 dout(20) << __func__
<< " pad 0x" << std::hex
<< front_pad
<< " + 0x"
9801 << back_pad
<< " on front/back, now 0x" << *offset
<< "~"
9802 << length
<< std::dec
<< dendl
;
9803 dout(40) << "after:\n";
9804 bl
->hexdump(*_dout
);
9807 logger
->inc(l_bluestore_write_pad_bytes
, pad_count
);
9808 assert(bl
->length() == length
);
9811 void BlueStore::_do_write_small(
9815 uint64_t offset
, uint64_t length
,
9816 bufferlist::iterator
& blp
,
9819 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
9820 << std::dec
<< dendl
;
9821 assert(length
< min_alloc_size
);
9822 uint64_t end_offs
= offset
+ length
;
9824 logger
->inc(l_bluestore_write_small
);
9825 logger
->inc(l_bluestore_write_small_bytes
, length
);
9828 blp
.copy(length
, bl
);
9830 // Look for an existing mutable blob we can use.
9831 auto begin
= o
->extent_map
.extent_map
.begin();
9832 auto end
= o
->extent_map
.extent_map
.end();
9833 auto ep
= o
->extent_map
.seek_lextent(offset
);
9836 if (ep
->blob_end() <= offset
) {
9841 if (prev_ep
!= begin
) {
9844 prev_ep
= end
; // to avoid this extent check as it's a duplicate
9847 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
9848 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
9849 uint32_t alloc_len
= min_alloc_size
;
9850 auto offset0
= P2ALIGN(offset
, alloc_len
);
9854 // search suitable extent in both forward and reverse direction in
9855 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9856 // then check if blob can be reused via can_reuse_blob func or apply
9857 // direct/deferred write (the latter for extents including or higher
9858 // than 'offset' only).
9862 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
9863 BlobRef b
= ep
->blob
;
9864 auto bstart
= ep
->blob_start();
9865 dout(20) << __func__
<< " considering " << *b
9866 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
9867 if (bstart
>= end_offs
) {
9868 dout(20) << __func__
<< " ignoring distant " << *b
<< dendl
;
9869 } else if (!b
->get_blob().is_mutable()) {
9870 dout(20) << __func__
<< " ignoring immutable " << *b
<< dendl
;
9871 } else if (ep
->logical_offset
% min_alloc_size
!=
9872 ep
->blob_offset
% min_alloc_size
) {
9873 dout(20) << __func__
<< " ignoring offset-skewed " << *b
<< dendl
;
9875 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
9876 // can we pad our head/tail out with zeros?
9877 uint64_t head_pad
, tail_pad
;
9878 head_pad
= P2PHASE(offset
, chunk_size
);
9879 tail_pad
= P2NPHASE(end_offs
, chunk_size
);
9880 if (head_pad
|| tail_pad
) {
9881 o
->extent_map
.fault_range(db
, offset
- head_pad
,
9882 end_offs
- offset
+ head_pad
+ tail_pad
);
9885 o
->extent_map
.has_any_lextents(offset
- head_pad
, chunk_size
)) {
9888 if (tail_pad
&& o
->extent_map
.has_any_lextents(end_offs
, tail_pad
)) {
9892 uint64_t b_off
= offset
- head_pad
- bstart
;
9893 uint64_t b_len
= length
+ head_pad
+ tail_pad
;
9895 // direct write into unused blocks of an existing mutable blob?
9896 if ((b_off
% chunk_size
== 0 && b_len
% chunk_size
== 0) &&
9897 b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
9898 b
->get_blob().is_unused(b_off
, b_len
) &&
9899 b
->get_blob().is_allocated(b_off
, b_len
)) {
9900 _apply_padding(head_pad
, tail_pad
, bl
);
9902 dout(20) << __func__
<< " write to unused 0x" << std::hex
9903 << b_off
<< "~" << b_len
9904 << " pad 0x" << head_pad
<< " + 0x" << tail_pad
9905 << std::dec
<< " of mutable " << *b
<< dendl
;
9906 _buffer_cache_write(txc
, b
, b_off
, bl
,
9907 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
9909 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
9910 if (b_len
<= prefer_deferred_size
) {
9911 dout(20) << __func__
<< " deferring small 0x" << std::hex
9912 << b_len
<< std::dec
<< " unused write via deferred" << dendl
;
9913 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
9914 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
9917 [&](uint64_t offset
, uint64_t length
) {
9918 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
9923 b
->get_blob().map_bl(
9925 [&](uint64_t offset
, bufferlist
& t
) {
9926 bdev
->aio_write(offset
, t
,
9927 &txc
->ioc
, wctx
->buffered
);
9931 b
->dirty_blob().calc_csum(b_off
, bl
);
9932 dout(20) << __func__
<< " lex old " << *ep
<< dendl
;
9933 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, b_off
+ head_pad
, length
,
9935 &wctx
->old_extents
);
9936 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
9937 txc
->statfs_delta
.stored() += le
->length
;
9938 dout(20) << __func__
<< " lex " << *le
<< dendl
;
9939 logger
->inc(l_bluestore_write_small_unused
);
9942 // read some data to fill out the chunk?
9943 uint64_t head_read
= P2PHASE(b_off
, chunk_size
);
9944 uint64_t tail_read
= P2NPHASE(b_off
+ b_len
, chunk_size
);
9945 if ((head_read
|| tail_read
) &&
9946 (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
+ tail_read
) &&
9947 head_read
+ tail_read
< min_alloc_size
) {
9949 b_len
+= head_read
+ tail_read
;
9952 head_read
= tail_read
= 0;
9955 // chunk-aligned deferred overwrite?
9956 if (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
9957 b_off
% chunk_size
== 0 &&
9958 b_len
% chunk_size
== 0 &&
9959 b
->get_blob().is_allocated(b_off
, b_len
)) {
9961 _apply_padding(head_pad
, tail_pad
, bl
);
9963 dout(20) << __func__
<< " reading head 0x" << std::hex
<< head_read
9964 << " and tail 0x" << tail_read
<< std::dec
<< dendl
;
9967 int r
= _do_read(c
.get(), o
, offset
- head_pad
- head_read
, head_read
,
9969 assert(r
>= 0 && r
<= (int)head_read
);
9970 size_t zlen
= head_read
- r
;
9972 head_bl
.append_zero(zlen
);
9973 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
9975 bl
.claim_prepend(head_bl
);
9976 logger
->inc(l_bluestore_write_penalty_read_ops
);
9980 int r
= _do_read(c
.get(), o
, offset
+ length
+ tail_pad
, tail_read
,
9982 assert(r
>= 0 && r
<= (int)tail_read
);
9983 size_t zlen
= tail_read
- r
;
9985 tail_bl
.append_zero(zlen
);
9986 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
9988 bl
.claim_append(tail_bl
);
9989 logger
->inc(l_bluestore_write_penalty_read_ops
);
9991 logger
->inc(l_bluestore_write_small_pre_read
);
9993 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
9994 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
9995 _buffer_cache_write(txc
, b
, b_off
, bl
,
9996 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
9998 int r
= b
->get_blob().map(
10000 [&](uint64_t offset
, uint64_t length
) {
10001 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
10005 if (b
->get_blob().csum_type
) {
10006 b
->dirty_blob().calc_csum(b_off
, bl
);
10008 op
->data
.claim(bl
);
10009 dout(20) << __func__
<< " deferred write 0x" << std::hex
<< b_off
<< "~"
10010 << b_len
<< std::dec
<< " of mutable " << *b
10011 << " at " << op
->extents
<< dendl
;
10012 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, offset
- bstart
, length
,
10013 b
, &wctx
->old_extents
);
10014 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
10015 txc
->statfs_delta
.stored() += le
->length
;
10016 dout(20) << __func__
<< " lex " << *le
<< dendl
;
10017 logger
->inc(l_bluestore_write_small_deferred
);
10020 // try to reuse blob if we can
10021 if (b
->can_reuse_blob(min_alloc_size
,
10025 assert(alloc_len
== min_alloc_size
); // expecting data always
10026 // fit into reused blob
10027 // Need to check for pending writes desiring to
10028 // reuse the same pextent. The rationale is that during GC two chunks
10029 // from garbage blobs(compressed?) can share logical space within the same
10030 // AU. That's in turn might be caused by unaligned len in clone_range2.
10031 // Hence the second write will fail in an attempt to reuse blob at
10032 // do_alloc_write().
10033 if (!wctx
->has_conflict(b
,
10035 offset0
+ alloc_len
,
10038 // we can't reuse pad_head/pad_tail since they might be truncated
10039 // due to existent extents
10040 uint64_t b_off
= offset
- bstart
;
10041 uint64_t b_off0
= b_off
;
10042 _pad_zeros(&bl
, &b_off0
, chunk_size
);
10044 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
10045 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
10046 << " (0x" << b_off
<< "~" << length
<< ")"
10047 << std::dec
<< dendl
;
10049 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
10050 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
10052 logger
->inc(l_bluestore_write_small_unused
);
10059 } // if (ep != end && ep->logical_offset < offset + max_bsize)
10061 // check extent for reuse in reverse order
10062 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
10063 BlobRef b
= prev_ep
->blob
;
10064 auto bstart
= prev_ep
->blob_start();
10065 dout(20) << __func__
<< " considering " << *b
10066 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
10067 if (b
->can_reuse_blob(min_alloc_size
,
10071 assert(alloc_len
== min_alloc_size
); // expecting data always
10072 // fit into reused blob
10073 // Need to check for pending writes desiring to
10074 // reuse the same pextent. The rationale is that during GC two chunks
10075 // from garbage blobs(compressed?) can share logical space within the same
10076 // AU. That's in turn might be caused by unaligned len in clone_range2.
10077 // Hence the second write will fail in an attempt to reuse blob at
10078 // do_alloc_write().
10079 if (!wctx
->has_conflict(b
,
10081 offset0
+ alloc_len
,
10084 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
10085 uint64_t b_off
= offset
- bstart
;
10086 uint64_t b_off0
= b_off
;
10087 _pad_zeros(&bl
, &b_off0
, chunk_size
);
10089 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
10090 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
10091 << " (0x" << b_off
<< "~" << length
<< ")"
10092 << std::dec
<< dendl
;
10094 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
10095 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
10097 logger
->inc(l_bluestore_write_small_unused
);
10101 if (prev_ep
!= begin
) {
10105 prev_ep
= end
; // to avoid useless first extent re-check
10107 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
10108 } while (any_change
);
10112 BlobRef b
= c
->new_blob();
10113 uint64_t b_off
= P2PHASE(offset
, alloc_len
);
10114 uint64_t b_off0
= b_off
;
10115 _pad_zeros(&bl
, &b_off0
, block_size
);
10116 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
10117 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
, true, true);
10118 logger
->inc(l_bluestore_write_small_new
);
10123 void BlueStore::_do_write_big(
10127 uint64_t offset
, uint64_t length
,
10128 bufferlist::iterator
& blp
,
10129 WriteContext
*wctx
)
10131 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
10132 << " target_blob_size 0x" << wctx
->target_blob_size
<< std::dec
10133 << " compress " << (int)wctx
->compress
10135 logger
->inc(l_bluestore_write_big
);
10136 logger
->inc(l_bluestore_write_big_bytes
, length
);
10137 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
10138 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
10139 while (length
> 0) {
10140 bool new_blob
= false;
10141 uint32_t l
= MIN(max_bsize
, length
);
10143 uint32_t b_off
= 0;
10145 //attempting to reuse existing blob
10146 if (!wctx
->compress
) {
10147 // look for an existing mutable blob we can reuse
10148 auto begin
= o
->extent_map
.extent_map
.begin();
10149 auto end
= o
->extent_map
.extent_map
.end();
10150 auto ep
= o
->extent_map
.seek_lextent(offset
);
10152 if (prev_ep
!= begin
) {
10155 prev_ep
= end
; // to avoid this extent check as it's a duplicate
10157 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
10158 // search suitable extent in both forward and reverse direction in
10159 // [offset - target_max_blob_size, offset + target_max_blob_size] range
10160 // then check if blob can be reused via can_reuse_blob func.
10163 any_change
= false;
10164 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
10165 if (offset
>= ep
->blob_start() &&
10166 ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
10167 offset
- ep
->blob_start(),
10170 b_off
= offset
- ep
->blob_start();
10171 prev_ep
= end
; // to avoid check below
10172 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
10173 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
10180 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
10181 if (prev_ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
10182 offset
- prev_ep
->blob_start(),
10185 b_off
= offset
- prev_ep
->blob_start();
10186 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
10187 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
10188 } else if (prev_ep
!= begin
) {
10192 prev_ep
= end
; // to avoid useless first extent re-check
10195 } while (b
== nullptr && any_change
);
10197 if (b
== nullptr) {
10205 wctx
->write(offset
, b
, l
, b_off
, t
, b_off
, l
, false, new_blob
);
10208 logger
->inc(l_bluestore_write_big_blobs
);
10212 int BlueStore::_do_alloc_write(
10214 CollectionRef coll
,
10216 WriteContext
*wctx
)
10218 dout(20) << __func__
<< " txc " << txc
10219 << " " << wctx
->writes
.size() << " blobs"
10221 if (wctx
->writes
.empty()) {
10227 if (wctx
->compress
) {
10229 "compression_algorithm",
10233 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_ALGORITHM
, &val
)) {
10234 CompressorRef cp
= compressor
;
10235 if (!cp
|| cp
->get_type_name() != val
) {
10236 cp
= Compressor::create(cct
, val
);
10238 return boost::optional
<CompressorRef
>(cp
);
10240 return boost::optional
<CompressorRef
>();
10244 crr
= select_option(
10245 "compression_required_ratio",
10246 cct
->_conf
->bluestore_compression_required_ratio
,
10249 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO
, &val
)) {
10250 return boost::optional
<double>(val
);
10252 return boost::optional
<double>();
10258 int csum
= csum_type
.load();
10259 csum
= select_option(
10264 if (coll
->pool_opts
.get(pool_opts_t::CSUM_TYPE
, &val
)) {
10265 return boost::optional
<int>(val
);
10267 return boost::optional
<int>();
10271 // compress (as needed) and calc needed space
10273 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
10274 for (auto& wi
: wctx
->writes
) {
10275 if (c
&& wi
.blob_length
> min_alloc_size
) {
10276 utime_t start
= ceph_clock_now();
10279 assert(wi
.b_off
== 0);
10280 assert(wi
.blob_length
== wi
.bl
.length());
10282 // FIXME: memory alignment here is bad
10284 int r
= c
->compress(wi
.bl
, t
);
10286 uint64_t want_len_raw
= wi
.blob_length
* crr
;
10287 uint64_t want_len
= P2ROUNDUP(want_len_raw
, min_alloc_size
);
10288 bool rejected
= false;
10289 uint64_t compressed_len
= t
.length();
10290 // do an approximate (fast) estimation for resulting blob size
10291 // that doesn't take header overhead into account
10292 uint64_t result_len
= P2ROUNDUP(compressed_len
, min_alloc_size
);
10293 if (r
== 0 && result_len
<= want_len
&& result_len
< wi
.blob_length
) {
10294 bluestore_compression_header_t chdr
;
10295 chdr
.type
= c
->get_type();
10296 chdr
.length
= t
.length();
10297 encode(chdr
, wi
.compressed_bl
);
10298 wi
.compressed_bl
.claim_append(t
);
10300 compressed_len
= wi
.compressed_bl
.length();
10301 result_len
= P2ROUNDUP(compressed_len
, min_alloc_size
);
10302 if (result_len
<= want_len
&& result_len
< wi
.blob_length
) {
10303 // Cool. We compressed at least as much as we were hoping to.
10304 // pad out to min_alloc_size
10305 wi
.compressed_bl
.append_zero(result_len
- compressed_len
);
10306 wi
.compressed_len
= compressed_len
;
10307 wi
.compressed
= true;
10308 logger
->inc(l_bluestore_write_pad_bytes
, result_len
- compressed_len
);
10309 dout(20) << __func__
<< std::hex
<< " compressed 0x" << wi
.blob_length
10310 << " -> 0x" << compressed_len
<< " => 0x" << result_len
10311 << " with " << c
->get_type()
10312 << std::dec
<< dendl
;
10313 txc
->statfs_delta
.compressed() += compressed_len
;
10314 txc
->statfs_delta
.compressed_original() += wi
.blob_length
;
10315 txc
->statfs_delta
.compressed_allocated() += result_len
;
10316 logger
->inc(l_bluestore_compress_success_count
);
10317 need
+= result_len
;
10321 } else if (r
!= 0) {
10322 dout(5) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
10323 << " bytes compressed using " << c
->get_type_name()
10325 << " failed with errcode = " << r
10326 << ", leaving uncompressed"
10328 logger
->inc(l_bluestore_compress_rejected_count
);
10329 need
+= wi
.blob_length
;
10335 dout(20) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
10336 << " compressed to 0x" << compressed_len
<< " -> 0x" << result_len
10337 << " with " << c
->get_type()
10338 << ", which is more than required 0x" << want_len_raw
10339 << " -> 0x" << want_len
10340 << ", leaving uncompressed"
10341 << std::dec
<< dendl
;
10342 logger
->inc(l_bluestore_compress_rejected_count
);
10343 need
+= wi
.blob_length
;
10345 logger
->tinc(l_bluestore_compress_lat
,
10346 ceph_clock_now() - start
);
10348 need
+= wi
.blob_length
;
10351 PExtentVector prealloc
;
10352 prealloc
.reserve(2 * wctx
->writes
.size());;
10353 int prealloc_left
= 0;
10354 prealloc_left
= alloc
->allocate(
10355 need
, min_alloc_size
, need
,
10357 if (prealloc_left
< 0) {
10358 derr
<< __func__
<< " failed to allocate 0x" << std::hex
<< need
<< std::dec
10362 assert(prealloc_left
== (int64_t)need
);
10364 dout(20) << __func__
<< " prealloc " << prealloc
<< dendl
;
10365 auto prealloc_pos
= prealloc
.begin();
10367 for (auto& wi
: wctx
->writes
) {
10369 bluestore_blob_t
& dblob
= b
->dirty_blob();
10370 uint64_t b_off
= wi
.b_off
;
10371 bufferlist
*l
= &wi
.bl
;
10372 uint64_t final_length
= wi
.blob_length
;
10373 uint64_t csum_length
= wi
.blob_length
;
10374 unsigned csum_order
= block_size_order
;
10375 if (wi
.compressed
) {
10376 final_length
= wi
.compressed_bl
.length();
10377 csum_length
= final_length
;
10378 csum_order
= ctz(csum_length
);
10379 l
= &wi
.compressed_bl
;
10380 dblob
.set_compressed(wi
.blob_length
, wi
.compressed_len
);
10381 } else if (wi
.new_blob
) {
10382 // initialize newly created blob only
10383 assert(dblob
.is_mutable());
10384 if (l
->length() != wi
.blob_length
) {
10385 // hrm, maybe we could do better here, but let's not bother.
10386 dout(20) << __func__
<< " forcing csum_order to block_size_order "
10387 << block_size_order
<< dendl
;
10388 csum_order
= block_size_order
;
10390 csum_order
= std::min(wctx
->csum_order
, ctz(l
->length()));
10392 // try to align blob with max_blob_size to improve
10393 // its reuse ratio, e.g. in case of reverse write
10394 uint32_t suggested_boff
=
10395 (wi
.logical_offset
- (wi
.b_off0
- wi
.b_off
)) % max_bsize
;
10396 if ((suggested_boff
% (1 << csum_order
)) == 0 &&
10397 suggested_boff
+ final_length
<= max_bsize
&&
10398 suggested_boff
> b_off
) {
10399 dout(20) << __func__
<< " forcing blob_offset to 0x"
10400 << std::hex
<< suggested_boff
<< std::dec
<< dendl
;
10401 assert(suggested_boff
>= b_off
);
10402 csum_length
+= suggested_boff
- b_off
;
10403 b_off
= suggested_boff
;
10405 if (csum
!= Checksummer::CSUM_NONE
) {
10406 dout(20) << __func__
<< " initialize csum setting for new blob " << *b
10407 << " csum_type " << Checksummer::get_csum_type_string(csum
)
10408 << " csum_order " << csum_order
10409 << " csum_length 0x" << std::hex
<< csum_length
<< std::dec
10411 dblob
.init_csum(csum
, csum_order
, csum_length
);
10415 PExtentVector extents
;
10416 int64_t left
= final_length
;
10418 assert(prealloc_left
> 0);
10419 if (prealloc_pos
->length
<= left
) {
10420 prealloc_left
-= prealloc_pos
->length
;
10421 left
-= prealloc_pos
->length
;
10422 txc
->statfs_delta
.allocated() += prealloc_pos
->length
;
10423 extents
.push_back(*prealloc_pos
);
10426 extents
.emplace_back(prealloc_pos
->offset
, left
);
10427 prealloc_pos
->offset
+= left
;
10428 prealloc_pos
->length
-= left
;
10429 prealloc_left
-= left
;
10430 txc
->statfs_delta
.allocated() += left
;
10435 for (auto& p
: extents
) {
10436 txc
->allocated
.insert(p
.offset
, p
.length
);
10438 dblob
.allocated(P2ALIGN(b_off
, min_alloc_size
), final_length
, extents
);
10440 dout(20) << __func__
<< " blob " << *b
<< dendl
;
10441 if (dblob
.has_csum()) {
10442 dblob
.calc_csum(b_off
, *l
);
10445 if (wi
.mark_unused
) {
10446 auto b_end
= b_off
+ wi
.bl
.length();
10448 dblob
.add_unused(0, b_off
);
10450 if (b_end
< wi
.blob_length
) {
10451 dblob
.add_unused(b_end
, wi
.blob_length
- b_end
);
10455 Extent
*le
= o
->extent_map
.set_lextent(coll
, wi
.logical_offset
,
10456 b_off
+ (wi
.b_off0
- wi
.b_off
),
10460 wi
.b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
10461 txc
->statfs_delta
.stored() += le
->length
;
10462 dout(20) << __func__
<< " lex " << *le
<< dendl
;
10463 _buffer_cache_write(txc
, wi
.b
, b_off
, wi
.bl
,
10464 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
10467 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
10468 if (l
->length() <= prefer_deferred_size
.load()) {
10469 dout(20) << __func__
<< " deferring small 0x" << std::hex
10470 << l
->length() << std::dec
<< " write via deferred" << dendl
;
10471 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
10472 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
10473 int r
= b
->get_blob().map(
10474 b_off
, l
->length(),
10475 [&](uint64_t offset
, uint64_t length
) {
10476 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
10482 b
->get_blob().map_bl(
10484 [&](uint64_t offset
, bufferlist
& t
) {
10485 bdev
->aio_write(offset
, t
, &txc
->ioc
, false);
10490 assert(prealloc_pos
== prealloc
.end());
10491 assert(prealloc_left
== 0);
10495 void BlueStore::_wctx_finish(
10499 WriteContext
*wctx
,
10500 set
<SharedBlob
*> *maybe_unshared_blobs
)
10502 auto oep
= wctx
->old_extents
.begin();
10503 while (oep
!= wctx
->old_extents
.end()) {
10505 oep
= wctx
->old_extents
.erase(oep
);
10506 dout(20) << __func__
<< " lex_old " << lo
.e
<< dendl
;
10507 BlobRef b
= lo
.e
.blob
;
10508 const bluestore_blob_t
& blob
= b
->get_blob();
10509 if (blob
.is_compressed()) {
10510 if (lo
.blob_empty
) {
10511 txc
->statfs_delta
.compressed() -= blob
.get_compressed_payload_length();
10513 txc
->statfs_delta
.compressed_original() -= lo
.e
.length
;
10516 txc
->statfs_delta
.stored() -= lo
.e
.length
;
10518 dout(20) << __func__
<< " blob release " << r
<< dendl
;
10519 if (blob
.is_shared()) {
10520 PExtentVector final
;
10521 c
->load_shared_blob(b
->shared_blob
);
10523 b
->shared_blob
->put_ref(
10524 e
.offset
, e
.length
, &final
,
10525 b
->is_referenced() ? nullptr : maybe_unshared_blobs
);
10527 dout(20) << __func__
<< " shared_blob release " << final
10528 << " from " << *b
->shared_blob
<< dendl
;
10529 txc
->write_shared_blob(b
->shared_blob
);
10534 // we can't invalidate our logical extents as we drop them because
10535 // other lextents (either in our onode or others) may still
10536 // reference them. but we can throw out anything that is no
10537 // longer allocated. Note that this will leave behind edge bits
10538 // that are no longer referenced but not deallocated (until they
10539 // age out of the cache naturally).
10540 b
->discard_unallocated(c
.get());
10542 dout(20) << __func__
<< " release " << e
<< dendl
;
10543 txc
->released
.insert(e
.offset
, e
.length
);
10544 txc
->statfs_delta
.allocated() -= e
.length
;
10545 if (blob
.is_compressed()) {
10546 txc
->statfs_delta
.compressed_allocated() -= e
.length
;
10550 if (b
->is_spanning() && !b
->is_referenced()) {
10551 dout(20) << __func__
<< " spanning_blob_map removing empty " << *b
10553 o
->extent_map
.spanning_blob_map
.erase(b
->id
);
10558 void BlueStore::_do_write_data(
10565 WriteContext
*wctx
)
10567 uint64_t end
= offset
+ length
;
10568 bufferlist::iterator p
= bl
.begin();
10570 if (offset
/ min_alloc_size
== (end
- 1) / min_alloc_size
&&
10571 (length
!= min_alloc_size
)) {
10572 // we fall within the same block
10573 _do_write_small(txc
, c
, o
, offset
, length
, p
, wctx
);
10575 uint64_t head_offset
, head_length
;
10576 uint64_t middle_offset
, middle_length
;
10577 uint64_t tail_offset
, tail_length
;
10579 head_offset
= offset
;
10580 head_length
= P2NPHASE(offset
, min_alloc_size
);
10582 tail_offset
= P2ALIGN(end
, min_alloc_size
);
10583 tail_length
= P2PHASE(end
, min_alloc_size
);
10585 middle_offset
= head_offset
+ head_length
;
10586 middle_length
= length
- head_length
- tail_length
;
10589 _do_write_small(txc
, c
, o
, head_offset
, head_length
, p
, wctx
);
10592 if (middle_length
) {
10593 _do_write_big(txc
, c
, o
, middle_offset
, middle_length
, p
, wctx
);
10597 _do_write_small(txc
, c
, o
, tail_offset
, tail_length
, p
, wctx
);
10602 void BlueStore::_choose_write_options(
10605 uint32_t fadvise_flags
,
10606 WriteContext
*wctx
)
10608 if (fadvise_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
10609 dout(20) << __func__
<< " will do buffered write" << dendl
;
10610 wctx
->buffered
= true;
10611 } else if (cct
->_conf
->bluestore_default_buffered_write
&&
10612 (fadvise_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
10613 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
10614 dout(20) << __func__
<< " defaulting to buffered write" << dendl
;
10615 wctx
->buffered
= true;
10618 // apply basic csum block size
10619 wctx
->csum_order
= block_size_order
;
10621 // compression parameters
10622 unsigned alloc_hints
= o
->onode
.alloc_hint_flags
;
10623 auto cm
= select_option(
10624 "compression_mode",
10628 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MODE
, &val
)) {
10629 return boost::optional
<Compressor::CompressionMode
>(
10630 Compressor::get_comp_mode_type(val
));
10632 return boost::optional
<Compressor::CompressionMode
>();
10636 wctx
->compress
= (cm
!= Compressor::COMP_NONE
) &&
10637 ((cm
== Compressor::COMP_FORCE
) ||
10638 (cm
== Compressor::COMP_AGGRESSIVE
&&
10639 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE
) == 0) ||
10640 (cm
== Compressor::COMP_PASSIVE
&&
10641 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE
)));
10643 if ((alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ
) &&
10644 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ
) == 0 &&
10645 (alloc_hints
& (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE
|
10646 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY
)) &&
10647 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE
) == 0) {
10649 dout(20) << __func__
<< " will prefer large blob and csum sizes" << dendl
;
10651 if (o
->onode
.expected_write_size
) {
10652 wctx
->csum_order
= std::max(min_alloc_size_order
,
10653 (uint8_t)ctz(o
->onode
.expected_write_size
));
10655 wctx
->csum_order
= min_alloc_size_order
;
10658 if (wctx
->compress
) {
10659 wctx
->target_blob_size
= select_option(
10660 "compression_max_blob_size",
10661 comp_max_blob_size
.load(),
10664 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, &val
)) {
10665 return boost::optional
<uint64_t>((uint64_t)val
);
10667 return boost::optional
<uint64_t>();
10672 if (wctx
->compress
) {
10673 wctx
->target_blob_size
= select_option(
10674 "compression_min_blob_size",
10675 comp_min_blob_size
.load(),
10678 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, &val
)) {
10679 return boost::optional
<uint64_t>((uint64_t)val
);
10681 return boost::optional
<uint64_t>();
10687 uint64_t max_bsize
= max_blob_size
.load();
10688 if (wctx
->target_blob_size
== 0 || wctx
->target_blob_size
> max_bsize
) {
10689 wctx
->target_blob_size
= max_bsize
;
10692 // set the min blob size floor at 2x the min_alloc_size, or else we
10693 // won't be able to allocate a smaller extent for the compressed
10695 if (wctx
->compress
&&
10696 wctx
->target_blob_size
< min_alloc_size
* 2) {
10697 wctx
->target_blob_size
= min_alloc_size
* 2;
10700 dout(20) << __func__
<< " prefer csum_order " << wctx
->csum_order
10701 << " target_blob_size 0x" << std::hex
<< wctx
->target_blob_size
10702 << " compress=" << (int)wctx
->compress
10703 << " buffered=" << (int)wctx
->buffered
10704 << std::dec
<< dendl
;
10707 int BlueStore::_do_gc(
10711 const GarbageCollector
& gc
,
10712 const WriteContext
& wctx
,
10713 uint64_t *dirty_start
,
10714 uint64_t *dirty_end
)
10716 auto& extents_to_collect
= gc
.get_extents_to_collect();
10718 bool dirty_range_updated
= false;
10719 WriteContext wctx_gc
;
10720 wctx_gc
.fork(wctx
); // make a clone for garbage collection
10722 for (auto it
= extents_to_collect
.begin();
10723 it
!= extents_to_collect
.end();
10726 int r
= _do_read(c
.get(), o
, it
->offset
, it
->length
, bl
, 0);
10727 assert(r
== (int)it
->length
);
10729 o
->extent_map
.fault_range(db
, it
->offset
, it
->length
);
10730 _do_write_data(txc
, c
, o
, it
->offset
, it
->length
, bl
, &wctx_gc
);
10731 logger
->inc(l_bluestore_gc_merged
, it
->length
);
10733 if (*dirty_start
> it
->offset
) {
10734 *dirty_start
= it
->offset
;
10735 dirty_range_updated
= true;
10738 if (*dirty_end
< it
->offset
+ it
->length
) {
10739 *dirty_end
= it
->offset
+ it
->length
;
10740 dirty_range_updated
= true;
10743 if (dirty_range_updated
) {
10744 o
->extent_map
.fault_range(db
, *dirty_start
, *dirty_end
);
10747 dout(30) << __func__
<< " alloc write" << dendl
;
10748 int r
= _do_alloc_write(txc
, c
, o
, &wctx_gc
);
10750 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
10755 _wctx_finish(txc
, c
, o
, &wctx_gc
);
10759 int BlueStore::_do_write(
10766 uint32_t fadvise_flags
)
10770 dout(20) << __func__
10772 << " 0x" << std::hex
<< offset
<< "~" << length
10773 << " - have 0x" << o
->onode
.size
10774 << " (" << std::dec
<< o
->onode
.size
<< ")"
10776 << " fadvise_flags 0x" << std::hex
<< fadvise_flags
<< std::dec
10784 uint64_t end
= offset
+ length
;
10786 GarbageCollector
gc(c
->store
->cct
);
10788 auto dirty_start
= offset
;
10789 auto dirty_end
= end
;
10792 _choose_write_options(c
, o
, fadvise_flags
, &wctx
);
10793 o
->extent_map
.fault_range(db
, offset
, length
);
10794 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx
);
10795 r
= _do_alloc_write(txc
, c
, o
, &wctx
);
10797 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
10802 // NB: _wctx_finish() will empty old_extents
10803 // so we must do gc estimation before that
10804 benefit
= gc
.estimate(offset
,
10810 _wctx_finish(txc
, c
, o
, &wctx
);
10811 if (end
> o
->onode
.size
) {
10812 dout(20) << __func__
<< " extending size to 0x" << std::hex
<< end
10813 << std::dec
<< dendl
;
10814 o
->onode
.size
= end
;
10817 if (benefit
>= g_conf
->bluestore_gc_enable_total_threshold
) {
10818 if (!gc
.get_extents_to_collect().empty()) {
10819 dout(20) << __func__
<< " perform garbage collection, "
10820 << "expected benefit = " << benefit
<< " AUs" << dendl
;
10821 r
= _do_gc(txc
, c
, o
, gc
, wctx
, &dirty_start
, &dirty_end
);
10823 derr
<< __func__
<< " _do_gc failed with " << cpp_strerror(r
)
10827 dout(20)<<__func__
<<" gc range is " << std::hex
<< dirty_start
10828 << "~" << dirty_end
- dirty_start
<< std::dec
<< dendl
;
10831 o
->extent_map
.compress_extent_map(dirty_start
, dirty_end
- dirty_start
);
10832 o
->extent_map
.dirty_range(dirty_start
, dirty_end
- dirty_start
);
10840 int BlueStore::_write(TransContext
*txc
,
10843 uint64_t offset
, size_t length
,
10845 uint32_t fadvise_flags
)
10847 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10848 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10851 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
10854 _assign_nid(txc
, o
);
10855 r
= _do_write(txc
, c
, o
, offset
, length
, bl
, fadvise_flags
);
10856 txc
->write_onode(o
);
10858 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10859 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10860 << " = " << r
<< dendl
;
10864 int BlueStore::_zero(TransContext
*txc
,
10867 uint64_t offset
, size_t length
)
10869 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10870 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10873 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
10876 _assign_nid(txc
, o
);
10877 r
= _do_zero(txc
, c
, o
, offset
, length
);
10879 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10880 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10881 << " = " << r
<< dendl
;
10885 int BlueStore::_do_zero(TransContext
*txc
,
10888 uint64_t offset
, size_t length
)
10890 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10891 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10898 o
->extent_map
.fault_range(db
, offset
, length
);
10899 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
10900 o
->extent_map
.dirty_range(offset
, length
);
10901 _wctx_finish(txc
, c
, o
, &wctx
);
10903 if (length
> 0 && offset
+ length
> o
->onode
.size
) {
10904 o
->onode
.size
= offset
+ length
;
10905 dout(20) << __func__
<< " extending size to " << offset
+ length
10908 txc
->write_onode(o
);
10910 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10911 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10912 << " = " << r
<< dendl
;
10916 void BlueStore::_do_truncate(
10917 TransContext
*txc
, CollectionRef
& c
, OnodeRef o
, uint64_t offset
,
10918 set
<SharedBlob
*> *maybe_unshared_blobs
)
10920 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10921 << " 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
10923 _dump_onode(o
, 30);
10925 if (offset
== o
->onode
.size
)
10928 if (offset
< o
->onode
.size
) {
10930 uint64_t length
= o
->onode
.size
- offset
;
10931 o
->extent_map
.fault_range(db
, offset
, length
);
10932 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
10933 o
->extent_map
.dirty_range(offset
, length
);
10934 _wctx_finish(txc
, c
, o
, &wctx
, maybe_unshared_blobs
);
10936 // if we have shards past EOF, ask for a reshard
10937 if (!o
->onode
.extent_map_shards
.empty() &&
10938 o
->onode
.extent_map_shards
.back().offset
>= offset
) {
10939 dout(10) << __func__
<< " request reshard past EOF" << dendl
;
10941 o
->extent_map
.request_reshard(offset
- 1, offset
+ length
);
10943 o
->extent_map
.request_reshard(0, length
);
10948 o
->onode
.size
= offset
;
10950 txc
->write_onode(o
);
10953 int BlueStore::_truncate(TransContext
*txc
,
10958 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10959 << " 0x" << std::hex
<< offset
<< std::dec
10962 if (offset
>= OBJECT_MAX_SIZE
) {
10965 _do_truncate(txc
, c
, o
, offset
);
10967 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10968 << " 0x" << std::hex
<< offset
<< std::dec
10969 << " = " << r
<< dendl
;
10973 int BlueStore::_do_remove(
10978 set
<SharedBlob
*> maybe_unshared_blobs
;
10979 bool is_gen
= !o
->oid
.is_no_gen();
10980 _do_truncate(txc
, c
, o
, 0, is_gen
? &maybe_unshared_blobs
: nullptr);
10981 if (o
->onode
.has_omap()) {
10983 _do_omap_clear(txc
, o
->onode
.nid
);
10987 for (auto &s
: o
->extent_map
.shards
) {
10988 dout(20) << __func__
<< " removing shard 0x" << std::hex
10989 << s
.shard_info
->offset
<< std::dec
<< dendl
;
10990 generate_extent_shard_key_and_apply(o
->key
, s
.shard_info
->offset
, &key
,
10991 [&](const string
& final_key
) {
10992 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
10996 txc
->t
->rmkey(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size());
10997 txc
->note_removed_object(o
);
10998 o
->extent_map
.clear();
10999 o
->onode
= bluestore_onode_t();
11000 _debug_obj_on_delete(o
->oid
);
11002 if (!is_gen
|| maybe_unshared_blobs
.empty()) {
11006 // see if we can unshare blobs still referenced by the head
11007 dout(10) << __func__
<< " gen and maybe_unshared_blobs "
11008 << maybe_unshared_blobs
<< dendl
;
11009 ghobject_t nogen
= o
->oid
;
11010 nogen
.generation
= ghobject_t::NO_GEN
;
11011 OnodeRef h
= c
->onode_map
.lookup(nogen
);
11013 if (!h
|| !h
->exists
) {
11017 dout(20) << __func__
<< " checking for unshareable blobs on " << h
11018 << " " << h
->oid
<< dendl
;
11019 map
<SharedBlob
*,bluestore_extent_ref_map_t
> expect
;
11020 for (auto& e
: h
->extent_map
.extent_map
) {
11021 const bluestore_blob_t
& b
= e
.blob
->get_blob();
11022 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
11023 if (b
.is_shared() &&
11025 maybe_unshared_blobs
.count(sb
)) {
11026 if (b
.is_compressed()) {
11027 expect
[sb
].get(0, b
.get_ondisk_length());
11029 b
.map(e
.blob_offset
, e
.length
, [&](uint64_t off
, uint64_t len
) {
11030 expect
[sb
].get(off
, len
);
11037 vector
<SharedBlob
*> unshared_blobs
;
11038 unshared_blobs
.reserve(maybe_unshared_blobs
.size());
11039 for (auto& p
: expect
) {
11040 dout(20) << " ? " << *p
.first
<< " vs " << p
.second
<< dendl
;
11041 if (p
.first
->persistent
->ref_map
== p
.second
) {
11042 SharedBlob
*sb
= p
.first
;
11043 dout(20) << __func__
<< " unsharing " << *sb
<< dendl
;
11044 unshared_blobs
.push_back(sb
);
11045 txc
->unshare_blob(sb
);
11046 uint64_t sbid
= c
->make_blob_unshared(sb
);
11048 get_shared_blob_key(sbid
, &key
);
11049 txc
->t
->rmkey(PREFIX_SHARED_BLOB
, key
);
11053 if (unshared_blobs
.empty()) {
11057 for (auto& e
: h
->extent_map
.extent_map
) {
11058 const bluestore_blob_t
& b
= e
.blob
->get_blob();
11059 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
11060 if (b
.is_shared() &&
11061 std::find(unshared_blobs
.begin(), unshared_blobs
.end(),
11062 sb
) != unshared_blobs
.end()) {
11063 dout(20) << __func__
<< " unsharing " << e
<< dendl
;
11064 bluestore_blob_t
& blob
= e
.blob
->dirty_blob();
11065 blob
.clear_flag(bluestore_blob_t::FLAG_SHARED
);
11066 h
->extent_map
.dirty_range(e
.logical_offset
, 1);
11069 txc
->write_onode(h
);
11074 int BlueStore::_remove(TransContext
*txc
,
11078 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
11079 int r
= _do_remove(txc
, c
, o
);
11080 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
11084 int BlueStore::_setattr(TransContext
*txc
,
11087 const string
& name
,
11090 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
11091 << " " << name
<< " (" << val
.length() << " bytes)"
11094 if (val
.is_partial()) {
11095 auto& b
= o
->onode
.attrs
[name
.c_str()] = bufferptr(val
.c_str(),
11097 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
11099 auto& b
= o
->onode
.attrs
[name
.c_str()] = val
;
11100 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
11102 txc
->write_onode(o
);
11103 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
11104 << " " << name
<< " (" << val
.length() << " bytes)"
11105 << " = " << r
<< dendl
;
11109 int BlueStore::_setattrs(TransContext
*txc
,
11112 const map
<string
,bufferptr
>& aset
)
11114 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
11115 << " " << aset
.size() << " keys"
11118 for (map
<string
,bufferptr
>::const_iterator p
= aset
.begin();
11119 p
!= aset
.end(); ++p
) {
11120 if (p
->second
.is_partial()) {
11121 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] =
11122 bufferptr(p
->second
.c_str(), p
->second
.length());
11123 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
11125 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] = p
->second
;
11126 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
11129 txc
->write_onode(o
);
11130 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
11131 << " " << aset
.size() << " keys"
11132 << " = " << r
<< dendl
;
11137 int BlueStore::_rmattr(TransContext
*txc
,
11140 const string
& name
)
11142 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
11143 << " " << name
<< dendl
;
11145 auto it
= o
->onode
.attrs
.find(name
.c_str());
11146 if (it
== o
->onode
.attrs
.end())
11149 o
->onode
.attrs
.erase(it
);
11150 txc
->write_onode(o
);
11153 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
11154 << " " << name
<< " = " << r
<< dendl
;
11158 int BlueStore::_rmattrs(TransContext
*txc
,
11162 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
11165 if (o
->onode
.attrs
.empty())
11168 o
->onode
.attrs
.clear();
11169 txc
->write_onode(o
);
11172 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
11176 void BlueStore::_do_omap_clear(TransContext
*txc
, uint64_t id
)
11178 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
11179 string prefix
, tail
;
11180 get_omap_header(id
, &prefix
);
11181 get_omap_tail(id
, &tail
);
11182 it
->lower_bound(prefix
);
11183 while (it
->valid()) {
11184 if (it
->key() >= tail
) {
11185 dout(30) << __func__
<< " stop at " << pretty_binary_string(tail
)
11189 txc
->t
->rmkey(PREFIX_OMAP
, it
->key());
11190 dout(30) << __func__
<< " rm " << pretty_binary_string(it
->key()) << dendl
;
11195 int BlueStore::_omap_clear(TransContext
*txc
,
11199 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
11201 if (o
->onode
.has_omap()) {
11203 _do_omap_clear(txc
, o
->onode
.nid
);
11204 o
->onode
.clear_omap_flag();
11205 txc
->write_onode(o
);
11207 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
11211 int BlueStore::_omap_setkeys(TransContext
*txc
,
11216 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
11218 bufferlist::iterator p
= bl
.begin();
11220 if (!o
->onode
.has_omap()) {
11221 o
->onode
.set_omap_flag();
11222 txc
->write_onode(o
);
11224 txc
->note_modified_object(o
);
11227 _key_encode_u64(o
->onode
.nid
, &final_key
);
11228 final_key
.push_back('.');
11234 ::decode(value
, p
);
11235 final_key
.resize(9); // keep prefix
11237 dout(30) << __func__
<< " " << pretty_binary_string(final_key
)
11238 << " <- " << key
<< dendl
;
11239 txc
->t
->set(PREFIX_OMAP
, final_key
, value
);
11242 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
11246 int BlueStore::_omap_setheader(TransContext
*txc
,
11251 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
11254 if (!o
->onode
.has_omap()) {
11255 o
->onode
.set_omap_flag();
11256 txc
->write_onode(o
);
11258 txc
->note_modified_object(o
);
11260 get_omap_header(o
->onode
.nid
, &key
);
11261 txc
->t
->set(PREFIX_OMAP
, key
, bl
);
11263 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
11267 int BlueStore::_omap_rmkeys(TransContext
*txc
,
11272 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
11274 bufferlist::iterator p
= bl
.begin();
11278 if (!o
->onode
.has_omap()) {
11281 _key_encode_u64(o
->onode
.nid
, &final_key
);
11282 final_key
.push_back('.');
11287 final_key
.resize(9); // keep prefix
11289 dout(30) << __func__
<< " rm " << pretty_binary_string(final_key
)
11290 << " <- " << key
<< dendl
;
11291 txc
->t
->rmkey(PREFIX_OMAP
, final_key
);
11293 txc
->note_modified_object(o
);
11296 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
11300 int BlueStore::_omap_rmkey_range(TransContext
*txc
,
11303 const string
& first
, const string
& last
)
11305 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
11306 KeyValueDB::Iterator it
;
11307 string key_first
, key_last
;
11309 if (!o
->onode
.has_omap()) {
11313 it
= db
->get_iterator(PREFIX_OMAP
);
11314 get_omap_key(o
->onode
.nid
, first
, &key_first
);
11315 get_omap_key(o
->onode
.nid
, last
, &key_last
);
11316 it
->lower_bound(key_first
);
11317 while (it
->valid()) {
11318 if (it
->key() >= key_last
) {
11319 dout(30) << __func__
<< " stop at " << pretty_binary_string(key_last
)
11323 txc
->t
->rmkey(PREFIX_OMAP
, it
->key());
11324 dout(30) << __func__
<< " rm " << pretty_binary_string(it
->key()) << dendl
;
11327 txc
->note_modified_object(o
);
11330 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
11334 int BlueStore::_set_alloc_hint(
11338 uint64_t expected_object_size
,
11339 uint64_t expected_write_size
,
11342 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
11343 << " object_size " << expected_object_size
11344 << " write_size " << expected_write_size
11345 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
11348 o
->onode
.expected_object_size
= expected_object_size
;
11349 o
->onode
.expected_write_size
= expected_write_size
;
11350 o
->onode
.alloc_hint_flags
= flags
;
11351 txc
->write_onode(o
);
11352 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
11353 << " object_size " << expected_object_size
11354 << " write_size " << expected_write_size
11355 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
11356 << " = " << r
<< dendl
;
11360 int BlueStore::_clone(TransContext
*txc
,
11365 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11366 << newo
->oid
<< dendl
;
11368 if (oldo
->oid
.hobj
.get_hash() != newo
->oid
.hobj
.get_hash()) {
11369 derr
<< __func__
<< " mismatched hash on " << oldo
->oid
11370 << " and " << newo
->oid
<< dendl
;
11374 _assign_nid(txc
, newo
);
11378 _do_truncate(txc
, c
, newo
, 0);
11379 if (cct
->_conf
->bluestore_clone_cow
) {
11380 _do_clone_range(txc
, c
, oldo
, newo
, 0, oldo
->onode
.size
, 0);
11383 r
= _do_read(c
.get(), oldo
, 0, oldo
->onode
.size
, bl
, 0);
11386 r
= _do_write(txc
, c
, newo
, 0, oldo
->onode
.size
, bl
, 0);
11392 newo
->onode
.attrs
= oldo
->onode
.attrs
;
11395 if (newo
->onode
.has_omap()) {
11396 dout(20) << __func__
<< " clearing old omap data" << dendl
;
11398 _do_omap_clear(txc
, newo
->onode
.nid
);
11400 if (oldo
->onode
.has_omap()) {
11401 dout(20) << __func__
<< " copying omap data" << dendl
;
11402 if (!newo
->onode
.has_omap()) {
11403 newo
->onode
.set_omap_flag();
11405 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
11407 get_omap_header(oldo
->onode
.nid
, &head
);
11408 get_omap_tail(oldo
->onode
.nid
, &tail
);
11409 it
->lower_bound(head
);
11410 while (it
->valid()) {
11411 if (it
->key() >= tail
) {
11412 dout(30) << __func__
<< " reached tail" << dendl
;
11415 dout(30) << __func__
<< " got header/data "
11416 << pretty_binary_string(it
->key()) << dendl
;
11418 rewrite_omap_key(newo
->onode
.nid
, it
->key(), &key
);
11419 txc
->t
->set(PREFIX_OMAP
, key
, it
->value());
11424 newo
->onode
.clear_omap_flag();
11427 txc
->write_onode(newo
);
11431 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11432 << newo
->oid
<< " = " << r
<< dendl
;
11436 int BlueStore::_do_clone_range(
11445 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11447 << " 0x" << std::hex
<< srcoff
<< "~" << length
<< " -> "
11448 << " 0x" << dstoff
<< "~" << length
<< std::dec
<< dendl
;
11449 oldo
->extent_map
.fault_range(db
, srcoff
, length
);
11450 newo
->extent_map
.fault_range(db
, dstoff
, length
);
11454 // hmm, this could go into an ExtentMap::dup() method.
11455 vector
<BlobRef
> id_to_blob(oldo
->extent_map
.extent_map
.size());
11456 for (auto &e
: oldo
->extent_map
.extent_map
) {
11457 e
.blob
->last_encoded_id
= -1;
11460 uint64_t end
= srcoff
+ length
;
11461 uint32_t dirty_range_begin
= 0;
11462 uint32_t dirty_range_end
= 0;
11463 bool src_dirty
= false;
11464 for (auto ep
= oldo
->extent_map
.seek_lextent(srcoff
);
11465 ep
!= oldo
->extent_map
.extent_map
.end();
11468 if (e
.logical_offset
>= end
) {
11471 dout(20) << __func__
<< " src " << e
<< dendl
;
11473 bool blob_duped
= true;
11474 if (e
.blob
->last_encoded_id
>= 0) {
11475 // blob is already duped
11476 cb
= id_to_blob
[e
.blob
->last_encoded_id
];
11477 blob_duped
= false;
11480 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
11481 // make sure it is shared
11482 if (!blob
.is_shared()) {
11483 c
->make_blob_shared(_assign_blobid(txc
), e
.blob
);
11486 dirty_range_begin
= e
.logical_offset
;
11488 assert(e
.logical_end() > 0);
11489 // -1 to exclude next potential shard
11490 dirty_range_end
= e
.logical_end() - 1;
11492 c
->load_shared_blob(e
.blob
->shared_blob
);
11495 e
.blob
->last_encoded_id
= n
;
11496 id_to_blob
[n
] = cb
;
11498 // bump the extent refs on the copied blob's extents
11499 for (auto p
: blob
.get_extents()) {
11500 if (p
.is_valid()) {
11501 e
.blob
->shared_blob
->get_ref(p
.offset
, p
.length
);
11504 txc
->write_shared_blob(e
.blob
->shared_blob
);
11505 dout(20) << __func__
<< " new " << *cb
<< dendl
;
11508 int skip_front
, skip_back
;
11509 if (e
.logical_offset
< srcoff
) {
11510 skip_front
= srcoff
- e
.logical_offset
;
11514 if (e
.logical_end() > end
) {
11515 skip_back
= e
.logical_end() - end
;
11519 Extent
*ne
= new Extent(e
.logical_offset
+ skip_front
+ dstoff
- srcoff
,
11520 e
.blob_offset
+ skip_front
,
11521 e
.length
- skip_front
- skip_back
, cb
);
11522 newo
->extent_map
.extent_map
.insert(*ne
);
11523 ne
->blob
->get_ref(c
.get(), ne
->blob_offset
, ne
->length
);
11524 // fixme: we may leave parts of new blob unreferenced that could
11525 // be freed (relative to the shared_blob).
11526 txc
->statfs_delta
.stored() += ne
->length
;
11527 if (e
.blob
->get_blob().is_compressed()) {
11528 txc
->statfs_delta
.compressed_original() += ne
->length
;
11530 txc
->statfs_delta
.compressed() +=
11531 cb
->get_blob().get_compressed_payload_length();
11534 dout(20) << __func__
<< " dst " << *ne
<< dendl
;
11538 oldo
->extent_map
.dirty_range(dirty_range_begin
,
11539 dirty_range_end
- dirty_range_begin
);
11540 txc
->write_onode(oldo
);
11542 txc
->write_onode(newo
);
11544 if (dstoff
+ length
> newo
->onode
.size
) {
11545 newo
->onode
.size
= dstoff
+ length
;
11547 newo
->extent_map
.dirty_range(dstoff
, length
);
11553 int BlueStore::_clone_range(TransContext
*txc
,
11557 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
)
11559 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11560 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
11561 << " to offset 0x" << dstoff
<< std::dec
<< dendl
;
11564 if (srcoff
+ length
>= OBJECT_MAX_SIZE
||
11565 dstoff
+ length
>= OBJECT_MAX_SIZE
) {
11569 if (srcoff
+ length
> oldo
->onode
.size
) {
11574 _assign_nid(txc
, newo
);
11577 if (cct
->_conf
->bluestore_clone_cow
) {
11578 _do_zero(txc
, c
, newo
, dstoff
, length
);
11579 _do_clone_range(txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
11582 r
= _do_read(c
.get(), oldo
, srcoff
, length
, bl
, 0);
11585 r
= _do_write(txc
, c
, newo
, dstoff
, bl
.length(), bl
, 0);
11591 txc
->write_onode(newo
);
11595 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11596 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
11597 << " to offset 0x" << dstoff
<< std::dec
11598 << " = " << r
<< dendl
;
11602 int BlueStore::_rename(TransContext
*txc
,
11606 const ghobject_t
& new_oid
)
11608 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11609 << new_oid
<< dendl
;
11611 ghobject_t old_oid
= oldo
->oid
;
11612 mempool::bluestore_cache_other::string new_okey
;
11615 if (newo
->exists
) {
11619 assert(txc
->onodes
.count(newo
) == 0);
11622 txc
->t
->rmkey(PREFIX_OBJ
, oldo
->key
.c_str(), oldo
->key
.size());
11626 oldo
->extent_map
.fault_range(db
, 0, oldo
->onode
.size
);
11627 get_object_key(cct
, new_oid
, &new_okey
);
11629 for (auto &s
: oldo
->extent_map
.shards
) {
11630 generate_extent_shard_key_and_apply(oldo
->key
, s
.shard_info
->offset
, &key
,
11631 [&](const string
& final_key
) {
11632 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
11640 txc
->write_onode(newo
);
11642 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
11643 // Onode in the old slot
11644 c
->onode_map
.rename(oldo
, old_oid
, new_oid
, new_okey
);
11647 // hold a ref to new Onode in old name position, to ensure we don't drop
11648 // it from the cache before this txc commits (or else someone may come along
11649 // and read newo's metadata via the old name).
11650 txc
->note_modified_object(oldo
);
11653 dout(10) << __func__
<< " " << c
->cid
<< " " << old_oid
<< " -> "
11654 << new_oid
<< " = " << r
<< dendl
;
11660 int BlueStore::_create_collection(
11666 dout(15) << __func__
<< " " << cid
<< " bits " << bits
<< dendl
;
11671 RWLock::WLocker
l(coll_lock
);
11679 cache_shards
[cid
.hash_to_shard(cache_shards
.size())],
11681 (*c
)->cnode
.bits
= bits
;
11682 coll_map
[cid
] = *c
;
11684 ::encode((*c
)->cnode
, bl
);
11685 txc
->t
->set(PREFIX_COLL
, stringify(cid
), bl
);
11689 dout(10) << __func__
<< " " << cid
<< " bits " << bits
<< " = " << r
<< dendl
;
11693 int BlueStore::_remove_collection(TransContext
*txc
, const coll_t
&cid
,
11696 dout(15) << __func__
<< " " << cid
<< dendl
;
11700 RWLock::WLocker
l(coll_lock
);
11705 size_t nonexistent_count
= 0;
11706 assert((*c
)->exists
);
11707 if ((*c
)->onode_map
.map_any([&](OnodeRef o
) {
11709 dout(10) << __func__
<< " " << o
->oid
<< " " << o
11710 << " exists in onode_map" << dendl
;
11713 ++nonexistent_count
;
11720 vector
<ghobject_t
> ls
;
11722 // Enumerate onodes in db, up to nonexistent_count + 1
11723 // then check if all of them are marked as non-existent.
11724 // Bypass the check if returned number is greater than nonexistent_count
11725 r
= _collection_list(c
->get(), ghobject_t(), ghobject_t::get_max(),
11726 nonexistent_count
+ 1, &ls
, &next
);
11728 bool exists
= false; //ls.size() > nonexistent_count;
11729 for (auto it
= ls
.begin(); !exists
&& it
< ls
.end(); ++it
) {
11730 dout(10) << __func__
<< " oid " << *it
<< dendl
;
11731 auto onode
= (*c
)->onode_map
.lookup(*it
);
11732 exists
= !onode
|| onode
->exists
;
11734 dout(10) << __func__
<< " " << *it
11735 << " exists in db" << dendl
;
11739 coll_map
.erase(cid
);
11740 txc
->removed_collections
.push_back(*c
);
11741 (*c
)->exists
= false;
11743 txc
->t
->rmkey(PREFIX_COLL
, stringify(cid
));
11746 dout(10) << __func__
<< " " << cid
11747 << " is non-empty" << dendl
;
11754 dout(10) << __func__
<< " " << cid
<< " = " << r
<< dendl
;
11758 int BlueStore::_split_collection(TransContext
*txc
,
11761 unsigned bits
, int rem
)
11763 dout(15) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
11764 << " bits " << bits
<< dendl
;
11765 RWLock::WLocker
l(c
->lock
);
11766 RWLock::WLocker
l2(d
->lock
);
11769 // flush all previous deferred writes on this sequencer. this is a bit
11770 // heavyweight, but we need to make sure all deferred writes complete
11771 // before we split as the new collection's sequencer may need to order
11772 // this after those writes, and we don't bother with the complexity of
11773 // moving those TransContexts over to the new osr.
11774 _osr_drain_preceding(txc
);
11776 // move any cached items (onodes and referenced shared blobs) that will
11777 // belong to the child collection post-split. leave everything else behind.
11778 // this may include things that don't strictly belong to the now-smaller
11779 // parent split, but the OSD will always send us a split for every new
11782 spg_t pgid
, dest_pgid
;
11783 bool is_pg
= c
->cid
.is_pg(&pgid
);
11785 is_pg
= d
->cid
.is_pg(&dest_pgid
);
11788 // the destination should initially be empty.
11789 assert(d
->onode_map
.empty());
11790 assert(d
->shared_blob_set
.empty());
11791 assert(d
->cnode
.bits
== bits
);
11793 c
->split_cache(d
.get());
11795 // adjust bits. note that this will be redundant for all but the first
11796 // split call for this parent (first child).
11797 c
->cnode
.bits
= bits
;
11798 assert(d
->cnode
.bits
== bits
);
11802 ::encode(c
->cnode
, bl
);
11803 txc
->t
->set(PREFIX_COLL
, stringify(c
->cid
), bl
);
11805 dout(10) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
11806 << " bits " << bits
<< " = " << r
<< dendl
;
11810 // DB key value Histogram
11811 #define KEY_SLAB 32
11812 #define VALUE_SLAB 64
11814 const string prefix_onode
= "o";
11815 const string prefix_onode_shard
= "x";
11816 const string prefix_other
= "Z";
11818 int BlueStore::DBHistogram::get_key_slab(size_t sz
)
11820 return (sz
/KEY_SLAB
);
11823 string
BlueStore::DBHistogram::get_key_slab_to_range(int slab
)
11825 int lower_bound
= slab
* KEY_SLAB
;
11826 int upper_bound
= (slab
+ 1) * KEY_SLAB
;
11827 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
11831 int BlueStore::DBHistogram::get_value_slab(size_t sz
)
11833 return (sz
/VALUE_SLAB
);
11836 string
BlueStore::DBHistogram::get_value_slab_to_range(int slab
)
11838 int lower_bound
= slab
* VALUE_SLAB
;
11839 int upper_bound
= (slab
+ 1) * VALUE_SLAB
;
11840 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
11844 void BlueStore::DBHistogram::update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
11845 const string
&prefix
, size_t key_size
, size_t value_size
)
11847 uint32_t key_slab
= get_key_slab(key_size
);
11848 uint32_t value_slab
= get_value_slab(value_size
);
11849 key_hist
[prefix
][key_slab
].count
++;
11850 key_hist
[prefix
][key_slab
].max_len
= MAX(key_size
, key_hist
[prefix
][key_slab
].max_len
);
11851 key_hist
[prefix
][key_slab
].val_map
[value_slab
].count
++;
11852 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
=
11853 MAX(value_size
, key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
);
11856 void BlueStore::DBHistogram::dump(Formatter
*f
)
11858 f
->open_object_section("rocksdb_value_distribution");
11859 for (auto i
: value_hist
) {
11860 f
->dump_unsigned(get_value_slab_to_range(i
.first
).data(), i
.second
);
11862 f
->close_section();
11864 f
->open_object_section("rocksdb_key_value_histogram");
11865 for (auto i
: key_hist
) {
11866 f
->dump_string("prefix", i
.first
);
11867 f
->open_object_section("key_hist");
11868 for ( auto k
: i
.second
) {
11869 f
->dump_unsigned(get_key_slab_to_range(k
.first
).data(), k
.second
.count
);
11870 f
->dump_unsigned("max_len", k
.second
.max_len
);
11871 f
->open_object_section("value_hist");
11872 for ( auto j
: k
.second
.val_map
) {
11873 f
->dump_unsigned(get_value_slab_to_range(j
.first
).data(), j
.second
.count
);
11874 f
->dump_unsigned("max_len", j
.second
.max_len
);
11876 f
->close_section();
11878 f
->close_section();
11880 f
->close_section();
11883 //Itrerates through the db and collects the stats
11884 void BlueStore::generate_db_histogram(Formatter
*f
)
11887 uint64_t num_onodes
= 0;
11888 uint64_t num_shards
= 0;
11889 uint64_t num_super
= 0;
11890 uint64_t num_coll
= 0;
11891 uint64_t num_omap
= 0;
11892 uint64_t num_deferred
= 0;
11893 uint64_t num_alloc
= 0;
11894 uint64_t num_stat
= 0;
11895 uint64_t num_others
= 0;
11896 uint64_t num_shared_shards
= 0;
11897 size_t max_key_size
=0, max_value_size
= 0;
11898 uint64_t total_key_size
= 0, total_value_size
= 0;
11899 size_t key_size
= 0, value_size
= 0;
11902 utime_t start
= ceph_clock_now();
11904 KeyValueDB::WholeSpaceIterator iter
= db
->get_iterator();
11905 iter
->seek_to_first();
11906 while (iter
->valid()) {
11907 dout(30) << __func__
<< " Key: " << iter
->key() << dendl
;
11908 key_size
= iter
->key_size();
11909 value_size
= iter
->value_size();
11910 hist
.value_hist
[hist
.get_value_slab(value_size
)]++;
11911 max_key_size
= MAX(max_key_size
, key_size
);
11912 max_value_size
= MAX(max_value_size
, value_size
);
11913 total_key_size
+= key_size
;
11914 total_value_size
+= value_size
;
11916 pair
<string
,string
> key(iter
->raw_key());
11918 if (key
.first
== PREFIX_SUPER
) {
11919 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SUPER
, key_size
, value_size
);
11921 } else if (key
.first
== PREFIX_STAT
) {
11922 hist
.update_hist_entry(hist
.key_hist
, PREFIX_STAT
, key_size
, value_size
);
11924 } else if (key
.first
== PREFIX_COLL
) {
11925 hist
.update_hist_entry(hist
.key_hist
, PREFIX_COLL
, key_size
, value_size
);
11927 } else if (key
.first
== PREFIX_OBJ
) {
11928 if (key
.second
.back() == ONODE_KEY_SUFFIX
) {
11929 hist
.update_hist_entry(hist
.key_hist
, prefix_onode
, key_size
, value_size
);
11932 hist
.update_hist_entry(hist
.key_hist
, prefix_onode_shard
, key_size
, value_size
);
11935 } else if (key
.first
== PREFIX_OMAP
) {
11936 hist
.update_hist_entry(hist
.key_hist
, PREFIX_OMAP
, key_size
, value_size
);
11938 } else if (key
.first
== PREFIX_DEFERRED
) {
11939 hist
.update_hist_entry(hist
.key_hist
, PREFIX_DEFERRED
, key_size
, value_size
);
11941 } else if (key
.first
== PREFIX_ALLOC
|| key
.first
== "b" ) {
11942 hist
.update_hist_entry(hist
.key_hist
, PREFIX_ALLOC
, key_size
, value_size
);
11944 } else if (key
.first
== PREFIX_SHARED_BLOB
) {
11945 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SHARED_BLOB
, key_size
, value_size
);
11946 num_shared_shards
++;
11948 hist
.update_hist_entry(hist
.key_hist
, prefix_other
, key_size
, value_size
);
11954 utime_t duration
= ceph_clock_now() - start
;
11955 f
->open_object_section("rocksdb_key_value_stats");
11956 f
->dump_unsigned("num_onodes", num_onodes
);
11957 f
->dump_unsigned("num_shards", num_shards
);
11958 f
->dump_unsigned("num_super", num_super
);
11959 f
->dump_unsigned("num_coll", num_coll
);
11960 f
->dump_unsigned("num_omap", num_omap
);
11961 f
->dump_unsigned("num_deferred", num_deferred
);
11962 f
->dump_unsigned("num_alloc", num_alloc
);
11963 f
->dump_unsigned("num_stat", num_stat
);
11964 f
->dump_unsigned("num_shared_shards", num_shared_shards
);
11965 f
->dump_unsigned("num_others", num_others
);
11966 f
->dump_unsigned("max_key_size", max_key_size
);
11967 f
->dump_unsigned("max_value_size", max_value_size
);
11968 f
->dump_unsigned("total_key_size", total_key_size
);
11969 f
->dump_unsigned("total_value_size", total_value_size
);
11970 f
->close_section();
11974 dout(20) << __func__
<< " finished in " << duration
<< " seconds" << dendl
;
11978 void BlueStore::_flush_cache()
11980 dout(10) << __func__
<< dendl
;
11981 for (auto i
: cache_shards
) {
11983 assert(i
->empty());
11985 for (auto& p
: coll_map
) {
11986 if (!p
.second
->onode_map
.empty()) {
11987 derr
<< __func__
<< "stray onodes on " << p
.first
<< dendl
;
11988 p
.second
->onode_map
.dump(cct
, 0);
11990 if (!p
.second
->shared_blob_set
.empty()) {
11991 derr
<< __func__
<< " stray shared blobs on " << p
.first
<< dendl
;
11992 p
.second
->shared_blob_set
.dump(cct
, 0);
11994 assert(p
.second
->onode_map
.empty());
11995 assert(p
.second
->shared_blob_set
.empty());
12000 // For external caller.
12001 // We use a best-effort policy instead, e.g.,
12002 // we don't care if there are still some pinned onodes/data in the cache
12003 // after this command is completed.
12004 void BlueStore::flush_cache()
12006 dout(10) << __func__
<< dendl
;
12007 for (auto i
: cache_shards
) {
12012 void BlueStore::_apply_padding(uint64_t head_pad
,
12014 bufferlist
& padded
)
12017 padded
.prepend_zero(head_pad
);
12020 padded
.append_zero(tail_pad
);
12022 if (head_pad
|| tail_pad
) {
12023 dout(20) << __func__
<< " can pad head 0x" << std::hex
<< head_pad
12024 << " tail 0x" << tail_pad
<< std::dec
<< dendl
;
12025 logger
->inc(l_bluestore_write_pad_bytes
, head_pad
+ tail_pad
);
12029 // ===========================================