1 // vim: ts=8 sw=2 smarttab
3 * Ceph - scalable distributed file system
5 * Copyright (C) 2014 Red Hat
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
16 #include <sys/types.h>
20 #include "include/cpp-btree/btree_set.h"
22 #include "BlueStore.h"
24 #include "include/compat.h"
25 #include "include/intarith.h"
26 #include "include/stringify.h"
27 #include "common/errno.h"
28 #include "common/safe_io.h"
29 #include "Allocator.h"
30 #include "FreelistManager.h"
32 #include "BlueRocksEnv.h"
33 #include "auth/Crypto.h"
34 #include "common/EventTrace.h"
36 #define dout_context cct
37 #define dout_subsys ceph_subsys_bluestore
39 using bid_t
= decltype(BlueStore::Blob::id
);
41 // bluestore_cache_onode
42 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode
, bluestore_onode
,
43 bluestore_cache_onode
);
45 // bluestore_cache_other
46 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer
, bluestore_buffer
,
47 bluestore_cache_other
);
48 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent
, bluestore_extent
,
49 bluestore_cache_other
);
50 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob
, bluestore_blob
,
51 bluestore_cache_other
);
52 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob
, bluestore_shared_blob
,
53 bluestore_cache_other
);
56 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext
, bluestore_transcontext
,
61 const string PREFIX_SUPER
= "S"; // field -> value
62 const string PREFIX_STAT
= "T"; // field -> value(int64 array)
63 const string PREFIX_COLL
= "C"; // collection name -> cnode_t
64 const string PREFIX_OBJ
= "O"; // object name -> onode_t
65 const string PREFIX_OMAP
= "M"; // u64 + keyname -> value
66 const string PREFIX_DEFERRED
= "L"; // id -> deferred_transaction_t
67 const string PREFIX_ALLOC
= "B"; // u64 offset -> u64 length (freelist)
68 const string PREFIX_SHARED_BLOB
= "X"; // u64 offset -> shared_blob_t
70 // write a label in the first block. always use this size. note that
71 // bluefs makes a matching assumption about the location of its
72 // superblock (always the second block of the device).
73 #define BDEV_LABEL_BLOCK_SIZE 4096
75 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
76 #define SUPER_RESERVED 8192
78 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
82 * extent map blob encoding
84 * we use the low bits of the blobid field to indicate some common scenarios
85 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
87 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
88 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
89 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
90 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
91 #define BLOBID_SHIFT_BITS 4
94 * object name key structure
96 * encoded u8: shard + 2^7 (so that it sorts properly)
97 * encoded u64: poolid + 2^63 (so that it sorts properly)
98 * encoded u32: hash (bit reversed)
100 * escaped string: namespace
102 * escaped string: key or object name
103 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
104 * we are done. otherwise, we are followed by the object name.
105 * escaped string: object name (unless '=' above)
108 * encoded u64: generation
111 #define ONODE_KEY_SUFFIX 'o'
120 #define EXTENT_SHARD_KEY_SUFFIX 'x'
123 * string encoding in the key
125 * The key string needs to lexicographically sort the same way that
126 * ghobject_t does. We do this by escaping anything <= to '#' with #
127 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
130 * We use ! as a terminator for strings; this works because it is < #
131 * and will get escaped if it is present in the string.
135 static void append_escaped(const string
&in
, S
*out
)
137 char hexbyte
[in
.length() * 3 + 1];
138 char* ptr
= &hexbyte
[0];
139 for (string::const_iterator i
= in
.begin(); i
!= in
.end(); ++i
) {
142 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
143 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
144 } else if (*i
>= '~') {
146 *ptr
++ = "0123456789abcdef"[(*i
>> 4) & 0x0f];
147 *ptr
++ = "0123456789abcdef"[*i
& 0x0f];
153 out
->append(hexbyte
, ptr
- &hexbyte
[0]);
156 inline unsigned h2i(char c
)
158 if ((c
>= '0') && (c
<= '9')) {
160 } else if ((c
>= 'a') && (c
<= 'f')) {
162 } else if ((c
>= 'A') && (c
<= 'F')) {
165 return 256; // make it always larger than 255
169 static int decode_escaped(const char *p
, string
*out
)
172 char* ptr
= &buff
[0];
173 char* max
= &buff
[252];
174 const char *orig_p
= p
;
175 while (*p
&& *p
!= '!') {
176 if (*p
== '#' || *p
== '~') {
179 hex
= h2i(*p
++) << 4;
192 out
->append(buff
, ptr
-buff
);
197 out
->append(buff
, ptr
-buff
);
202 // some things we encode in binary (as le32 or le64); print the
203 // resulting key strings nicely
205 static string
pretty_binary_string(const S
& in
)
209 out
.reserve(in
.length() * 3);
210 enum { NONE
, HEX
, STRING
} mode
= NONE
;
211 unsigned from
= 0, i
;
212 for (i
=0; i
< in
.length(); ++i
) {
213 if ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
214 (mode
== HEX
&& in
.length() - i
>= 4 &&
215 ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
216 (in
[i
+1] < 32 || (unsigned char)in
[i
+1] > 126) ||
217 (in
[i
+2] < 32 || (unsigned char)in
[i
+2] > 126) ||
218 (in
[i
+3] < 32 || (unsigned char)in
[i
+3] > 126)))) {
219 if (mode
== STRING
) {
220 out
.append(in
.c_str() + from
, i
- from
);
227 if (in
.length() - i
>= 4) {
228 // print a whole u32 at once
229 snprintf(buf
, sizeof(buf
), "%08x",
230 (uint32_t)(((unsigned char)in
[i
] << 24) |
231 ((unsigned char)in
[i
+1] << 16) |
232 ((unsigned char)in
[i
+2] << 8) |
233 ((unsigned char)in
[i
+3] << 0)));
236 snprintf(buf
, sizeof(buf
), "%02x", (int)(unsigned char)in
[i
]);
240 if (mode
!= STRING
) {
247 if (mode
== STRING
) {
248 out
.append(in
.c_str() + from
, i
- from
);
255 static void _key_encode_shard(shard_id_t shard
, T
*key
)
257 key
->push_back((char)((uint8_t)shard
.id
+ (uint8_t)0x80));
260 static const char *_key_decode_shard(const char *key
, shard_id_t
*pshard
)
262 pshard
->id
= (uint8_t)*key
- (uint8_t)0x80;
266 static void get_coll_key_range(const coll_t
& cid
, int bits
,
267 string
*temp_start
, string
*temp_end
,
268 string
*start
, string
*end
)
276 if (cid
.is_pg(&pgid
)) {
277 _key_encode_shard(pgid
.shard
, start
);
278 *temp_start
= *start
;
280 _key_encode_u64(pgid
.pool() + 0x8000000000000000ull
, start
);
281 _key_encode_u64((-2ll - pgid
.pool()) + 0x8000000000000000ull
, temp_start
);
284 *temp_end
= *temp_start
;
286 uint32_t reverse_hash
= hobject_t::_reverse_bits(pgid
.ps());
287 _key_encode_u32(reverse_hash
, start
);
288 _key_encode_u32(reverse_hash
, temp_start
);
290 uint64_t end_hash
= reverse_hash
+ (1ull << (32 - bits
));
291 if (end_hash
> 0xffffffffull
)
292 end_hash
= 0xffffffffull
;
294 _key_encode_u32(end_hash
, end
);
295 _key_encode_u32(end_hash
, temp_end
);
297 _key_encode_shard(shard_id_t::NO_SHARD
, start
);
298 _key_encode_u64(-1ull + 0x8000000000000000ull
, start
);
300 _key_encode_u32(0, start
);
301 _key_encode_u32(0xffffffff, end
);
303 // no separate temp section
309 static void get_shared_blob_key(uint64_t sbid
, string
*key
)
312 _key_encode_u64(sbid
, key
);
315 static int get_key_shared_blob(const string
& key
, uint64_t *sbid
)
317 const char *p
= key
.c_str();
318 if (key
.length() < sizeof(uint64_t))
320 _key_decode_u64(p
, sbid
);
325 static int get_key_object(const S
& key
, ghobject_t
*oid
)
328 const char *p
= key
.c_str();
330 if (key
.length() < 1 + 8 + 4)
332 p
= _key_decode_shard(p
, &oid
->shard_id
);
335 p
= _key_decode_u64(p
, &pool
);
336 oid
->hobj
.pool
= pool
- 0x8000000000000000ull
;
339 p
= _key_decode_u32(p
, &hash
);
341 oid
->hobj
.set_bitwise_key_u32(hash
);
343 r
= decode_escaped(p
, &oid
->hobj
.nspace
);
349 r
= decode_escaped(p
, &k
);
356 oid
->hobj
.oid
.name
= k
;
357 } else if (*p
== '<' || *p
== '>') {
360 r
= decode_escaped(p
, &oid
->hobj
.oid
.name
);
364 oid
->hobj
.set_key(k
);
370 p
= _key_decode_u64(p
, &oid
->hobj
.snap
.val
);
371 p
= _key_decode_u64(p
, &oid
->generation
);
373 if (*p
!= ONODE_KEY_SUFFIX
) {
378 // if we get something other than a null terminator here,
379 // something goes wrong.
387 static void get_object_key(CephContext
*cct
, const ghobject_t
& oid
, S
*key
)
391 size_t max_len
= 1 + 8 + 4 +
392 (oid
.hobj
.nspace
.length() * 3 + 1) +
393 (oid
.hobj
.get_key().length() * 3 + 1) +
394 1 + // for '<', '=', or '>'
395 (oid
.hobj
.oid
.name
.length() * 3 + 1) +
397 key
->reserve(max_len
);
399 _key_encode_shard(oid
.shard_id
, key
);
400 _key_encode_u64(oid
.hobj
.pool
+ 0x8000000000000000ull
, key
);
401 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), key
);
403 append_escaped(oid
.hobj
.nspace
, key
);
405 if (oid
.hobj
.get_key().length()) {
406 // is a key... could be < = or >.
407 append_escaped(oid
.hobj
.get_key(), key
);
408 // (ASCII chars < = and > sort in that order, yay)
409 int r
= oid
.hobj
.get_key().compare(oid
.hobj
.oid
.name
);
411 key
->append(r
> 0 ? ">" : "<");
412 append_escaped(oid
.hobj
.oid
.name
, key
);
419 append_escaped(oid
.hobj
.oid
.name
, key
);
423 _key_encode_u64(oid
.hobj
.snap
, key
);
424 _key_encode_u64(oid
.generation
, key
);
426 key
->push_back(ONODE_KEY_SUFFIX
);
431 int r
= get_key_object(*key
, &t
);
433 derr
<< " r " << r
<< dendl
;
434 derr
<< "key " << pretty_binary_string(*key
) << dendl
;
435 derr
<< "oid " << oid
<< dendl
;
436 derr
<< " t " << t
<< dendl
;
437 assert(r
== 0 && t
== oid
);
443 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
444 // char lets us quickly test whether it is a shard key without decoding any
445 // of the prefix bytes.
447 static void get_extent_shard_key(const S
& onode_key
, uint32_t offset
,
451 key
->reserve(onode_key
.length() + 4 + 1);
452 key
->append(onode_key
.c_str(), onode_key
.size());
453 _key_encode_u32(offset
, key
);
454 key
->push_back(EXTENT_SHARD_KEY_SUFFIX
);
457 static void rewrite_extent_shard_key(uint32_t offset
, string
*key
)
459 assert(key
->size() > sizeof(uint32_t) + 1);
460 assert(*key
->rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
461 _key_encode_u32(offset
, key
->size() - sizeof(uint32_t) - 1, key
);
465 static void generate_extent_shard_key_and_apply(
469 std::function
<void(const string
& final_key
)> apply
)
471 if (key
->empty()) { // make full key
472 assert(!onode_key
.empty());
473 get_extent_shard_key(onode_key
, offset
, key
);
475 rewrite_extent_shard_key(offset
, key
);
480 int get_key_extent_shard(const string
& key
, string
*onode_key
, uint32_t *offset
)
482 assert(key
.size() > sizeof(uint32_t) + 1);
483 assert(*key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
484 int okey_len
= key
.size() - sizeof(uint32_t) - 1;
485 *onode_key
= key
.substr(0, okey_len
);
486 const char *p
= key
.data() + okey_len
;
487 _key_decode_u32(p
, offset
);
491 static bool is_extent_shard_key(const string
& key
)
493 return *key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
;
497 static void get_omap_header(uint64_t id
, string
*out
)
499 _key_encode_u64(id
, out
);
503 // hmm, I don't think there's any need to escape the user key since we
504 // have a clean prefix.
505 static void get_omap_key(uint64_t id
, const string
& key
, string
*out
)
507 _key_encode_u64(id
, out
);
512 static void rewrite_omap_key(uint64_t id
, string old
, string
*out
)
514 _key_encode_u64(id
, out
);
515 out
->append(old
.c_str() + out
->length(), old
.size() - out
->length());
518 static void decode_omap_key(const string
& key
, string
*user_key
)
520 *user_key
= key
.substr(sizeof(uint64_t) + 1);
523 static void get_omap_tail(uint64_t id
, string
*out
)
525 _key_encode_u64(id
, out
);
529 static void get_deferred_key(uint64_t seq
, string
*out
)
531 _key_encode_u64(seq
, out
);
537 struct Int64ArrayMergeOperator
: public KeyValueDB::MergeOperator
{
538 void merge_nonexistent(
539 const char *rdata
, size_t rlen
, std::string
*new_value
) override
{
540 *new_value
= std::string(rdata
, rlen
);
543 const char *ldata
, size_t llen
,
544 const char *rdata
, size_t rlen
,
545 std::string
*new_value
) override
{
546 assert(llen
== rlen
);
547 assert((rlen
% 8) == 0);
548 new_value
->resize(rlen
);
549 const __le64
* lv
= (const __le64
*)ldata
;
550 const __le64
* rv
= (const __le64
*)rdata
;
551 __le64
* nv
= &(__le64
&)new_value
->at(0);
552 for (size_t i
= 0; i
< rlen
>> 3; ++i
) {
553 nv
[i
] = lv
[i
] + rv
[i
];
556 // We use each operator name and each prefix to construct the
557 // overall RocksDB operator name for consistency check at open time.
558 string
name() const override
{
559 return "int64_array";
566 ostream
& operator<<(ostream
& out
, const BlueStore::Buffer
& b
)
568 out
<< "buffer(" << &b
<< " space " << b
.space
<< " 0x" << std::hex
569 << b
.offset
<< "~" << b
.length
<< std::dec
570 << " " << BlueStore::Buffer::get_state_name(b
.state
);
572 out
<< " " << BlueStore::Buffer::get_flag_name(b
.flags
);
578 void BlueStore::GarbageCollector::process_protrusive_extents(
579 const BlueStore::ExtentMap
& extent_map
,
580 uint64_t start_offset
,
582 uint64_t start_touch_offset
,
583 uint64_t end_touch_offset
,
584 uint64_t min_alloc_size
)
586 assert(start_offset
<= start_touch_offset
&& end_offset
>= end_touch_offset
);
588 uint64_t lookup_start_offset
= P2ALIGN(start_offset
, min_alloc_size
);
589 uint64_t lookup_end_offset
= ROUND_UP_TO(end_offset
, min_alloc_size
);
591 dout(30) << __func__
<< " (hex): [" << std::hex
592 << lookup_start_offset
<< ", " << lookup_end_offset
593 << ")" << std::dec
<< dendl
;
595 for (auto it
= extent_map
.seek_lextent(lookup_start_offset
);
596 it
!= extent_map
.extent_map
.end() &&
597 it
->logical_offset
< lookup_end_offset
;
599 uint64_t alloc_unit_start
= it
->logical_offset
/ min_alloc_size
;
600 uint64_t alloc_unit_end
= (it
->logical_end() - 1) / min_alloc_size
;
602 dout(30) << __func__
<< " " << *it
603 << "alloc_units: " << alloc_unit_start
<< ".." << alloc_unit_end
606 Blob
* b
= it
->blob
.get();
608 if (it
->logical_offset
>=start_touch_offset
&&
609 it
->logical_end() <= end_touch_offset
) {
610 // Process extents within the range affected by
611 // the current write request.
612 // Need to take into account if existing extents
613 // can be merged with them (uncompressed case)
614 if (!b
->get_blob().is_compressed()) {
615 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
616 --blob_info_counted
->expected_allocations
; // don't need to allocate
617 // new AU for compressed
618 // data since another
619 // collocated uncompressed
620 // blob already exists
621 dout(30) << __func__
<< " --expected:"
622 << alloc_unit_start
<< dendl
;
624 used_alloc_unit
= alloc_unit_end
;
625 blob_info_counted
= nullptr;
627 } else if (b
->get_blob().is_compressed()) {
629 // additionally we take compressed blobs that were not impacted
630 // by the write into account too
632 affected_blobs
.emplace(
633 b
, BlobInfo(b
->get_referenced_bytes())).first
->second
;
636 (used_alloc_unit
&& used_alloc_unit
== alloc_unit_start
) ? 0 : 1;
637 bi
.expected_allocations
+= alloc_unit_end
- alloc_unit_start
+ adjust
;
638 dout(30) << __func__
<< " expected_allocations="
639 << bi
.expected_allocations
<< " end_au:"
640 << alloc_unit_end
<< dendl
;
642 blob_info_counted
= &bi
;
643 used_alloc_unit
= alloc_unit_end
;
645 assert(it
->length
<= bi
.referenced_bytes
);
646 bi
.referenced_bytes
-= it
->length
;
647 dout(30) << __func__
<< " affected_blob:" << *b
648 << " unref 0x" << std::hex
<< it
->length
649 << " referenced = 0x" << bi
.referenced_bytes
650 << std::dec
<< dendl
;
651 // NOTE: we can't move specific blob to resulting GC list here
652 // when reference counter == 0 since subsequent extents might
653 // decrement its expected_allocation.
654 // Hence need to enumerate all the extents first.
655 if (!bi
.collect_candidate
) {
656 bi
.first_lextent
= it
;
657 bi
.collect_candidate
= true;
659 bi
.last_lextent
= it
;
661 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
662 // don't need to allocate new AU for compressed data since another
663 // collocated uncompressed blob already exists
664 --blob_info_counted
->expected_allocations
;
665 dout(30) << __func__
<< " --expected_allocations:"
666 << alloc_unit_start
<< dendl
;
668 used_alloc_unit
= alloc_unit_end
;
669 blob_info_counted
= nullptr;
673 for (auto b_it
= affected_blobs
.begin();
674 b_it
!= affected_blobs
.end();
676 Blob
* b
= b_it
->first
;
677 BlobInfo
& bi
= b_it
->second
;
678 if (bi
.referenced_bytes
== 0) {
679 uint64_t len_on_disk
= b_it
->first
->get_blob().get_ondisk_length();
680 int64_t blob_expected_for_release
=
681 ROUND_UP_TO(len_on_disk
, min_alloc_size
) / min_alloc_size
;
683 dout(30) << __func__
<< " " << *(b_it
->first
)
684 << " expected4release=" << blob_expected_for_release
685 << " expected_allocations=" << bi
.expected_allocations
687 int64_t benefit
= blob_expected_for_release
- bi
.expected_allocations
;
688 if (benefit
>= g_conf
->bluestore_gc_enable_blob_threshold
) {
689 if (bi
.collect_candidate
) {
690 auto it
= bi
.first_lextent
;
693 if (it
->blob
.get() == b
) {
694 extents_to_collect
.emplace_back(it
->logical_offset
, it
->length
);
696 bExit
= it
== bi
.last_lextent
;
700 expected_for_release
+= blob_expected_for_release
;
701 expected_allocations
+= bi
.expected_allocations
;
707 int64_t BlueStore::GarbageCollector::estimate(
708 uint64_t start_offset
,
710 const BlueStore::ExtentMap
& extent_map
,
711 const BlueStore::old_extent_map_t
& old_extents
,
712 uint64_t min_alloc_size
)
715 affected_blobs
.clear();
716 extents_to_collect
.clear();
717 used_alloc_unit
= boost::optional
<uint64_t >();
718 blob_info_counted
= nullptr;
720 gc_start_offset
= start_offset
;
721 gc_end_offset
= start_offset
+ length
;
723 uint64_t end_offset
= start_offset
+ length
;
725 for (auto it
= old_extents
.begin(); it
!= old_extents
.end(); ++it
) {
726 Blob
* b
= it
->e
.blob
.get();
727 if (b
->get_blob().is_compressed()) {
729 // update gc_start_offset/gc_end_offset if needed
730 gc_start_offset
= min(gc_start_offset
, (uint64_t)it
->e
.blob_start());
731 gc_end_offset
= max(gc_end_offset
, (uint64_t)it
->e
.blob_end());
733 auto o
= it
->e
.logical_offset
;
734 auto l
= it
->e
.length
;
736 uint64_t ref_bytes
= b
->get_referenced_bytes();
737 // micro optimization to bypass blobs that have no more references
738 if (ref_bytes
!= 0) {
739 dout(30) << __func__
<< " affected_blob:" << *b
740 << " unref 0x" << std::hex
<< o
<< "~" << l
741 << std::dec
<< dendl
;
742 affected_blobs
.emplace(b
, BlobInfo(ref_bytes
));
746 dout(30) << __func__
<< " gc range(hex): [" << std::hex
747 << gc_start_offset
<< ", " << gc_end_offset
748 << ")" << std::dec
<< dendl
;
750 // enumerate preceeding extents to check if they reference affected blobs
751 if (gc_start_offset
< start_offset
|| gc_end_offset
> end_offset
) {
752 process_protrusive_extents(extent_map
,
759 return expected_for_release
- expected_allocations
;
764 BlueStore::Cache
*BlueStore::Cache::create(CephContext
* cct
, string type
,
765 PerfCounters
*logger
)
770 c
= new LRUCache(cct
);
771 else if (type
== "2q")
772 c
= new TwoQCache(cct
);
774 assert(0 == "unrecognized cache type");
780 void BlueStore::Cache::trim_all()
782 std::lock_guard
<std::recursive_mutex
> l(lock
);
786 void BlueStore::Cache::trim(
787 uint64_t target_bytes
,
788 float target_meta_ratio
,
789 float target_data_ratio
,
790 float bytes_per_onode
)
792 std::lock_guard
<std::recursive_mutex
> l(lock
);
793 uint64_t current_meta
= _get_num_onodes() * bytes_per_onode
;
794 uint64_t current_buffer
= _get_buffer_bytes();
795 uint64_t current
= current_meta
+ current_buffer
;
797 uint64_t target_meta
= target_bytes
* target_meta_ratio
;
798 uint64_t target_buffer
= target_bytes
* target_data_ratio
;
800 // correct for overflow or float imprecision
801 target_meta
= min(target_bytes
, target_meta
);
802 target_buffer
= min(target_bytes
- target_meta
, target_buffer
);
804 if (current
<= target_bytes
) {
806 << " shard target " << pretty_si_t(target_bytes
)
807 << " meta/data ratios " << target_meta_ratio
808 << " + " << target_data_ratio
<< " ("
809 << pretty_si_t(target_meta
) << " + "
810 << pretty_si_t(target_buffer
) << "), "
811 << " current " << pretty_si_t(current
) << " ("
812 << pretty_si_t(current_meta
) << " + "
813 << pretty_si_t(current_buffer
) << ")"
818 uint64_t need_to_free
= current
- target_bytes
;
819 uint64_t free_buffer
= 0;
820 uint64_t free_meta
= 0;
821 if (current_buffer
> target_buffer
) {
822 free_buffer
= current_buffer
- target_buffer
;
823 if (free_buffer
> need_to_free
) {
824 free_buffer
= need_to_free
;
827 free_meta
= need_to_free
- free_buffer
;
829 // start bounds at what we have now
830 uint64_t max_buffer
= current_buffer
- free_buffer
;
831 uint64_t max_meta
= current_meta
- free_meta
;
832 uint64_t max_onodes
= max_meta
/ bytes_per_onode
;
835 << " shard target " << pretty_si_t(target_bytes
)
836 << " ratio " << target_meta_ratio
<< " ("
837 << pretty_si_t(target_meta
) << " + "
838 << pretty_si_t(target_buffer
) << "), "
839 << " current " << pretty_si_t(current
) << " ("
840 << pretty_si_t(current_meta
) << " + "
841 << pretty_si_t(current_buffer
) << "),"
842 << " need_to_free " << pretty_si_t(need_to_free
) << " ("
843 << pretty_si_t(free_meta
) << " + "
844 << pretty_si_t(free_buffer
) << ")"
845 << " -> max " << max_onodes
<< " onodes + "
846 << max_buffer
<< " buffer"
848 _trim(max_onodes
, max_buffer
);
854 #define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
856 void BlueStore::LRUCache::_touch_onode(OnodeRef
& o
)
858 auto p
= onode_lru
.iterator_to(*o
);
860 onode_lru
.push_front(*o
);
863 void BlueStore::LRUCache::_trim(uint64_t onode_max
, uint64_t buffer_max
)
865 dout(20) << __func__
<< " onodes " << onode_lru
.size() << " / " << onode_max
866 << " buffers " << buffer_size
<< " / " << buffer_max
869 _audit("trim start");
872 while (buffer_size
> buffer_max
) {
873 auto i
= buffer_lru
.rbegin();
874 if (i
== buffer_lru
.rend()) {
875 // stop if buffer_lru is now empty
880 assert(b
->is_clean());
881 dout(20) << __func__
<< " rm " << *b
<< dendl
;
882 b
->space
->_rm_buffer(this, b
);
886 int num
= onode_lru
.size() - onode_max
;
888 return; // don't even try
890 auto p
= onode_lru
.end();
891 assert(p
!= onode_lru
.begin());
894 int max_skipped
= g_conf
->bluestore_cache_trim_max_skip_pinned
;
897 int refs
= o
->nref
.load();
899 dout(20) << __func__
<< " " << o
->oid
<< " has " << refs
900 << " refs, skipping" << dendl
;
901 if (++skipped
>= max_skipped
) {
902 dout(20) << __func__
<< " maximum skip pinned reached; stopping with "
903 << num
<< " left to trim" << dendl
;
907 if (p
== onode_lru
.begin()) {
915 dout(30) << __func__
<< " rm " << o
->oid
<< dendl
;
916 if (p
!= onode_lru
.begin()) {
917 onode_lru
.erase(p
--);
922 o
->get(); // paranoia
923 o
->c
->onode_map
.remove(o
->oid
);
930 void BlueStore::LRUCache::_audit(const char *when
)
932 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
934 for (auto i
= buffer_lru
.begin(); i
!= buffer_lru
.end(); ++i
) {
937 if (s
!= buffer_size
) {
938 derr
<< __func__
<< " buffer_size " << buffer_size
<< " actual " << s
940 for (auto i
= buffer_lru
.begin(); i
!= buffer_lru
.end(); ++i
) {
941 derr
<< __func__
<< " " << *i
<< dendl
;
943 assert(s
== buffer_size
);
945 dout(20) << __func__
<< " " << when
<< " buffer_size " << buffer_size
952 #define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
955 void BlueStore::TwoQCache::_touch_onode(OnodeRef
& o
)
957 auto p
= onode_lru
.iterator_to(*o
);
959 onode_lru
.push_front(*o
);
962 void BlueStore::TwoQCache::_add_buffer(Buffer
*b
, int level
, Buffer
*near
)
964 dout(20) << __func__
<< " level " << level
<< " near " << near
966 << " which has cache_private " << b
->cache_private
<< dendl
;
968 b
->cache_private
= near
->cache_private
;
969 switch (b
->cache_private
) {
971 buffer_warm_in
.insert(buffer_warm_in
.iterator_to(*near
), *b
);
973 case BUFFER_WARM_OUT
:
974 assert(b
->is_empty());
975 buffer_warm_out
.insert(buffer_warm_out
.iterator_to(*near
), *b
);
978 buffer_hot
.insert(buffer_hot
.iterator_to(*near
), *b
);
981 assert(0 == "bad cache_private");
983 } else if (b
->cache_private
== BUFFER_NEW
) {
984 b
->cache_private
= BUFFER_WARM_IN
;
986 buffer_warm_in
.push_front(*b
);
988 // take caller hint to start at the back of the warm queue
989 buffer_warm_in
.push_back(*b
);
992 // we got a hint from discard
993 switch (b
->cache_private
) {
995 // stay in warm_in. move to front, even though 2Q doesn't actually
997 dout(20) << __func__
<< " move to front of warm " << *b
<< dendl
;
998 buffer_warm_in
.push_front(*b
);
1000 case BUFFER_WARM_OUT
:
1001 b
->cache_private
= BUFFER_HOT
;
1002 // move to hot. fall-thru
1004 dout(20) << __func__
<< " move to front of hot " << *b
<< dendl
;
1005 buffer_hot
.push_front(*b
);
1008 assert(0 == "bad cache_private");
1011 if (!b
->is_empty()) {
1012 buffer_bytes
+= b
->length
;
1013 buffer_list_bytes
[b
->cache_private
] += b
->length
;
1017 void BlueStore::TwoQCache::_rm_buffer(Buffer
*b
)
1019 dout(20) << __func__
<< " " << *b
<< dendl
;
1020 if (!b
->is_empty()) {
1021 assert(buffer_bytes
>= b
->length
);
1022 buffer_bytes
-= b
->length
;
1023 assert(buffer_list_bytes
[b
->cache_private
] >= b
->length
);
1024 buffer_list_bytes
[b
->cache_private
] -= b
->length
;
1026 switch (b
->cache_private
) {
1027 case BUFFER_WARM_IN
:
1028 buffer_warm_in
.erase(buffer_warm_in
.iterator_to(*b
));
1030 case BUFFER_WARM_OUT
:
1031 buffer_warm_out
.erase(buffer_warm_out
.iterator_to(*b
));
1034 buffer_hot
.erase(buffer_hot
.iterator_to(*b
));
1037 assert(0 == "bad cache_private");
1041 void BlueStore::TwoQCache::_move_buffer(Cache
*srcc
, Buffer
*b
)
1043 TwoQCache
*src
= static_cast<TwoQCache
*>(srcc
);
1046 // preserve which list we're on (even if we can't preserve the order!)
1047 switch (b
->cache_private
) {
1048 case BUFFER_WARM_IN
:
1049 assert(!b
->is_empty());
1050 buffer_warm_in
.push_back(*b
);
1052 case BUFFER_WARM_OUT
:
1053 assert(b
->is_empty());
1054 buffer_warm_out
.push_back(*b
);
1057 assert(!b
->is_empty());
1058 buffer_hot
.push_back(*b
);
1061 assert(0 == "bad cache_private");
1063 if (!b
->is_empty()) {
1064 buffer_bytes
+= b
->length
;
1065 buffer_list_bytes
[b
->cache_private
] += b
->length
;
1069 void BlueStore::TwoQCache::_adjust_buffer_size(Buffer
*b
, int64_t delta
)
1071 dout(20) << __func__
<< " delta " << delta
<< " on " << *b
<< dendl
;
1072 if (!b
->is_empty()) {
1073 assert((int64_t)buffer_bytes
+ delta
>= 0);
1074 buffer_bytes
+= delta
;
1075 assert((int64_t)buffer_list_bytes
[b
->cache_private
] + delta
>= 0);
1076 buffer_list_bytes
[b
->cache_private
] += delta
;
1080 void BlueStore::TwoQCache::_trim(uint64_t onode_max
, uint64_t buffer_max
)
1082 dout(20) << __func__
<< " onodes " << onode_lru
.size() << " / " << onode_max
1083 << " buffers " << buffer_bytes
<< " / " << buffer_max
1086 _audit("trim start");
1089 if (buffer_bytes
> buffer_max
) {
1090 uint64_t kin
= buffer_max
* cct
->_conf
->bluestore_2q_cache_kin_ratio
;
1091 uint64_t khot
= buffer_max
- kin
;
1093 // pre-calculate kout based on average buffer size too,
1094 // which is typical(the warm_in and hot lists may change later)
1096 uint64_t buffer_num
= buffer_hot
.size() + buffer_warm_in
.size();
1098 uint64_t buffer_avg_size
= buffer_bytes
/ buffer_num
;
1099 assert(buffer_avg_size
);
1100 uint64_t calculated_buffer_num
= buffer_max
/ buffer_avg_size
;
1101 kout
= calculated_buffer_num
* cct
->_conf
->bluestore_2q_cache_kout_ratio
;
1104 if (buffer_list_bytes
[BUFFER_HOT
] < khot
) {
1105 // hot is small, give slack to warm_in
1106 kin
+= khot
- buffer_list_bytes
[BUFFER_HOT
];
1107 } else if (buffer_list_bytes
[BUFFER_WARM_IN
] < kin
) {
1108 // warm_in is small, give slack to hot
1109 khot
+= kin
- buffer_list_bytes
[BUFFER_WARM_IN
];
1112 // adjust warm_in list
1113 int64_t to_evict_bytes
= buffer_list_bytes
[BUFFER_WARM_IN
] - kin
;
1114 uint64_t evicted
= 0;
1116 while (to_evict_bytes
> 0) {
1117 auto p
= buffer_warm_in
.rbegin();
1118 if (p
== buffer_warm_in
.rend()) {
1119 // stop if warm_in list is now empty
1124 assert(b
->is_clean());
1125 dout(20) << __func__
<< " buffer_warm_in -> out " << *b
<< dendl
;
1126 assert(buffer_bytes
>= b
->length
);
1127 buffer_bytes
-= b
->length
;
1128 assert(buffer_list_bytes
[BUFFER_WARM_IN
] >= b
->length
);
1129 buffer_list_bytes
[BUFFER_WARM_IN
] -= b
->length
;
1130 to_evict_bytes
-= b
->length
;
1131 evicted
+= b
->length
;
1132 b
->state
= Buffer::STATE_EMPTY
;
1134 buffer_warm_in
.erase(buffer_warm_in
.iterator_to(*b
));
1135 buffer_warm_out
.push_front(*b
);
1136 b
->cache_private
= BUFFER_WARM_OUT
;
1140 dout(20) << __func__
<< " evicted " << prettybyte_t(evicted
)
1141 << " from warm_in list, done evicting warm_in buffers"
1146 to_evict_bytes
= buffer_list_bytes
[BUFFER_HOT
] - khot
;
1149 while (to_evict_bytes
> 0) {
1150 auto p
= buffer_hot
.rbegin();
1151 if (p
== buffer_hot
.rend()) {
1152 // stop if hot list is now empty
1157 dout(20) << __func__
<< " buffer_hot rm " << *b
<< dendl
;
1158 assert(b
->is_clean());
1159 // adjust evict size before buffer goes invalid
1160 to_evict_bytes
-= b
->length
;
1161 evicted
+= b
->length
;
1162 b
->space
->_rm_buffer(this, b
);
1166 dout(20) << __func__
<< " evicted " << prettybyte_t(evicted
)
1167 << " from hot list, done evicting hot buffers"
1171 // adjust warm out list too, if necessary
1172 int64_t num
= buffer_warm_out
.size() - kout
;
1174 Buffer
*b
= &*buffer_warm_out
.rbegin();
1175 assert(b
->is_empty());
1176 dout(20) << __func__
<< " buffer_warm_out rm " << *b
<< dendl
;
1177 b
->space
->_rm_buffer(this, b
);
1182 int num
= onode_lru
.size() - onode_max
;
1184 return; // don't even try
1186 auto p
= onode_lru
.end();
1187 assert(p
!= onode_lru
.begin());
1190 int max_skipped
= g_conf
->bluestore_cache_trim_max_skip_pinned
;
1193 dout(20) << __func__
<< " considering " << o
<< dendl
;
1194 int refs
= o
->nref
.load();
1196 dout(20) << __func__
<< " " << o
->oid
<< " has " << refs
1197 << " refs; skipping" << dendl
;
1198 if (++skipped
>= max_skipped
) {
1199 dout(20) << __func__
<< " maximum skip pinned reached; stopping with "
1200 << num
<< " left to trim" << dendl
;
1204 if (p
== onode_lru
.begin()) {
1212 dout(30) << __func__
<< " " << o
->oid
<< " num=" << num
<<" lru size="<<onode_lru
.size()<< dendl
;
1213 if (p
!= onode_lru
.begin()) {
1214 onode_lru
.erase(p
--);
1219 o
->get(); // paranoia
1220 o
->c
->onode_map
.remove(o
->oid
);
1227 void BlueStore::TwoQCache::_audit(const char *when
)
1229 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1231 for (auto i
= buffer_hot
.begin(); i
!= buffer_hot
.end(); ++i
) {
1235 uint64_t hot_bytes
= s
;
1236 if (hot_bytes
!= buffer_list_bytes
[BUFFER_HOT
]) {
1237 derr
<< __func__
<< " hot_list_bytes "
1238 << buffer_list_bytes
[BUFFER_HOT
]
1239 << " != actual " << hot_bytes
1241 assert(hot_bytes
== buffer_list_bytes
[BUFFER_HOT
]);
1244 for (auto i
= buffer_warm_in
.begin(); i
!= buffer_warm_in
.end(); ++i
) {
1248 uint64_t warm_in_bytes
= s
- hot_bytes
;
1249 if (warm_in_bytes
!= buffer_list_bytes
[BUFFER_WARM_IN
]) {
1250 derr
<< __func__
<< " warm_in_list_bytes "
1251 << buffer_list_bytes
[BUFFER_WARM_IN
]
1252 << " != actual " << warm_in_bytes
1254 assert(warm_in_bytes
== buffer_list_bytes
[BUFFER_WARM_IN
]);
1257 if (s
!= buffer_bytes
) {
1258 derr
<< __func__
<< " buffer_bytes " << buffer_bytes
<< " actual " << s
1260 assert(s
== buffer_bytes
);
1263 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1272 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1274 void BlueStore::BufferSpace::_clear(Cache
* cache
)
1276 // note: we already hold cache->lock
1277 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1278 while (!buffer_map
.empty()) {
1279 _rm_buffer(cache
, buffer_map
.begin());
1283 int BlueStore::BufferSpace::_discard(Cache
* cache
, uint32_t offset
, uint32_t length
)
1285 // note: we already hold cache->lock
1286 ldout(cache
->cct
, 20) << __func__
<< std::hex
<< " 0x" << offset
<< "~" << length
1287 << std::dec
<< dendl
;
1288 int cache_private
= 0;
1289 cache
->_audit("discard start");
1290 auto i
= _data_lower_bound(offset
);
1291 uint32_t end
= offset
+ length
;
1292 while (i
!= buffer_map
.end()) {
1293 Buffer
*b
= i
->second
.get();
1294 if (b
->offset
>= end
) {
1297 if (b
->cache_private
> cache_private
) {
1298 cache_private
= b
->cache_private
;
1300 if (b
->offset
< offset
) {
1301 int64_t front
= offset
- b
->offset
;
1302 if (b
->end() > end
) {
1303 // drop middle (split)
1304 uint32_t tail
= b
->end() - end
;
1305 if (b
->data
.length()) {
1307 bl
.substr_of(b
->data
, b
->length
- tail
, tail
);
1308 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1309 nb
->maybe_rebuild();
1310 _add_buffer(cache
, nb
, 0, b
);
1312 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, tail
),
1315 if (!b
->is_writing()) {
1316 cache
->_adjust_buffer_size(b
, front
- (int64_t)b
->length
);
1320 cache
->_audit("discard end 1");
1324 if (!b
->is_writing()) {
1325 cache
->_adjust_buffer_size(b
, front
- (int64_t)b
->length
);
1333 if (b
->end() <= end
) {
1334 // drop entire buffer
1335 _rm_buffer(cache
, i
++);
1339 uint32_t keep
= b
->end() - end
;
1340 if (b
->data
.length()) {
1342 bl
.substr_of(b
->data
, b
->length
- keep
, keep
);
1343 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1344 nb
->maybe_rebuild();
1345 _add_buffer(cache
, nb
, 0, b
);
1347 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, keep
), 0, b
);
1349 _rm_buffer(cache
, i
);
1350 cache
->_audit("discard end 2");
1353 return cache_private
;
1356 void BlueStore::BufferSpace::read(
1360 BlueStore::ready_regions_t
& res
,
1361 interval_set
<uint32_t>& res_intervals
)
1364 res_intervals
.clear();
1365 uint32_t want_bytes
= length
;
1366 uint32_t end
= offset
+ length
;
1369 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1370 for (auto i
= _data_lower_bound(offset
);
1371 i
!= buffer_map
.end() && offset
< end
&& i
->first
< end
;
1373 Buffer
*b
= i
->second
.get();
1374 assert(b
->end() > offset
);
1375 if (b
->is_writing() || b
->is_clean()) {
1376 if (b
->offset
< offset
) {
1377 uint32_t skip
= offset
- b
->offset
;
1378 uint32_t l
= MIN(length
, b
->length
- skip
);
1379 res
[offset
].substr_of(b
->data
, skip
, l
);
1380 res_intervals
.insert(offset
, l
);
1383 if (!b
->is_writing()) {
1384 cache
->_touch_buffer(b
);
1388 if (b
->offset
> offset
) {
1389 uint32_t gap
= b
->offset
- offset
;
1390 if (length
<= gap
) {
1396 if (!b
->is_writing()) {
1397 cache
->_touch_buffer(b
);
1399 if (b
->length
> length
) {
1400 res
[offset
].substr_of(b
->data
, 0, length
);
1401 res_intervals
.insert(offset
, length
);
1404 res
[offset
].append(b
->data
);
1405 res_intervals
.insert(offset
, b
->length
);
1406 if (b
->length
== length
)
1408 offset
+= b
->length
;
1409 length
-= b
->length
;
1415 uint64_t hit_bytes
= res_intervals
.size();
1416 assert(hit_bytes
<= want_bytes
);
1417 uint64_t miss_bytes
= want_bytes
- hit_bytes
;
1418 cache
->logger
->inc(l_bluestore_buffer_hit_bytes
, hit_bytes
);
1419 cache
->logger
->inc(l_bluestore_buffer_miss_bytes
, miss_bytes
);
1422 void BlueStore::BufferSpace::finish_write(Cache
* cache
, uint64_t seq
)
1424 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1426 auto i
= writing
.begin();
1427 while (i
!= writing
.end()) {
1437 assert(b
->is_writing());
1439 if (b
->flags
& Buffer::FLAG_NOCACHE
) {
1441 ldout(cache
->cct
, 20) << __func__
<< " discard " << *b
<< dendl
;
1442 buffer_map
.erase(b
->offset
);
1444 b
->state
= Buffer::STATE_CLEAN
;
1447 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
1448 cache
->_add_buffer(b
, 1, nullptr);
1449 ldout(cache
->cct
, 20) << __func__
<< " added " << *b
<< dendl
;
1453 cache
->_audit("finish_write end");
1456 void BlueStore::BufferSpace::split(Cache
* cache
, size_t pos
, BlueStore::BufferSpace
&r
)
1458 std::lock_guard
<std::recursive_mutex
> lk(cache
->lock
);
1459 if (buffer_map
.empty())
1462 auto p
= --buffer_map
.end();
1464 if (p
->second
->end() <= pos
)
1467 if (p
->second
->offset
< pos
) {
1468 ldout(cache
->cct
, 30) << __func__
<< " cut " << *p
->second
<< dendl
;
1469 size_t left
= pos
- p
->second
->offset
;
1470 size_t right
= p
->second
->length
- left
;
1471 if (p
->second
->data
.length()) {
1473 bl
.substr_of(p
->second
->data
, left
, right
);
1474 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, bl
),
1475 0, p
->second
.get());
1477 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, right
),
1478 0, p
->second
.get());
1480 cache
->_adjust_buffer_size(p
->second
.get(), -right
);
1481 p
->second
->truncate(left
);
1485 assert(p
->second
->end() > pos
);
1486 ldout(cache
->cct
, 30) << __func__
<< " move " << *p
->second
<< dendl
;
1487 if (p
->second
->data
.length()) {
1488 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1489 p
->second
->offset
- pos
, p
->second
->data
),
1490 0, p
->second
.get());
1492 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1493 p
->second
->offset
- pos
, p
->second
->length
),
1494 0, p
->second
.get());
1496 if (p
== buffer_map
.begin()) {
1497 _rm_buffer(cache
, p
);
1500 _rm_buffer(cache
, p
--);
1503 assert(writing
.empty());
1509 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1511 BlueStore::OnodeRef
BlueStore::OnodeSpace::add(const ghobject_t
& oid
, OnodeRef o
)
1513 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1514 auto p
= onode_map
.find(oid
);
1515 if (p
!= onode_map
.end()) {
1516 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
1517 << " raced, returning existing " << p
->second
1521 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
<< dendl
;
1523 cache
->_add_onode(o
, 1);
1527 BlueStore::OnodeRef
BlueStore::OnodeSpace::lookup(const ghobject_t
& oid
)
1529 ldout(cache
->cct
, 30) << __func__
<< dendl
;
1534 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1535 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator p
= onode_map
.find(oid
);
1536 if (p
== onode_map
.end()) {
1537 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " miss" << dendl
;
1539 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " hit " << p
->second
1541 cache
->_touch_onode(p
->second
);
1548 cache
->logger
->inc(l_bluestore_onode_hits
);
1550 cache
->logger
->inc(l_bluestore_onode_misses
);
1555 void BlueStore::OnodeSpace::clear()
1557 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1558 ldout(cache
->cct
, 10) << __func__
<< dendl
;
1559 for (auto &p
: onode_map
) {
1560 cache
->_rm_onode(p
.second
);
1565 bool BlueStore::OnodeSpace::empty()
1567 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1568 return onode_map
.empty();
1571 void BlueStore::OnodeSpace::rename(
1573 const ghobject_t
& old_oid
,
1574 const ghobject_t
& new_oid
,
1575 const mempool::bluestore_cache_other::string
& new_okey
)
1577 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1578 ldout(cache
->cct
, 30) << __func__
<< " " << old_oid
<< " -> " << new_oid
1580 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator po
, pn
;
1581 po
= onode_map
.find(old_oid
);
1582 pn
= onode_map
.find(new_oid
);
1585 assert(po
!= onode_map
.end());
1586 if (pn
!= onode_map
.end()) {
1587 ldout(cache
->cct
, 30) << __func__
<< " removing target " << pn
->second
1589 cache
->_rm_onode(pn
->second
);
1590 onode_map
.erase(pn
);
1592 OnodeRef o
= po
->second
;
1594 // install a non-existent onode at old location
1595 oldo
.reset(new Onode(o
->c
, old_oid
, o
->key
));
1597 cache
->_add_onode(po
->second
, 1);
1599 // add at new position and fix oid, key
1600 onode_map
.insert(make_pair(new_oid
, o
));
1601 cache
->_touch_onode(o
);
1606 bool BlueStore::OnodeSpace::map_any(std::function
<bool(OnodeRef
)> f
)
1608 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1609 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1610 for (auto& i
: onode_map
) {
1618 void BlueStore::OnodeSpace::dump(CephContext
*cct
, int lvl
)
1620 for (auto& i
: onode_map
) {
1621 ldout(cct
, lvl
) << i
.first
<< " : " << i
.second
<< dendl
;
1628 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1630 ostream
& operator<<(ostream
& out
, const BlueStore::SharedBlob
& sb
)
1632 out
<< "SharedBlob(" << &sb
;
1635 out
<< " loaded " << *sb
.persistent
;
1637 out
<< " sbid 0x" << std::hex
<< sb
.sbid_unloaded
<< std::dec
;
1642 BlueStore::SharedBlob::SharedBlob(uint64_t i
, Collection
*_coll
)
1643 : coll(_coll
), sbid_unloaded(i
)
1645 assert(sbid_unloaded
> 0);
1647 get_cache()->add_blob();
1651 BlueStore::SharedBlob::~SharedBlob()
1653 if (get_cache()) { // the dummy instances have a nullptr
1654 std::lock_guard
<std::recursive_mutex
> l(get_cache()->lock
);
1655 bc
._clear(get_cache());
1656 get_cache()->rm_blob();
1658 if (loaded
&& persistent
) {
1663 void BlueStore::SharedBlob::put()
1666 ldout(coll
->store
->cct
, 20) << __func__
<< " " << this
1667 << " removing self from set " << get_parent()
1670 if (get_parent()->try_remove(this)) {
1673 ldout(coll
->store
->cct
, 20)
1674 << __func__
<< " " << this << " lost race to remove myself from set"
1683 void BlueStore::SharedBlob::get_ref(uint64_t offset
, uint32_t length
)
1686 persistent
->ref_map
.get(offset
, length
);
1689 void BlueStore::SharedBlob::put_ref(uint64_t offset
, uint32_t length
,
1691 set
<SharedBlob
*> *maybe_unshared
)
1695 persistent
->ref_map
.put(offset
, length
, r
, maybe_unshared
? &maybe
: nullptr);
1696 if (maybe_unshared
&& maybe
) {
1697 maybe_unshared
->insert(this);
1704 #define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
1706 void BlueStore::SharedBlobSet::dump(CephContext
*cct
, int lvl
)
1708 std::lock_guard
<std::mutex
> l(lock
);
1709 for (auto& i
: sb_map
) {
1710 ldout(cct
, lvl
) << i
.first
<< " : " << *i
.second
<< dendl
;
1717 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1719 ostream
& operator<<(ostream
& out
, const BlueStore::Blob
& b
)
1721 out
<< "Blob(" << &b
;
1722 if (b
.is_spanning()) {
1723 out
<< " spanning " << b
.id
;
1725 out
<< " " << b
.get_blob() << " " << b
.get_blob_use_tracker();
1726 if (b
.shared_blob
) {
1727 out
<< " " << *b
.shared_blob
;
1729 out
<< " (shared_blob=NULL)";
1735 void BlueStore::Blob::discard_unallocated(Collection
*coll
)
1737 if (get_blob().is_shared()) {
1740 if (get_blob().is_compressed()) {
1741 bool discard
= false;
1742 bool all_invalid
= true;
1743 for (auto e
: get_blob().get_extents()) {
1744 if (!e
.is_valid()) {
1747 all_invalid
= false;
1750 assert(discard
== all_invalid
); // in case of compressed blob all
1751 // or none pextents are invalid.
1753 shared_blob
->bc
.discard(shared_blob
->get_cache(), 0,
1754 get_blob().get_logical_length());
1758 for (auto e
: get_blob().get_extents()) {
1759 if (!e
.is_valid()) {
1760 ldout(coll
->store
->cct
, 20) << __func__
<< " 0x" << std::hex
<< pos
1762 << std::dec
<< dendl
;
1763 shared_blob
->bc
.discard(shared_blob
->get_cache(), pos
, e
.length
);
1767 if (get_blob().can_prune_tail()) {
1768 dirty_blob().prune_tail();
1769 used_in_blob
.prune_tail(get_blob().get_ondisk_length());
1770 auto cct
= coll
->store
->cct
; //used by dout
1771 dout(20) << __func__
<< " pruned tail, now " << get_blob() << dendl
;
1776 void BlueStore::Blob::get_ref(
1781 // Caller has to initialize Blob's logical length prior to increment
1782 // references. Otherwise one is neither unable to determine required
1783 // amount of counters in case of per-au tracking nor obtain min_release_size
1784 // for single counter mode.
1785 assert(get_blob().get_logical_length() != 0);
1786 auto cct
= coll
->store
->cct
;
1787 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
1788 << std::dec
<< " " << *this << dendl
;
1790 if (used_in_blob
.is_empty()) {
1791 uint32_t min_release_size
=
1792 get_blob().get_release_size(coll
->store
->min_alloc_size
);
1793 uint64_t l
= get_blob().get_logical_length();
1794 dout(20) << __func__
<< " init 0x" << std::hex
<< l
<< ", "
1795 << min_release_size
<< std::dec
<< dendl
;
1796 used_in_blob
.init(l
, min_release_size
);
1803 bool BlueStore::Blob::put_ref(
1809 PExtentVector logical
;
1811 auto cct
= coll
->store
->cct
;
1812 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
1813 << std::dec
<< " " << *this << dendl
;
1815 bool empty
= used_in_blob
.put(
1820 // nothing to release
1821 if (!empty
&& logical
.empty()) {
1825 bluestore_blob_t
& b
= dirty_blob();
1826 return b
.release_extents(empty
, logical
, r
);
1829 bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size
,
1830 uint32_t target_blob_size
,
1832 uint32_t *length0
) {
1833 assert(min_alloc_size
);
1834 assert(target_blob_size
);
1835 if (!get_blob().is_mutable()) {
1839 uint32_t length
= *length0
;
1840 uint32_t end
= b_offset
+ length
;
1842 // Currently for the sake of simplicity we omit blob reuse if data is
1843 // unaligned with csum chunk. Later we can perform padding if needed.
1844 if (get_blob().has_csum() &&
1845 ((b_offset
% get_blob().get_csum_chunk_size()) != 0 ||
1846 (end
% get_blob().get_csum_chunk_size()) != 0)) {
1850 auto blen
= get_blob().get_logical_length();
1851 uint32_t new_blen
= blen
;
1853 // make sure target_blob_size isn't less than current blob len
1854 target_blob_size
= MAX(blen
, target_blob_size
);
1856 if (b_offset
>= blen
) {
1857 // new data totally stands out of the existing blob
1860 // new data overlaps with the existing blob
1861 new_blen
= MAX(blen
, end
);
1863 uint32_t overlap
= 0;
1864 if (new_blen
> blen
) {
1865 overlap
= blen
- b_offset
;
1870 if (!get_blob().is_unallocated(b_offset
, overlap
)) {
1871 // abort if any piece of the overlap has already been allocated
1876 if (new_blen
> blen
) {
1877 int64_t overflow
= int64_t(new_blen
) - target_blob_size
;
1878 // Unable to decrease the provided length to fit into max_blob_size
1879 if (overflow
>= length
) {
1883 // FIXME: in some cases we could reduce unused resolution
1884 if (get_blob().has_unused()) {
1889 new_blen
-= overflow
;
1894 if (new_blen
> blen
) {
1895 dirty_blob().add_tail(new_blen
);
1896 used_in_blob
.add_tail(new_blen
,
1897 get_blob().get_release_size(min_alloc_size
));
1903 void BlueStore::Blob::split(Collection
*coll
, uint32_t blob_offset
, Blob
*r
)
1905 auto cct
= coll
->store
->cct
; //used by dout
1906 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1907 << " start " << *this << dendl
;
1908 assert(blob
.can_split());
1909 assert(used_in_blob
.can_split());
1910 bluestore_blob_t
&lb
= dirty_blob();
1911 bluestore_blob_t
&rb
= r
->dirty_blob();
1915 &(r
->used_in_blob
));
1917 lb
.split(blob_offset
, rb
);
1918 shared_blob
->bc
.split(shared_blob
->get_cache(), blob_offset
, r
->shared_blob
->bc
);
1920 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1921 << " finish " << *this << dendl
;
1922 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1923 << " and " << *r
<< dendl
;
1926 #ifndef CACHE_BLOB_BL
1927 void BlueStore::Blob::decode(
1929 bufferptr::iterator
& p
,
1932 bool include_ref_map
)
1934 denc(blob
, p
, struct_v
);
1935 if (blob
.is_shared()) {
1938 if (include_ref_map
) {
1940 used_in_blob
.decode(p
);
1942 used_in_blob
.clear();
1943 bluestore_extent_ref_map_t legacy_ref_map
;
1944 legacy_ref_map
.decode(p
);
1945 for (auto r
: legacy_ref_map
.ref_map
) {
1949 r
.second
.refs
* r
.second
.length
);
1958 ostream
& operator<<(ostream
& out
, const BlueStore::Extent
& e
)
1960 return out
<< std::hex
<< "0x" << e
.logical_offset
<< "~" << e
.length
1961 << ": 0x" << e
.blob_offset
<< "~" << e
.length
<< std::dec
1966 BlueStore::OldExtent
* BlueStore::OldExtent::create(CollectionRef c
,
1971 OldExtent
* oe
= new OldExtent(lo
, o
, l
, b
);
1972 b
->put_ref(c
.get(), o
, l
, &(oe
->r
));
1973 oe
->blob_empty
= b
->get_referenced_bytes() == 0;
1980 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1982 BlueStore::ExtentMap::ExtentMap(Onode
*o
)
1985 o
->c
->store
->cct
->_conf
->bluestore_extent_map_inline_shard_prealloc_size
) {
1988 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t
,
1991 auto cct
= onode
->c
->store
->cct
; //used by dout
1992 dout(20) << __func__
<< " " << onode
->oid
<< (force
? " force" : "") << dendl
;
1993 if (onode
->onode
.extent_map_shards
.empty()) {
1994 if (inline_bl
.length() == 0) {
1996 // we need to encode inline_bl to measure encoded length
1997 bool never_happen
= encode_some(0, OBJECT_MAX_SIZE
, inline_bl
, &n
);
1998 inline_bl
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
1999 assert(!never_happen
);
2000 size_t len
= inline_bl
.length();
2001 dout(20) << __func__
<< " inline shard " << len
<< " bytes from " << n
2002 << " extents" << dendl
;
2003 if (!force
&& len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2004 request_reshard(0, OBJECT_MAX_SIZE
);
2008 // will persist in the onode key.
2010 // pending shard update
2011 struct dirty_shard_t
{
2014 dirty_shard_t(Shard
*s
) : shard(s
) {}
2016 vector
<dirty_shard_t
> encoded_shards
;
2017 // allocate slots for all shards in a single call instead of
2018 // doing multiple allocations - one per each dirty shard
2019 encoded_shards
.reserve(shards
.size());
2021 auto p
= shards
.begin();
2023 while (p
!= shards
.end()) {
2024 assert(p
->shard_info
->offset
>= prev_p
->shard_info
->offset
);
2029 if (n
== shards
.end()) {
2030 endoff
= OBJECT_MAX_SIZE
;
2032 endoff
= n
->shard_info
->offset
;
2034 encoded_shards
.emplace_back(dirty_shard_t(&(*p
)));
2035 bufferlist
& bl
= encoded_shards
.back().bl
;
2036 if (encode_some(p
->shard_info
->offset
, endoff
- p
->shard_info
->offset
,
2039 derr
<< __func__
<< " encode_some needs reshard" << dendl
;
2043 size_t len
= bl
.length();
2045 dout(20) << __func__
<< " shard 0x" << std::hex
2046 << p
->shard_info
->offset
<< std::dec
<< " is " << len
2047 << " bytes (was " << p
->shard_info
->bytes
<< ") from "
2048 << p
->extents
<< " extents" << dendl
;
2051 if (len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
2052 // we are big; reshard ourselves
2053 request_reshard(p
->shard_info
->offset
, endoff
);
2055 // avoid resharding the trailing shard, even if it is small
2056 else if (n
!= shards
.end() &&
2057 len
< g_conf
->bluestore_extent_map_shard_min_size
) {
2058 assert(endoff
!= OBJECT_MAX_SIZE
);
2059 if (p
== shards
.begin()) {
2060 // we are the first shard, combine with next shard
2061 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2063 // combine either with the previous shard or the next,
2064 // whichever is smaller
2065 if (prev_p
->shard_info
->bytes
> n
->shard_info
->bytes
) {
2066 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
2068 request_reshard(prev_p
->shard_info
->offset
, endoff
);
2077 if (needs_reshard()) {
2081 // schedule DB update for dirty shards
2083 for (auto& it
: encoded_shards
) {
2084 it
.shard
->dirty
= false;
2085 it
.shard
->shard_info
->bytes
= it
.bl
.length();
2086 generate_extent_shard_key_and_apply(
2088 it
.shard
->shard_info
->offset
,
2090 [&](const string
& final_key
) {
2091 t
->set(PREFIX_OBJ
, final_key
, it
.bl
);
2098 bid_t
BlueStore::ExtentMap::allocate_spanning_blob_id()
2100 if (spanning_blob_map
.empty())
2102 bid_t bid
= spanning_blob_map
.rbegin()->first
+ 1;
2103 // bid is valid and available.
2106 // Find next unused bid;
2107 bid
= rand() % (numeric_limits
<bid_t
>::max() + 1);
2108 const auto begin_bid
= bid
;
2110 if (!spanning_blob_map
.count(bid
))
2114 if (bid
< 0) bid
= 0;
2116 } while (bid
!= begin_bid
);
2117 assert(0 == "no available blob id");
2120 void BlueStore::ExtentMap::reshard(
2122 KeyValueDB::Transaction t
)
2124 auto cct
= onode
->c
->store
->cct
; // used by dout
2126 dout(10) << __func__
<< " 0x[" << std::hex
<< needs_reshard_begin
<< ","
2127 << needs_reshard_end
<< ")" << std::dec
2128 << " of " << onode
->onode
.extent_map_shards
.size()
2129 << " shards on " << onode
->oid
<< dendl
;
2130 for (auto& p
: spanning_blob_map
) {
2131 dout(20) << __func__
<< " spanning blob " << p
.first
<< " " << *p
.second
2134 // determine shard index range
2135 unsigned si_begin
= 0, si_end
= 0;
2136 if (!shards
.empty()) {
2137 while (si_begin
+ 1 < shards
.size() &&
2138 shards
[si_begin
+ 1].shard_info
->offset
<= needs_reshard_begin
) {
2141 needs_reshard_begin
= shards
[si_begin
].shard_info
->offset
;
2142 for (si_end
= si_begin
; si_end
< shards
.size(); ++si_end
) {
2143 if (shards
[si_end
].shard_info
->offset
>= needs_reshard_end
) {
2144 needs_reshard_end
= shards
[si_end
].shard_info
->offset
;
2148 if (si_end
== shards
.size()) {
2149 needs_reshard_end
= OBJECT_MAX_SIZE
;
2151 dout(20) << __func__
<< " shards [" << si_begin
<< "," << si_end
<< ")"
2152 << " over 0x[" << std::hex
<< needs_reshard_begin
<< ","
2153 << needs_reshard_end
<< ")" << std::dec
<< dendl
;
2156 fault_range(db
, needs_reshard_begin
, (needs_reshard_end
- needs_reshard_begin
));
2158 // we may need to fault in a larger interval later must have all
2159 // referring extents for spanning blobs loaded in order to have
2160 // accurate use_tracker values.
2161 uint32_t spanning_scan_begin
= needs_reshard_begin
;
2162 uint32_t spanning_scan_end
= needs_reshard_end
;
2166 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2167 generate_extent_shard_key_and_apply(
2168 onode
->key
, shards
[i
].shard_info
->offset
, &key
,
2169 [&](const string
& final_key
) {
2170 t
->rmkey(PREFIX_OBJ
, final_key
);
2175 // calculate average extent size
2177 unsigned extents
= 0;
2178 if (onode
->onode
.extent_map_shards
.empty()) {
2179 bytes
= inline_bl
.length();
2180 extents
= extent_map
.size();
2182 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2183 bytes
+= shards
[i
].shard_info
->bytes
;
2184 extents
+= shards
[i
].extents
;
2187 unsigned target
= cct
->_conf
->bluestore_extent_map_shard_target_size
;
2188 unsigned slop
= target
*
2189 cct
->_conf
->bluestore_extent_map_shard_target_size_slop
;
2190 unsigned extent_avg
= bytes
/ MAX(1, extents
);
2191 dout(20) << __func__
<< " extent_avg " << extent_avg
<< ", target " << target
2192 << ", slop " << slop
<< dendl
;
2195 unsigned estimate
= 0;
2196 unsigned offset
= needs_reshard_begin
;
2197 vector
<bluestore_onode_t::shard_info
> new_shard_info
;
2198 unsigned max_blob_end
= 0;
2199 Extent
dummy(needs_reshard_begin
);
2200 for (auto e
= extent_map
.lower_bound(dummy
);
2201 e
!= extent_map
.end();
2203 if (e
->logical_offset
>= needs_reshard_end
) {
2206 dout(30) << " extent " << *e
<< dendl
;
2208 // disfavor shard boundaries that span a blob
2209 bool would_span
= (e
->logical_offset
< max_blob_end
) || e
->blob_offset
;
2211 estimate
+ extent_avg
> target
+ (would_span
? slop
: 0)) {
2213 if (offset
== needs_reshard_begin
) {
2214 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2215 new_shard_info
.back().offset
= offset
;
2216 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2217 << std::dec
<< dendl
;
2219 offset
= e
->logical_offset
;
2220 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2221 new_shard_info
.back().offset
= offset
;
2222 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2223 << std::dec
<< dendl
;
2226 estimate
+= extent_avg
;
2227 unsigned bs
= e
->blob_start();
2228 if (bs
< spanning_scan_begin
) {
2229 spanning_scan_begin
= bs
;
2231 uint32_t be
= e
->blob_end();
2232 if (be
> max_blob_end
) {
2235 if (be
> spanning_scan_end
) {
2236 spanning_scan_end
= be
;
2239 if (new_shard_info
.empty() && (si_begin
> 0 ||
2240 si_end
< shards
.size())) {
2241 // we resharded a partial range; we must produce at least one output
2243 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2244 new_shard_info
.back().offset
= needs_reshard_begin
;
2245 dout(20) << __func__
<< " new shard 0x" << std::hex
<< needs_reshard_begin
2246 << std::dec
<< " (singleton degenerate case)" << dendl
;
2249 auto& sv
= onode
->onode
.extent_map_shards
;
2250 dout(20) << __func__
<< " new " << new_shard_info
<< dendl
;
2251 dout(20) << __func__
<< " old " << sv
<< dendl
;
2253 // no old shards to keep
2254 sv
.swap(new_shard_info
);
2255 init_shards(true, true);
2257 // splice in new shards
2258 sv
.erase(sv
.begin() + si_begin
, sv
.begin() + si_end
);
2259 shards
.erase(shards
.begin() + si_begin
, shards
.begin() + si_end
);
2261 sv
.begin() + si_begin
,
2262 new_shard_info
.begin(),
2263 new_shard_info
.end());
2264 shards
.insert(shards
.begin() + si_begin
, new_shard_info
.size(), Shard());
2265 si_end
= si_begin
+ new_shard_info
.size();
2267 assert(sv
.size() == shards
.size());
2269 // note that we need to update every shard_info of shards here,
2270 // as sv might have been totally re-allocated above
2271 for (unsigned i
= 0; i
< shards
.size(); i
++) {
2272 shards
[i
].shard_info
= &sv
[i
];
2275 // mark newly added shards as dirty
2276 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2277 shards
[i
].loaded
= true;
2278 shards
[i
].dirty
= true;
2281 dout(20) << __func__
<< " fin " << sv
<< dendl
;
2285 // no more shards; unspan all previously spanning blobs
2286 auto p
= spanning_blob_map
.begin();
2287 while (p
!= spanning_blob_map
.end()) {
2289 dout(30) << __func__
<< " un-spanning " << *p
->second
<< dendl
;
2290 p
= spanning_blob_map
.erase(p
);
2293 // identify new spanning blobs
2294 dout(20) << __func__
<< " checking spanning blobs 0x[" << std::hex
2295 << spanning_scan_begin
<< "," << spanning_scan_end
<< ")" << dendl
;
2296 if (spanning_scan_begin
< needs_reshard_begin
) {
2297 fault_range(db
, spanning_scan_begin
,
2298 needs_reshard_begin
- spanning_scan_begin
);
2300 if (spanning_scan_end
> needs_reshard_end
) {
2301 fault_range(db
, needs_reshard_end
,
2302 spanning_scan_end
- needs_reshard_end
);
2304 auto sp
= sv
.begin() + si_begin
;
2305 auto esp
= sv
.end();
2306 unsigned shard_start
= sp
->offset
;
2310 shard_end
= OBJECT_MAX_SIZE
;
2312 shard_end
= sp
->offset
;
2314 Extent
dummy(needs_reshard_begin
);
2315 for (auto e
= extent_map
.lower_bound(dummy
); e
!= extent_map
.end(); ++e
) {
2316 if (e
->logical_offset
>= needs_reshard_end
) {
2319 dout(30) << " extent " << *e
<< dendl
;
2320 while (e
->logical_offset
>= shard_end
) {
2321 shard_start
= shard_end
;
2325 shard_end
= OBJECT_MAX_SIZE
;
2327 shard_end
= sp
->offset
;
2329 dout(30) << __func__
<< " shard 0x" << std::hex
<< shard_start
2330 << " to 0x" << shard_end
<< std::dec
<< dendl
;
2332 if (e
->blob_escapes_range(shard_start
, shard_end
- shard_start
)) {
2333 if (!e
->blob
->is_spanning()) {
2334 // We have two options: (1) split the blob into pieces at the
2335 // shard boundaries (and adjust extents accordingly), or (2)
2336 // mark it spanning. We prefer to cut the blob if we can. Note that
2337 // we may have to split it multiple times--potentially at every
2339 bool must_span
= false;
2340 BlobRef b
= e
->blob
;
2341 if (b
->can_split()) {
2342 uint32_t bstart
= e
->blob_start();
2343 uint32_t bend
= e
->blob_end();
2344 for (const auto& sh
: shards
) {
2345 if (bstart
< sh
.shard_info
->offset
&&
2346 bend
> sh
.shard_info
->offset
) {
2347 uint32_t blob_offset
= sh
.shard_info
->offset
- bstart
;
2348 if (b
->can_split_at(blob_offset
)) {
2349 dout(20) << __func__
<< " splitting blob, bstart 0x"
2350 << std::hex
<< bstart
<< " blob_offset 0x"
2351 << blob_offset
<< std::dec
<< " " << *b
<< dendl
;
2352 b
= split_blob(b
, blob_offset
, sh
.shard_info
->offset
);
2353 // switch b to the new right-hand side, in case it
2354 // *also* has to get split.
2355 bstart
+= blob_offset
;
2356 onode
->c
->store
->logger
->inc(l_bluestore_blob_split
);
2367 auto bid
= allocate_spanning_blob_id();
2369 spanning_blob_map
[b
->id
] = b
;
2370 dout(20) << __func__
<< " adding spanning " << *b
<< dendl
;
2374 if (e
->blob
->is_spanning()) {
2375 spanning_blob_map
.erase(e
->blob
->id
);
2377 dout(30) << __func__
<< " un-spanning " << *e
->blob
<< dendl
;
2383 clear_needs_reshard();
2386 bool BlueStore::ExtentMap::encode_some(
2392 auto cct
= onode
->c
->store
->cct
; //used by dout
2393 Extent
dummy(offset
);
2394 auto start
= extent_map
.lower_bound(dummy
);
2395 uint32_t end
= offset
+ length
;
2397 __u8 struct_v
= 2; // Version 2 differs from v1 in blob's ref_map
2398 // serialization only. Hence there is no specific
2399 // handling at ExtentMap level.
2403 bool must_reshard
= false;
2404 for (auto p
= start
;
2405 p
!= extent_map
.end() && p
->logical_offset
< end
;
2407 assert(p
->logical_offset
>= offset
);
2408 p
->blob
->last_encoded_id
= -1;
2409 if (!p
->blob
->is_spanning() && p
->blob_escapes_range(offset
, length
)) {
2410 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2411 << std::dec
<< " hit new spanning blob " << *p
<< dendl
;
2412 request_reshard(p
->blob_start(), p
->blob_end());
2413 must_reshard
= true;
2415 if (!must_reshard
) {
2416 denc_varint(0, bound
); // blobid
2417 denc_varint(0, bound
); // logical_offset
2418 denc_varint(0, bound
); // len
2419 denc_varint(0, bound
); // blob_offset
2421 p
->blob
->bound_encode(
2424 p
->blob
->shared_blob
->get_sbid(),
2432 denc(struct_v
, bound
);
2433 denc_varint(0, bound
); // number of extents
2436 auto app
= bl
.get_contiguous_appender(bound
);
2437 denc(struct_v
, app
);
2438 denc_varint(n
, app
);
2445 uint64_t prev_len
= 0;
2446 for (auto p
= start
;
2447 p
!= extent_map
.end() && p
->logical_offset
< end
;
2450 bool include_blob
= false;
2451 if (p
->blob
->is_spanning()) {
2452 blobid
= p
->blob
->id
<< BLOBID_SHIFT_BITS
;
2453 blobid
|= BLOBID_FLAG_SPANNING
;
2454 } else if (p
->blob
->last_encoded_id
< 0) {
2455 p
->blob
->last_encoded_id
= n
+ 1; // so it is always non-zero
2456 include_blob
= true;
2457 blobid
= 0; // the decoder will infer the id from n
2459 blobid
= p
->blob
->last_encoded_id
<< BLOBID_SHIFT_BITS
;
2461 if (p
->logical_offset
== pos
) {
2462 blobid
|= BLOBID_FLAG_CONTIGUOUS
;
2464 if (p
->blob_offset
== 0) {
2465 blobid
|= BLOBID_FLAG_ZEROOFFSET
;
2467 if (p
->length
== prev_len
) {
2468 blobid
|= BLOBID_FLAG_SAMELENGTH
;
2470 prev_len
= p
->length
;
2472 denc_varint(blobid
, app
);
2473 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
2474 denc_varint_lowz(p
->logical_offset
- pos
, app
);
2476 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
2477 denc_varint_lowz(p
->blob_offset
, app
);
2479 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
2480 denc_varint_lowz(p
->length
, app
);
2482 pos
= p
->logical_end();
2484 p
->blob
->encode(app
, struct_v
, p
->blob
->shared_blob
->get_sbid(), false);
2488 /*derr << __func__ << bl << dendl;
2489 derr << __func__ << ":";
2496 unsigned BlueStore::ExtentMap::decode_some(bufferlist
& bl
)
2498 auto cct
= onode
->c
->store
->cct
; //used by dout
2500 derr << __func__ << ":";
2505 assert(bl
.get_num_buffers() <= 1);
2506 auto p
= bl
.front().begin_deep();
2509 // Version 2 differs from v1 in blob's ref_map
2510 // serialization only. Hence there is no specific
2511 // handling at ExtentMap level below.
2512 assert(struct_v
== 1 || struct_v
== 2);
2515 denc_varint(num
, p
);
2516 vector
<BlobRef
> blobs(num
);
2518 uint64_t prev_len
= 0;
2522 Extent
*le
= new Extent();
2524 denc_varint(blobid
, p
);
2525 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
2527 denc_varint_lowz(gap
, p
);
2530 le
->logical_offset
= pos
;
2531 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
2532 denc_varint_lowz(le
->blob_offset
, p
);
2534 le
->blob_offset
= 0;
2536 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
2537 denc_varint_lowz(prev_len
, p
);
2539 le
->length
= prev_len
;
2541 if (blobid
& BLOBID_FLAG_SPANNING
) {
2542 dout(30) << __func__
<< " getting spanning blob "
2543 << (blobid
>> BLOBID_SHIFT_BITS
) << dendl
;
2544 le
->assign_blob(get_spanning_blob(blobid
>> BLOBID_SHIFT_BITS
));
2546 blobid
>>= BLOBID_SHIFT_BITS
;
2548 le
->assign_blob(blobs
[blobid
- 1]);
2551 Blob
*b
= new Blob();
2553 b
->decode(onode
->c
, p
, struct_v
, &sbid
, false);
2555 onode
->c
->open_shared_blob(sbid
, b
);
2558 // we build ref_map dynamically for non-spanning blobs
2566 extent_map
.insert(*le
);
2573 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p
)
2575 // Version 2 differs from v1 in blob's ref_map
2576 // serialization only. Hence there is no specific
2577 // handling at ExtentMap level.
2581 denc_varint((uint32_t)0, p
);
2582 size_t key_size
= 0;
2583 denc_varint((uint32_t)0, key_size
);
2584 p
+= spanning_blob_map
.size() * key_size
;
2585 for (const auto& i
: spanning_blob_map
) {
2586 i
.second
->bound_encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
2590 void BlueStore::ExtentMap::encode_spanning_blobs(
2591 bufferlist::contiguous_appender
& p
)
2593 // Version 2 differs from v1 in blob's ref_map
2594 // serialization only. Hence there is no specific
2595 // handling at ExtentMap level.
2599 denc_varint(spanning_blob_map
.size(), p
);
2600 for (auto& i
: spanning_blob_map
) {
2601 denc_varint(i
.second
->id
, p
);
2602 i
.second
->encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
2606 void BlueStore::ExtentMap::decode_spanning_blobs(
2607 bufferptr::iterator
& p
)
2611 // Version 2 differs from v1 in blob's ref_map
2612 // serialization only. Hence there is no specific
2613 // handling at ExtentMap level.
2614 assert(struct_v
== 1 || struct_v
== 2);
2619 BlobRef
b(new Blob());
2620 denc_varint(b
->id
, p
);
2621 spanning_blob_map
[b
->id
] = b
;
2623 b
->decode(onode
->c
, p
, struct_v
, &sbid
, true);
2624 onode
->c
->open_shared_blob(sbid
, b
);
2628 void BlueStore::ExtentMap::init_shards(bool loaded
, bool dirty
)
2630 shards
.resize(onode
->onode
.extent_map_shards
.size());
2632 for (auto &s
: onode
->onode
.extent_map_shards
) {
2633 shards
[i
].shard_info
= &s
;
2634 shards
[i
].loaded
= loaded
;
2635 shards
[i
].dirty
= dirty
;
2640 void BlueStore::ExtentMap::fault_range(
2645 auto cct
= onode
->c
->store
->cct
; //used by dout
2646 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2647 << std::dec
<< dendl
;
2648 auto start
= seek_shard(offset
);
2649 auto last
= seek_shard(offset
+ length
);
2654 assert(last
>= start
);
2656 while (start
<= last
) {
2657 assert((size_t)start
< shards
.size());
2658 auto p
= &shards
[start
];
2660 dout(30) << __func__
<< " opening shard 0x" << std::hex
2661 << p
->shard_info
->offset
<< std::dec
<< dendl
;
2663 generate_extent_shard_key_and_apply(
2664 onode
->key
, p
->shard_info
->offset
, &key
,
2665 [&](const string
& final_key
) {
2666 int r
= db
->get(PREFIX_OBJ
, final_key
, &v
);
2668 derr
<< __func__
<< " missing shard 0x" << std::hex
2669 << p
->shard_info
->offset
<< std::dec
<< " for " << onode
->oid
2675 p
->extents
= decode_some(v
);
2677 dout(20) << __func__
<< " open shard 0x" << std::hex
2678 << p
->shard_info
->offset
<< std::dec
2679 << " (" << v
.length() << " bytes)" << dendl
;
2680 assert(p
->dirty
== false);
2681 assert(v
.length() == p
->shard_info
->bytes
);
2682 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_misses
);
2684 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_hits
);
2690 void BlueStore::ExtentMap::dirty_range(
2694 auto cct
= onode
->c
->store
->cct
; //used by dout
2695 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2696 << std::dec
<< dendl
;
2697 if (shards
.empty()) {
2698 dout(20) << __func__
<< " mark inline shard dirty" << dendl
;
2702 auto start
= seek_shard(offset
);
2703 auto last
= seek_shard(offset
+ length
);
2707 assert(last
>= start
);
2708 while (start
<= last
) {
2709 assert((size_t)start
< shards
.size());
2710 auto p
= &shards
[start
];
2712 dout(20) << __func__
<< " shard 0x" << std::hex
<< p
->shard_info
->offset
2713 << std::dec
<< " is not loaded, can't mark dirty" << dendl
;
2714 assert(0 == "can't mark unloaded shard dirty");
2717 dout(20) << __func__
<< " mark shard 0x" << std::hex
2718 << p
->shard_info
->offset
<< std::dec
<< " dirty" << dendl
;
2725 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::find(
2728 Extent
dummy(offset
);
2729 return extent_map
.find(dummy
);
2732 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::seek_lextent(
2735 Extent
dummy(offset
);
2736 auto fp
= extent_map
.lower_bound(dummy
);
2737 if (fp
!= extent_map
.begin()) {
2739 if (fp
->logical_end() <= offset
) {
2746 BlueStore::extent_map_t::const_iterator
BlueStore::ExtentMap::seek_lextent(
2747 uint64_t offset
) const
2749 Extent
dummy(offset
);
2750 auto fp
= extent_map
.lower_bound(dummy
);
2751 if (fp
!= extent_map
.begin()) {
2753 if (fp
->logical_end() <= offset
) {
2760 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset
, uint64_t length
)
2762 auto fp
= seek_lextent(offset
);
2763 if (fp
== extent_map
.end() || fp
->logical_offset
>= offset
+ length
) {
2769 int BlueStore::ExtentMap::compress_extent_map(
2773 auto cct
= onode
->c
->store
->cct
; //used by dout
2774 if (extent_map
.empty())
2777 auto p
= seek_lextent(offset
);
2778 if (p
!= extent_map
.begin()) {
2779 --p
; // start to the left of offset
2781 // the caller should have just written to this region
2782 assert(p
!= extent_map
.end());
2784 // identify the *next* shard
2785 auto pshard
= shards
.begin();
2786 while (pshard
!= shards
.end() &&
2787 p
->logical_offset
>= pshard
->shard_info
->offset
) {
2791 if (pshard
!= shards
.end()) {
2792 shard_end
= pshard
->shard_info
->offset
;
2794 shard_end
= OBJECT_MAX_SIZE
;
2798 for (++n
; n
!= extent_map
.end(); p
= n
++) {
2799 if (n
->logical_offset
> offset
+ length
) {
2800 break; // stop after end
2802 while (n
!= extent_map
.end() &&
2803 p
->logical_end() == n
->logical_offset
&&
2804 p
->blob
== n
->blob
&&
2805 p
->blob_offset
+ p
->length
== n
->blob_offset
&&
2806 n
->logical_offset
< shard_end
) {
2807 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2808 << " next shard 0x" << shard_end
<< std::dec
2809 << " merging " << *p
<< " and " << *n
<< dendl
;
2810 p
->length
+= n
->length
;
2814 if (n
== extent_map
.end()) {
2817 if (n
->logical_offset
>= shard_end
) {
2818 assert(pshard
!= shards
.end());
2820 if (pshard
!= shards
.end()) {
2821 shard_end
= pshard
->shard_info
->offset
;
2823 shard_end
= OBJECT_MAX_SIZE
;
2827 if (removed
&& onode
) {
2828 onode
->c
->store
->logger
->inc(l_bluestore_extent_compress
, removed
);
2833 void BlueStore::ExtentMap::punch_hole(
2837 old_extent_map_t
*old_extents
)
2839 auto p
= seek_lextent(offset
);
2840 uint64_t end
= offset
+ length
;
2841 while (p
!= extent_map
.end()) {
2842 if (p
->logical_offset
>= end
) {
2845 if (p
->logical_offset
< offset
) {
2846 if (p
->logical_end() > end
) {
2847 // split and deref middle
2848 uint64_t front
= offset
- p
->logical_offset
;
2849 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ front
,
2851 old_extents
->push_back(*oe
);
2853 p
->blob_offset
+ front
+ length
,
2854 p
->length
- front
- length
,
2860 assert(p
->logical_end() > offset
); // else seek_lextent bug
2861 uint64_t keep
= offset
- p
->logical_offset
;
2862 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ keep
,
2863 p
->length
- keep
, p
->blob
);
2864 old_extents
->push_back(*oe
);
2870 if (p
->logical_offset
+ p
->length
<= end
) {
2871 // deref whole lextent
2872 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
2873 p
->length
, p
->blob
);
2874 old_extents
->push_back(*oe
);
2879 uint64_t keep
= p
->logical_end() - end
;
2880 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
2881 p
->length
- keep
, p
->blob
);
2882 old_extents
->push_back(*oe
);
2884 add(end
, p
->blob_offset
+ p
->length
- keep
, keep
, p
->blob
);
2890 BlueStore::Extent
*BlueStore::ExtentMap::set_lextent(
2892 uint64_t logical_offset
,
2893 uint64_t blob_offset
, uint64_t length
, BlobRef b
,
2894 old_extent_map_t
*old_extents
)
2896 // We need to have completely initialized Blob to increment its ref counters.
2897 assert(b
->get_blob().get_logical_length() != 0);
2899 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2900 // old_extents list if we overwre the blob totally
2901 // This might happen during WAL overwrite.
2902 b
->get_ref(onode
->c
, blob_offset
, length
);
2905 punch_hole(c
, logical_offset
, length
, old_extents
);
2908 Extent
*le
= new Extent(logical_offset
, blob_offset
, length
, b
);
2909 extent_map
.insert(*le
);
2910 if (spans_shard(logical_offset
, length
)) {
2911 request_reshard(logical_offset
, logical_offset
+ length
);
2916 BlueStore::BlobRef
BlueStore::ExtentMap::split_blob(
2918 uint32_t blob_offset
,
2921 auto cct
= onode
->c
->store
->cct
; //used by dout
2923 uint32_t end_pos
= pos
+ lb
->get_blob().get_logical_length() - blob_offset
;
2924 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< " end 0x" << end_pos
2925 << " blob_offset 0x" << blob_offset
<< std::dec
<< " " << *lb
2927 BlobRef rb
= onode
->c
->new_blob();
2928 lb
->split(onode
->c
, blob_offset
, rb
.get());
2930 for (auto ep
= seek_lextent(pos
);
2931 ep
!= extent_map
.end() && ep
->logical_offset
< end_pos
;
2933 if (ep
->blob
!= lb
) {
2936 if (ep
->logical_offset
< pos
) {
2938 size_t left
= pos
- ep
->logical_offset
;
2939 Extent
*ne
= new Extent(pos
, 0, ep
->length
- left
, rb
);
2940 extent_map
.insert(*ne
);
2942 dout(30) << __func__
<< " split " << *ep
<< dendl
;
2943 dout(30) << __func__
<< " to " << *ne
<< dendl
;
2946 assert(ep
->blob_offset
>= blob_offset
);
2949 ep
->blob_offset
-= blob_offset
;
2950 dout(30) << __func__
<< " adjusted " << *ep
<< dendl
;
2959 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2961 void BlueStore::Onode::flush()
2963 if (flushing_count
.load()) {
2964 ldout(c
->store
->cct
, 20) << __func__
<< " cnt:" << flushing_count
<< dendl
;
2965 std::unique_lock
<std::mutex
> l(flush_lock
);
2966 while (flushing_count
.load()) {
2970 ldout(c
->store
->cct
, 20) << __func__
<< " done" << dendl
;
2973 // =======================================================
2976 /// Checks for writes to the same pextent within a blob
2977 bool BlueStore::WriteContext::has_conflict(
2981 uint64_t min_alloc_size
)
2983 assert((loffs
% min_alloc_size
) == 0);
2984 assert((loffs_end
% min_alloc_size
) == 0);
2985 for (auto w
: writes
) {
2987 auto loffs2
= P2ALIGN(w
.logical_offset
, min_alloc_size
);
2988 auto loffs2_end
= P2ROUNDUP(w
.logical_offset
+ w
.length0
, min_alloc_size
);
2989 if ((loffs
<= loffs2
&& loffs_end
> loffs2
) ||
2990 (loffs
>= loffs2
&& loffs
< loffs2_end
)) {
2998 // =======================================================
3002 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
3004 void BlueStore::DeferredBatch::prepare_write(
3006 uint64_t seq
, uint64_t offset
, uint64_t length
,
3007 bufferlist::const_iterator
& blp
)
3009 _discard(cct
, offset
, length
);
3010 auto i
= iomap
.insert(make_pair(offset
, deferred_io()));
3011 assert(i
.second
); // this should be a new insertion
3012 i
.first
->second
.seq
= seq
;
3013 blp
.copy(length
, i
.first
->second
.bl
);
3014 i
.first
->second
.bl
.reassign_to_mempool(
3015 mempool::mempool_bluestore_writing_deferred
);
3016 dout(20) << __func__
<< " seq " << seq
3017 << " 0x" << std::hex
<< offset
<< "~" << length
3018 << " crc " << i
.first
->second
.bl
.crc32c(-1)
3019 << std::dec
<< dendl
;
3020 seq_bytes
[seq
] += length
;
3021 #ifdef DEBUG_DEFERRED
3026 void BlueStore::DeferredBatch::_discard(
3027 CephContext
*cct
, uint64_t offset
, uint64_t length
)
3029 generic_dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
3030 << std::dec
<< dendl
;
3031 auto p
= iomap
.lower_bound(offset
);
3032 if (p
!= iomap
.begin()) {
3034 auto end
= p
->first
+ p
->second
.bl
.length();
3037 head
.substr_of(p
->second
.bl
, 0, offset
- p
->first
);
3038 dout(20) << __func__
<< " keep head " << p
->second
.seq
3039 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3040 << " -> 0x" << head
.length() << std::dec
<< dendl
;
3041 auto i
= seq_bytes
.find(p
->second
.seq
);
3042 assert(i
!= seq_bytes
.end());
3043 if (end
> offset
+ length
) {
3045 tail
.substr_of(p
->second
.bl
, offset
+ length
- p
->first
,
3046 end
- (offset
+ length
));
3047 dout(20) << __func__
<< " keep tail " << p
->second
.seq
3048 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3049 << " -> 0x" << tail
.length() << std::dec
<< dendl
;
3050 auto &n
= iomap
[offset
+ length
];
3052 n
.seq
= p
->second
.seq
;
3053 i
->second
-= length
;
3055 i
->second
-= end
- offset
;
3057 assert(i
->second
>= 0);
3058 p
->second
.bl
.swap(head
);
3062 while (p
!= iomap
.end()) {
3063 if (p
->first
>= offset
+ length
) {
3066 auto i
= seq_bytes
.find(p
->second
.seq
);
3067 assert(i
!= seq_bytes
.end());
3068 auto end
= p
->first
+ p
->second
.bl
.length();
3069 if (end
> offset
+ length
) {
3070 unsigned drop_front
= offset
+ length
- p
->first
;
3071 unsigned keep_tail
= end
- (offset
+ length
);
3072 dout(20) << __func__
<< " truncate front " << p
->second
.seq
3073 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3074 << " drop_front 0x" << drop_front
<< " keep_tail 0x" << keep_tail
3075 << " to 0x" << (offset
+ length
) << "~" << keep_tail
3076 << std::dec
<< dendl
;
3077 auto &s
= iomap
[offset
+ length
];
3078 s
.seq
= p
->second
.seq
;
3079 s
.bl
.substr_of(p
->second
.bl
, drop_front
, keep_tail
);
3080 i
->second
-= drop_front
;
3082 dout(20) << __func__
<< " drop " << p
->second
.seq
3083 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3084 << std::dec
<< dendl
;
3085 i
->second
-= p
->second
.bl
.length();
3087 assert(i
->second
>= 0);
3092 void BlueStore::DeferredBatch::_audit(CephContext
*cct
)
3094 map
<uint64_t,int> sb
;
3095 for (auto p
: seq_bytes
) {
3096 sb
[p
.first
] = 0; // make sure we have the same set of keys
3099 for (auto& p
: iomap
) {
3100 assert(p
.first
>= pos
);
3101 sb
[p
.second
.seq
] += p
.second
.bl
.length();
3102 pos
= p
.first
+ p
.second
.bl
.length();
3104 assert(sb
== seq_bytes
);
3111 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3113 BlueStore::Collection::Collection(BlueStore
*ns
, Cache
*c
, coll_t cid
)
3117 lock("BlueStore::Collection::lock", true, false),
3123 void BlueStore::Collection::open_shared_blob(uint64_t sbid
, BlobRef b
)
3125 assert(!b
->shared_blob
);
3126 const bluestore_blob_t
& blob
= b
->get_blob();
3127 if (!blob
.is_shared()) {
3128 b
->shared_blob
= new SharedBlob(this);
3132 b
->shared_blob
= shared_blob_set
.lookup(sbid
);
3133 if (b
->shared_blob
) {
3134 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3135 << std::dec
<< " had " << *b
->shared_blob
<< dendl
;
3137 b
->shared_blob
= new SharedBlob(sbid
, this);
3138 shared_blob_set
.add(this, b
->shared_blob
.get());
3139 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3140 << std::dec
<< " opened " << *b
->shared_blob
3145 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb
)
3147 if (!sb
->is_loaded()) {
3151 auto sbid
= sb
->get_sbid();
3152 get_shared_blob_key(sbid
, &key
);
3153 int r
= store
->db
->get(PREFIX_SHARED_BLOB
, key
, &v
);
3155 lderr(store
->cct
) << __func__
<< " sbid 0x" << std::hex
<< sbid
3156 << std::dec
<< " not found at key "
3157 << pretty_binary_string(key
) << dendl
;
3158 assert(0 == "uh oh, missing shared_blob");
3162 sb
->persistent
= new bluestore_shared_blob_t(sbid
);
3163 bufferlist::iterator p
= v
.begin();
3164 ::decode(*(sb
->persistent
), p
);
3165 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3166 << std::dec
<< " loaded shared_blob " << *sb
<< dendl
;
3170 void BlueStore::Collection::make_blob_shared(uint64_t sbid
, BlobRef b
)
3172 ldout(store
->cct
, 10) << __func__
<< " " << *b
<< dendl
;
3173 assert(!b
->shared_blob
->is_loaded());
3176 bluestore_blob_t
& blob
= b
->dirty_blob();
3177 blob
.set_flag(bluestore_blob_t::FLAG_SHARED
);
3179 // update shared blob
3180 b
->shared_blob
->loaded
= true;
3181 b
->shared_blob
->persistent
= new bluestore_shared_blob_t(sbid
);
3182 shared_blob_set
.add(this, b
->shared_blob
.get());
3183 for (auto p
: blob
.get_extents()) {
3185 b
->shared_blob
->get_ref(
3190 ldout(store
->cct
, 20) << __func__
<< " now " << *b
<< dendl
;
3193 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob
*sb
)
3195 ldout(store
->cct
, 10) << __func__
<< " " << *sb
<< dendl
;
3196 assert(sb
->is_loaded());
3198 uint64_t sbid
= sb
->get_sbid();
3199 shared_blob_set
.remove(sb
);
3201 delete sb
->persistent
;
3202 sb
->sbid_unloaded
= 0;
3203 ldout(store
->cct
, 20) << __func__
<< " now " << *sb
<< dendl
;
3207 BlueStore::OnodeRef
BlueStore::Collection::get_onode(
3208 const ghobject_t
& oid
,
3211 assert(create
? lock
.is_wlocked() : lock
.is_locked());
3214 if (cid
.is_pg(&pgid
)) {
3215 if (!oid
.match(cnode
.bits
, pgid
.ps())) {
3216 lderr(store
->cct
) << __func__
<< " oid " << oid
<< " not part of "
3217 << pgid
<< " bits " << cnode
.bits
<< dendl
;
3222 OnodeRef o
= onode_map
.lookup(oid
);
3226 mempool::bluestore_cache_other::string key
;
3227 get_object_key(store
->cct
, oid
, &key
);
3229 ldout(store
->cct
, 20) << __func__
<< " oid " << oid
<< " key "
3230 << pretty_binary_string(key
) << dendl
;
3233 int r
= store
->db
->get(PREFIX_OBJ
, key
.c_str(), key
.size(), &v
);
3234 ldout(store
->cct
, 20) << " r " << r
<< " v.len " << v
.length() << dendl
;
3236 if (v
.length() == 0) {
3237 assert(r
== -ENOENT
);
3238 if (!store
->cct
->_conf
->bluestore_debug_misc
&&
3242 // new object, new onode
3243 on
= new Onode(this, oid
, key
);
3247 on
= new Onode(this, oid
, key
);
3249 bufferptr::iterator p
= v
.front().begin_deep();
3250 on
->onode
.decode(p
);
3251 for (auto& i
: on
->onode
.attrs
) {
3252 i
.second
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
3255 // initialize extent_map
3256 on
->extent_map
.decode_spanning_blobs(p
);
3257 if (on
->onode
.extent_map_shards
.empty()) {
3258 denc(on
->extent_map
.inline_bl
, p
);
3259 on
->extent_map
.decode_some(on
->extent_map
.inline_bl
);
3260 on
->extent_map
.inline_bl
.reassign_to_mempool(
3261 mempool::mempool_bluestore_cache_other
);
3263 on
->extent_map
.init_shards(false, false);
3267 return onode_map
.add(oid
, o
);
3270 void BlueStore::Collection::split_cache(
3273 ldout(store
->cct
, 10) << __func__
<< " to " << dest
<< dendl
;
3275 // lock (one or both) cache shards
3276 std::lock(cache
->lock
, dest
->cache
->lock
);
3277 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
, std::adopt_lock
);
3278 std::lock_guard
<std::recursive_mutex
> l2(dest
->cache
->lock
, std::adopt_lock
);
3280 int destbits
= dest
->cnode
.bits
;
3282 bool is_pg
= dest
->cid
.is_pg(&destpg
);
3285 auto p
= onode_map
.onode_map
.begin();
3286 while (p
!= onode_map
.onode_map
.end()) {
3287 if (!p
->second
->oid
.match(destbits
, destpg
.pgid
.ps())) {
3288 // onode does not belong to this child
3291 OnodeRef o
= p
->second
;
3292 ldout(store
->cct
, 20) << __func__
<< " moving " << o
<< " " << o
->oid
3295 cache
->_rm_onode(p
->second
);
3296 p
= onode_map
.onode_map
.erase(p
);
3299 dest
->cache
->_add_onode(o
, 1);
3300 dest
->onode_map
.onode_map
[o
->oid
] = o
;
3301 dest
->onode_map
.cache
= dest
->cache
;
3303 // move over shared blobs and buffers. cover shared blobs from
3304 // both extent map and spanning blob map (the full extent map
3305 // may not be faulted in)
3306 vector
<SharedBlob
*> sbvec
;
3307 for (auto& e
: o
->extent_map
.extent_map
) {
3308 sbvec
.push_back(e
.blob
->shared_blob
.get());
3310 for (auto& b
: o
->extent_map
.spanning_blob_map
) {
3311 sbvec
.push_back(b
.second
->shared_blob
.get());
3313 for (auto sb
: sbvec
) {
3314 if (sb
->coll
== dest
) {
3315 ldout(store
->cct
, 20) << __func__
<< " already moved " << *sb
3319 ldout(store
->cct
, 20) << __func__
<< " moving " << *sb
<< dendl
;
3320 if (sb
->get_sbid()) {
3321 ldout(store
->cct
, 20) << __func__
3322 << " moving registration " << *sb
<< dendl
;
3323 shared_blob_set
.remove(sb
);
3324 dest
->shared_blob_set
.add(dest
, sb
);
3327 if (dest
->cache
!= cache
) {
3328 for (auto& i
: sb
->bc
.buffer_map
) {
3329 if (!i
.second
->is_writing()) {
3330 ldout(store
->cct
, 20) << __func__
<< " moving " << *i
.second
3332 dest
->cache
->_move_buffer(cache
, i
.second
.get());
3341 // =======================================================
3343 void *BlueStore::MempoolThread::entry()
3345 Mutex::Locker
l(lock
);
3347 uint64_t meta_bytes
=
3348 mempool::bluestore_cache_other::allocated_bytes() +
3349 mempool::bluestore_cache_onode::allocated_bytes();
3350 uint64_t onode_num
=
3351 mempool::bluestore_cache_onode::allocated_items();
3353 if (onode_num
< 2) {
3357 float bytes_per_onode
= (float)meta_bytes
/ (float)onode_num
;
3358 size_t num_shards
= store
->cache_shards
.size();
3359 float target_ratio
= store
->cache_meta_ratio
+ store
->cache_data_ratio
;
3360 // A little sloppy but should be close enough
3361 uint64_t shard_target
= target_ratio
* (store
->cache_size
/ num_shards
);
3363 for (auto i
: store
->cache_shards
) {
3364 i
->trim(shard_target
,
3365 store
->cache_meta_ratio
,
3366 store
->cache_data_ratio
,
3370 store
->_update_cache_logger();
3373 wait
+= store
->cct
->_conf
->bluestore_cache_trim_interval
;
3374 cond
.WaitInterval(lock
, wait
);
3380 // =======================================================
3385 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3387 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3388 CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
)
3389 : c(c
), o(o
), it(it
)
3391 RWLock::RLocker
l(c
->lock
);
3392 if (o
->onode
.has_omap()) {
3393 get_omap_key(o
->onode
.nid
, string(), &head
);
3394 get_omap_tail(o
->onode
.nid
, &tail
);
3395 it
->lower_bound(head
);
3399 int BlueStore::OmapIteratorImpl::seek_to_first()
3401 RWLock::RLocker
l(c
->lock
);
3402 if (o
->onode
.has_omap()) {
3403 it
->lower_bound(head
);
3405 it
= KeyValueDB::Iterator();
3410 int BlueStore::OmapIteratorImpl::upper_bound(const string
& after
)
3412 RWLock::RLocker
l(c
->lock
);
3413 if (o
->onode
.has_omap()) {
3415 get_omap_key(o
->onode
.nid
, after
, &key
);
3416 ldout(c
->store
->cct
,20) << __func__
<< " after " << after
<< " key "
3417 << pretty_binary_string(key
) << dendl
;
3418 it
->upper_bound(key
);
3420 it
= KeyValueDB::Iterator();
3425 int BlueStore::OmapIteratorImpl::lower_bound(const string
& to
)
3427 RWLock::RLocker
l(c
->lock
);
3428 if (o
->onode
.has_omap()) {
3430 get_omap_key(o
->onode
.nid
, to
, &key
);
3431 ldout(c
->store
->cct
,20) << __func__
<< " to " << to
<< " key "
3432 << pretty_binary_string(key
) << dendl
;
3433 it
->lower_bound(key
);
3435 it
= KeyValueDB::Iterator();
3440 bool BlueStore::OmapIteratorImpl::valid()
3442 RWLock::RLocker
l(c
->lock
);
3443 bool r
= o
->onode
.has_omap() && it
&& it
->valid() &&
3444 it
->raw_key().second
<= tail
;
3445 if (it
&& it
->valid()) {
3446 ldout(c
->store
->cct
,20) << __func__
<< " is at "
3447 << pretty_binary_string(it
->raw_key().second
)
3453 int BlueStore::OmapIteratorImpl::next(bool validate
)
3455 RWLock::RLocker
l(c
->lock
);
3456 if (o
->onode
.has_omap()) {
3464 string
BlueStore::OmapIteratorImpl::key()
3466 RWLock::RLocker
l(c
->lock
);
3467 assert(it
->valid());
3468 string db_key
= it
->raw_key().second
;
3470 decode_omap_key(db_key
, &user_key
);
3474 bufferlist
BlueStore::OmapIteratorImpl::value()
3476 RWLock::RLocker
l(c
->lock
);
3477 assert(it
->valid());
3482 // =====================================
3485 #define dout_prefix *_dout << "bluestore(" << path << ") "
3488 static void aio_cb(void *priv
, void *priv2
)
3490 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
3491 BlueStore::AioContext
*c
= static_cast<BlueStore::AioContext
*>(priv2
);
3492 c
->aio_finish(store
);
3495 BlueStore::BlueStore(CephContext
*cct
, const string
& path
)
3496 : ObjectStore(cct
, path
),
3497 throttle_bytes(cct
, "bluestore_throttle_bytes",
3498 cct
->_conf
->bluestore_throttle_bytes
),
3499 throttle_deferred_bytes(cct
, "bluestore_throttle_deferred_bytes",
3500 cct
->_conf
->bluestore_throttle_bytes
+
3501 cct
->_conf
->bluestore_throttle_deferred_bytes
),
3502 deferred_finisher(cct
, "defered_finisher", "dfin"),
3503 kv_sync_thread(this),
3504 kv_finalize_thread(this),
3505 mempool_thread(this)
3508 cct
->_conf
->add_observer(this);
3509 set_cache_shards(1);
3512 BlueStore::BlueStore(CephContext
*cct
,
3514 uint64_t _min_alloc_size
)
3515 : ObjectStore(cct
, path
),
3516 throttle_bytes(cct
, "bluestore_throttle_bytes",
3517 cct
->_conf
->bluestore_throttle_bytes
),
3518 throttle_deferred_bytes(cct
, "bluestore_throttle_deferred_bytes",
3519 cct
->_conf
->bluestore_throttle_bytes
+
3520 cct
->_conf
->bluestore_throttle_deferred_bytes
),
3521 deferred_finisher(cct
, "defered_finisher", "dfin"),
3522 kv_sync_thread(this),
3523 kv_finalize_thread(this),
3524 min_alloc_size(_min_alloc_size
),
3525 min_alloc_size_order(ctz(_min_alloc_size
)),
3526 mempool_thread(this)
3529 cct
->_conf
->add_observer(this);
3530 set_cache_shards(1);
3533 BlueStore::~BlueStore()
3535 for (auto f
: finishers
) {
3540 cct
->_conf
->remove_observer(this);
3544 assert(bluefs
== NULL
);
3545 assert(fsid_fd
< 0);
3546 assert(path_fd
< 0);
3547 for (auto i
: cache_shards
) {
3550 cache_shards
.clear();
3553 const char **BlueStore::get_tracked_conf_keys() const
3555 static const char* KEYS
[] = {
3556 "bluestore_csum_type",
3557 "bluestore_compression_mode",
3558 "bluestore_compression_algorithm",
3559 "bluestore_compression_min_blob_size",
3560 "bluestore_compression_min_blob_size_ssd",
3561 "bluestore_compression_min_blob_size_hdd",
3562 "bluestore_compression_max_blob_size",
3563 "bluestore_compression_max_blob_size_ssd",
3564 "bluestore_compression_max_blob_size_hdd",
3565 "bluestore_compression_required_ratio",
3566 "bluestore_max_alloc_size",
3567 "bluestore_prefer_deferred_size",
3568 "bluestore_prefer_deferred_size_hdd",
3569 "bluestore_prefer_deferred_size_ssd",
3570 "bluestore_deferred_batch_ops",
3571 "bluestore_deferred_batch_ops_hdd",
3572 "bluestore_deferred_batch_ops_ssd",
3573 "bluestore_throttle_bytes",
3574 "bluestore_throttle_deferred_bytes",
3575 "bluestore_throttle_cost_per_io_hdd",
3576 "bluestore_throttle_cost_per_io_ssd",
3577 "bluestore_throttle_cost_per_io",
3578 "bluestore_max_blob_size",
3579 "bluestore_max_blob_size_ssd",
3580 "bluestore_max_blob_size_hdd",
3586 void BlueStore::handle_conf_change(const struct md_config_t
*conf
,
3587 const std::set
<std::string
> &changed
)
3589 if (changed
.count("bluestore_csum_type")) {
3592 if (changed
.count("bluestore_compression_mode") ||
3593 changed
.count("bluestore_compression_algorithm") ||
3594 changed
.count("bluestore_compression_min_blob_size") ||
3595 changed
.count("bluestore_compression_max_blob_size")) {
3600 if (changed
.count("bluestore_max_blob_size") ||
3601 changed
.count("bluestore_max_blob_size_ssd") ||
3602 changed
.count("bluestore_max_blob_size_hdd")) {
3604 // only after startup
3608 if (changed
.count("bluestore_prefer_deferred_size") ||
3609 changed
.count("bluestore_prefer_deferred_size_hdd") ||
3610 changed
.count("bluestore_prefer_deferred_size_ssd") ||
3611 changed
.count("bluestore_max_alloc_size") ||
3612 changed
.count("bluestore_deferred_batch_ops") ||
3613 changed
.count("bluestore_deferred_batch_ops_hdd") ||
3614 changed
.count("bluestore_deferred_batch_ops_ssd")) {
3616 // only after startup
3620 if (changed
.count("bluestore_throttle_cost_per_io") ||
3621 changed
.count("bluestore_throttle_cost_per_io_hdd") ||
3622 changed
.count("bluestore_throttle_cost_per_io_ssd")) {
3624 _set_throttle_params();
3627 if (changed
.count("bluestore_throttle_bytes")) {
3628 throttle_bytes
.reset_max(conf
->bluestore_throttle_bytes
);
3629 throttle_deferred_bytes
.reset_max(
3630 conf
->bluestore_throttle_bytes
+ conf
->bluestore_throttle_deferred_bytes
);
3632 if (changed
.count("bluestore_throttle_deferred_bytes")) {
3633 throttle_deferred_bytes
.reset_max(
3634 conf
->bluestore_throttle_bytes
+ conf
->bluestore_throttle_deferred_bytes
);
3638 void BlueStore::_set_compression()
3640 auto m
= Compressor::get_comp_mode_type(cct
->_conf
->bluestore_compression_mode
);
3644 derr
<< __func__
<< " unrecognized value '"
3645 << cct
->_conf
->bluestore_compression_mode
3646 << "' for bluestore_compression_mode, reverting to 'none'"
3648 comp_mode
= Compressor::COMP_NONE
;
3651 compressor
= nullptr;
3653 if (comp_mode
== Compressor::COMP_NONE
) {
3654 dout(10) << __func__
<< " compression mode set to 'none', "
3655 << "ignore other compression setttings" << dendl
;
3659 if (cct
->_conf
->bluestore_compression_min_blob_size
) {
3660 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size
;
3663 if (bdev
->is_rotational()) {
3664 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_hdd
;
3666 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_ssd
;
3670 if (cct
->_conf
->bluestore_compression_max_blob_size
) {
3671 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size
;
3674 if (bdev
->is_rotational()) {
3675 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_hdd
;
3677 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_ssd
;
3681 auto& alg_name
= cct
->_conf
->bluestore_compression_algorithm
;
3682 if (!alg_name
.empty()) {
3683 compressor
= Compressor::create(cct
, alg_name
);
3685 derr
<< __func__
<< " unable to initialize " << alg_name
.c_str() << " compressor"
3690 dout(10) << __func__
<< " mode " << Compressor::get_comp_mode_name(comp_mode
)
3691 << " alg " << (compressor
? compressor
->get_type_name() : "(none)")
3695 void BlueStore::_set_csum()
3697 csum_type
= Checksummer::CSUM_NONE
;
3698 int t
= Checksummer::get_csum_string_type(cct
->_conf
->bluestore_csum_type
);
3699 if (t
> Checksummer::CSUM_NONE
)
3702 dout(10) << __func__
<< " csum_type "
3703 << Checksummer::get_csum_type_string(csum_type
)
3707 void BlueStore::_set_throttle_params()
3709 if (cct
->_conf
->bluestore_throttle_cost_per_io
) {
3710 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io
;
3713 if (bdev
->is_rotational()) {
3714 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_hdd
;
3716 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_ssd
;
3720 dout(10) << __func__
<< " throttle_cost_per_io " << throttle_cost_per_io
3723 void BlueStore::_set_blob_size()
3725 if (cct
->_conf
->bluestore_max_blob_size
) {
3726 max_blob_size
= cct
->_conf
->bluestore_max_blob_size
;
3729 if (bdev
->is_rotational()) {
3730 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_hdd
;
3732 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_ssd
;
3735 dout(10) << __func__
<< " max_blob_size 0x" << std::hex
<< max_blob_size
3736 << std::dec
<< dendl
;
3739 int BlueStore::_set_cache_sizes()
3742 if (cct
->_conf
->bluestore_cache_size
) {
3743 cache_size
= cct
->_conf
->bluestore_cache_size
;
3745 // choose global cache size based on backend type
3746 if (bdev
->is_rotational()) {
3747 cache_size
= cct
->_conf
->bluestore_cache_size_hdd
;
3749 cache_size
= cct
->_conf
->bluestore_cache_size_ssd
;
3752 cache_meta_ratio
= cct
->_conf
->bluestore_cache_meta_ratio
;
3753 cache_kv_ratio
= cct
->_conf
->bluestore_cache_kv_ratio
;
3755 double cache_kv_max
= cct
->_conf
->bluestore_cache_kv_max
;
3756 double cache_kv_max_ratio
= 0;
3758 // if cache_kv_max is negative, disable it
3759 if (cache_size
> 0 && cache_kv_max
>= 0) {
3760 cache_kv_max_ratio
= (double) cache_kv_max
/ (double) cache_size
;
3761 if (cache_kv_max_ratio
< 1.0 && cache_kv_max_ratio
< cache_kv_ratio
) {
3762 dout(1) << __func__
<< " max " << cache_kv_max_ratio
3763 << " < ratio " << cache_kv_ratio
3765 cache_meta_ratio
= cache_meta_ratio
+ cache_kv_ratio
- cache_kv_max_ratio
;
3766 cache_kv_ratio
= cache_kv_max_ratio
;
3771 (double)1.0 - (double)cache_meta_ratio
- (double)cache_kv_ratio
;
3773 if (cache_meta_ratio
< 0 || cache_meta_ratio
> 1.0) {
3774 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
3775 << ") must be in range [0,1.0]" << dendl
;
3778 if (cache_kv_ratio
< 0 || cache_kv_ratio
> 1.0) {
3779 derr
<< __func__
<< " bluestore_cache_kv_ratio (" << cache_kv_ratio
3780 << ") must be in range [0,1.0]" << dendl
;
3783 if (cache_meta_ratio
+ cache_kv_ratio
> 1.0) {
3784 derr
<< __func__
<< " bluestore_cache_meta_ratio (" << cache_meta_ratio
3785 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
3786 << ") = " << cache_meta_ratio
+ cache_kv_ratio
<< "; must be <= 1.0"
3790 if (cache_data_ratio
< 0) {
3791 // deal with floating point imprecision
3792 cache_data_ratio
= 0;
3794 dout(1) << __func__
<< " cache_size " << cache_size
3795 << " meta " << cache_meta_ratio
3796 << " kv " << cache_kv_ratio
3797 << " data " << cache_data_ratio
3802 int BlueStore::write_meta(const std::string
& key
, const std::string
& value
)
3804 bluestore_bdev_label_t label
;
3805 string p
= path
+ "/block";
3806 int r
= _read_bdev_label(cct
, p
, &label
);
3808 return ObjectStore::write_meta(key
, value
);
3810 label
.meta
[key
] = value
;
3811 r
= _write_bdev_label(cct
, p
, label
);
3813 return ObjectStore::write_meta(key
, value
);
3816 int BlueStore::read_meta(const std::string
& key
, std::string
*value
)
3818 bluestore_bdev_label_t label
;
3819 string p
= path
+ "/block";
3820 int r
= _read_bdev_label(cct
, p
, &label
);
3822 return ObjectStore::read_meta(key
, value
);
3824 auto i
= label
.meta
.find(key
);
3825 if (i
== label
.meta
.end()) {
3826 return ObjectStore::read_meta(key
, value
);
3832 void BlueStore::_init_logger()
3834 PerfCountersBuilder
b(cct
, "bluestore",
3835 l_bluestore_first
, l_bluestore_last
);
3836 b
.add_time_avg(l_bluestore_kv_flush_lat
, "kv_flush_lat",
3837 "Average kv_thread flush latency",
3838 "fl_l", PerfCountersBuilder::PRIO_INTERESTING
);
3839 b
.add_time_avg(l_bluestore_kv_commit_lat
, "kv_commit_lat",
3840 "Average kv_thread commit latency");
3841 b
.add_time_avg(l_bluestore_kv_lat
, "kv_lat",
3842 "Average kv_thread sync latency",
3843 "k_l", PerfCountersBuilder::PRIO_INTERESTING
);
3844 b
.add_time_avg(l_bluestore_state_prepare_lat
, "state_prepare_lat",
3845 "Average prepare state latency");
3846 b
.add_time_avg(l_bluestore_state_aio_wait_lat
, "state_aio_wait_lat",
3847 "Average aio_wait state latency",
3848 "io_l", PerfCountersBuilder::PRIO_INTERESTING
);
3849 b
.add_time_avg(l_bluestore_state_io_done_lat
, "state_io_done_lat",
3850 "Average io_done state latency");
3851 b
.add_time_avg(l_bluestore_state_kv_queued_lat
, "state_kv_queued_lat",
3852 "Average kv_queued state latency");
3853 b
.add_time_avg(l_bluestore_state_kv_committing_lat
, "state_kv_commiting_lat",
3854 "Average kv_commiting state latency");
3855 b
.add_time_avg(l_bluestore_state_kv_done_lat
, "state_kv_done_lat",
3856 "Average kv_done state latency");
3857 b
.add_time_avg(l_bluestore_state_deferred_queued_lat
, "state_deferred_queued_lat",
3858 "Average deferred_queued state latency");
3859 b
.add_time_avg(l_bluestore_state_deferred_aio_wait_lat
, "state_deferred_aio_wait_lat",
3860 "Average aio_wait state latency");
3861 b
.add_time_avg(l_bluestore_state_deferred_cleanup_lat
, "state_deferred_cleanup_lat",
3862 "Average cleanup state latency");
3863 b
.add_time_avg(l_bluestore_state_finishing_lat
, "state_finishing_lat",
3864 "Average finishing state latency");
3865 b
.add_time_avg(l_bluestore_state_done_lat
, "state_done_lat",
3866 "Average done state latency");
3867 b
.add_time_avg(l_bluestore_throttle_lat
, "throttle_lat",
3868 "Average submit throttle latency",
3869 "th_l", PerfCountersBuilder::PRIO_CRITICAL
);
3870 b
.add_time_avg(l_bluestore_submit_lat
, "submit_lat",
3871 "Average submit latency",
3872 "s_l", PerfCountersBuilder::PRIO_CRITICAL
);
3873 b
.add_time_avg(l_bluestore_commit_lat
, "commit_lat",
3874 "Average commit latency",
3875 "c_l", PerfCountersBuilder::PRIO_CRITICAL
);
3876 b
.add_time_avg(l_bluestore_read_lat
, "read_lat",
3877 "Average read latency",
3878 "r_l", PerfCountersBuilder::PRIO_CRITICAL
);
3879 b
.add_time_avg(l_bluestore_read_onode_meta_lat
, "read_onode_meta_lat",
3880 "Average read onode metadata latency");
3881 b
.add_time_avg(l_bluestore_read_wait_aio_lat
, "read_wait_aio_lat",
3882 "Average read latency");
3883 b
.add_time_avg(l_bluestore_compress_lat
, "compress_lat",
3884 "Average compress latency");
3885 b
.add_time_avg(l_bluestore_decompress_lat
, "decompress_lat",
3886 "Average decompress latency");
3887 b
.add_time_avg(l_bluestore_csum_lat
, "csum_lat",
3888 "Average checksum latency");
3889 b
.add_u64_counter(l_bluestore_compress_success_count
, "compress_success_count",
3890 "Sum for beneficial compress ops");
3891 b
.add_u64_counter(l_bluestore_compress_rejected_count
, "compress_rejected_count",
3892 "Sum for compress ops rejected due to low net gain of space");
3893 b
.add_u64_counter(l_bluestore_write_pad_bytes
, "write_pad_bytes",
3894 "Sum for write-op padded bytes");
3895 b
.add_u64_counter(l_bluestore_deferred_write_ops
, "deferred_write_ops",
3896 "Sum for deferred write op");
3897 b
.add_u64_counter(l_bluestore_deferred_write_bytes
, "deferred_write_bytes",
3898 "Sum for deferred write bytes", "def");
3899 b
.add_u64_counter(l_bluestore_write_penalty_read_ops
, "write_penalty_read_ops",
3900 "Sum for write penalty read ops");
3901 b
.add_u64(l_bluestore_allocated
, "bluestore_allocated",
3902 "Sum for allocated bytes");
3903 b
.add_u64(l_bluestore_stored
, "bluestore_stored",
3904 "Sum for stored bytes");
3905 b
.add_u64(l_bluestore_compressed
, "bluestore_compressed",
3906 "Sum for stored compressed bytes");
3907 b
.add_u64(l_bluestore_compressed_allocated
, "bluestore_compressed_allocated",
3908 "Sum for bytes allocated for compressed data");
3909 b
.add_u64(l_bluestore_compressed_original
, "bluestore_compressed_original",
3910 "Sum for original bytes that were compressed");
3912 b
.add_u64(l_bluestore_onodes
, "bluestore_onodes",
3913 "Number of onodes in cache");
3914 b
.add_u64_counter(l_bluestore_onode_hits
, "bluestore_onode_hits",
3915 "Sum for onode-lookups hit in the cache");
3916 b
.add_u64_counter(l_bluestore_onode_misses
, "bluestore_onode_misses",
3917 "Sum for onode-lookups missed in the cache");
3918 b
.add_u64_counter(l_bluestore_onode_shard_hits
, "bluestore_onode_shard_hits",
3919 "Sum for onode-shard lookups hit in the cache");
3920 b
.add_u64_counter(l_bluestore_onode_shard_misses
,
3921 "bluestore_onode_shard_misses",
3922 "Sum for onode-shard lookups missed in the cache");
3923 b
.add_u64(l_bluestore_extents
, "bluestore_extents",
3924 "Number of extents in cache");
3925 b
.add_u64(l_bluestore_blobs
, "bluestore_blobs",
3926 "Number of blobs in cache");
3927 b
.add_u64(l_bluestore_buffers
, "bluestore_buffers",
3928 "Number of buffers in cache");
3929 b
.add_u64(l_bluestore_buffer_bytes
, "bluestore_buffer_bytes",
3930 "Number of buffer bytes in cache");
3931 b
.add_u64(l_bluestore_buffer_hit_bytes
, "bluestore_buffer_hit_bytes",
3932 "Sum for bytes of read hit in the cache");
3933 b
.add_u64(l_bluestore_buffer_miss_bytes
, "bluestore_buffer_miss_bytes",
3934 "Sum for bytes of read missed in the cache");
3936 b
.add_u64_counter(l_bluestore_write_big
, "bluestore_write_big",
3937 "Large aligned writes into fresh blobs");
3938 b
.add_u64_counter(l_bluestore_write_big_bytes
, "bluestore_write_big_bytes",
3939 "Large aligned writes into fresh blobs (bytes)");
3940 b
.add_u64_counter(l_bluestore_write_big_blobs
, "bluestore_write_big_blobs",
3941 "Large aligned writes into fresh blobs (blobs)");
3942 b
.add_u64_counter(l_bluestore_write_small
, "bluestore_write_small",
3943 "Small writes into existing or sparse small blobs");
3944 b
.add_u64_counter(l_bluestore_write_small_bytes
, "bluestore_write_small_bytes",
3945 "Small writes into existing or sparse small blobs (bytes)");
3946 b
.add_u64_counter(l_bluestore_write_small_unused
,
3947 "bluestore_write_small_unused",
3948 "Small writes into unused portion of existing blob");
3949 b
.add_u64_counter(l_bluestore_write_small_deferred
,
3950 "bluestore_write_small_deferred",
3951 "Small overwrites using deferred");
3952 b
.add_u64_counter(l_bluestore_write_small_pre_read
,
3953 "bluestore_write_small_pre_read",
3954 "Small writes that required we read some data (possibly "
3955 "cached) to fill out the block");
3956 b
.add_u64_counter(l_bluestore_write_small_new
, "bluestore_write_small_new",
3957 "Small write into new (sparse) blob");
3959 b
.add_u64_counter(l_bluestore_txc
, "bluestore_txc", "Transactions committed");
3960 b
.add_u64_counter(l_bluestore_onode_reshard
, "bluestore_onode_reshard",
3961 "Onode extent map reshard events");
3962 b
.add_u64_counter(l_bluestore_blob_split
, "bluestore_blob_split",
3963 "Sum for blob splitting due to resharding");
3964 b
.add_u64_counter(l_bluestore_extent_compress
, "bluestore_extent_compress",
3965 "Sum for extents that have been removed due to compression");
3966 b
.add_u64_counter(l_bluestore_gc_merged
, "bluestore_gc_merged",
3967 "Sum for extents that have been merged due to garbage "
3969 b
.add_u64_counter(l_bluestore_read_eio
, "bluestore_read_eio",
3970 "Read EIO errors propagated to high level callers");
3971 logger
= b
.create_perf_counters();
3972 cct
->get_perfcounters_collection()->add(logger
);
3975 int BlueStore::_reload_logger()
3977 struct store_statfs_t store_statfs
;
3979 int r
= statfs(&store_statfs
);
3981 logger
->set(l_bluestore_allocated
, store_statfs
.allocated
);
3982 logger
->set(l_bluestore_stored
, store_statfs
.stored
);
3983 logger
->set(l_bluestore_compressed
, store_statfs
.compressed
);
3984 logger
->set(l_bluestore_compressed_allocated
, store_statfs
.compressed_allocated
);
3985 logger
->set(l_bluestore_compressed_original
, store_statfs
.compressed_original
);
3990 void BlueStore::_shutdown_logger()
3992 cct
->get_perfcounters_collection()->remove(logger
);
3996 int BlueStore::get_block_device_fsid(CephContext
* cct
, const string
& path
,
3999 bluestore_bdev_label_t label
;
4000 int r
= _read_bdev_label(cct
, path
, &label
);
4003 *fsid
= label
.osd_uuid
;
4007 int BlueStore::_open_path()
4010 if (cct
->_conf
->get_val
<uint64_t>("osd_max_object_size") >=
4011 4*1024*1024*1024ull) {
4012 derr
<< __func__
<< " osd_max_object_size >= 4GB; BlueStore has hard limit of 4GB." << dendl
;
4015 assert(path_fd
< 0);
4016 path_fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_DIRECTORY
));
4019 derr
<< __func__
<< " unable to open " << path
<< ": " << cpp_strerror(r
)
4026 void BlueStore::_close_path()
4028 VOID_TEMP_FAILURE_RETRY(::close(path_fd
));
4032 int BlueStore::_write_bdev_label(CephContext
*cct
,
4033 string path
, bluestore_bdev_label_t label
)
4035 dout(10) << __func__
<< " path " << path
<< " label " << label
<< dendl
;
4037 ::encode(label
, bl
);
4038 uint32_t crc
= bl
.crc32c(-1);
4040 assert(bl
.length() <= BDEV_LABEL_BLOCK_SIZE
);
4041 bufferptr
z(BDEV_LABEL_BLOCK_SIZE
- bl
.length());
4043 bl
.append(std::move(z
));
4045 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_WRONLY
));
4048 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
4052 int r
= bl
.write_fd(fd
);
4054 derr
<< __func__
<< " failed to write to " << path
4055 << ": " << cpp_strerror(r
) << dendl
;
4059 derr
<< __func__
<< " failed to fsync " << path
4060 << ": " << cpp_strerror(r
) << dendl
;
4062 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4066 int BlueStore::_read_bdev_label(CephContext
* cct
, string path
,
4067 bluestore_bdev_label_t
*label
)
4069 dout(10) << __func__
<< dendl
;
4070 int fd
= TEMP_FAILURE_RETRY(::open(path
.c_str(), O_RDONLY
));
4073 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
4078 int r
= bl
.read_fd(fd
, BDEV_LABEL_BLOCK_SIZE
);
4079 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4081 derr
<< __func__
<< " failed to read from " << path
4082 << ": " << cpp_strerror(r
) << dendl
;
4086 uint32_t crc
, expected_crc
;
4087 bufferlist::iterator p
= bl
.begin();
4089 ::decode(*label
, p
);
4091 t
.substr_of(bl
, 0, p
.get_off());
4093 ::decode(expected_crc
, p
);
4095 catch (buffer::error
& e
) {
4096 dout(2) << __func__
<< " unable to decode label at offset " << p
.get_off()
4101 if (crc
!= expected_crc
) {
4102 derr
<< __func__
<< " bad crc on label, expected " << expected_crc
4103 << " != actual " << crc
<< dendl
;
4106 dout(10) << __func__
<< " got " << *label
<< dendl
;
4110 int BlueStore::_check_or_set_bdev_label(
4111 string path
, uint64_t size
, string desc
, bool create
)
4113 bluestore_bdev_label_t label
;
4115 label
.osd_uuid
= fsid
;
4117 label
.btime
= ceph_clock_now();
4118 label
.description
= desc
;
4119 int r
= _write_bdev_label(cct
, path
, label
);
4123 int r
= _read_bdev_label(cct
, path
, &label
);
4126 if (cct
->_conf
->bluestore_debug_permit_any_bdev_label
) {
4127 dout(20) << __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
4128 << " and fsid " << fsid
<< " check bypassed" << dendl
;
4130 else if (label
.osd_uuid
!= fsid
) {
4131 derr
<< __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
4132 << " does not match our fsid " << fsid
<< dendl
;
4139 void BlueStore::_set_alloc_sizes(void)
4141 max_alloc_size
= cct
->_conf
->bluestore_max_alloc_size
;
4143 if (cct
->_conf
->bluestore_prefer_deferred_size
) {
4144 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size
;
4147 if (bdev
->is_rotational()) {
4148 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_hdd
;
4150 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_ssd
;
4154 if (cct
->_conf
->bluestore_deferred_batch_ops
) {
4155 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops
;
4158 if (bdev
->is_rotational()) {
4159 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_hdd
;
4161 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_ssd
;
4165 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
4166 << std::dec
<< " order " << min_alloc_size_order
4167 << " max_alloc_size 0x" << std::hex
<< max_alloc_size
4168 << " prefer_deferred_size 0x" << prefer_deferred_size
4170 << " deferred_batch_ops " << deferred_batch_ops
4174 int BlueStore::_open_bdev(bool create
)
4176 assert(bdev
== NULL
);
4177 string p
= path
+ "/block";
4178 bdev
= BlockDevice::create(cct
, p
, aio_cb
, static_cast<void*>(this));
4179 int r
= bdev
->open(p
);
4183 if (bdev
->supported_bdev_label()) {
4184 r
= _check_or_set_bdev_label(p
, bdev
->get_size(), "main", create
);
4189 // initialize global block parameters
4190 block_size
= bdev
->get_block_size();
4191 block_mask
= ~(block_size
- 1);
4192 block_size_order
= ctz(block_size
);
4193 assert(block_size
== 1u << block_size_order
);
4194 // and set cache_size based on device type
4195 r
= _set_cache_sizes();
4209 void BlueStore::_close_bdev()
4217 int BlueStore::_open_fm(bool create
)
4220 fm
= FreelistManager::create(cct
, freelist_type
, db
, PREFIX_ALLOC
);
4223 // initialize freespace
4224 dout(20) << __func__
<< " initializing freespace" << dendl
;
4225 KeyValueDB::Transaction t
= db
->get_transaction();
4228 bl
.append(freelist_type
);
4229 t
->set(PREFIX_SUPER
, "freelist_type", bl
);
4231 // being able to allocate in units less than bdev block size
4232 // seems to be a bad idea.
4233 assert( cct
->_conf
->bdev_block_size
<= (int64_t)min_alloc_size
);
4234 fm
->create(bdev
->get_size(), (int64_t)min_alloc_size
, t
);
4236 // allocate superblock reserved space. note that we do not mark
4237 // bluefs space as allocated in the freelist; we instead rely on
4239 uint64_t reserved
= ROUND_UP_TO(MAX(SUPER_RESERVED
, min_alloc_size
),
4241 fm
->allocate(0, reserved
, t
);
4243 if (cct
->_conf
->bluestore_bluefs
) {
4244 assert(bluefs_extents
.num_intervals() == 1);
4245 interval_set
<uint64_t>::iterator p
= bluefs_extents
.begin();
4246 reserved
= ROUND_UP_TO(p
.get_start() + p
.get_len(), min_alloc_size
);
4247 dout(20) << __func__
<< " reserved 0x" << std::hex
<< reserved
<< std::dec
4248 << " for bluefs" << dendl
;
4250 ::encode(bluefs_extents
, bl
);
4251 t
->set(PREFIX_SUPER
, "bluefs_extents", bl
);
4252 dout(20) << __func__
<< " bluefs_extents 0x" << std::hex
<< bluefs_extents
4253 << std::dec
<< dendl
;
4256 if (cct
->_conf
->bluestore_debug_prefill
> 0) {
4257 uint64_t end
= bdev
->get_size() - reserved
;
4258 dout(1) << __func__
<< " pre-fragmenting freespace, using "
4259 << cct
->_conf
->bluestore_debug_prefill
<< " with max free extent "
4260 << cct
->_conf
->bluestore_debug_prefragment_max
<< dendl
;
4261 uint64_t start
= P2ROUNDUP(reserved
, min_alloc_size
);
4262 uint64_t max_b
= cct
->_conf
->bluestore_debug_prefragment_max
/ min_alloc_size
;
4263 float r
= cct
->_conf
->bluestore_debug_prefill
;
4267 while (!stop
&& start
< end
) {
4268 uint64_t l
= (rand() % max_b
+ 1) * min_alloc_size
;
4269 if (start
+ l
> end
) {
4271 l
= P2ALIGN(l
, min_alloc_size
);
4273 assert(start
+ l
<= end
);
4275 uint64_t u
= 1 + (uint64_t)(r
* (double)l
);
4276 u
= P2ROUNDUP(u
, min_alloc_size
);
4277 if (start
+ l
+ u
> end
) {
4278 u
= end
- (start
+ l
);
4279 // trim to align so we don't overflow again
4280 u
= P2ALIGN(u
, min_alloc_size
);
4283 assert(start
+ l
+ u
<= end
);
4285 dout(20) << " free 0x" << std::hex
<< start
<< "~" << l
4286 << " use 0x" << u
<< std::dec
<< dendl
;
4289 // break if u has been trimmed to nothing
4293 fm
->allocate(start
+ l
, u
, t
);
4297 db
->submit_transaction_sync(t
);
4300 int r
= fm
->init(bdev
->get_size());
4302 derr
<< __func__
<< " freelist init failed: " << cpp_strerror(r
) << dendl
;
4310 void BlueStore::_close_fm()
4312 dout(10) << __func__
<< dendl
;
4319 int BlueStore::_open_alloc()
4321 assert(alloc
== NULL
);
4322 assert(bdev
->get_size());
4323 alloc
= Allocator::create(cct
, cct
->_conf
->bluestore_allocator
,
4327 lderr(cct
) << __func__
<< " Allocator::unknown alloc type "
4328 << cct
->_conf
->bluestore_allocator
4333 uint64_t num
= 0, bytes
= 0;
4335 dout(1) << __func__
<< " opening allocation metadata" << dendl
;
4336 // initialize from freelist
4337 fm
->enumerate_reset();
4338 uint64_t offset
, length
;
4339 while (fm
->enumerate_next(&offset
, &length
)) {
4340 alloc
->init_add_free(offset
, length
);
4344 fm
->enumerate_reset();
4345 dout(1) << __func__
<< " loaded " << pretty_si_t(bytes
)
4346 << " in " << num
<< " extents"
4349 // also mark bluefs space as allocated
4350 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
4351 alloc
->init_rm_free(e
.get_start(), e
.get_len());
4353 dout(10) << __func__
<< " marked bluefs_extents 0x" << std::hex
4354 << bluefs_extents
<< std::dec
<< " as allocated" << dendl
;
4359 void BlueStore::_close_alloc()
4367 int BlueStore::_open_fsid(bool create
)
4369 assert(fsid_fd
< 0);
4373 fsid_fd
= ::openat(path_fd
, "fsid", flags
, 0644);
4376 derr
<< __func__
<< " " << cpp_strerror(err
) << dendl
;
4382 int BlueStore::_read_fsid(uuid_d
*uuid
)
4385 memset(fsid_str
, 0, sizeof(fsid_str
));
4386 int ret
= safe_read(fsid_fd
, fsid_str
, sizeof(fsid_str
));
4388 derr
<< __func__
<< " failed: " << cpp_strerror(ret
) << dendl
;
4395 if (!uuid
->parse(fsid_str
)) {
4396 derr
<< __func__
<< " unparsable uuid " << fsid_str
<< dendl
;
4402 int BlueStore::_write_fsid()
4404 int r
= ::ftruncate(fsid_fd
, 0);
4407 derr
<< __func__
<< " fsid truncate failed: " << cpp_strerror(r
) << dendl
;
4410 string str
= stringify(fsid
) + "\n";
4411 r
= safe_write(fsid_fd
, str
.c_str(), str
.length());
4413 derr
<< __func__
<< " fsid write failed: " << cpp_strerror(r
) << dendl
;
4416 r
= ::fsync(fsid_fd
);
4419 derr
<< __func__
<< " fsid fsync failed: " << cpp_strerror(r
) << dendl
;
4425 void BlueStore::_close_fsid()
4427 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd
));
4431 int BlueStore::_lock_fsid()
4434 memset(&l
, 0, sizeof(l
));
4436 l
.l_whence
= SEEK_SET
;
4437 int r
= ::fcntl(fsid_fd
, F_SETLK
, &l
);
4440 derr
<< __func__
<< " failed to lock " << path
<< "/fsid"
4441 << " (is another ceph-osd still running?)"
4442 << cpp_strerror(err
) << dendl
;
4448 bool BlueStore::is_rotational()
4451 return bdev
->is_rotational();
4454 bool rotational
= true;
4455 int r
= _open_path();
4458 r
= _open_fsid(false);
4461 r
= _read_fsid(&fsid
);
4467 r
= _open_bdev(false);
4470 rotational
= bdev
->is_rotational();
4480 bool BlueStore::is_journal_rotational()
4483 dout(5) << __func__
<< " bluefs disabled, default to store media type"
4485 return is_rotational();
4487 dout(10) << __func__
<< " " << (int)bluefs
->wal_is_rotational() << dendl
;
4488 return bluefs
->wal_is_rotational();
4491 bool BlueStore::test_mount_in_use()
4493 // most error conditions mean the mount is not in use (e.g., because
4494 // it doesn't exist). only if we fail to lock do we conclude it is
4497 int r
= _open_path();
4500 r
= _open_fsid(false);
4505 ret
= true; // if we can't lock, it is in use
4512 int BlueStore::_open_db(bool create
)
4516 string fn
= path
+ "/db";
4519 ceph::shared_ptr
<Int64ArrayMergeOperator
> merge_op(new Int64ArrayMergeOperator
);
4523 kv_backend
= cct
->_conf
->bluestore_kvbackend
;
4525 r
= read_meta("kv_backend", &kv_backend
);
4527 derr
<< __func__
<< " unable to read 'kv_backend' meta" << dendl
;
4531 dout(10) << __func__
<< " kv_backend = " << kv_backend
<< dendl
;
4535 do_bluefs
= cct
->_conf
->bluestore_bluefs
;
4538 r
= read_meta("bluefs", &s
);
4540 derr
<< __func__
<< " unable to read 'bluefs' meta" << dendl
;
4545 } else if (s
== "0") {
4548 derr
<< __func__
<< " bluefs = " << s
<< " : not 0 or 1, aborting"
4553 dout(10) << __func__
<< " do_bluefs = " << do_bluefs
<< dendl
;
4555 rocksdb::Env
*env
= NULL
;
4557 dout(10) << __func__
<< " initializing bluefs" << dendl
;
4558 if (kv_backend
!= "rocksdb") {
4559 derr
<< " backend must be rocksdb to use bluefs" << dendl
;
4562 bluefs
= new BlueFS(cct
);
4567 if (read_meta("path_block.db", &bfn
) < 0) {
4568 bfn
= path
+ "/block.db";
4570 if (::stat(bfn
.c_str(), &st
) == 0) {
4571 r
= bluefs
->add_block_device(BlueFS::BDEV_DB
, bfn
);
4573 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4574 << cpp_strerror(r
) << dendl
;
4578 if (bluefs
->bdev_support_label(BlueFS::BDEV_DB
)) {
4579 r
= _check_or_set_bdev_label(
4581 bluefs
->get_block_device_size(BlueFS::BDEV_DB
),
4582 "bluefs db", create
);
4585 << " check block device(" << bfn
<< ") label returned: "
4586 << cpp_strerror(r
) << dendl
;
4591 bluefs
->add_block_extent(
4594 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) - SUPER_RESERVED
);
4596 bluefs_shared_bdev
= BlueFS::BDEV_SLOW
;
4597 bluefs_single_shared_device
= false;
4598 } else if (::lstat(bfn
.c_str(), &st
) == -1) {
4599 bluefs_shared_bdev
= BlueFS::BDEV_DB
;
4601 //symlink exist is bug
4602 derr
<< __func__
<< " " << bfn
<< " link target doesn't exist" << dendl
;
4608 if (read_meta("path_block", &bfn
) < 0) {
4609 bfn
= path
+ "/block";
4611 r
= bluefs
->add_block_device(bluefs_shared_bdev
, bfn
);
4613 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4614 << cpp_strerror(r
) << dendl
;
4618 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4620 bdev
->get_size() * (cct
->_conf
->bluestore_bluefs_min_ratio
+
4621 cct
->_conf
->bluestore_bluefs_gift_ratio
);
4622 initial
= MAX(initial
, cct
->_conf
->bluestore_bluefs_min
);
4623 if (cct
->_conf
->bluefs_alloc_size
% min_alloc_size
) {
4624 derr
<< __func__
<< " bluefs_alloc_size 0x" << std::hex
4625 << cct
->_conf
->bluefs_alloc_size
<< " is not a multiple of "
4626 << "min_alloc_size 0x" << min_alloc_size
<< std::dec
<< dendl
;
4630 // align to bluefs's alloc_size
4631 initial
= P2ROUNDUP(initial
, cct
->_conf
->bluefs_alloc_size
);
4632 // put bluefs in the middle of the device in case it is an HDD
4633 uint64_t start
= P2ALIGN((bdev
->get_size() - initial
) / 2,
4634 cct
->_conf
->bluefs_alloc_size
);
4635 bluefs
->add_block_extent(bluefs_shared_bdev
, start
, initial
);
4636 bluefs_extents
.insert(start
, initial
);
4639 if (read_meta("path_block.wal", &bfn
) < 0) {
4640 bfn
= path
+ "/block.wal";
4642 if (::stat(bfn
.c_str(), &st
) == 0) {
4643 r
= bluefs
->add_block_device(BlueFS::BDEV_WAL
, bfn
);
4645 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4646 << cpp_strerror(r
) << dendl
;
4650 if (bluefs
->bdev_support_label(BlueFS::BDEV_WAL
)) {
4651 r
= _check_or_set_bdev_label(
4653 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
),
4654 "bluefs wal", create
);
4656 derr
<< __func__
<< " check block device(" << bfn
4657 << ") label returned: " << cpp_strerror(r
) << dendl
;
4663 bluefs
->add_block_extent(
4664 BlueFS::BDEV_WAL
, BDEV_LABEL_BLOCK_SIZE
,
4665 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) -
4666 BDEV_LABEL_BLOCK_SIZE
);
4668 cct
->_conf
->set_val("rocksdb_separate_wal_dir", "true");
4669 bluefs_single_shared_device
= false;
4670 } else if (::lstat(bfn
.c_str(), &st
) == -1) {
4671 cct
->_conf
->set_val("rocksdb_separate_wal_dir", "false");
4673 //symlink exist is bug
4674 derr
<< __func__
<< " " << bfn
<< " link target doesn't exist" << dendl
;
4682 r
= bluefs
->mount();
4684 derr
<< __func__
<< " failed bluefs mount: " << cpp_strerror(r
) << dendl
;
4687 if (cct
->_conf
->bluestore_bluefs_env_mirror
) {
4688 rocksdb::Env
*a
= new BlueRocksEnv(bluefs
);
4689 rocksdb::Env
*b
= rocksdb::Env::Default();
4691 string cmd
= "rm -rf " + path
+ "/db " +
4692 path
+ "/db.slow " +
4694 int r
= system(cmd
.c_str());
4697 env
= new rocksdb::EnvMirror(b
, a
, false, true);
4699 env
= new BlueRocksEnv(bluefs
);
4701 // simplify the dir names, too, as "seen" by rocksdb
4705 if (bluefs_shared_bdev
== BlueFS::BDEV_SLOW
) {
4706 // we have both block.db and block; tell rocksdb!
4707 // note: the second (last) size value doesn't really matter
4708 ostringstream db_paths
;
4709 uint64_t db_size
= bluefs
->get_block_device_size(BlueFS::BDEV_DB
);
4710 uint64_t slow_size
= bluefs
->get_block_device_size(BlueFS::BDEV_SLOW
);
4711 db_paths
<< fn
<< ","
4712 << (uint64_t)(db_size
* 95 / 100) << " "
4713 << fn
+ ".slow" << ","
4714 << (uint64_t)(slow_size
* 95 / 100);
4715 cct
->_conf
->set_val("rocksdb_db_paths", db_paths
.str(), false);
4716 dout(10) << __func__
<< " set rocksdb_db_paths to "
4717 << cct
->_conf
->get_val
<std::string
>("rocksdb_db_paths") << dendl
;
4722 if (cct
->_conf
->rocksdb_separate_wal_dir
)
4723 env
->CreateDir(fn
+ ".wal");
4724 if (cct
->_conf
->get_val
<std::string
>("rocksdb_db_paths").length())
4725 env
->CreateDir(fn
+ ".slow");
4727 } else if (create
) {
4728 int r
= ::mkdir(fn
.c_str(), 0755);
4731 if (r
< 0 && r
!= -EEXIST
) {
4732 derr
<< __func__
<< " failed to create " << fn
<< ": " << cpp_strerror(r
)
4738 if (cct
->_conf
->rocksdb_separate_wal_dir
) {
4739 string walfn
= path
+ "/db.wal";
4740 r
= ::mkdir(walfn
.c_str(), 0755);
4743 if (r
< 0 && r
!= -EEXIST
) {
4744 derr
<< __func__
<< " failed to create " << walfn
4745 << ": " << cpp_strerror(r
)
4752 db
= KeyValueDB::create(cct
,
4755 static_cast<void*>(env
));
4757 derr
<< __func__
<< " error creating db" << dendl
;
4763 // delete env manually here since we can't depend on db to do this
4770 FreelistManager::setup_merge_operators(db
);
4771 db
->set_merge_operator(PREFIX_STAT
, merge_op
);
4773 db
->set_cache_size(cache_size
* cache_kv_ratio
);
4775 if (kv_backend
== "rocksdb")
4776 options
= cct
->_conf
->bluestore_rocksdb_options
;
4779 r
= db
->create_and_open(err
);
4783 derr
<< __func__
<< " erroring opening db: " << err
.str() << dendl
;
4793 dout(1) << __func__
<< " opened " << kv_backend
4794 << " path " << fn
<< " options " << options
<< dendl
;
4804 void BlueStore::_close_db()
4816 int BlueStore::_reconcile_bluefs_freespace()
4818 dout(10) << __func__
<< dendl
;
4819 interval_set
<uint64_t> bset
;
4820 int r
= bluefs
->get_block_extents(bluefs_shared_bdev
, &bset
);
4822 if (bset
== bluefs_extents
) {
4823 dout(10) << __func__
<< " we agree bluefs has 0x" << std::hex
<< bset
4824 << std::dec
<< dendl
;
4827 dout(10) << __func__
<< " bluefs says 0x" << std::hex
<< bset
<< std::dec
4829 dout(10) << __func__
<< " super says 0x" << std::hex
<< bluefs_extents
4830 << std::dec
<< dendl
;
4832 interval_set
<uint64_t> overlap
;
4833 overlap
.intersection_of(bset
, bluefs_extents
);
4835 bset
.subtract(overlap
);
4836 if (!bset
.empty()) {
4837 derr
<< __func__
<< " bluefs extra 0x" << std::hex
<< bset
<< std::dec
4842 interval_set
<uint64_t> super_extra
;
4843 super_extra
= bluefs_extents
;
4844 super_extra
.subtract(overlap
);
4845 if (!super_extra
.empty()) {
4846 // This is normal: it can happen if we commit to give extents to
4847 // bluefs and we crash before bluefs commits that it owns them.
4848 dout(10) << __func__
<< " super extra " << super_extra
<< dendl
;
4849 for (interval_set
<uint64_t>::iterator p
= super_extra
.begin();
4850 p
!= super_extra
.end();
4852 bluefs
->add_block_extent(bluefs_shared_bdev
, p
.get_start(), p
.get_len());
4859 int BlueStore::_balance_bluefs_freespace(PExtentVector
*extents
)
4864 vector
<pair
<uint64_t,uint64_t>> bluefs_usage
; // <free, total> ...
4865 bluefs
->get_usage(&bluefs_usage
);
4866 assert(bluefs_usage
.size() > bluefs_shared_bdev
);
4868 // fixme: look at primary bdev only for now
4869 uint64_t bluefs_free
= bluefs_usage
[bluefs_shared_bdev
].first
;
4870 uint64_t bluefs_total
= bluefs_usage
[bluefs_shared_bdev
].second
;
4871 float bluefs_free_ratio
= (float)bluefs_free
/ (float)bluefs_total
;
4873 uint64_t my_free
= alloc
->get_free();
4874 uint64_t total
= bdev
->get_size();
4875 float my_free_ratio
= (float)my_free
/ (float)total
;
4877 uint64_t total_free
= bluefs_free
+ my_free
;
4879 float bluefs_ratio
= (float)bluefs_free
/ (float)total_free
;
4881 dout(10) << __func__
4882 << " bluefs " << pretty_si_t(bluefs_free
)
4883 << " free (" << bluefs_free_ratio
4884 << ") bluestore " << pretty_si_t(my_free
)
4885 << " free (" << my_free_ratio
4886 << "), bluefs_ratio " << bluefs_ratio
4890 uint64_t reclaim
= 0;
4891 if (bluefs_ratio
< cct
->_conf
->bluestore_bluefs_min_ratio
) {
4892 gift
= cct
->_conf
->bluestore_bluefs_gift_ratio
* total_free
;
4893 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
4894 << " < min_ratio " << cct
->_conf
->bluestore_bluefs_min_ratio
4895 << ", should gift " << pretty_si_t(gift
) << dendl
;
4896 } else if (bluefs_ratio
> cct
->_conf
->bluestore_bluefs_max_ratio
) {
4897 reclaim
= cct
->_conf
->bluestore_bluefs_reclaim_ratio
* total_free
;
4898 if (bluefs_total
- reclaim
< cct
->_conf
->bluestore_bluefs_min
)
4899 reclaim
= bluefs_total
- cct
->_conf
->bluestore_bluefs_min
;
4900 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
4901 << " > max_ratio " << cct
->_conf
->bluestore_bluefs_max_ratio
4902 << ", should reclaim " << pretty_si_t(reclaim
) << dendl
;
4905 // don't take over too much of the freespace
4906 uint64_t free_cap
= cct
->_conf
->bluestore_bluefs_max_ratio
* total_free
;
4907 if (bluefs_total
< cct
->_conf
->bluestore_bluefs_min
&&
4908 cct
->_conf
->bluestore_bluefs_min
< free_cap
) {
4909 uint64_t g
= cct
->_conf
->bluestore_bluefs_min
- bluefs_total
;
4910 dout(10) << __func__
<< " bluefs_total " << bluefs_total
4911 << " < min " << cct
->_conf
->bluestore_bluefs_min
4912 << ", should gift " << pretty_si_t(g
) << dendl
;
4917 uint64_t min_free
= cct
->_conf
->get_val
<uint64_t>("bluestore_bluefs_min_free");
4918 if (bluefs_free
< min_free
&&
4919 min_free
< free_cap
) {
4920 uint64_t g
= min_free
- bluefs_free
;
4921 dout(10) << __func__
<< " bluefs_free " << bluefs_total
4922 << " < min " << min_free
4923 << ", should gift " << pretty_si_t(g
) << dendl
;
4930 // round up to alloc size
4931 gift
= P2ROUNDUP(gift
, cct
->_conf
->bluefs_alloc_size
);
4933 // hard cap to fit into 32 bits
4934 gift
= MIN(gift
, 1ull<<31);
4935 dout(10) << __func__
<< " gifting " << gift
4936 << " (" << pretty_si_t(gift
) << ")" << dendl
;
4938 // fixme: just do one allocation to start...
4939 int r
= alloc
->reserve(gift
);
4942 AllocExtentVector exts
;
4943 int64_t alloc_len
= alloc
->allocate(gift
, cct
->_conf
->bluefs_alloc_size
,
4946 if (alloc_len
<= 0) {
4947 dout(1) << __func__
<< " no allocate on 0x" << std::hex
<< gift
4948 << " min_alloc_size 0x" << min_alloc_size
<< std::dec
<< dendl
;
4949 alloc
->unreserve(gift
);
4952 } else if (alloc_len
< (int64_t)gift
) {
4953 dout(1) << __func__
<< " insufficient allocate on 0x" << std::hex
<< gift
4954 << " min_alloc_size 0x" << min_alloc_size
4955 << " allocated 0x" << alloc_len
4956 << std::dec
<< dendl
;
4957 alloc
->unreserve(gift
- alloc_len
);
4960 for (auto& p
: exts
) {
4961 bluestore_pextent_t e
= bluestore_pextent_t(p
);
4962 dout(1) << __func__
<< " gifting " << e
<< " to bluefs" << dendl
;
4963 extents
->push_back(e
);
4970 // reclaim from bluefs?
4972 // round up to alloc size
4973 reclaim
= P2ROUNDUP(reclaim
, cct
->_conf
->bluefs_alloc_size
);
4975 // hard cap to fit into 32 bits
4976 reclaim
= MIN(reclaim
, 1ull<<31);
4977 dout(10) << __func__
<< " reclaiming " << reclaim
4978 << " (" << pretty_si_t(reclaim
) << ")" << dendl
;
4980 while (reclaim
> 0) {
4981 // NOTE: this will block and do IO.
4982 AllocExtentVector extents
;
4983 int r
= bluefs
->reclaim_blocks(bluefs_shared_bdev
, reclaim
,
4986 derr
<< __func__
<< " failed to reclaim space from bluefs"
4990 for (auto e
: extents
) {
4991 bluefs_extents
.erase(e
.offset
, e
.length
);
4992 bluefs_extents_reclaiming
.insert(e
.offset
, e
.length
);
4993 reclaim
-= e
.length
;
5003 void BlueStore::_commit_bluefs_freespace(
5004 const PExtentVector
& bluefs_gift_extents
)
5006 dout(10) << __func__
<< dendl
;
5007 for (auto& p
: bluefs_gift_extents
) {
5008 bluefs
->add_block_extent(bluefs_shared_bdev
, p
.offset
, p
.length
);
5012 int BlueStore::_open_collections(int *errors
)
5014 assert(coll_map
.empty());
5015 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
5016 for (it
->upper_bound(string());
5020 if (cid
.parse(it
->key())) {
5024 cache_shards
[cid
.hash_to_shard(cache_shards
.size())],
5026 bufferlist bl
= it
->value();
5027 bufferlist::iterator p
= bl
.begin();
5029 ::decode(c
->cnode
, p
);
5030 } catch (buffer::error
& e
) {
5031 derr
<< __func__
<< " failed to decode cnode, key:"
5032 << pretty_binary_string(it
->key()) << dendl
;
5035 dout(20) << __func__
<< " opened " << cid
<< " " << c
<< dendl
;
5038 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
5046 void BlueStore::_open_statfs()
5049 int r
= db
->get(PREFIX_STAT
, "bluestore_statfs", &bl
);
5051 if (size_t(bl
.length()) >= sizeof(vstatfs
.values
)) {
5052 auto it
= bl
.begin();
5055 dout(10) << __func__
<< " store_statfs is corrupt, using empty" << dendl
;
5059 dout(10) << __func__
<< " store_statfs missed, using empty" << dendl
;
5063 int BlueStore::_setup_block_symlink_or_file(
5069 dout(20) << __func__
<< " name " << name
<< " path " << epath
5070 << " size " << size
<< " create=" << (int)create
<< dendl
;
5075 if (epath
.length()) {
5076 r
= ::symlinkat(epath
.c_str(), path_fd
, name
.c_str());
5079 derr
<< __func__
<< " failed to create " << name
<< " symlink to "
5080 << epath
<< ": " << cpp_strerror(r
) << dendl
;
5084 if (!epath
.compare(0, strlen(SPDK_PREFIX
), SPDK_PREFIX
)) {
5085 int fd
= ::openat(path_fd
, epath
.c_str(), flags
, 0644);
5088 derr
<< __func__
<< " failed to open " << epath
<< " file: "
5089 << cpp_strerror(r
) << dendl
;
5092 string serial_number
= epath
.substr(strlen(SPDK_PREFIX
));
5093 r
= ::write(fd
, serial_number
.c_str(), serial_number
.size());
5094 assert(r
== (int)serial_number
.size());
5095 dout(1) << __func__
<< " created " << name
<< " symlink to "
5097 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5101 int fd
= ::openat(path_fd
, name
.c_str(), flags
, 0644);
5103 // block file is present
5105 int r
= ::fstat(fd
, &st
);
5107 S_ISREG(st
.st_mode
) && // if it is a regular file
5108 st
.st_size
== 0) { // and is 0 bytes
5109 r
= ::ftruncate(fd
, size
);
5112 derr
<< __func__
<< " failed to resize " << name
<< " file to "
5113 << size
<< ": " << cpp_strerror(r
) << dendl
;
5114 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5118 if (cct
->_conf
->bluestore_block_preallocate_file
) {
5119 #ifdef HAVE_POSIX_FALLOCATE
5120 r
= ::posix_fallocate(fd
, 0, size
);
5122 derr
<< __func__
<< " failed to prefallocate " << name
<< " file to "
5123 << size
<< ": " << cpp_strerror(r
) << dendl
;
5124 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5128 char data
[1024*128];
5129 for (uint64_t off
= 0; off
< size
; off
+= sizeof(data
)) {
5130 if (off
+ sizeof(data
) > size
)
5131 r
= ::write(fd
, data
, size
- off
);
5133 r
= ::write(fd
, data
, sizeof(data
));
5136 derr
<< __func__
<< " failed to prefallocate w/ write " << name
<< " file to "
5137 << size
<< ": " << cpp_strerror(r
) << dendl
;
5138 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5144 dout(1) << __func__
<< " resized " << name
<< " file to "
5145 << pretty_si_t(size
) << "B" << dendl
;
5147 VOID_TEMP_FAILURE_RETRY(::close(fd
));
5151 derr
<< __func__
<< " failed to open " << name
<< " file: "
5152 << cpp_strerror(r
) << dendl
;
5160 int BlueStore::mkfs()
5162 dout(1) << __func__
<< " path " << path
<< dendl
;
5168 r
= read_meta("mkfs_done", &done
);
5170 dout(1) << __func__
<< " already created" << dendl
;
5171 if (cct
->_conf
->bluestore_fsck_on_mkfs
) {
5172 r
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
5174 derr
<< __func__
<< " fsck found fatal error: " << cpp_strerror(r
)
5179 derr
<< __func__
<< " fsck found " << r
<< " errors" << dendl
;
5183 return r
; // idempotent
5189 r
= read_meta("type", &type
);
5191 if (type
!= "bluestore") {
5192 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
5196 r
= write_meta("type", "bluestore");
5202 freelist_type
= "bitmap";
5208 r
= _open_fsid(true);
5214 goto out_close_fsid
;
5216 r
= _read_fsid(&old_fsid
);
5217 if (r
< 0 || old_fsid
.is_zero()) {
5218 if (fsid
.is_zero()) {
5219 fsid
.generate_random();
5220 dout(1) << __func__
<< " generated fsid " << fsid
<< dendl
;
5222 dout(1) << __func__
<< " using provided fsid " << fsid
<< dendl
;
5224 // we'll write it later.
5226 if (!fsid
.is_zero() && fsid
!= old_fsid
) {
5227 derr
<< __func__
<< " on-disk fsid " << old_fsid
5228 << " != provided " << fsid
<< dendl
;
5230 goto out_close_fsid
;
5235 r
= _setup_block_symlink_or_file("block", cct
->_conf
->bluestore_block_path
,
5236 cct
->_conf
->bluestore_block_size
,
5237 cct
->_conf
->bluestore_block_create
);
5239 goto out_close_fsid
;
5240 if (cct
->_conf
->bluestore_bluefs
) {
5241 r
= _setup_block_symlink_or_file("block.wal", cct
->_conf
->bluestore_block_wal_path
,
5242 cct
->_conf
->bluestore_block_wal_size
,
5243 cct
->_conf
->bluestore_block_wal_create
);
5245 goto out_close_fsid
;
5246 r
= _setup_block_symlink_or_file("block.db", cct
->_conf
->bluestore_block_db_path
,
5247 cct
->_conf
->bluestore_block_db_size
,
5248 cct
->_conf
->bluestore_block_db_create
);
5250 goto out_close_fsid
;
5253 r
= _open_bdev(true);
5255 goto out_close_fsid
;
5258 string wal_path
= cct
->_conf
->get_val
<string
>("bluestore_block_wal_path");
5259 if (wal_path
.size()) {
5260 write_meta("path_block.wal", wal_path
);
5262 string db_path
= cct
->_conf
->get_val
<string
>("bluestore_block_db_path");
5263 if (db_path
.size()) {
5264 write_meta("path_block.db", db_path
);
5268 // choose min_alloc_size
5269 if (cct
->_conf
->bluestore_min_alloc_size
) {
5270 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size
;
5273 if (bdev
->is_rotational()) {
5274 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_hdd
;
5276 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_ssd
;
5280 // make sure min_alloc_size is power of 2 aligned.
5281 if (!ISP2(min_alloc_size
)) {
5282 derr
<< __func__
<< " min_alloc_size 0x"
5283 << std::hex
<< min_alloc_size
<< std::dec
5284 << " is not power of 2 aligned!"
5287 goto out_close_bdev
;
5292 goto out_close_bdev
;
5299 KeyValueDB::Transaction t
= db
->get_transaction();
5302 ::encode((uint64_t)0, bl
);
5303 t
->set(PREFIX_SUPER
, "nid_max", bl
);
5304 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
5309 ::encode((uint64_t)min_alloc_size
, bl
);
5310 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
5313 ondisk_format
= latest_ondisk_format
;
5314 _prepare_ondisk_format_super(t
);
5315 db
->submit_transaction_sync(t
);
5319 r
= write_meta("kv_backend", cct
->_conf
->bluestore_kvbackend
);
5323 r
= write_meta("bluefs", stringify(bluefs
? 1 : 0));
5327 if (fsid
!= old_fsid
) {
5330 derr
<< __func__
<< " error writing fsid: " << cpp_strerror(r
) << dendl
;
5347 cct
->_conf
->bluestore_fsck_on_mkfs
) {
5348 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
5352 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
5358 // indicate success by writing the 'mkfs_done' file
5359 r
= write_meta("mkfs_done", "yes");
5363 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
5365 dout(0) << __func__
<< " success" << dendl
;
5370 void BlueStore::set_cache_shards(unsigned num
)
5372 dout(10) << __func__
<< " " << num
<< dendl
;
5373 size_t old
= cache_shards
.size();
5375 cache_shards
.resize(num
);
5376 for (unsigned i
= old
; i
< num
; ++i
) {
5377 cache_shards
[i
] = Cache::create(cct
, cct
->_conf
->bluestore_cache_type
,
5382 int BlueStore::_mount(bool kv_only
)
5384 dout(1) << __func__
<< " path " << path
<< dendl
;
5390 int r
= read_meta("type", &type
);
5392 derr
<< __func__
<< " failed to load os-type: " << cpp_strerror(r
)
5397 if (type
!= "bluestore") {
5398 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
5403 if (cct
->_conf
->bluestore_fsck_on_mount
) {
5404 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mount_deep
);
5408 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
5413 int r
= _open_path();
5416 r
= _open_fsid(false);
5420 r
= _read_fsid(&fsid
);
5428 r
= _open_bdev(false);
5432 r
= _open_db(false);
5439 r
= _open_super_meta();
5443 r
= _open_fm(false);
5451 r
= _open_collections();
5455 r
= _reload_logger();
5460 r
= _reconcile_bluefs_freespace();
5467 r
= _deferred_replay();
5471 mempool_thread
.init();
5495 int BlueStore::umount()
5497 assert(_kv_only
|| mounted
);
5498 dout(1) << __func__
<< dendl
;
5501 _osr_unregister_all();
5505 mempool_thread
.shutdown();
5506 dout(20) << __func__
<< " stopping kv thread" << dendl
;
5509 dout(20) << __func__
<< " closing" << dendl
;
5519 if (cct
->_conf
->bluestore_fsck_on_umount
) {
5520 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_umount_deep
);
5524 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
5531 static void apply(uint64_t off
,
5533 uint64_t granularity
,
5534 BlueStore::mempool_dynamic_bitset
&bitset
,
5535 std::function
<void(uint64_t,
5536 BlueStore::mempool_dynamic_bitset
&)> f
) {
5537 auto end
= ROUND_UP_TO(off
+ len
, granularity
);
5539 uint64_t pos
= off
/ granularity
;
5545 int BlueStore::_fsck_check_extents(
5546 const ghobject_t
& oid
,
5547 const PExtentVector
& extents
,
5549 mempool_dynamic_bitset
&used_blocks
,
5550 uint64_t granularity
,
5551 store_statfs_t
& expected_statfs
)
5553 dout(30) << __func__
<< " oid " << oid
<< " extents " << extents
<< dendl
;
5555 for (auto e
: extents
) {
5558 expected_statfs
.allocated
+= e
.length
;
5560 expected_statfs
.compressed_allocated
+= e
.length
;
5562 bool already
= false;
5564 e
.offset
, e
.length
, granularity
, used_blocks
,
5565 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5566 assert(pos
< bs
.size());
5573 derr
<< " " << oid
<< " extent " << e
5574 << " or a subset is already allocated" << dendl
;
5577 if (e
.end() > bdev
->get_size()) {
5578 derr
<< " " << oid
<< " extent " << e
5579 << " past end of block device" << dendl
;
5586 int BlueStore::_fsck(bool deep
, bool repair
)
5589 << (repair
? " fsck" : " repair")
5590 << (deep
? " (deep)" : " (shallow)") << " start" << dendl
;
5594 typedef btree::btree_set
<
5595 uint64_t,std::less
<uint64_t>,
5596 mempool::bluestore_fsck::pool_allocator
<uint64_t>> uint64_t_btree_t
;
5597 uint64_t_btree_t used_nids
;
5598 uint64_t_btree_t used_omap_head
;
5599 uint64_t_btree_t used_sbids
;
5601 mempool_dynamic_bitset used_blocks
;
5602 KeyValueDB::Iterator it
;
5603 store_statfs_t expected_statfs
, actual_statfs
;
5605 list
<ghobject_t
> oids
;
5607 bluestore_extent_ref_map_t ref_map
;
5610 mempool::bluestore_fsck::map
<uint64_t,sb_info_t
> sb_info
;
5612 uint64_t num_objects
= 0;
5613 uint64_t num_extents
= 0;
5614 uint64_t num_blobs
= 0;
5615 uint64_t num_spanning_blobs
= 0;
5616 uint64_t num_shared_blobs
= 0;
5617 uint64_t num_sharded_objects
= 0;
5618 uint64_t num_object_shards
= 0;
5620 utime_t start
= ceph_clock_now();
5622 int r
= _open_path();
5625 r
= _open_fsid(false);
5629 r
= _read_fsid(&fsid
);
5637 r
= _open_bdev(false);
5641 r
= _open_db(false);
5645 r
= _open_super_meta();
5649 r
= _open_fm(false);
5657 r
= _open_collections(&errors
);
5661 mempool_thread
.init();
5663 // we need finishers and kv_{sync,finalize}_thread *just* for replay
5665 r
= _deferred_replay();
5670 used_blocks
.resize(fm
->get_alloc_units());
5672 0, MAX(min_alloc_size
, SUPER_RESERVED
), fm
->get_alloc_size(), used_blocks
,
5673 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5674 assert(pos
< bs
.size());
5680 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
5682 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
5683 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5684 assert(pos
< bs
.size());
5697 // get expected statfs; fill unaffected fields to be able to compare
5699 statfs(&actual_statfs
);
5700 expected_statfs
.total
= actual_statfs
.total
;
5701 expected_statfs
.available
= actual_statfs
.available
;
5704 dout(1) << __func__
<< " walking object keyspace" << dendl
;
5705 it
= db
->get_iterator(PREFIX_OBJ
);
5709 mempool::bluestore_fsck::list
<string
> expecting_shards
;
5710 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
5711 if (g_conf
->bluestore_debug_fsck_abort
) {
5714 dout(30) << " key " << pretty_binary_string(it
->key()) << dendl
;
5715 if (is_extent_shard_key(it
->key())) {
5716 while (!expecting_shards
.empty() &&
5717 expecting_shards
.front() < it
->key()) {
5718 derr
<< "fsck error: missing shard key "
5719 << pretty_binary_string(expecting_shards
.front())
5722 expecting_shards
.pop_front();
5724 if (!expecting_shards
.empty() &&
5725 expecting_shards
.front() == it
->key()) {
5727 expecting_shards
.pop_front();
5733 get_key_extent_shard(it
->key(), &okey
, &offset
);
5734 derr
<< "fsck error: stray shard 0x" << std::hex
<< offset
5735 << std::dec
<< dendl
;
5736 if (expecting_shards
.empty()) {
5737 derr
<< "fsck error: " << pretty_binary_string(it
->key())
5738 << " is unexpected" << dendl
;
5742 while (expecting_shards
.front() > it
->key()) {
5743 derr
<< "fsck error: saw " << pretty_binary_string(it
->key())
5745 derr
<< "fsck error: exp "
5746 << pretty_binary_string(expecting_shards
.front()) << dendl
;
5748 expecting_shards
.pop_front();
5749 if (expecting_shards
.empty()) {
5757 int r
= get_key_object(it
->key(), &oid
);
5759 derr
<< "fsck error: bad object key "
5760 << pretty_binary_string(it
->key()) << dendl
;
5765 oid
.shard_id
!= pgid
.shard
||
5766 oid
.hobj
.pool
!= (int64_t)pgid
.pool() ||
5767 !c
->contains(oid
)) {
5769 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
=
5771 p
!= coll_map
.end();
5773 if (p
->second
->contains(oid
)) {
5779 derr
<< "fsck error: stray object " << oid
5780 << " not owned by any collection" << dendl
;
5784 c
->cid
.is_pg(&pgid
);
5785 dout(20) << __func__
<< " collection " << c
->cid
<< dendl
;
5788 if (!expecting_shards
.empty()) {
5789 for (auto &k
: expecting_shards
) {
5790 derr
<< "fsck error: missing shard key "
5791 << pretty_binary_string(k
) << dendl
;
5794 expecting_shards
.clear();
5797 dout(10) << __func__
<< " " << oid
<< dendl
;
5798 RWLock::RLocker
l(c
->lock
);
5799 OnodeRef o
= c
->get_onode(oid
, false);
5801 if (o
->onode
.nid
> nid_max
) {
5802 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
5803 << " > nid_max " << nid_max
<< dendl
;
5806 if (used_nids
.count(o
->onode
.nid
)) {
5807 derr
<< "fsck error: " << oid
<< " nid " << o
->onode
.nid
5808 << " already in use" << dendl
;
5810 continue; // go for next object
5812 used_nids
.insert(o
->onode
.nid
);
5815 num_spanning_blobs
+= o
->extent_map
.spanning_blob_map
.size();
5816 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
5819 if (!o
->extent_map
.shards
.empty()) {
5820 ++num_sharded_objects
;
5821 num_object_shards
+= o
->extent_map
.shards
.size();
5823 for (auto& s
: o
->extent_map
.shards
) {
5824 dout(20) << __func__
<< " shard " << *s
.shard_info
<< dendl
;
5825 expecting_shards
.push_back(string());
5826 get_extent_shard_key(o
->key
, s
.shard_info
->offset
,
5827 &expecting_shards
.back());
5828 if (s
.shard_info
->offset
>= o
->onode
.size
) {
5829 derr
<< "fsck error: " << oid
<< " shard 0x" << std::hex
5830 << s
.shard_info
->offset
<< " past EOF at 0x" << o
->onode
.size
5831 << std::dec
<< dendl
;
5836 map
<BlobRef
,bluestore_blob_t::unused_t
> referenced
;
5838 mempool::bluestore_fsck::map
<BlobRef
,
5839 bluestore_blob_use_tracker_t
> ref_map
;
5840 for (auto& l
: o
->extent_map
.extent_map
) {
5841 dout(20) << __func__
<< " " << l
<< dendl
;
5842 if (l
.logical_offset
< pos
) {
5843 derr
<< "fsck error: " << oid
<< " lextent at 0x"
5844 << std::hex
<< l
.logical_offset
5845 << " overlaps with the previous, which ends at 0x" << pos
5846 << std::dec
<< dendl
;
5849 if (o
->extent_map
.spans_shard(l
.logical_offset
, l
.length
)) {
5850 derr
<< "fsck error: " << oid
<< " lextent at 0x"
5851 << std::hex
<< l
.logical_offset
<< "~" << l
.length
5852 << " spans a shard boundary"
5853 << std::dec
<< dendl
;
5856 pos
= l
.logical_offset
+ l
.length
;
5857 expected_statfs
.stored
+= l
.length
;
5859 const bluestore_blob_t
& blob
= l
.blob
->get_blob();
5861 auto& ref
= ref_map
[l
.blob
];
5862 if (ref
.is_empty()) {
5863 uint32_t min_release_size
= blob
.get_release_size(min_alloc_size
);
5864 uint32_t l
= blob
.get_logical_length();
5865 ref
.init(l
, min_release_size
);
5871 if (blob
.has_unused()) {
5872 auto p
= referenced
.find(l
.blob
);
5873 bluestore_blob_t::unused_t
*pu
;
5874 if (p
== referenced
.end()) {
5875 pu
= &referenced
[l
.blob
];
5879 uint64_t blob_len
= blob
.get_logical_length();
5880 assert((blob_len
% (sizeof(*pu
)*8)) == 0);
5881 assert(l
.blob_offset
+ l
.length
<= blob_len
);
5882 uint64_t chunk_size
= blob_len
/ (sizeof(*pu
)*8);
5883 uint64_t start
= l
.blob_offset
/ chunk_size
;
5885 ROUND_UP_TO(l
.blob_offset
+ l
.length
, chunk_size
) / chunk_size
;
5886 for (auto i
= start
; i
< end
; ++i
) {
5891 for (auto &i
: referenced
) {
5892 dout(20) << __func__
<< " referenced 0x" << std::hex
<< i
.second
5893 << std::dec
<< " for " << *i
.first
<< dendl
;
5894 const bluestore_blob_t
& blob
= i
.first
->get_blob();
5895 if (i
.second
& blob
.unused
) {
5896 derr
<< "fsck error: " << oid
<< " blob claims unused 0x"
5897 << std::hex
<< blob
.unused
5898 << " but extents reference 0x" << i
.second
5899 << " on blob " << *i
.first
<< dendl
;
5902 if (blob
.has_csum()) {
5903 uint64_t blob_len
= blob
.get_logical_length();
5904 uint64_t unused_chunk_size
= blob_len
/ (sizeof(blob
.unused
)*8);
5905 unsigned csum_count
= blob
.get_csum_count();
5906 unsigned csum_chunk_size
= blob
.get_csum_chunk_size();
5907 for (unsigned p
= 0; p
< csum_count
; ++p
) {
5908 unsigned pos
= p
* csum_chunk_size
;
5909 unsigned firstbit
= pos
/ unused_chunk_size
; // [firstbit,lastbit]
5910 unsigned lastbit
= (pos
+ csum_chunk_size
- 1) / unused_chunk_size
;
5911 unsigned mask
= 1u << firstbit
;
5912 for (unsigned b
= firstbit
+ 1; b
<= lastbit
; ++b
) {
5915 if ((blob
.unused
& mask
) == mask
) {
5916 // this csum chunk region is marked unused
5917 if (blob
.get_csum_item(p
) != 0) {
5918 derr
<< "fsck error: " << oid
5919 << " blob claims csum chunk 0x" << std::hex
<< pos
5920 << "~" << csum_chunk_size
5921 << " is unused (mask 0x" << mask
<< " of unused 0x"
5922 << blob
.unused
<< ") but csum is non-zero 0x"
5923 << blob
.get_csum_item(p
) << std::dec
<< " on blob "
5924 << *i
.first
<< dendl
;
5931 for (auto &i
: ref_map
) {
5933 const bluestore_blob_t
& blob
= i
.first
->get_blob();
5934 bool equal
= i
.first
->get_blob_use_tracker().equal(i
.second
);
5936 derr
<< "fsck error: " << oid
<< " blob " << *i
.first
5937 << " doesn't match expected ref_map " << i
.second
<< dendl
;
5940 if (blob
.is_compressed()) {
5941 expected_statfs
.compressed
+= blob
.get_compressed_payload_length();
5942 expected_statfs
.compressed_original
+=
5943 i
.first
->get_referenced_bytes();
5945 if (blob
.is_shared()) {
5946 if (i
.first
->shared_blob
->get_sbid() > blobid_max
) {
5947 derr
<< "fsck error: " << oid
<< " blob " << blob
5948 << " sbid " << i
.first
->shared_blob
->get_sbid() << " > blobid_max "
5949 << blobid_max
<< dendl
;
5951 } else if (i
.first
->shared_blob
->get_sbid() == 0) {
5952 derr
<< "fsck error: " << oid
<< " blob " << blob
5953 << " marked as shared but has uninitialized sbid"
5957 sb_info_t
& sbi
= sb_info
[i
.first
->shared_blob
->get_sbid()];
5958 sbi
.sb
= i
.first
->shared_blob
;
5959 sbi
.oids
.push_back(oid
);
5960 sbi
.compressed
= blob
.is_compressed();
5961 for (auto e
: blob
.get_extents()) {
5963 sbi
.ref_map
.get(e
.offset
, e
.length
);
5967 errors
+= _fsck_check_extents(oid
, blob
.get_extents(),
5968 blob
.is_compressed(),
5970 fm
->get_alloc_size(),
5976 int r
= _do_read(c
.get(), o
, 0, o
->onode
.size
, bl
, 0);
5979 derr
<< "fsck error: " << oid
<< " error during read: "
5980 << cpp_strerror(r
) << dendl
;
5984 if (o
->onode
.has_omap()) {
5985 if (used_omap_head
.count(o
->onode
.nid
)) {
5986 derr
<< "fsck error: " << oid
<< " omap_head " << o
->onode
.nid
5987 << " already in use" << dendl
;
5990 used_omap_head
.insert(o
->onode
.nid
);
5995 dout(1) << __func__
<< " checking shared_blobs" << dendl
;
5996 it
= db
->get_iterator(PREFIX_SHARED_BLOB
);
5998 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
5999 string key
= it
->key();
6001 if (get_key_shared_blob(key
, &sbid
)) {
6002 derr
<< "fsck error: bad key '" << key
6003 << "' in shared blob namespace" << dendl
;
6007 auto p
= sb_info
.find(sbid
);
6008 if (p
== sb_info
.end()) {
6009 derr
<< "fsck error: found stray shared blob data for sbid 0x"
6010 << std::hex
<< sbid
<< std::dec
<< dendl
;
6014 sb_info_t
& sbi
= p
->second
;
6015 bluestore_shared_blob_t
shared_blob(sbid
);
6016 bufferlist bl
= it
->value();
6017 bufferlist::iterator blp
= bl
.begin();
6018 ::decode(shared_blob
, blp
);
6019 dout(20) << __func__
<< " " << *sbi
.sb
<< " " << shared_blob
<< dendl
;
6020 if (shared_blob
.ref_map
!= sbi
.ref_map
) {
6021 derr
<< "fsck error: shared blob 0x" << std::hex
<< sbid
6022 << std::dec
<< " ref_map " << shared_blob
.ref_map
6023 << " != expected " << sbi
.ref_map
<< dendl
;
6026 PExtentVector extents
;
6027 for (auto &r
: shared_blob
.ref_map
.ref_map
) {
6028 extents
.emplace_back(bluestore_pextent_t(r
.first
, r
.second
.length
));
6030 errors
+= _fsck_check_extents(p
->second
.oids
.front(),
6032 p
->second
.compressed
,
6034 fm
->get_alloc_size(),
6040 for (auto &p
: sb_info
) {
6041 derr
<< "fsck error: shared_blob 0x" << p
.first
6042 << " key is missing (" << *p
.second
.sb
<< ")" << dendl
;
6045 if (!(actual_statfs
== expected_statfs
)) {
6046 derr
<< "fsck error: actual " << actual_statfs
6047 << " != expected " << expected_statfs
<< dendl
;
6051 dout(1) << __func__
<< " checking for stray omap data" << dendl
;
6052 it
= db
->get_iterator(PREFIX_OMAP
);
6054 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
6056 _key_decode_u64(it
->key().c_str(), &omap_head
);
6057 if (used_omap_head
.count(omap_head
) == 0) {
6058 derr
<< "fsck error: found stray omap data on omap_head "
6059 << omap_head
<< dendl
;
6065 dout(1) << __func__
<< " checking deferred events" << dendl
;
6066 it
= db
->get_iterator(PREFIX_DEFERRED
);
6068 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
6069 bufferlist bl
= it
->value();
6070 bufferlist::iterator p
= bl
.begin();
6071 bluestore_deferred_transaction_t wt
;
6074 } catch (buffer::error
& e
) {
6075 derr
<< "fsck error: failed to decode deferred txn "
6076 << pretty_binary_string(it
->key()) << dendl
;
6080 dout(20) << __func__
<< " deferred " << wt
.seq
6081 << " ops " << wt
.ops
.size()
6082 << " released 0x" << std::hex
<< wt
.released
<< std::dec
<< dendl
;
6083 for (auto e
= wt
.released
.begin(); e
!= wt
.released
.end(); ++e
) {
6085 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
6086 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
6087 assert(pos
< bs
.size());
6095 dout(1) << __func__
<< " checking freelist vs allocated" << dendl
;
6097 // remove bluefs_extents from used set since the freelist doesn't
6098 // know they are allocated.
6099 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
6101 e
.get_start(), e
.get_len(), fm
->get_alloc_size(), used_blocks
,
6102 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
6103 assert(pos
< bs
.size());
6108 fm
->enumerate_reset();
6109 uint64_t offset
, length
;
6110 while (fm
->enumerate_next(&offset
, &length
)) {
6111 bool intersects
= false;
6113 offset
, length
, fm
->get_alloc_size(), used_blocks
,
6114 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
6115 assert(pos
< bs
.size());
6124 if (offset
== SUPER_RESERVED
&&
6125 length
== min_alloc_size
- SUPER_RESERVED
) {
6126 // this is due to the change just after luminous to min_alloc_size
6127 // granularity allocations, and our baked in assumption at the top
6128 // of _fsck that 0~ROUND_UP_TO(SUPER_RESERVED,min_alloc_size) is used
6129 // (vs luminous's ROUND_UP_TO(SUPER_RESERVED,block_size)). harmless,
6130 // since we will never allocate this region below min_alloc_size.
6131 dout(10) << __func__
<< " ignoring free extent between SUPER_RESERVED"
6132 << " and min_alloc_size, 0x" << std::hex
<< offset
<< "~"
6135 derr
<< "fsck error: free extent 0x" << std::hex
<< offset
6136 << "~" << length
<< std::dec
6137 << " intersects allocated blocks" << dendl
;
6142 fm
->enumerate_reset();
6143 size_t count
= used_blocks
.count();
6144 if (used_blocks
.size() != count
) {
6145 assert(used_blocks
.size() > count
);
6148 size_t start
= used_blocks
.find_first();
6149 while (start
!= decltype(used_blocks
)::npos
) {
6152 size_t next
= used_blocks
.find_next(cur
);
6153 if (next
!= cur
+ 1) {
6154 derr
<< "fsck error: leaked extent 0x" << std::hex
6155 << ((uint64_t)start
* fm
->get_alloc_size()) << "~"
6156 << ((cur
+ 1 - start
) * fm
->get_alloc_size()) << std::dec
6169 mempool_thread
.shutdown();
6176 it
.reset(); // before db is closed
6185 // fatal errors take precedence
6189 dout(2) << __func__
<< " " << num_objects
<< " objects, "
6190 << num_sharded_objects
<< " of them sharded. "
6192 dout(2) << __func__
<< " " << num_extents
<< " extents to "
6193 << num_blobs
<< " blobs, "
6194 << num_spanning_blobs
<< " spanning, "
6195 << num_shared_blobs
<< " shared."
6198 utime_t duration
= ceph_clock_now() - start
;
6199 dout(1) << __func__
<< " finish with " << errors
<< " errors, " << repaired
6200 << " repaired, " << (errors
- repaired
) << " remaining in "
6201 << duration
<< " seconds" << dendl
;
6202 return errors
- repaired
;
6205 void BlueStore::collect_metadata(map
<string
,string
> *pm
)
6207 dout(10) << __func__
<< dendl
;
6208 bdev
->collect_metadata("bluestore_bdev_", pm
);
6210 (*pm
)["bluefs"] = "1";
6211 (*pm
)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device
);
6212 bluefs
->collect_metadata(pm
);
6214 (*pm
)["bluefs"] = "0";
6218 int BlueStore::statfs(struct store_statfs_t
*buf
)
6221 buf
->total
= bdev
->get_size();
6222 buf
->available
= alloc
->get_free();
6225 // part of our shared device is "free" according to BlueFS, but we
6226 // can't touch bluestore_bluefs_min of it.
6227 int64_t shared_available
= std::min(
6228 bluefs
->get_free(bluefs_shared_bdev
),
6229 bluefs
->get_total(bluefs_shared_bdev
) - cct
->_conf
->bluestore_bluefs_min
);
6230 if (shared_available
> 0) {
6231 buf
->available
+= shared_available
;
6236 std::lock_guard
<std::mutex
> l(vstatfs_lock
);
6238 buf
->allocated
= vstatfs
.allocated();
6239 buf
->stored
= vstatfs
.stored();
6240 buf
->compressed
= vstatfs
.compressed();
6241 buf
->compressed_original
= vstatfs
.compressed_original();
6242 buf
->compressed_allocated
= vstatfs
.compressed_allocated();
6245 dout(20) << __func__
<< *buf
<< dendl
;
6252 BlueStore::CollectionRef
BlueStore::_get_collection(const coll_t
& cid
)
6254 RWLock::RLocker
l(coll_lock
);
6255 ceph::unordered_map
<coll_t
,CollectionRef
>::iterator cp
= coll_map
.find(cid
);
6256 if (cp
== coll_map
.end())
6257 return CollectionRef();
6261 void BlueStore::_queue_reap_collection(CollectionRef
& c
)
6263 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
6264 // _reap_collections and this in the same thread,
6265 // so no need a lock.
6266 removed_collections
.push_back(c
);
6269 void BlueStore::_reap_collections()
6272 list
<CollectionRef
> removed_colls
;
6274 // _queue_reap_collection and this in the same thread.
6275 // So no need a lock.
6276 if (!removed_collections
.empty())
6277 removed_colls
.swap(removed_collections
);
6282 list
<CollectionRef
>::iterator p
= removed_colls
.begin();
6283 while (p
!= removed_colls
.end()) {
6284 CollectionRef c
= *p
;
6285 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
6286 if (c
->onode_map
.map_any([&](OnodeRef o
) {
6288 if (o
->flushing_count
.load()) {
6289 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " " << o
->oid
6290 << " flush_txns " << o
->flushing_count
<< dendl
;
6298 c
->onode_map
.clear();
6299 p
= removed_colls
.erase(p
);
6300 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " done" << dendl
;
6302 if (removed_colls
.empty()) {
6303 dout(10) << __func__
<< " all reaped" << dendl
;
6305 removed_collections
.splice(removed_collections
.begin(), removed_colls
);
6309 void BlueStore::_update_cache_logger()
6311 uint64_t num_onodes
= 0;
6312 uint64_t num_extents
= 0;
6313 uint64_t num_blobs
= 0;
6314 uint64_t num_buffers
= 0;
6315 uint64_t num_buffer_bytes
= 0;
6316 for (auto c
: cache_shards
) {
6317 c
->add_stats(&num_onodes
, &num_extents
, &num_blobs
,
6318 &num_buffers
, &num_buffer_bytes
);
6320 logger
->set(l_bluestore_onodes
, num_onodes
);
6321 logger
->set(l_bluestore_extents
, num_extents
);
6322 logger
->set(l_bluestore_blobs
, num_blobs
);
6323 logger
->set(l_bluestore_buffers
, num_buffers
);
6324 logger
->set(l_bluestore_buffer_bytes
, num_buffer_bytes
);
6330 ObjectStore::CollectionHandle
BlueStore::open_collection(const coll_t
& cid
)
6332 return _get_collection(cid
);
6335 bool BlueStore::exists(const coll_t
& cid
, const ghobject_t
& oid
)
6337 CollectionHandle c
= _get_collection(cid
);
6340 return exists(c
, oid
);
6343 bool BlueStore::exists(CollectionHandle
&c_
, const ghobject_t
& oid
)
6345 Collection
*c
= static_cast<Collection
*>(c_
.get());
6346 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
6353 RWLock::RLocker
l(c
->lock
);
6354 OnodeRef o
= c
->get_onode(oid
, false);
6355 if (!o
|| !o
->exists
)
6362 int BlueStore::stat(
6364 const ghobject_t
& oid
,
6368 CollectionHandle c
= _get_collection(cid
);
6371 return stat(c
, oid
, st
, allow_eio
);
6374 int BlueStore::stat(
6375 CollectionHandle
&c_
,
6376 const ghobject_t
& oid
,
6380 Collection
*c
= static_cast<Collection
*>(c_
.get());
6383 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
6386 RWLock::RLocker
l(c
->lock
);
6387 OnodeRef o
= c
->get_onode(oid
, false);
6388 if (!o
|| !o
->exists
)
6390 st
->st_size
= o
->onode
.size
;
6391 st
->st_blksize
= 4096;
6392 st
->st_blocks
= (st
->st_size
+ st
->st_blksize
- 1) / st
->st_blksize
;
6397 if (_debug_mdata_eio(oid
)) {
6399 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
6403 int BlueStore::set_collection_opts(
6405 const pool_opts_t
& opts
)
6407 CollectionHandle ch
= _get_collection(cid
);
6410 Collection
*c
= static_cast<Collection
*>(ch
.get());
6411 dout(15) << __func__
<< " " << cid
<< " options " << opts
<< dendl
;
6414 RWLock::WLocker
l(c
->lock
);
6415 c
->pool_opts
= opts
;
6419 int BlueStore::read(
6421 const ghobject_t
& oid
,
6427 CollectionHandle c
= _get_collection(cid
);
6430 return read(c
, oid
, offset
, length
, bl
, op_flags
);
6433 int BlueStore::read(
6434 CollectionHandle
&c_
,
6435 const ghobject_t
& oid
,
6441 utime_t start
= ceph_clock_now();
6442 Collection
*c
= static_cast<Collection
*>(c_
.get());
6443 const coll_t
&cid
= c
->get_cid();
6444 dout(15) << __func__
<< " " << cid
<< " " << oid
6445 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
6453 RWLock::RLocker
l(c
->lock
);
6454 utime_t start1
= ceph_clock_now();
6455 OnodeRef o
= c
->get_onode(oid
, false);
6456 logger
->tinc(l_bluestore_read_onode_meta_lat
, ceph_clock_now() - start1
);
6457 if (!o
|| !o
->exists
) {
6462 if (offset
== length
&& offset
== 0)
6463 length
= o
->onode
.size
;
6465 r
= _do_read(c
, o
, offset
, length
, bl
, op_flags
);
6467 logger
->inc(l_bluestore_read_eio
);
6472 if (r
== 0 && _debug_data_eio(oid
)) {
6474 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
6475 } else if (cct
->_conf
->bluestore_debug_random_read_err
&&
6476 (rand() % (int)(cct
->_conf
->bluestore_debug_random_read_err
* 100.0)) == 0) {
6477 dout(0) << __func__
<< ": inject random EIO" << dendl
;
6480 dout(10) << __func__
<< " " << cid
<< " " << oid
6481 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
6482 << " = " << r
<< dendl
;
6483 logger
->tinc(l_bluestore_read_lat
, ceph_clock_now() - start
);
6487 // --------------------------------------------------------
6488 // intermediate data structures used while reading
6490 uint64_t logical_offset
;
6491 uint64_t blob_xoffset
; //region offset within the blob
6495 // used later in read process
6499 region_t(uint64_t offset
, uint64_t b_offs
, uint64_t len
)
6500 : logical_offset(offset
),
6501 blob_xoffset(b_offs
),
6503 region_t(const region_t
& from
)
6504 : logical_offset(from
.logical_offset
),
6505 blob_xoffset(from
.blob_xoffset
),
6506 length(from
.length
){}
6508 friend ostream
& operator<<(ostream
& out
, const region_t
& r
) {
6509 return out
<< "0x" << std::hex
<< r
.logical_offset
<< ":"
6510 << r
.blob_xoffset
<< "~" << r
.length
<< std::dec
;
6514 typedef list
<region_t
> regions2read_t
;
6515 typedef map
<BlueStore::BlobRef
, regions2read_t
> blobs2read_t
;
6517 int BlueStore::_do_read(
6528 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
6529 << " size 0x" << o
->onode
.size
<< " (" << std::dec
6530 << o
->onode
.size
<< ")" << dendl
;
6533 if (offset
>= o
->onode
.size
) {
6537 // generally, don't buffer anything, unless the client explicitly requests
6539 bool buffered
= false;
6540 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
6541 dout(20) << __func__
<< " will do buffered read" << dendl
;
6543 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
6544 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
6545 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
6546 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
6550 if (offset
+ length
> o
->onode
.size
) {
6551 length
= o
->onode
.size
- offset
;
6554 utime_t start
= ceph_clock_now();
6555 o
->extent_map
.fault_range(db
, offset
, length
);
6556 logger
->tinc(l_bluestore_read_onode_meta_lat
, ceph_clock_now() - start
);
6559 ready_regions_t ready_regions
;
6561 // build blob-wise list to of stuff read (that isn't cached)
6562 blobs2read_t blobs2read
;
6563 unsigned left
= length
;
6564 uint64_t pos
= offset
;
6565 unsigned num_regions
= 0;
6566 auto lp
= o
->extent_map
.seek_lextent(offset
);
6567 while (left
> 0 && lp
!= o
->extent_map
.extent_map
.end()) {
6568 if (pos
< lp
->logical_offset
) {
6569 unsigned hole
= lp
->logical_offset
- pos
;
6573 dout(30) << __func__
<< " hole 0x" << std::hex
<< pos
<< "~" << hole
6574 << std::dec
<< dendl
;
6578 BlobRef
& bptr
= lp
->blob
;
6579 unsigned l_off
= pos
- lp
->logical_offset
;
6580 unsigned b_off
= l_off
+ lp
->blob_offset
;
6581 unsigned b_len
= std::min(left
, lp
->length
- l_off
);
6583 ready_regions_t cache_res
;
6584 interval_set
<uint32_t> cache_interval
;
6585 bptr
->shared_blob
->bc
.read(
6586 bptr
->shared_blob
->get_cache(), b_off
, b_len
, cache_res
, cache_interval
);
6587 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6588 << " need 0x" << b_off
<< "~" << b_len
6589 << " cache has 0x" << cache_interval
6590 << std::dec
<< dendl
;
6592 auto pc
= cache_res
.begin();
6595 if (pc
!= cache_res
.end() &&
6596 pc
->first
== b_off
) {
6597 l
= pc
->second
.length();
6598 ready_regions
[pos
].claim(pc
->second
);
6599 dout(30) << __func__
<< " use cache 0x" << std::hex
<< pos
<< ": 0x"
6600 << b_off
<< "~" << l
<< std::dec
<< dendl
;
6604 if (pc
!= cache_res
.end()) {
6605 assert(pc
->first
> b_off
);
6606 l
= pc
->first
- b_off
;
6608 dout(30) << __func__
<< " will read 0x" << std::hex
<< pos
<< ": 0x"
6609 << b_off
<< "~" << l
<< std::dec
<< dendl
;
6610 blobs2read
[bptr
].emplace_back(region_t(pos
, b_off
, l
));
6621 // read raw blob data. use aio if we have >1 blobs to read.
6622 start
= ceph_clock_now(); // for the sake of simplicity
6623 // measure the whole block below.
6624 // The error isn't that much...
6625 vector
<bufferlist
> compressed_blob_bls
;
6626 IOContext
ioc(cct
, NULL
, true); // allow EIO
6627 for (auto& p
: blobs2read
) {
6628 const BlobRef
& bptr
= p
.first
;
6629 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6630 << " need " << p
.second
<< std::dec
<< dendl
;
6631 if (bptr
->get_blob().is_compressed()) {
6632 // read the whole thing
6633 if (compressed_blob_bls
.empty()) {
6634 // ensure we avoid any reallocation on subsequent blobs
6635 compressed_blob_bls
.reserve(blobs2read
.size());
6637 compressed_blob_bls
.push_back(bufferlist());
6638 bufferlist
& bl
= compressed_blob_bls
.back();
6639 r
= bptr
->get_blob().map(
6640 0, bptr
->get_blob().get_ondisk_length(),
6641 [&](uint64_t offset
, uint64_t length
) {
6643 // use aio if there are more regions to read than those in this blob
6644 if (num_regions
> p
.second
.size()) {
6645 r
= bdev
->aio_read(offset
, length
, &bl
, &ioc
);
6647 r
= bdev
->read(offset
, length
, &bl
, &ioc
, false);
6654 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
) << dendl
;
6656 // propagate EIO to caller
6663 for (auto& reg
: p
.second
) {
6664 // determine how much of the blob to read
6665 uint64_t chunk_size
= bptr
->get_blob().get_chunk_size(block_size
);
6666 reg
.r_off
= reg
.blob_xoffset
;
6667 uint64_t r_len
= reg
.length
;
6668 reg
.front
= reg
.r_off
% chunk_size
;
6670 reg
.r_off
-= reg
.front
;
6673 unsigned tail
= r_len
% chunk_size
;
6675 r_len
+= chunk_size
- tail
;
6677 dout(20) << __func__
<< " region 0x" << std::hex
6678 << reg
.logical_offset
6679 << ": 0x" << reg
.blob_xoffset
<< "~" << reg
.length
6680 << " reading 0x" << reg
.r_off
<< "~" << r_len
<< std::dec
6684 r
= bptr
->get_blob().map(
6686 [&](uint64_t offset
, uint64_t length
) {
6688 // use aio if there is more than one region to read
6689 if (num_regions
> 1) {
6690 r
= bdev
->aio_read(offset
, length
, ®
.bl
, &ioc
);
6692 r
= bdev
->read(offset
, length
, ®
.bl
, &ioc
, false);
6699 derr
<< __func__
<< " bdev-read failed: " << cpp_strerror(r
)
6702 // propagate EIO to caller
6707 assert(reg
.bl
.length() == r_len
);
6711 if (ioc
.has_pending_aios()) {
6712 bdev
->aio_submit(&ioc
);
6713 dout(20) << __func__
<< " waiting for aio" << dendl
;
6715 r
= ioc
.get_return_value();
6717 assert(r
== -EIO
); // no other errors allowed
6721 logger
->tinc(l_bluestore_read_wait_aio_lat
, ceph_clock_now() - start
);
6723 // enumerate and decompress desired blobs
6724 auto p
= compressed_blob_bls
.begin();
6725 blobs2read_t::iterator b2r_it
= blobs2read
.begin();
6726 while (b2r_it
!= blobs2read
.end()) {
6727 const BlobRef
& bptr
= b2r_it
->first
;
6728 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6729 << " need 0x" << b2r_it
->second
<< std::dec
<< dendl
;
6730 if (bptr
->get_blob().is_compressed()) {
6731 assert(p
!= compressed_blob_bls
.end());
6732 bufferlist
& compressed_bl
= *p
++;
6733 if (_verify_csum(o
, &bptr
->get_blob(), 0, compressed_bl
,
6734 b2r_it
->second
.front().logical_offset
) < 0) {
6738 r
= _decompress(compressed_bl
, &raw_bl
);
6742 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(), 0,
6745 for (auto& i
: b2r_it
->second
) {
6746 ready_regions
[i
.logical_offset
].substr_of(
6747 raw_bl
, i
.blob_xoffset
, i
.length
);
6750 for (auto& reg
: b2r_it
->second
) {
6751 if (_verify_csum(o
, &bptr
->get_blob(), reg
.r_off
, reg
.bl
,
6752 reg
.logical_offset
) < 0) {
6756 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(),
6760 // prune and keep result
6761 ready_regions
[reg
.logical_offset
].substr_of(
6762 reg
.bl
, reg
.front
, reg
.length
);
6768 // generate a resulting buffer
6769 auto pr
= ready_regions
.begin();
6770 auto pr_end
= ready_regions
.end();
6772 while (pos
< length
) {
6773 if (pr
!= pr_end
&& pr
->first
== pos
+ offset
) {
6774 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
6775 << ": data from 0x" << pr
->first
<< "~" << pr
->second
.length()
6776 << std::dec
<< dendl
;
6777 pos
+= pr
->second
.length();
6778 bl
.claim_append(pr
->second
);
6781 uint64_t l
= length
- pos
;
6783 assert(pr
->first
> pos
+ offset
);
6784 l
= pr
->first
- (pos
+ offset
);
6786 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
6787 << ": zeros for 0x" << (pos
+ offset
) << "~" << l
6788 << std::dec
<< dendl
;
6793 assert(bl
.length() == length
);
6794 assert(pos
== length
);
6795 assert(pr
== pr_end
);
6800 int BlueStore::_verify_csum(OnodeRef
& o
,
6801 const bluestore_blob_t
* blob
, uint64_t blob_xoffset
,
6802 const bufferlist
& bl
,
6803 uint64_t logical_offset
) const
6807 utime_t start
= ceph_clock_now();
6808 int r
= blob
->verify_csum(blob_xoffset
, bl
, &bad
, &bad_csum
);
6814 blob
->get_csum_chunk_size(),
6815 [&](uint64_t offset
, uint64_t length
) {
6816 pex
.emplace_back(bluestore_pextent_t(offset
, length
));
6819 derr
<< __func__
<< " bad "
6820 << Checksummer::get_csum_type_string(blob
->csum_type
)
6821 << "/0x" << std::hex
<< blob
->get_csum_chunk_size()
6822 << " checksum at blob offset 0x" << bad
6823 << ", got 0x" << bad_csum
<< ", expected 0x"
6824 << blob
->get_csum_item(bad
/ blob
->get_csum_chunk_size()) << std::dec
6825 << ", device location " << pex
6826 << ", logical extent 0x" << std::hex
6827 << (logical_offset
+ bad
- blob_xoffset
) << "~"
6828 << blob
->get_csum_chunk_size() << std::dec
6829 << ", object " << o
->oid
6832 derr
<< __func__
<< " failed with exit code: " << cpp_strerror(r
) << dendl
;
6835 logger
->tinc(l_bluestore_csum_lat
, ceph_clock_now() - start
);
6839 int BlueStore::_decompress(bufferlist
& source
, bufferlist
* result
)
6842 utime_t start
= ceph_clock_now();
6843 bufferlist::iterator i
= source
.begin();
6844 bluestore_compression_header_t chdr
;
6846 int alg
= int(chdr
.type
);
6847 CompressorRef cp
= compressor
;
6848 if (!cp
|| (int)cp
->get_type() != alg
) {
6849 cp
= Compressor::create(cct
, alg
);
6853 // if compressor isn't available - error, because cannot return
6854 // decompressed data?
6855 derr
<< __func__
<< " can't load decompressor " << alg
<< dendl
;
6858 r
= cp
->decompress(i
, chdr
.length
, *result
);
6860 derr
<< __func__
<< " decompression failed with exit code " << r
<< dendl
;
6864 logger
->tinc(l_bluestore_decompress_lat
, ceph_clock_now() - start
);
6868 // this stores fiemap into interval_set, other variations
6869 // use it internally
6870 int BlueStore::_fiemap(
6871 CollectionHandle
&c_
,
6872 const ghobject_t
& oid
,
6875 interval_set
<uint64_t>& destset
)
6877 Collection
*c
= static_cast<Collection
*>(c_
.get());
6881 RWLock::RLocker
l(c
->lock
);
6883 OnodeRef o
= c
->get_onode(oid
, false);
6884 if (!o
|| !o
->exists
) {
6889 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
6890 << " size 0x" << o
->onode
.size
<< std::dec
<< dendl
;
6892 boost::intrusive::set
<Extent
>::iterator ep
, eend
;
6893 if (offset
>= o
->onode
.size
)
6896 if (offset
+ length
> o
->onode
.size
) {
6897 length
= o
->onode
.size
- offset
;
6900 o
->extent_map
.fault_range(db
, offset
, length
);
6901 eend
= o
->extent_map
.extent_map
.end();
6902 ep
= o
->extent_map
.seek_lextent(offset
);
6903 while (length
> 0) {
6904 dout(20) << __func__
<< " offset " << offset
<< dendl
;
6905 if (ep
!= eend
&& ep
->logical_offset
+ ep
->length
<= offset
) {
6910 uint64_t x_len
= length
;
6911 if (ep
!= eend
&& ep
->logical_offset
<= offset
) {
6912 uint64_t x_off
= offset
- ep
->logical_offset
;
6913 x_len
= MIN(x_len
, ep
->length
- x_off
);
6914 dout(30) << __func__
<< " lextent 0x" << std::hex
<< offset
<< "~"
6915 << x_len
<< std::dec
<< " blob " << ep
->blob
<< dendl
;
6916 destset
.insert(offset
, x_len
);
6919 if (x_off
+ x_len
== ep
->length
)
6924 ep
->logical_offset
> offset
&&
6925 ep
->logical_offset
- offset
< x_len
) {
6926 x_len
= ep
->logical_offset
- offset
;
6934 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
6935 << " size = 0x(" << destset
<< ")" << std::dec
<< dendl
;
6939 int BlueStore::fiemap(
6941 const ghobject_t
& oid
,
6946 CollectionHandle c
= _get_collection(cid
);
6949 return fiemap(c
, oid
, offset
, len
, bl
);
6952 int BlueStore::fiemap(
6953 CollectionHandle
&c_
,
6954 const ghobject_t
& oid
,
6959 interval_set
<uint64_t> m
;
6960 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
6967 int BlueStore::fiemap(
6969 const ghobject_t
& oid
,
6972 map
<uint64_t, uint64_t>& destmap
)
6974 CollectionHandle c
= _get_collection(cid
);
6977 return fiemap(c
, oid
, offset
, len
, destmap
);
6980 int BlueStore::fiemap(
6981 CollectionHandle
&c_
,
6982 const ghobject_t
& oid
,
6985 map
<uint64_t, uint64_t>& destmap
)
6987 interval_set
<uint64_t> m
;
6988 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
6990 m
.move_into(destmap
);
6995 int BlueStore::getattr(
6997 const ghobject_t
& oid
,
7001 CollectionHandle c
= _get_collection(cid
);
7004 return getattr(c
, oid
, name
, value
);
7007 int BlueStore::getattr(
7008 CollectionHandle
&c_
,
7009 const ghobject_t
& oid
,
7013 Collection
*c
= static_cast<Collection
*>(c_
.get());
7014 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
<< dendl
;
7020 RWLock::RLocker
l(c
->lock
);
7021 mempool::bluestore_cache_other::string
k(name
);
7023 OnodeRef o
= c
->get_onode(oid
, false);
7024 if (!o
|| !o
->exists
) {
7029 if (!o
->onode
.attrs
.count(k
)) {
7033 value
= o
->onode
.attrs
[k
];
7037 if (r
== 0 && _debug_mdata_eio(oid
)) {
7039 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
7041 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
7042 << " = " << r
<< dendl
;
7047 int BlueStore::getattrs(
7049 const ghobject_t
& oid
,
7050 map
<string
,bufferptr
>& aset
)
7052 CollectionHandle c
= _get_collection(cid
);
7055 return getattrs(c
, oid
, aset
);
7058 int BlueStore::getattrs(
7059 CollectionHandle
&c_
,
7060 const ghobject_t
& oid
,
7061 map
<string
,bufferptr
>& aset
)
7063 Collection
*c
= static_cast<Collection
*>(c_
.get());
7064 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
7070 RWLock::RLocker
l(c
->lock
);
7072 OnodeRef o
= c
->get_onode(oid
, false);
7073 if (!o
|| !o
->exists
) {
7077 for (auto& i
: o
->onode
.attrs
) {
7078 aset
.emplace(i
.first
.c_str(), i
.second
);
7084 if (r
== 0 && _debug_mdata_eio(oid
)) {
7086 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
7088 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
7089 << " = " << r
<< dendl
;
7093 int BlueStore::list_collections(vector
<coll_t
>& ls
)
7095 RWLock::RLocker
l(coll_lock
);
7096 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
= coll_map
.begin();
7097 p
!= coll_map
.end();
7099 ls
.push_back(p
->first
);
7103 bool BlueStore::collection_exists(const coll_t
& c
)
7105 RWLock::RLocker
l(coll_lock
);
7106 return coll_map
.count(c
);
7109 int BlueStore::collection_empty(const coll_t
& cid
, bool *empty
)
7111 dout(15) << __func__
<< " " << cid
<< dendl
;
7112 vector
<ghobject_t
> ls
;
7114 int r
= collection_list(cid
, ghobject_t(), ghobject_t::get_max(), 1,
7117 derr
<< __func__
<< " collection_list returned: " << cpp_strerror(r
)
7121 *empty
= ls
.empty();
7122 dout(10) << __func__
<< " " << cid
<< " = " << (int)(*empty
) << dendl
;
7126 int BlueStore::collection_bits(const coll_t
& cid
)
7128 dout(15) << __func__
<< " " << cid
<< dendl
;
7129 CollectionRef c
= _get_collection(cid
);
7132 RWLock::RLocker
l(c
->lock
);
7133 dout(10) << __func__
<< " " << cid
<< " = " << c
->cnode
.bits
<< dendl
;
7134 return c
->cnode
.bits
;
7137 int BlueStore::collection_list(
7138 const coll_t
& cid
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
7139 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
7141 CollectionHandle c
= _get_collection(cid
);
7144 return collection_list(c
, start
, end
, max
, ls
, pnext
);
7147 int BlueStore::collection_list(
7148 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
7149 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
7151 Collection
*c
= static_cast<Collection
*>(c_
.get());
7152 dout(15) << __func__
<< " " << c
->cid
7153 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
7156 RWLock::RLocker
l(c
->lock
);
7157 r
= _collection_list(c
, start
, end
, max
, ls
, pnext
);
7160 dout(10) << __func__
<< " " << c
->cid
7161 << " start " << start
<< " end " << end
<< " max " << max
7162 << " = " << r
<< ", ls.size() = " << ls
->size()
7163 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
7167 int BlueStore::_collection_list(
7168 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
7169 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
7176 ghobject_t static_next
;
7177 KeyValueDB::Iterator it
;
7178 string temp_start_key
, temp_end_key
;
7179 string start_key
, end_key
;
7180 bool set_next
= false;
7185 pnext
= &static_next
;
7187 if (start
== ghobject_t::get_max() ||
7188 start
.hobj
.is_max()) {
7191 get_coll_key_range(c
->cid
, c
->cnode
.bits
, &temp_start_key
, &temp_end_key
,
7192 &start_key
, &end_key
);
7193 dout(20) << __func__
7194 << " range " << pretty_binary_string(temp_start_key
)
7195 << " to " << pretty_binary_string(temp_end_key
)
7196 << " and " << pretty_binary_string(start_key
)
7197 << " to " << pretty_binary_string(end_key
)
7198 << " start " << start
<< dendl
;
7199 it
= db
->get_iterator(PREFIX_OBJ
);
7200 if (start
== ghobject_t() ||
7201 start
.hobj
== hobject_t() ||
7202 start
== c
->cid
.get_min_hobj()) {
7203 it
->upper_bound(temp_start_key
);
7207 get_object_key(cct
, start
, &k
);
7208 if (start
.hobj
.is_temp()) {
7210 assert(k
>= temp_start_key
&& k
< temp_end_key
);
7213 assert(k
>= start_key
&& k
< end_key
);
7215 dout(20) << " start from " << pretty_binary_string(k
)
7216 << " temp=" << (int)temp
<< dendl
;
7219 if (end
.hobj
.is_max()) {
7220 pend
= temp
? temp_end_key
: end_key
;
7222 get_object_key(cct
, end
, &end_key
);
7223 if (end
.hobj
.is_temp()) {
7229 pend
= temp
? temp_end_key
: end_key
;
7232 dout(20) << __func__
<< " pend " << pretty_binary_string(pend
) << dendl
;
7234 if (!it
->valid() || it
->key() >= pend
) {
7236 dout(20) << __func__
<< " iterator not valid (end of db?)" << dendl
;
7238 dout(20) << __func__
<< " key " << pretty_binary_string(it
->key())
7239 << " >= " << end
<< dendl
;
7241 if (end
.hobj
.is_temp()) {
7244 dout(30) << __func__
<< " switch to non-temp namespace" << dendl
;
7246 it
->upper_bound(start_key
);
7248 dout(30) << __func__
<< " pend " << pretty_binary_string(pend
) << dendl
;
7253 dout(30) << __func__
<< " key " << pretty_binary_string(it
->key()) << dendl
;
7254 if (is_extent_shard_key(it
->key())) {
7259 int r
= get_key_object(it
->key(), &oid
);
7261 dout(20) << __func__
<< " oid " << oid
<< " end " << end
<< dendl
;
7262 if (ls
->size() >= (unsigned)max
) {
7263 dout(20) << __func__
<< " reached max " << max
<< dendl
;
7273 *pnext
= ghobject_t::get_max();
7279 int BlueStore::omap_get(
7280 const coll_t
& cid
, ///< [in] Collection containing oid
7281 const ghobject_t
&oid
, ///< [in] Object containing omap
7282 bufferlist
*header
, ///< [out] omap header
7283 map
<string
, bufferlist
> *out
/// < [out] Key to value map
7286 CollectionHandle c
= _get_collection(cid
);
7289 return omap_get(c
, oid
, header
, out
);
7292 int BlueStore::omap_get(
7293 CollectionHandle
&c_
, ///< [in] Collection containing oid
7294 const ghobject_t
&oid
, ///< [in] Object containing omap
7295 bufferlist
*header
, ///< [out] omap header
7296 map
<string
, bufferlist
> *out
/// < [out] Key to value map
7299 Collection
*c
= static_cast<Collection
*>(c_
.get());
7300 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7303 RWLock::RLocker
l(c
->lock
);
7305 OnodeRef o
= c
->get_onode(oid
, false);
7306 if (!o
|| !o
->exists
) {
7310 if (!o
->onode
.has_omap())
7314 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
7316 get_omap_header(o
->onode
.nid
, &head
);
7317 get_omap_tail(o
->onode
.nid
, &tail
);
7318 it
->lower_bound(head
);
7319 while (it
->valid()) {
7320 if (it
->key() == head
) {
7321 dout(30) << __func__
<< " got header" << dendl
;
7322 *header
= it
->value();
7323 } else if (it
->key() >= tail
) {
7324 dout(30) << __func__
<< " reached tail" << dendl
;
7328 decode_omap_key(it
->key(), &user_key
);
7329 dout(30) << __func__
<< " got " << pretty_binary_string(it
->key())
7330 << " -> " << user_key
<< dendl
;
7331 (*out
)[user_key
] = it
->value();
7337 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7342 int BlueStore::omap_get_header(
7343 const coll_t
& cid
, ///< [in] Collection containing oid
7344 const ghobject_t
&oid
, ///< [in] Object containing omap
7345 bufferlist
*header
, ///< [out] omap header
7346 bool allow_eio
///< [in] don't assert on eio
7349 CollectionHandle c
= _get_collection(cid
);
7352 return omap_get_header(c
, oid
, header
, allow_eio
);
7355 int BlueStore::omap_get_header(
7356 CollectionHandle
&c_
, ///< [in] Collection containing oid
7357 const ghobject_t
&oid
, ///< [in] Object containing omap
7358 bufferlist
*header
, ///< [out] omap header
7359 bool allow_eio
///< [in] don't assert on eio
7362 Collection
*c
= static_cast<Collection
*>(c_
.get());
7363 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7366 RWLock::RLocker
l(c
->lock
);
7368 OnodeRef o
= c
->get_onode(oid
, false);
7369 if (!o
|| !o
->exists
) {
7373 if (!o
->onode
.has_omap())
7378 get_omap_header(o
->onode
.nid
, &head
);
7379 if (db
->get(PREFIX_OMAP
, head
, header
) >= 0) {
7380 dout(30) << __func__
<< " got header" << dendl
;
7382 dout(30) << __func__
<< " no header" << dendl
;
7386 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7391 int BlueStore::omap_get_keys(
7392 const coll_t
& cid
, ///< [in] Collection containing oid
7393 const ghobject_t
&oid
, ///< [in] Object containing omap
7394 set
<string
> *keys
///< [out] Keys defined on oid
7397 CollectionHandle c
= _get_collection(cid
);
7400 return omap_get_keys(c
, oid
, keys
);
7403 int BlueStore::omap_get_keys(
7404 CollectionHandle
&c_
, ///< [in] Collection containing oid
7405 const ghobject_t
&oid
, ///< [in] Object containing omap
7406 set
<string
> *keys
///< [out] Keys defined on oid
7409 Collection
*c
= static_cast<Collection
*>(c_
.get());
7410 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7413 RWLock::RLocker
l(c
->lock
);
7415 OnodeRef o
= c
->get_onode(oid
, false);
7416 if (!o
|| !o
->exists
) {
7420 if (!o
->onode
.has_omap())
7424 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
7426 get_omap_key(o
->onode
.nid
, string(), &head
);
7427 get_omap_tail(o
->onode
.nid
, &tail
);
7428 it
->lower_bound(head
);
7429 while (it
->valid()) {
7430 if (it
->key() >= tail
) {
7431 dout(30) << __func__
<< " reached tail" << dendl
;
7435 decode_omap_key(it
->key(), &user_key
);
7436 dout(30) << __func__
<< " got " << pretty_binary_string(it
->key())
7437 << " -> " << user_key
<< dendl
;
7438 keys
->insert(user_key
);
7443 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7448 int BlueStore::omap_get_values(
7449 const coll_t
& cid
, ///< [in] Collection containing oid
7450 const ghobject_t
&oid
, ///< [in] Object containing omap
7451 const set
<string
> &keys
, ///< [in] Keys to get
7452 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
7455 CollectionHandle c
= _get_collection(cid
);
7458 return omap_get_values(c
, oid
, keys
, out
);
7461 int BlueStore::omap_get_values(
7462 CollectionHandle
&c_
, ///< [in] Collection containing oid
7463 const ghobject_t
&oid
, ///< [in] Object containing omap
7464 const set
<string
> &keys
, ///< [in] Keys to get
7465 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
7468 Collection
*c
= static_cast<Collection
*>(c_
.get());
7469 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7472 RWLock::RLocker
l(c
->lock
);
7475 OnodeRef o
= c
->get_onode(oid
, false);
7476 if (!o
|| !o
->exists
) {
7480 if (!o
->onode
.has_omap())
7483 _key_encode_u64(o
->onode
.nid
, &final_key
);
7484 final_key
.push_back('.');
7485 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
7486 final_key
.resize(9); // keep prefix
7489 if (db
->get(PREFIX_OMAP
, final_key
, &val
) >= 0) {
7490 dout(30) << __func__
<< " got " << pretty_binary_string(final_key
)
7491 << " -> " << *p
<< dendl
;
7492 out
->insert(make_pair(*p
, val
));
7496 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7501 int BlueStore::omap_check_keys(
7502 const coll_t
& cid
, ///< [in] Collection containing oid
7503 const ghobject_t
&oid
, ///< [in] Object containing omap
7504 const set
<string
> &keys
, ///< [in] Keys to check
7505 set
<string
> *out
///< [out] Subset of keys defined on oid
7508 CollectionHandle c
= _get_collection(cid
);
7511 return omap_check_keys(c
, oid
, keys
, out
);
7514 int BlueStore::omap_check_keys(
7515 CollectionHandle
&c_
, ///< [in] Collection containing oid
7516 const ghobject_t
&oid
, ///< [in] Object containing omap
7517 const set
<string
> &keys
, ///< [in] Keys to check
7518 set
<string
> *out
///< [out] Subset of keys defined on oid
7521 Collection
*c
= static_cast<Collection
*>(c_
.get());
7522 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7525 RWLock::RLocker
l(c
->lock
);
7528 OnodeRef o
= c
->get_onode(oid
, false);
7529 if (!o
|| !o
->exists
) {
7533 if (!o
->onode
.has_omap())
7536 _key_encode_u64(o
->onode
.nid
, &final_key
);
7537 final_key
.push_back('.');
7538 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
7539 final_key
.resize(9); // keep prefix
7542 if (db
->get(PREFIX_OMAP
, final_key
, &val
) >= 0) {
7543 dout(30) << __func__
<< " have " << pretty_binary_string(final_key
)
7544 << " -> " << *p
<< dendl
;
7547 dout(30) << __func__
<< " miss " << pretty_binary_string(final_key
)
7548 << " -> " << *p
<< dendl
;
7552 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7557 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
7558 const coll_t
& cid
, ///< [in] collection
7559 const ghobject_t
&oid
///< [in] object
7562 CollectionHandle c
= _get_collection(cid
);
7564 dout(10) << __func__
<< " " << cid
<< "doesn't exist" <<dendl
;
7565 return ObjectMap::ObjectMapIterator();
7567 return get_omap_iterator(c
, oid
);
7570 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
7571 CollectionHandle
&c_
, ///< [in] collection
7572 const ghobject_t
&oid
///< [in] object
7575 Collection
*c
= static_cast<Collection
*>(c_
.get());
7576 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
7578 return ObjectMap::ObjectMapIterator();
7580 RWLock::RLocker
l(c
->lock
);
7581 OnodeRef o
= c
->get_onode(oid
, false);
7582 if (!o
|| !o
->exists
) {
7583 dout(10) << __func__
<< " " << oid
<< "doesn't exist" <<dendl
;
7584 return ObjectMap::ObjectMapIterator();
7587 dout(10) << __func__
<< " has_omap = " << (int)o
->onode
.has_omap() <<dendl
;
7588 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
7589 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c
, o
, it
));
7592 // -----------------
7595 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction
& t
)
7597 dout(10) << __func__
<< " ondisk_format " << ondisk_format
7598 << " min_compat_ondisk_format " << min_compat_ondisk_format
7600 assert(ondisk_format
== latest_ondisk_format
);
7603 ::encode(ondisk_format
, bl
);
7604 t
->set(PREFIX_SUPER
, "ondisk_format", bl
);
7608 ::encode(min_compat_ondisk_format
, bl
);
7609 t
->set(PREFIX_SUPER
, "min_compat_ondisk_format", bl
);
7613 int BlueStore::_open_super_meta()
7619 db
->get(PREFIX_SUPER
, "nid_max", &bl
);
7620 bufferlist::iterator p
= bl
.begin();
7625 } catch (buffer::error
& e
) {
7626 derr
<< __func__
<< " unable to read nid_max" << dendl
;
7629 dout(10) << __func__
<< " old nid_max " << nid_max
<< dendl
;
7630 nid_last
= nid_max
.load();
7637 db
->get(PREFIX_SUPER
, "blobid_max", &bl
);
7638 bufferlist::iterator p
= bl
.begin();
7643 } catch (buffer::error
& e
) {
7644 derr
<< __func__
<< " unable to read blobid_max" << dendl
;
7647 dout(10) << __func__
<< " old blobid_max " << blobid_max
<< dendl
;
7648 blobid_last
= blobid_max
.load();
7654 db
->get(PREFIX_SUPER
, "freelist_type", &bl
);
7656 freelist_type
= std::string(bl
.c_str(), bl
.length());
7657 dout(10) << __func__
<< " freelist_type " << freelist_type
<< dendl
;
7659 assert("Not Support extent freelist manager" == 0);
7664 if (cct
->_conf
->bluestore_bluefs
) {
7665 bluefs_extents
.clear();
7667 db
->get(PREFIX_SUPER
, "bluefs_extents", &bl
);
7668 bufferlist::iterator p
= bl
.begin();
7670 ::decode(bluefs_extents
, p
);
7672 catch (buffer::error
& e
) {
7673 derr
<< __func__
<< " unable to read bluefs_extents" << dendl
;
7676 dout(10) << __func__
<< " bluefs_extents 0x" << std::hex
<< bluefs_extents
7677 << std::dec
<< dendl
;
7681 int32_t compat_ondisk_format
= 0;
7684 int r
= db
->get(PREFIX_SUPER
, "ondisk_format", &bl
);
7686 // base case: kraken bluestore is v1 and readable by v1
7687 dout(20) << __func__
<< " missing ondisk_format; assuming kraken"
7690 compat_ondisk_format
= 1;
7692 auto p
= bl
.begin();
7694 ::decode(ondisk_format
, p
);
7695 } catch (buffer::error
& e
) {
7696 derr
<< __func__
<< " unable to read ondisk_format" << dendl
;
7701 r
= db
->get(PREFIX_SUPER
, "min_compat_ondisk_format", &bl
);
7703 auto p
= bl
.begin();
7705 ::decode(compat_ondisk_format
, p
);
7706 } catch (buffer::error
& e
) {
7707 derr
<< __func__
<< " unable to read compat_ondisk_format" << dendl
;
7712 dout(10) << __func__
<< " ondisk_format " << ondisk_format
7713 << " compat_ondisk_format " << compat_ondisk_format
7717 if (latest_ondisk_format
< compat_ondisk_format
) {
7718 derr
<< __func__
<< " compat_ondisk_format is "
7719 << compat_ondisk_format
<< " but we only understand version "
7720 << latest_ondisk_format
<< dendl
;
7723 if (ondisk_format
< latest_ondisk_format
) {
7724 int r
= _upgrade_super();
7732 db
->get(PREFIX_SUPER
, "min_alloc_size", &bl
);
7733 auto p
= bl
.begin();
7737 min_alloc_size
= val
;
7738 min_alloc_size_order
= ctz(val
);
7739 assert(min_alloc_size
== 1u << min_alloc_size_order
);
7740 } catch (buffer::error
& e
) {
7741 derr
<< __func__
<< " unable to read min_alloc_size" << dendl
;
7744 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
7745 << std::dec
<< dendl
;
7749 _set_throttle_params();
7758 int BlueStore::_upgrade_super()
7760 dout(1) << __func__
<< " from " << ondisk_format
<< ", latest "
7761 << latest_ondisk_format
<< dendl
;
7762 assert(ondisk_format
> 0);
7763 assert(ondisk_format
< latest_ondisk_format
);
7765 if (ondisk_format
== 1) {
7767 // - super: added ondisk_format
7768 // - super: added min_readable_ondisk_format
7769 // - super: added min_compat_ondisk_format
7770 // - super: added min_alloc_size
7771 // - super: removed min_min_alloc_size
7772 KeyValueDB::Transaction t
= db
->get_transaction();
7775 db
->get(PREFIX_SUPER
, "min_min_alloc_size", &bl
);
7776 auto p
= bl
.begin();
7780 min_alloc_size
= val
;
7781 } catch (buffer::error
& e
) {
7782 derr
<< __func__
<< " failed to read min_min_alloc_size" << dendl
;
7785 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
7786 t
->rmkey(PREFIX_SUPER
, "min_min_alloc_size");
7789 _prepare_ondisk_format_super(t
);
7790 int r
= db
->submit_transaction_sync(t
);
7795 dout(1) << __func__
<< " done" << dendl
;
7799 void BlueStore::_assign_nid(TransContext
*txc
, OnodeRef o
)
7805 uint64_t nid
= ++nid_last
;
7806 dout(20) << __func__
<< " " << nid
<< dendl
;
7808 txc
->last_nid
= nid
;
7812 uint64_t BlueStore::_assign_blobid(TransContext
*txc
)
7814 uint64_t bid
= ++blobid_last
;
7815 dout(20) << __func__
<< " " << bid
<< dendl
;
7816 txc
->last_blobid
= bid
;
7820 void BlueStore::get_db_statistics(Formatter
*f
)
7822 db
->get_statistics(f
);
7825 BlueStore::TransContext
*BlueStore::_txc_create(OpSequencer
*osr
)
7827 TransContext
*txc
= new TransContext(cct
, osr
);
7828 txc
->t
= db
->get_transaction();
7829 osr
->queue_new(txc
);
7830 dout(20) << __func__
<< " osr " << osr
<< " = " << txc
7831 << " seq " << txc
->seq
<< dendl
;
7835 void BlueStore::_txc_calc_cost(TransContext
*txc
)
7837 // this is about the simplest model for transaction cost you can
7838 // imagine. there is some fixed overhead cost by saying there is a
7839 // minimum of one "io". and then we have some cost per "io" that is
7840 // a configurable (with different hdd and ssd defaults), and add
7841 // that to the bytes value.
7842 int ios
= 1; // one "io" for the kv commit
7843 for (auto& p
: txc
->ioc
.pending_aios
) {
7844 ios
+= p
.iov
.size();
7846 auto cost
= throttle_cost_per_io
.load();
7847 txc
->cost
= ios
* cost
+ txc
->bytes
;
7848 dout(10) << __func__
<< " " << txc
<< " cost " << txc
->cost
<< " ("
7849 << ios
<< " ios * " << cost
<< " + " << txc
->bytes
7850 << " bytes)" << dendl
;
7853 void BlueStore::_txc_update_store_statfs(TransContext
*txc
)
7855 if (txc
->statfs_delta
.is_empty())
7858 logger
->inc(l_bluestore_allocated
, txc
->statfs_delta
.allocated());
7859 logger
->inc(l_bluestore_stored
, txc
->statfs_delta
.stored());
7860 logger
->inc(l_bluestore_compressed
, txc
->statfs_delta
.compressed());
7861 logger
->inc(l_bluestore_compressed_allocated
, txc
->statfs_delta
.compressed_allocated());
7862 logger
->inc(l_bluestore_compressed_original
, txc
->statfs_delta
.compressed_original());
7865 std::lock_guard
<std::mutex
> l(vstatfs_lock
);
7866 vstatfs
+= txc
->statfs_delta
;
7870 txc
->statfs_delta
.encode(bl
);
7872 txc
->t
->merge(PREFIX_STAT
, "bluestore_statfs", bl
);
7873 txc
->statfs_delta
.reset();
7876 void BlueStore::_txc_state_proc(TransContext
*txc
)
7879 dout(10) << __func__
<< " txc " << txc
7880 << " " << txc
->get_state_name() << dendl
;
7881 switch (txc
->state
) {
7882 case TransContext::STATE_PREPARE
:
7883 txc
->log_state_latency(logger
, l_bluestore_state_prepare_lat
);
7884 if (txc
->ioc
.has_pending_aios()) {
7885 txc
->state
= TransContext::STATE_AIO_WAIT
;
7886 txc
->had_ios
= true;
7887 _txc_aio_submit(txc
);
7892 case TransContext::STATE_AIO_WAIT
:
7893 txc
->log_state_latency(logger
, l_bluestore_state_aio_wait_lat
);
7894 _txc_finish_io(txc
); // may trigger blocked txc's too
7897 case TransContext::STATE_IO_DONE
:
7898 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
7900 ++txc
->osr
->txc_with_unstable_io
;
7902 txc
->log_state_latency(logger
, l_bluestore_state_io_done_lat
);
7903 txc
->state
= TransContext::STATE_KV_QUEUED
;
7904 if (cct
->_conf
->bluestore_sync_submit_transaction
) {
7905 if (txc
->last_nid
>= nid_max
||
7906 txc
->last_blobid
>= blobid_max
) {
7907 dout(20) << __func__
7908 << " last_{nid,blobid} exceeds max, submit via kv thread"
7910 } else if (txc
->osr
->kv_committing_serially
) {
7911 dout(20) << __func__
<< " prior txc submitted via kv thread, us too"
7913 // note: this is starvation-prone. once we have a txc in a busy
7914 // sequencer that is committing serially it is possible to keep
7915 // submitting new transactions fast enough that we get stuck doing
7916 // so. the alternative is to block here... fixme?
7917 } else if (txc
->osr
->txc_with_unstable_io
) {
7918 dout(20) << __func__
<< " prior txc(s) with unstable ios "
7919 << txc
->osr
->txc_with_unstable_io
.load() << dendl
;
7920 } else if (cct
->_conf
->bluestore_debug_randomize_serial_transaction
&&
7921 rand() % cct
->_conf
->bluestore_debug_randomize_serial_transaction
7923 dout(20) << __func__
<< " DEBUG randomly forcing submit via kv thread"
7926 txc
->state
= TransContext::STATE_KV_SUBMITTED
;
7927 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction(txc
->t
);
7929 _txc_applied_kv(txc
);
7933 std::lock_guard
<std::mutex
> l(kv_lock
);
7934 kv_queue
.push_back(txc
);
7935 kv_cond
.notify_one();
7936 if (txc
->state
!= TransContext::STATE_KV_SUBMITTED
) {
7937 kv_queue_unsubmitted
.push_back(txc
);
7938 ++txc
->osr
->kv_committing_serially
;
7942 kv_throttle_costs
+= txc
->cost
;
7945 case TransContext::STATE_KV_SUBMITTED
:
7946 txc
->log_state_latency(logger
, l_bluestore_state_kv_committing_lat
);
7947 txc
->state
= TransContext::STATE_KV_DONE
;
7948 _txc_committed_kv(txc
);
7951 case TransContext::STATE_KV_DONE
:
7952 txc
->log_state_latency(logger
, l_bluestore_state_kv_done_lat
);
7953 if (txc
->deferred_txn
) {
7954 txc
->state
= TransContext::STATE_DEFERRED_QUEUED
;
7955 _deferred_queue(txc
);
7958 txc
->state
= TransContext::STATE_FINISHING
;
7961 case TransContext::STATE_DEFERRED_CLEANUP
:
7962 txc
->log_state_latency(logger
, l_bluestore_state_deferred_cleanup_lat
);
7963 txc
->state
= TransContext::STATE_FINISHING
;
7966 case TransContext::STATE_FINISHING
:
7967 txc
->log_state_latency(logger
, l_bluestore_state_finishing_lat
);
7972 derr
<< __func__
<< " unexpected txc " << txc
7973 << " state " << txc
->get_state_name() << dendl
;
7974 assert(0 == "unexpected txc state");
7980 void BlueStore::_txc_finish_io(TransContext
*txc
)
7982 dout(20) << __func__
<< " " << txc
<< dendl
;
7985 * we need to preserve the order of kv transactions,
7986 * even though aio will complete in any order.
7989 OpSequencer
*osr
= txc
->osr
.get();
7990 std::lock_guard
<std::mutex
> l(osr
->qlock
);
7991 txc
->state
= TransContext::STATE_IO_DONE
;
7993 // release aio contexts (including pinned buffers).
7994 txc
->ioc
.running_aios
.clear();
7996 OpSequencer::q_list_t::iterator p
= osr
->q
.iterator_to(*txc
);
7997 while (p
!= osr
->q
.begin()) {
7999 if (p
->state
< TransContext::STATE_IO_DONE
) {
8000 dout(20) << __func__
<< " " << txc
<< " blocked by " << &*p
<< " "
8001 << p
->get_state_name() << dendl
;
8004 if (p
->state
> TransContext::STATE_IO_DONE
) {
8010 _txc_state_proc(&*p
++);
8011 } while (p
!= osr
->q
.end() &&
8012 p
->state
== TransContext::STATE_IO_DONE
);
8014 if (osr
->kv_submitted_waiters
&&
8015 osr
->_is_all_kv_submitted()) {
8016 osr
->qcond
.notify_all();
8020 void BlueStore::_txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
)
8022 dout(20) << __func__
<< " txc " << txc
8023 << " onodes " << txc
->onodes
8024 << " shared_blobs " << txc
->shared_blobs
8028 for (auto o
: txc
->onodes
) {
8029 // finalize extent_map shards
8030 o
->extent_map
.update(t
, false);
8031 if (o
->extent_map
.needs_reshard()) {
8032 o
->extent_map
.reshard(db
, t
);
8033 o
->extent_map
.update(t
, true);
8034 if (o
->extent_map
.needs_reshard()) {
8035 dout(20) << __func__
<< " warning: still wants reshard, check options?"
8037 o
->extent_map
.clear_needs_reshard();
8039 logger
->inc(l_bluestore_onode_reshard
);
8044 denc(o
->onode
, bound
);
8045 o
->extent_map
.bound_encode_spanning_blobs(bound
);
8046 if (o
->onode
.extent_map_shards
.empty()) {
8047 denc(o
->extent_map
.inline_bl
, bound
);
8052 unsigned onode_part
, blob_part
, extent_part
;
8054 auto p
= bl
.get_contiguous_appender(bound
, true);
8056 onode_part
= p
.get_logical_offset();
8057 o
->extent_map
.encode_spanning_blobs(p
);
8058 blob_part
= p
.get_logical_offset() - onode_part
;
8059 if (o
->onode
.extent_map_shards
.empty()) {
8060 denc(o
->extent_map
.inline_bl
, p
);
8062 extent_part
= p
.get_logical_offset() - onode_part
- blob_part
;
8065 dout(20) << " onode " << o
->oid
<< " is " << bl
.length()
8066 << " (" << onode_part
<< " bytes onode + "
8067 << blob_part
<< " bytes spanning blobs + "
8068 << extent_part
<< " bytes inline extents)"
8070 t
->set(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size(), bl
);
8071 o
->flushing_count
++;
8074 // objects we modified but didn't affect the onode
8075 auto p
= txc
->modified_objects
.begin();
8076 while (p
!= txc
->modified_objects
.end()) {
8077 if (txc
->onodes
.count(*p
) == 0) {
8078 (*p
)->flushing_count
++;
8081 // remove dups with onodes list to avoid problems in _txc_finish
8082 p
= txc
->modified_objects
.erase(p
);
8086 // finalize shared_blobs
8087 for (auto sb
: txc
->shared_blobs
) {
8089 auto sbid
= sb
->get_sbid();
8090 get_shared_blob_key(sbid
, &key
);
8091 if (sb
->persistent
->empty()) {
8092 dout(20) << " shared_blob 0x" << std::hex
<< sbid
<< std::dec
8093 << " is empty" << dendl
;
8094 t
->rmkey(PREFIX_SHARED_BLOB
, key
);
8097 ::encode(*(sb
->persistent
), bl
);
8098 dout(20) << " shared_blob 0x" << std::hex
<< sbid
<< std::dec
8099 << " is " << bl
.length() << " " << *sb
<< dendl
;
8100 t
->set(PREFIX_SHARED_BLOB
, key
, bl
);
8105 void BlueStore::BSPerfTracker::update_from_perfcounters(
8106 PerfCounters
&logger
)
8108 os_commit_latency
.consume_next(
8110 l_bluestore_commit_lat
));
8111 os_apply_latency
.consume_next(
8113 l_bluestore_commit_lat
));
8116 void BlueStore::_txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
)
8118 dout(20) << __func__
<< " txc " << txc
<< std::hex
8119 << " allocated 0x" << txc
->allocated
8120 << " released 0x" << txc
->released
8121 << std::dec
<< dendl
;
8123 // We have to handle the case where we allocate *and* deallocate the
8124 // same region in this transaction. The freelist doesn't like that.
8125 // (Actually, the only thing that cares is the BitmapFreelistManager
8126 // debug check. But that's important.)
8127 interval_set
<uint64_t> tmp_allocated
, tmp_released
;
8128 interval_set
<uint64_t> *pallocated
= &txc
->allocated
;
8129 interval_set
<uint64_t> *preleased
= &txc
->released
;
8130 if (!txc
->allocated
.empty() && !txc
->released
.empty()) {
8131 interval_set
<uint64_t> overlap
;
8132 overlap
.intersection_of(txc
->allocated
, txc
->released
);
8133 if (!overlap
.empty()) {
8134 tmp_allocated
= txc
->allocated
;
8135 tmp_allocated
.subtract(overlap
);
8136 tmp_released
= txc
->released
;
8137 tmp_released
.subtract(overlap
);
8138 dout(20) << __func__
<< " overlap 0x" << std::hex
<< overlap
8139 << ", new allocated 0x" << tmp_allocated
8140 << " released 0x" << tmp_released
<< std::dec
8142 pallocated
= &tmp_allocated
;
8143 preleased
= &tmp_released
;
8147 // update freelist with non-overlap sets
8148 for (interval_set
<uint64_t>::iterator p
= pallocated
->begin();
8149 p
!= pallocated
->end();
8151 fm
->allocate(p
.get_start(), p
.get_len(), t
);
8153 for (interval_set
<uint64_t>::iterator p
= preleased
->begin();
8154 p
!= preleased
->end();
8156 dout(20) << __func__
<< " release 0x" << std::hex
<< p
.get_start()
8157 << "~" << p
.get_len() << std::dec
<< dendl
;
8158 fm
->release(p
.get_start(), p
.get_len(), t
);
8161 _txc_update_store_statfs(txc
);
8164 void BlueStore::_txc_applied_kv(TransContext
*txc
)
8166 for (auto ls
: { &txc
->onodes
, &txc
->modified_objects
}) {
8167 for (auto& o
: *ls
) {
8168 dout(20) << __func__
<< " onode " << o
<< " had " << o
->flushing_count
8170 if (--o
->flushing_count
== 0) {
8171 std::lock_guard
<std::mutex
> l(o
->flush_lock
);
8172 o
->flush_cond
.notify_all();
8178 void BlueStore::_txc_committed_kv(TransContext
*txc
)
8180 dout(20) << __func__
<< " txc " << txc
<< dendl
;
8182 // warning: we're calling onreadable_sync inside the sequencer lock
8183 if (txc
->onreadable_sync
) {
8184 txc
->onreadable_sync
->complete(0);
8185 txc
->onreadable_sync
= NULL
;
8187 unsigned n
= txc
->osr
->parent
->shard_hint
.hash_to_shard(m_finisher_num
);
8188 if (txc
->oncommit
) {
8189 logger
->tinc(l_bluestore_commit_lat
, ceph_clock_now() - txc
->start
);
8190 finishers
[n
]->queue(txc
->oncommit
);
8191 txc
->oncommit
= NULL
;
8193 if (txc
->onreadable
) {
8194 finishers
[n
]->queue(txc
->onreadable
);
8195 txc
->onreadable
= NULL
;
8198 if (!txc
->oncommits
.empty()) {
8199 finishers
[n
]->queue(txc
->oncommits
);
8203 void BlueStore::_txc_finish(TransContext
*txc
)
8205 dout(20) << __func__
<< " " << txc
<< " onodes " << txc
->onodes
<< dendl
;
8206 assert(txc
->state
== TransContext::STATE_FINISHING
);
8208 for (auto& sb
: txc
->shared_blobs_written
) {
8209 sb
->bc
.finish_write(sb
->get_cache(), txc
->seq
);
8211 txc
->shared_blobs_written
.clear();
8213 while (!txc
->removed_collections
.empty()) {
8214 _queue_reap_collection(txc
->removed_collections
.front());
8215 txc
->removed_collections
.pop_front();
8218 OpSequencerRef osr
= txc
->osr
;
8220 bool submit_deferred
= false;
8221 OpSequencer::q_list_t releasing_txc
;
8223 std::lock_guard
<std::mutex
> l(osr
->qlock
);
8224 txc
->state
= TransContext::STATE_DONE
;
8225 bool notify
= false;
8226 while (!osr
->q
.empty()) {
8227 TransContext
*txc
= &osr
->q
.front();
8228 dout(20) << __func__
<< " txc " << txc
<< " " << txc
->get_state_name()
8230 if (txc
->state
!= TransContext::STATE_DONE
) {
8231 if (txc
->state
== TransContext::STATE_PREPARE
&&
8232 deferred_aggressive
) {
8233 // for _osr_drain_preceding()
8236 if (txc
->state
== TransContext::STATE_DEFERRED_QUEUED
&&
8237 osr
->q
.size() > g_conf
->bluestore_max_deferred_txc
) {
8238 submit_deferred
= true;
8244 releasing_txc
.push_back(*txc
);
8248 osr
->qcond
.notify_all();
8250 if (osr
->q
.empty()) {
8251 dout(20) << __func__
<< " osr " << osr
<< " q now empty" << dendl
;
8255 while (!releasing_txc
.empty()) {
8256 // release to allocator only after all preceding txc's have also
8257 // finished any deferred writes that potentially land in these
8259 auto txc
= &releasing_txc
.front();
8260 _txc_release_alloc(txc
);
8261 releasing_txc
.pop_front();
8262 txc
->log_state_latency(logger
, l_bluestore_state_done_lat
);
8266 if (submit_deferred
) {
8267 // we're pinning memory; flush! we could be more fine-grained here but
8268 // i'm not sure it's worth the bother.
8269 deferred_try_submit();
8272 if (empty
&& osr
->zombie
) {
8273 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
8278 void BlueStore::_txc_release_alloc(TransContext
*txc
)
8280 // update allocator with full released set
8281 if (!cct
->_conf
->bluestore_debug_no_reuse_blocks
) {
8282 dout(10) << __func__
<< " " << txc
<< " " << std::hex
8283 << txc
->released
<< std::dec
<< dendl
;
8284 for (interval_set
<uint64_t>::iterator p
= txc
->released
.begin();
8285 p
!= txc
->released
.end();
8287 alloc
->release(p
.get_start(), p
.get_len());
8291 txc
->allocated
.clear();
8292 txc
->released
.clear();
8295 void BlueStore::_osr_drain_preceding(TransContext
*txc
)
8297 OpSequencer
*osr
= txc
->osr
.get();
8298 dout(10) << __func__
<< " " << txc
<< " osr " << osr
<< dendl
;
8299 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
8301 // submit anything pending
8302 deferred_lock
.lock();
8303 if (osr
->deferred_pending
) {
8304 _deferred_submit_unlock(osr
);
8306 deferred_lock
.unlock();
8310 // wake up any previously finished deferred events
8311 std::lock_guard
<std::mutex
> l(kv_lock
);
8312 kv_cond
.notify_one();
8314 osr
->drain_preceding(txc
);
8315 --deferred_aggressive
;
8316 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
8319 void BlueStore::_osr_drain_all()
8321 dout(10) << __func__
<< dendl
;
8323 set
<OpSequencerRef
> s
;
8325 std::lock_guard
<std::mutex
> l(osr_lock
);
8328 dout(20) << __func__
<< " osr_set " << s
<< dendl
;
8330 ++deferred_aggressive
;
8332 // submit anything pending
8333 deferred_try_submit();
8336 // wake up any previously finished deferred events
8337 std::lock_guard
<std::mutex
> l(kv_lock
);
8338 kv_cond
.notify_one();
8341 std::lock_guard
<std::mutex
> l(kv_finalize_lock
);
8342 kv_finalize_cond
.notify_one();
8344 for (auto osr
: s
) {
8345 dout(20) << __func__
<< " drain " << osr
<< dendl
;
8348 --deferred_aggressive
;
8350 dout(10) << __func__
<< " done" << dendl
;
8353 void BlueStore::_osr_unregister_all()
8355 set
<OpSequencerRef
> s
;
8357 std::lock_guard
<std::mutex
> l(osr_lock
);
8360 dout(10) << __func__
<< " " << s
<< dendl
;
8361 for (auto osr
: s
) {
8365 // break link from Sequencer to us so that this OpSequencer
8366 // instance can die with this mount/umount cycle. note that
8367 // we assume umount() will not race against ~Sequencer.
8368 assert(osr
->parent
);
8369 osr
->parent
->p
.reset();
8372 // nobody should be creating sequencers during umount either.
8374 std::lock_guard
<std::mutex
> l(osr_lock
);
8375 assert(osr_set
.empty());
8379 void BlueStore::_kv_start()
8381 dout(10) << __func__
<< dendl
;
8383 if (cct
->_conf
->bluestore_shard_finishers
) {
8384 if (cct
->_conf
->osd_op_num_shards
) {
8385 m_finisher_num
= cct
->_conf
->osd_op_num_shards
;
8388 if (bdev
->is_rotational()) {
8389 m_finisher_num
= cct
->_conf
->osd_op_num_shards_hdd
;
8391 m_finisher_num
= cct
->_conf
->osd_op_num_shards_ssd
;
8396 assert(m_finisher_num
!= 0);
8398 for (int i
= 0; i
< m_finisher_num
; ++i
) {
8400 oss
<< "finisher-" << i
;
8401 Finisher
*f
= new Finisher(cct
, oss
.str(), "finisher");
8402 finishers
.push_back(f
);
8405 deferred_finisher
.start();
8406 for (auto f
: finishers
) {
8409 kv_sync_thread
.create("bstore_kv_sync");
8410 kv_finalize_thread
.create("bstore_kv_final");
8413 void BlueStore::_kv_stop()
8415 dout(10) << __func__
<< dendl
;
8417 std::unique_lock
<std::mutex
> l(kv_lock
);
8418 while (!kv_sync_started
) {
8422 kv_cond
.notify_all();
8425 std::unique_lock
<std::mutex
> l(kv_finalize_lock
);
8426 while (!kv_finalize_started
) {
8427 kv_finalize_cond
.wait(l
);
8429 kv_finalize_stop
= true;
8430 kv_finalize_cond
.notify_all();
8432 kv_sync_thread
.join();
8433 kv_finalize_thread
.join();
8434 assert(removed_collections
.empty());
8436 std::lock_guard
<std::mutex
> l(kv_lock
);
8440 std::lock_guard
<std::mutex
> l(kv_finalize_lock
);
8441 kv_finalize_stop
= false;
8443 dout(10) << __func__
<< " stopping finishers" << dendl
;
8444 deferred_finisher
.wait_for_empty();
8445 deferred_finisher
.stop();
8446 for (auto f
: finishers
) {
8447 f
->wait_for_empty();
8450 dout(10) << __func__
<< " stopped" << dendl
;
8453 void BlueStore::_kv_sync_thread()
8455 dout(10) << __func__
<< " start" << dendl
;
8456 std::unique_lock
<std::mutex
> l(kv_lock
);
8457 assert(!kv_sync_started
);
8458 kv_sync_started
= true;
8459 kv_cond
.notify_all();
8461 assert(kv_committing
.empty());
8462 if (kv_queue
.empty() &&
8463 ((deferred_done_queue
.empty() && deferred_stable_queue
.empty()) ||
8464 !deferred_aggressive
)) {
8467 dout(20) << __func__
<< " sleep" << dendl
;
8469 dout(20) << __func__
<< " wake" << dendl
;
8471 deque
<TransContext
*> kv_submitting
;
8472 deque
<DeferredBatch
*> deferred_done
, deferred_stable
;
8473 uint64_t aios
= 0, costs
= 0;
8475 dout(20) << __func__
<< " committing " << kv_queue
.size()
8476 << " submitting " << kv_queue_unsubmitted
.size()
8477 << " deferred done " << deferred_done_queue
.size()
8478 << " stable " << deferred_stable_queue
.size()
8480 kv_committing
.swap(kv_queue
);
8481 kv_submitting
.swap(kv_queue_unsubmitted
);
8482 deferred_done
.swap(deferred_done_queue
);
8483 deferred_stable
.swap(deferred_stable_queue
);
8485 costs
= kv_throttle_costs
;
8487 kv_throttle_costs
= 0;
8488 utime_t start
= ceph_clock_now();
8491 dout(30) << __func__
<< " committing " << kv_committing
<< dendl
;
8492 dout(30) << __func__
<< " submitting " << kv_submitting
<< dendl
;
8493 dout(30) << __func__
<< " deferred_done " << deferred_done
<< dendl
;
8494 dout(30) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
8496 bool force_flush
= false;
8497 // if bluefs is sharing the same device as data (only), then we
8498 // can rely on the bluefs commit to flush the device and make
8499 // deferred aios stable. that means that if we do have done deferred
8500 // txcs AND we are not on a single device, we need to force a flush.
8501 if (bluefs_single_shared_device
&& bluefs
) {
8504 } else if (kv_committing
.empty() && kv_submitting
.empty() &&
8505 deferred_stable
.empty()) {
8506 force_flush
= true; // there's nothing else to commit!
8507 } else if (deferred_aggressive
) {
8514 dout(20) << __func__
<< " num_aios=" << aios
8515 << " force_flush=" << (int)force_flush
8516 << ", flushing, deferred done->stable" << dendl
;
8517 // flush/barrier on block device
8520 // if we flush then deferred done are now deferred stable
8521 deferred_stable
.insert(deferred_stable
.end(), deferred_done
.begin(),
8522 deferred_done
.end());
8523 deferred_done
.clear();
8525 utime_t after_flush
= ceph_clock_now();
8527 // we will use one final transaction to force a sync
8528 KeyValueDB::Transaction synct
= db
->get_transaction();
8530 // increase {nid,blobid}_max? note that this covers both the
8531 // case where we are approaching the max and the case we passed
8532 // it. in either case, we increase the max in the earlier txn
8534 uint64_t new_nid_max
= 0, new_blobid_max
= 0;
8535 if (nid_last
+ cct
->_conf
->bluestore_nid_prealloc
/2 > nid_max
) {
8536 KeyValueDB::Transaction t
=
8537 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
8538 new_nid_max
= nid_last
+ cct
->_conf
->bluestore_nid_prealloc
;
8540 ::encode(new_nid_max
, bl
);
8541 t
->set(PREFIX_SUPER
, "nid_max", bl
);
8542 dout(10) << __func__
<< " new_nid_max " << new_nid_max
<< dendl
;
8544 if (blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
/2 > blobid_max
) {
8545 KeyValueDB::Transaction t
=
8546 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
8547 new_blobid_max
= blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
;
8549 ::encode(new_blobid_max
, bl
);
8550 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
8551 dout(10) << __func__
<< " new_blobid_max " << new_blobid_max
<< dendl
;
8554 for (auto txc
: kv_committing
) {
8555 if (txc
->state
== TransContext::STATE_KV_QUEUED
) {
8556 txc
->log_state_latency(logger
, l_bluestore_state_kv_queued_lat
);
8557 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction(txc
->t
);
8559 _txc_applied_kv(txc
);
8560 --txc
->osr
->kv_committing_serially
;
8561 txc
->state
= TransContext::STATE_KV_SUBMITTED
;
8562 if (txc
->osr
->kv_submitted_waiters
) {
8563 std::lock_guard
<std::mutex
> l(txc
->osr
->qlock
);
8564 if (txc
->osr
->_is_all_kv_submitted()) {
8565 txc
->osr
->qcond
.notify_all();
8570 assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
8571 txc
->log_state_latency(logger
, l_bluestore_state_kv_queued_lat
);
8574 --txc
->osr
->txc_with_unstable_io
;
8578 // release throttle *before* we commit. this allows new ops
8579 // to be prepared and enter pipeline while we are waiting on
8580 // the kv commit sync/flush. then hopefully on the next
8581 // iteration there will already be ops awake. otherwise, we
8582 // end up going to sleep, and then wake up when the very first
8583 // transaction is ready for commit.
8584 throttle_bytes
.put(costs
);
8586 PExtentVector bluefs_gift_extents
;
8588 after_flush
- bluefs_last_balance
>
8589 cct
->_conf
->bluestore_bluefs_balance_interval
) {
8590 bluefs_last_balance
= after_flush
;
8591 int r
= _balance_bluefs_freespace(&bluefs_gift_extents
);
8594 for (auto& p
: bluefs_gift_extents
) {
8595 bluefs_extents
.insert(p
.offset
, p
.length
);
8598 ::encode(bluefs_extents
, bl
);
8599 dout(10) << __func__
<< " bluefs_extents now 0x" << std::hex
8600 << bluefs_extents
<< std::dec
<< dendl
;
8601 synct
->set(PREFIX_SUPER
, "bluefs_extents", bl
);
8605 // cleanup sync deferred keys
8606 for (auto b
: deferred_stable
) {
8607 for (auto& txc
: b
->txcs
) {
8608 bluestore_deferred_transaction_t
& wt
= *txc
.deferred_txn
;
8609 if (!wt
.released
.empty()) {
8610 // kraken replay compat only
8611 txc
.released
= wt
.released
;
8612 dout(10) << __func__
<< " deferred txn has released "
8614 << " (we just upgraded from kraken) on " << &txc
<< dendl
;
8615 _txc_finalize_kv(&txc
, synct
);
8617 // cleanup the deferred
8619 get_deferred_key(wt
.seq
, &key
);
8620 synct
->rm_single_key(PREFIX_DEFERRED
, key
);
8624 // submit synct synchronously (block and wait for it to commit)
8625 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction_sync(synct
);
8629 nid_max
= new_nid_max
;
8630 dout(10) << __func__
<< " nid_max now " << nid_max
<< dendl
;
8632 if (new_blobid_max
) {
8633 blobid_max
= new_blobid_max
;
8634 dout(10) << __func__
<< " blobid_max now " << blobid_max
<< dendl
;
8638 utime_t finish
= ceph_clock_now();
8639 utime_t dur_flush
= after_flush
- start
;
8640 utime_t dur_kv
= finish
- after_flush
;
8641 utime_t dur
= finish
- start
;
8642 dout(20) << __func__
<< " committed " << kv_committing
.size()
8643 << " cleaned " << deferred_stable
.size()
8645 << " (" << dur_flush
<< " flush + " << dur_kv
<< " kv commit)"
8647 logger
->tinc(l_bluestore_kv_flush_lat
, dur_flush
);
8648 logger
->tinc(l_bluestore_kv_commit_lat
, dur_kv
);
8649 logger
->tinc(l_bluestore_kv_lat
, dur
);
8653 if (!bluefs_gift_extents
.empty()) {
8654 _commit_bluefs_freespace(bluefs_gift_extents
);
8656 for (auto p
= bluefs_extents_reclaiming
.begin();
8657 p
!= bluefs_extents_reclaiming
.end();
8659 dout(20) << __func__
<< " releasing old bluefs 0x" << std::hex
8660 << p
.get_start() << "~" << p
.get_len() << std::dec
8662 alloc
->release(p
.get_start(), p
.get_len());
8664 bluefs_extents_reclaiming
.clear();
8668 std::unique_lock
<std::mutex
> m(kv_finalize_lock
);
8669 if (kv_committing_to_finalize
.empty()) {
8670 kv_committing_to_finalize
.swap(kv_committing
);
8672 kv_committing_to_finalize
.insert(
8673 kv_committing_to_finalize
.end(),
8674 kv_committing
.begin(),
8675 kv_committing
.end());
8676 kv_committing
.clear();
8678 if (deferred_stable_to_finalize
.empty()) {
8679 deferred_stable_to_finalize
.swap(deferred_stable
);
8681 deferred_stable_to_finalize
.insert(
8682 deferred_stable_to_finalize
.end(),
8683 deferred_stable
.begin(),
8684 deferred_stable
.end());
8685 deferred_stable
.clear();
8687 kv_finalize_cond
.notify_one();
8691 // previously deferred "done" are now "stable" by virtue of this
8693 deferred_stable_queue
.swap(deferred_done
);
8696 dout(10) << __func__
<< " finish" << dendl
;
8697 kv_sync_started
= false;
8700 void BlueStore::_kv_finalize_thread()
8702 deque
<TransContext
*> kv_committed
;
8703 deque
<DeferredBatch
*> deferred_stable
;
8704 dout(10) << __func__
<< " start" << dendl
;
8705 std::unique_lock
<std::mutex
> l(kv_finalize_lock
);
8706 assert(!kv_finalize_started
);
8707 kv_finalize_started
= true;
8708 kv_finalize_cond
.notify_all();
8710 assert(kv_committed
.empty());
8711 assert(deferred_stable
.empty());
8712 if (kv_committing_to_finalize
.empty() &&
8713 deferred_stable_to_finalize
.empty()) {
8714 if (kv_finalize_stop
)
8716 dout(20) << __func__
<< " sleep" << dendl
;
8717 kv_finalize_cond
.wait(l
);
8718 dout(20) << __func__
<< " wake" << dendl
;
8720 kv_committed
.swap(kv_committing_to_finalize
);
8721 deferred_stable
.swap(deferred_stable_to_finalize
);
8723 dout(20) << __func__
<< " kv_committed " << kv_committed
<< dendl
;
8724 dout(20) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
8726 while (!kv_committed
.empty()) {
8727 TransContext
*txc
= kv_committed
.front();
8728 assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
8729 _txc_state_proc(txc
);
8730 kv_committed
.pop_front();
8733 for (auto b
: deferred_stable
) {
8734 auto p
= b
->txcs
.begin();
8735 while (p
!= b
->txcs
.end()) {
8736 TransContext
*txc
= &*p
;
8737 p
= b
->txcs
.erase(p
); // unlink here because
8738 _txc_state_proc(txc
); // this may destroy txc
8742 deferred_stable
.clear();
8744 if (!deferred_aggressive
) {
8745 if (deferred_queue_size
>= deferred_batch_ops
.load() ||
8746 throttle_deferred_bytes
.past_midpoint()) {
8747 deferred_try_submit();
8751 // this is as good a place as any ...
8752 _reap_collections();
8757 dout(10) << __func__
<< " finish" << dendl
;
8758 kv_finalize_started
= false;
8761 bluestore_deferred_op_t
*BlueStore::_get_deferred_op(
8762 TransContext
*txc
, OnodeRef o
)
8764 if (!txc
->deferred_txn
) {
8765 txc
->deferred_txn
= new bluestore_deferred_transaction_t
;
8767 txc
->deferred_txn
->ops
.push_back(bluestore_deferred_op_t());
8768 return &txc
->deferred_txn
->ops
.back();
8771 void BlueStore::_deferred_queue(TransContext
*txc
)
8773 dout(20) << __func__
<< " txc " << txc
<< " osr " << txc
->osr
<< dendl
;
8774 deferred_lock
.lock();
8775 if (!txc
->osr
->deferred_pending
&&
8776 !txc
->osr
->deferred_running
) {
8777 deferred_queue
.push_back(*txc
->osr
);
8779 if (!txc
->osr
->deferred_pending
) {
8780 txc
->osr
->deferred_pending
= new DeferredBatch(cct
, txc
->osr
.get());
8782 ++deferred_queue_size
;
8783 txc
->osr
->deferred_pending
->txcs
.push_back(*txc
);
8784 bluestore_deferred_transaction_t
& wt
= *txc
->deferred_txn
;
8785 for (auto opi
= wt
.ops
.begin(); opi
!= wt
.ops
.end(); ++opi
) {
8786 const auto& op
= *opi
;
8787 assert(op
.op
== bluestore_deferred_op_t::OP_WRITE
);
8788 bufferlist::const_iterator p
= op
.data
.begin();
8789 for (auto e
: op
.extents
) {
8790 txc
->osr
->deferred_pending
->prepare_write(
8791 cct
, wt
.seq
, e
.offset
, e
.length
, p
);
8794 if (deferred_aggressive
&&
8795 !txc
->osr
->deferred_running
) {
8796 _deferred_submit_unlock(txc
->osr
.get());
8798 deferred_lock
.unlock();
8802 void BlueStore::deferred_try_submit()
8804 dout(20) << __func__
<< " " << deferred_queue
.size() << " osrs, "
8805 << deferred_queue_size
<< " txcs" << dendl
;
8806 std::lock_guard
<std::mutex
> l(deferred_lock
);
8807 vector
<OpSequencerRef
> osrs
;
8808 osrs
.reserve(deferred_queue
.size());
8809 for (auto& osr
: deferred_queue
) {
8810 osrs
.push_back(&osr
);
8812 for (auto& osr
: osrs
) {
8813 if (osr
->deferred_pending
) {
8814 if (!osr
->deferred_running
) {
8815 _deferred_submit_unlock(osr
.get());
8816 deferred_lock
.lock();
8818 dout(20) << __func__
<< " osr " << osr
<< " already has running"
8822 dout(20) << __func__
<< " osr " << osr
<< " has no pending" << dendl
;
8827 void BlueStore::_deferred_submit_unlock(OpSequencer
*osr
)
8829 dout(10) << __func__
<< " osr " << osr
8830 << " " << osr
->deferred_pending
->iomap
.size() << " ios pending "
8832 assert(osr
->deferred_pending
);
8833 assert(!osr
->deferred_running
);
8835 auto b
= osr
->deferred_pending
;
8836 deferred_queue_size
-= b
->seq_bytes
.size();
8837 assert(deferred_queue_size
>= 0);
8839 osr
->deferred_running
= osr
->deferred_pending
;
8840 osr
->deferred_pending
= nullptr;
8842 uint64_t start
= 0, pos
= 0;
8844 auto i
= b
->iomap
.begin();
8846 if (i
== b
->iomap
.end() || i
->first
!= pos
) {
8848 dout(20) << __func__
<< " write 0x" << std::hex
8849 << start
<< "~" << bl
.length()
8850 << " crc " << bl
.crc32c(-1) << std::dec
<< dendl
;
8851 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
8852 logger
->inc(l_bluestore_deferred_write_ops
);
8853 logger
->inc(l_bluestore_deferred_write_bytes
, bl
.length());
8854 int r
= bdev
->aio_write(start
, bl
, &b
->ioc
, false);
8858 if (i
== b
->iomap
.end()) {
8865 dout(20) << __func__
<< " seq " << i
->second
.seq
<< " 0x"
8866 << std::hex
<< pos
<< "~" << i
->second
.bl
.length() << std::dec
8871 pos
+= i
->second
.bl
.length();
8872 bl
.claim_append(i
->second
.bl
);
8876 deferred_lock
.unlock();
8877 bdev
->aio_submit(&b
->ioc
);
8880 struct C_DeferredTrySubmit
: public Context
{
8882 C_DeferredTrySubmit(BlueStore
*s
) : store(s
) {}
8883 void finish(int r
) {
8884 store
->deferred_try_submit();
8888 void BlueStore::_deferred_aio_finish(OpSequencer
*osr
)
8890 dout(10) << __func__
<< " osr " << osr
<< dendl
;
8891 assert(osr
->deferred_running
);
8892 DeferredBatch
*b
= osr
->deferred_running
;
8895 std::lock_guard
<std::mutex
> l(deferred_lock
);
8896 assert(osr
->deferred_running
== b
);
8897 osr
->deferred_running
= nullptr;
8898 if (!osr
->deferred_pending
) {
8899 dout(20) << __func__
<< " dequeueing" << dendl
;
8900 auto q
= deferred_queue
.iterator_to(*osr
);
8901 deferred_queue
.erase(q
);
8902 } else if (deferred_aggressive
) {
8903 dout(20) << __func__
<< " queuing async deferred_try_submit" << dendl
;
8904 deferred_finisher
.queue(new C_DeferredTrySubmit(this));
8906 dout(20) << __func__
<< " leaving queued, more pending" << dendl
;
8912 std::lock_guard
<std::mutex
> l2(osr
->qlock
);
8913 for (auto& i
: b
->txcs
) {
8914 TransContext
*txc
= &i
;
8915 txc
->state
= TransContext::STATE_DEFERRED_CLEANUP
;
8918 osr
->qcond
.notify_all();
8919 throttle_deferred_bytes
.put(costs
);
8920 std::lock_guard
<std::mutex
> l(kv_lock
);
8921 deferred_done_queue
.emplace_back(b
);
8924 // in the normal case, do not bother waking up the kv thread; it will
8925 // catch us on the next commit anyway.
8926 if (deferred_aggressive
) {
8927 std::lock_guard
<std::mutex
> l(kv_lock
);
8928 kv_cond
.notify_one();
8932 int BlueStore::_deferred_replay()
8934 dout(10) << __func__
<< " start" << dendl
;
8935 OpSequencerRef osr
= new OpSequencer(cct
, this);
8938 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_DEFERRED
);
8939 for (it
->lower_bound(string()); it
->valid(); it
->next(), ++count
) {
8940 dout(20) << __func__
<< " replay " << pretty_binary_string(it
->key())
8942 bluestore_deferred_transaction_t
*deferred_txn
=
8943 new bluestore_deferred_transaction_t
;
8944 bufferlist bl
= it
->value();
8945 bufferlist::iterator p
= bl
.begin();
8947 ::decode(*deferred_txn
, p
);
8948 } catch (buffer::error
& e
) {
8949 derr
<< __func__
<< " failed to decode deferred txn "
8950 << pretty_binary_string(it
->key()) << dendl
;
8951 delete deferred_txn
;
8955 TransContext
*txc
= _txc_create(osr
.get());
8956 txc
->deferred_txn
= deferred_txn
;
8957 txc
->state
= TransContext::STATE_KV_DONE
;
8958 _txc_state_proc(txc
);
8961 dout(20) << __func__
<< " draining osr" << dendl
;
8964 dout(10) << __func__
<< " completed " << count
<< " events" << dendl
;
8968 // ---------------------------
8971 int BlueStore::queue_transactions(
8973 vector
<Transaction
>& tls
,
8975 ThreadPool::TPHandle
*handle
)
8978 Context
*onreadable
;
8980 Context
*onreadable_sync
;
8981 ObjectStore::Transaction::collect_contexts(
8982 tls
, &onreadable
, &ondisk
, &onreadable_sync
);
8984 if (cct
->_conf
->objectstore_blackhole
) {
8985 dout(0) << __func__
<< " objectstore_blackhole = TRUE, dropping transaction"
8989 delete onreadable_sync
;
8992 utime_t start
= ceph_clock_now();
8993 // set up the sequencer
8997 osr
= static_cast<OpSequencer
*>(posr
->p
.get());
8998 dout(10) << __func__
<< " existing " << osr
<< " " << *osr
<< dendl
;
9000 osr
= new OpSequencer(cct
, this);
9003 dout(10) << __func__
<< " new " << osr
<< " " << *osr
<< dendl
;
9007 TransContext
*txc
= _txc_create(osr
);
9008 txc
->onreadable
= onreadable
;
9009 txc
->onreadable_sync
= onreadable_sync
;
9010 txc
->oncommit
= ondisk
;
9012 for (vector
<Transaction
>::iterator p
= tls
.begin(); p
!= tls
.end(); ++p
) {
9014 txc
->bytes
+= (*p
).get_num_bytes();
9015 _txc_add_transaction(txc
, &(*p
));
9017 _txc_calc_cost(txc
);
9019 _txc_write_nodes(txc
, txc
->t
);
9021 // journal deferred items
9022 if (txc
->deferred_txn
) {
9023 txc
->deferred_txn
->seq
= ++deferred_seq
;
9025 ::encode(*txc
->deferred_txn
, bl
);
9027 get_deferred_key(txc
->deferred_txn
->seq
, &key
);
9028 txc
->t
->set(PREFIX_DEFERRED
, key
, bl
);
9031 _txc_finalize_kv(txc
, txc
->t
);
9033 handle
->suspend_tp_timeout();
9035 utime_t tstart
= ceph_clock_now();
9036 throttle_bytes
.get(txc
->cost
);
9037 if (txc
->deferred_txn
) {
9038 // ensure we do not block here because of deferred writes
9039 if (!throttle_deferred_bytes
.get_or_fail(txc
->cost
)) {
9040 dout(10) << __func__
<< " failed get throttle_deferred_bytes, aggressive"
9042 ++deferred_aggressive
;
9043 deferred_try_submit();
9045 // wake up any previously finished deferred events
9046 std::lock_guard
<std::mutex
> l(kv_lock
);
9047 kv_cond
.notify_one();
9049 throttle_deferred_bytes
.get(txc
->cost
);
9050 --deferred_aggressive
;
9053 utime_t tend
= ceph_clock_now();
9056 handle
->reset_tp_timeout();
9058 logger
->inc(l_bluestore_txc
);
9061 _txc_state_proc(txc
);
9063 logger
->tinc(l_bluestore_submit_lat
, ceph_clock_now() - start
);
9064 logger
->tinc(l_bluestore_throttle_lat
, tend
- tstart
);
9068 void BlueStore::_txc_aio_submit(TransContext
*txc
)
9070 dout(10) << __func__
<< " txc " << txc
<< dendl
;
9071 bdev
->aio_submit(&txc
->ioc
);
9074 void BlueStore::_txc_add_transaction(TransContext
*txc
, Transaction
*t
)
9076 Transaction::iterator i
= t
->begin();
9078 _dump_transaction(t
);
9080 vector
<CollectionRef
> cvec(i
.colls
.size());
9082 for (vector
<coll_t
>::iterator p
= i
.colls
.begin(); p
!= i
.colls
.end();
9084 cvec
[j
] = _get_collection(*p
);
9086 vector
<OnodeRef
> ovec(i
.objects
.size());
9088 for (int pos
= 0; i
.have_op(); ++pos
) {
9089 Transaction::Op
*op
= i
.decode_op();
9093 if (op
->op
== Transaction::OP_NOP
)
9096 // collection operations
9097 CollectionRef
&c
= cvec
[op
->cid
];
9099 case Transaction::OP_RMCOLL
:
9101 const coll_t
&cid
= i
.get_cid(op
->cid
);
9102 r
= _remove_collection(txc
, cid
, &c
);
9108 case Transaction::OP_MKCOLL
:
9111 const coll_t
&cid
= i
.get_cid(op
->cid
);
9112 r
= _create_collection(txc
, cid
, op
->split_bits
, &c
);
9118 case Transaction::OP_SPLIT_COLLECTION
:
9119 assert(0 == "deprecated");
9122 case Transaction::OP_SPLIT_COLLECTION2
:
9124 uint32_t bits
= op
->split_bits
;
9125 uint32_t rem
= op
->split_rem
;
9126 r
= _split_collection(txc
, c
, cvec
[op
->dest_cid
], bits
, rem
);
9132 case Transaction::OP_COLL_HINT
:
9134 uint32_t type
= op
->hint_type
;
9137 bufferlist::iterator hiter
= hint
.begin();
9138 if (type
== Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
) {
9141 ::decode(pg_num
, hiter
);
9142 ::decode(num_objs
, hiter
);
9143 dout(10) << __func__
<< " collection hint objects is a no-op, "
9144 << " pg_num " << pg_num
<< " num_objects " << num_objs
9148 dout(10) << __func__
<< " unknown collection hint " << type
<< dendl
;
9154 case Transaction::OP_COLL_SETATTR
:
9158 case Transaction::OP_COLL_RMATTR
:
9162 case Transaction::OP_COLL_RENAME
:
9163 assert(0 == "not implemented");
9167 derr
<< __func__
<< " error " << cpp_strerror(r
)
9168 << " not handled on operation " << op
->op
9169 << " (op " << pos
<< ", counting from 0)" << dendl
;
9170 _dump_transaction(t
, 0);
9171 assert(0 == "unexpected error");
9174 // these operations implicity create the object
9175 bool create
= false;
9176 if (op
->op
== Transaction::OP_TOUCH
||
9177 op
->op
== Transaction::OP_WRITE
||
9178 op
->op
== Transaction::OP_ZERO
) {
9182 // object operations
9183 RWLock::WLocker
l(c
->lock
);
9184 OnodeRef
&o
= ovec
[op
->oid
];
9186 ghobject_t oid
= i
.get_oid(op
->oid
);
9187 o
= c
->get_onode(oid
, create
);
9189 if (!create
&& (!o
|| !o
->exists
)) {
9190 dout(10) << __func__
<< " op " << op
->op
<< " got ENOENT on "
9191 << i
.get_oid(op
->oid
) << dendl
;
9197 case Transaction::OP_TOUCH
:
9198 r
= _touch(txc
, c
, o
);
9201 case Transaction::OP_WRITE
:
9203 uint64_t off
= op
->off
;
9204 uint64_t len
= op
->len
;
9205 uint32_t fadvise_flags
= i
.get_fadvise_flags();
9208 r
= _write(txc
, c
, o
, off
, len
, bl
, fadvise_flags
);
9212 case Transaction::OP_ZERO
:
9214 uint64_t off
= op
->off
;
9215 uint64_t len
= op
->len
;
9216 r
= _zero(txc
, c
, o
, off
, len
);
9220 case Transaction::OP_TRIMCACHE
:
9222 // deprecated, no-op
9226 case Transaction::OP_TRUNCATE
:
9228 uint64_t off
= op
->off
;
9229 r
= _truncate(txc
, c
, o
, off
);
9233 case Transaction::OP_REMOVE
:
9235 r
= _remove(txc
, c
, o
);
9239 case Transaction::OP_SETATTR
:
9241 string name
= i
.decode_string();
9244 r
= _setattr(txc
, c
, o
, name
, bp
);
9248 case Transaction::OP_SETATTRS
:
9250 map
<string
, bufferptr
> aset
;
9251 i
.decode_attrset(aset
);
9252 r
= _setattrs(txc
, c
, o
, aset
);
9256 case Transaction::OP_RMATTR
:
9258 string name
= i
.decode_string();
9259 r
= _rmattr(txc
, c
, o
, name
);
9263 case Transaction::OP_RMATTRS
:
9265 r
= _rmattrs(txc
, c
, o
);
9269 case Transaction::OP_CLONE
:
9271 OnodeRef
& no
= ovec
[op
->dest_oid
];
9273 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
9274 no
= c
->get_onode(noid
, true);
9276 r
= _clone(txc
, c
, o
, no
);
9280 case Transaction::OP_CLONERANGE
:
9281 assert(0 == "deprecated");
9284 case Transaction::OP_CLONERANGE2
:
9286 OnodeRef
& no
= ovec
[op
->dest_oid
];
9288 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
9289 no
= c
->get_onode(noid
, true);
9291 uint64_t srcoff
= op
->off
;
9292 uint64_t len
= op
->len
;
9293 uint64_t dstoff
= op
->dest_off
;
9294 r
= _clone_range(txc
, c
, o
, no
, srcoff
, len
, dstoff
);
9298 case Transaction::OP_COLL_ADD
:
9299 assert(0 == "not implemented");
9302 case Transaction::OP_COLL_REMOVE
:
9303 assert(0 == "not implemented");
9306 case Transaction::OP_COLL_MOVE
:
9307 assert(0 == "deprecated");
9310 case Transaction::OP_COLL_MOVE_RENAME
:
9311 case Transaction::OP_TRY_RENAME
:
9313 assert(op
->cid
== op
->dest_cid
);
9314 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
9315 OnodeRef
& no
= ovec
[op
->dest_oid
];
9317 no
= c
->get_onode(noid
, false);
9319 r
= _rename(txc
, c
, o
, no
, noid
);
9323 case Transaction::OP_OMAP_CLEAR
:
9325 r
= _omap_clear(txc
, c
, o
);
9328 case Transaction::OP_OMAP_SETKEYS
:
9331 i
.decode_attrset_bl(&aset_bl
);
9332 r
= _omap_setkeys(txc
, c
, o
, aset_bl
);
9335 case Transaction::OP_OMAP_RMKEYS
:
9338 i
.decode_keyset_bl(&keys_bl
);
9339 r
= _omap_rmkeys(txc
, c
, o
, keys_bl
);
9342 case Transaction::OP_OMAP_RMKEYRANGE
:
9345 first
= i
.decode_string();
9346 last
= i
.decode_string();
9347 r
= _omap_rmkey_range(txc
, c
, o
, first
, last
);
9350 case Transaction::OP_OMAP_SETHEADER
:
9354 r
= _omap_setheader(txc
, c
, o
, bl
);
9358 case Transaction::OP_SETALLOCHINT
:
9360 r
= _set_alloc_hint(txc
, c
, o
,
9361 op
->expected_object_size
,
9362 op
->expected_write_size
,
9363 op
->alloc_hint_flags
);
9368 derr
<< __func__
<< "bad op " << op
->op
<< dendl
;
9376 if (r
== -ENOENT
&& !(op
->op
== Transaction::OP_CLONERANGE
||
9377 op
->op
== Transaction::OP_CLONE
||
9378 op
->op
== Transaction::OP_CLONERANGE2
||
9379 op
->op
== Transaction::OP_COLL_ADD
||
9380 op
->op
== Transaction::OP_SETATTR
||
9381 op
->op
== Transaction::OP_SETATTRS
||
9382 op
->op
== Transaction::OP_RMATTR
||
9383 op
->op
== Transaction::OP_OMAP_SETKEYS
||
9384 op
->op
== Transaction::OP_OMAP_RMKEYS
||
9385 op
->op
== Transaction::OP_OMAP_RMKEYRANGE
||
9386 op
->op
== Transaction::OP_OMAP_SETHEADER
))
9387 // -ENOENT is usually okay
9393 const char *msg
= "unexpected error code";
9395 if (r
== -ENOENT
&& (op
->op
== Transaction::OP_CLONERANGE
||
9396 op
->op
== Transaction::OP_CLONE
||
9397 op
->op
== Transaction::OP_CLONERANGE2
))
9398 msg
= "ENOENT on clone suggests osd bug";
9401 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
9402 // by partially applying transactions.
9403 msg
= "ENOSPC from bluestore, misconfigured cluster";
9405 if (r
== -ENOTEMPTY
) {
9406 msg
= "ENOTEMPTY suggests garbage data in osd data dir";
9409 derr
<< __func__
<< " error " << cpp_strerror(r
)
9410 << " not handled on operation " << op
->op
9411 << " (op " << pos
<< ", counting from 0)"
9413 derr
<< msg
<< dendl
;
9414 _dump_transaction(t
, 0);
9415 assert(0 == "unexpected error");
9423 // -----------------
9426 int BlueStore::_touch(TransContext
*txc
,
9430 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
9432 _assign_nid(txc
, o
);
9433 txc
->write_onode(o
);
9434 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
9438 void BlueStore::_dump_onode(const OnodeRef
& o
, int log_level
)
9440 if (!cct
->_conf
->subsys
.should_gather(ceph_subsys_bluestore
, log_level
))
9442 dout(log_level
) << __func__
<< " " << o
<< " " << o
->oid
9443 << " nid " << o
->onode
.nid
9444 << " size 0x" << std::hex
<< o
->onode
.size
9445 << " (" << std::dec
<< o
->onode
.size
<< ")"
9446 << " expected_object_size " << o
->onode
.expected_object_size
9447 << " expected_write_size " << o
->onode
.expected_write_size
9448 << " in " << o
->onode
.extent_map_shards
.size() << " shards"
9449 << ", " << o
->extent_map
.spanning_blob_map
.size()
9450 << " spanning blobs"
9452 for (auto p
= o
->onode
.attrs
.begin();
9453 p
!= o
->onode
.attrs
.end();
9455 dout(log_level
) << __func__
<< " attr " << p
->first
9456 << " len " << p
->second
.length() << dendl
;
9458 _dump_extent_map(o
->extent_map
, log_level
);
9461 void BlueStore::_dump_extent_map(ExtentMap
&em
, int log_level
)
9464 for (auto& s
: em
.shards
) {
9465 dout(log_level
) << __func__
<< " shard " << *s
.shard_info
9466 << (s
.loaded
? " (loaded)" : "")
9467 << (s
.dirty
? " (dirty)" : "")
9470 for (auto& e
: em
.extent_map
) {
9471 dout(log_level
) << __func__
<< " " << e
<< dendl
;
9472 assert(e
.logical_offset
>= pos
);
9473 pos
= e
.logical_offset
+ e
.length
;
9474 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
9475 if (blob
.has_csum()) {
9477 unsigned n
= blob
.get_csum_count();
9478 for (unsigned i
= 0; i
< n
; ++i
)
9479 v
.push_back(blob
.get_csum_item(i
));
9480 dout(log_level
) << __func__
<< " csum: " << std::hex
<< v
<< std::dec
9483 std::lock_guard
<std::recursive_mutex
> l(e
.blob
->shared_blob
->get_cache()->lock
);
9484 for (auto& i
: e
.blob
->shared_blob
->bc
.buffer_map
) {
9485 dout(log_level
) << __func__
<< " 0x" << std::hex
<< i
.first
9486 << "~" << i
.second
->length
<< std::dec
9487 << " " << *i
.second
<< dendl
;
9492 void BlueStore::_dump_transaction(Transaction
*t
, int log_level
)
9494 dout(log_level
) << " transaction dump:\n";
9495 JSONFormatter
f(true);
9496 f
.open_object_section("transaction");
9503 void BlueStore::_pad_zeros(
9504 bufferlist
*bl
, uint64_t *offset
,
9505 uint64_t chunk_size
)
9507 auto length
= bl
->length();
9508 dout(30) << __func__
<< " 0x" << std::hex
<< *offset
<< "~" << length
9509 << " chunk_size 0x" << chunk_size
<< std::dec
<< dendl
;
9510 dout(40) << "before:\n";
9511 bl
->hexdump(*_dout
);
9514 size_t front_pad
= *offset
% chunk_size
;
9515 size_t back_pad
= 0;
9516 size_t pad_count
= 0;
9518 size_t front_copy
= MIN(chunk_size
- front_pad
, length
);
9519 bufferptr z
= buffer::create_page_aligned(chunk_size
);
9520 z
.zero(0, front_pad
, false);
9521 pad_count
+= front_pad
;
9522 bl
->copy(0, front_copy
, z
.c_str() + front_pad
);
9523 if (front_copy
+ front_pad
< chunk_size
) {
9524 back_pad
= chunk_size
- (length
+ front_pad
);
9525 z
.zero(front_pad
+ length
, back_pad
, false);
9526 pad_count
+= back_pad
;
9530 t
.substr_of(old
, front_copy
, length
- front_copy
);
9532 bl
->claim_append(t
);
9533 *offset
-= front_pad
;
9534 length
+= pad_count
;
9538 uint64_t end
= *offset
+ length
;
9539 unsigned back_copy
= end
% chunk_size
;
9541 assert(back_pad
== 0);
9542 back_pad
= chunk_size
- back_copy
;
9543 assert(back_copy
<= length
);
9544 bufferptr
tail(chunk_size
);
9545 bl
->copy(length
- back_copy
, back_copy
, tail
.c_str());
9546 tail
.zero(back_copy
, back_pad
, false);
9549 bl
->substr_of(old
, 0, length
- back_copy
);
9552 pad_count
+= back_pad
;
9554 dout(20) << __func__
<< " pad 0x" << std::hex
<< front_pad
<< " + 0x"
9555 << back_pad
<< " on front/back, now 0x" << *offset
<< "~"
9556 << length
<< std::dec
<< dendl
;
9557 dout(40) << "after:\n";
9558 bl
->hexdump(*_dout
);
9561 logger
->inc(l_bluestore_write_pad_bytes
, pad_count
);
9562 assert(bl
->length() == length
);
9565 void BlueStore::_do_write_small(
9569 uint64_t offset
, uint64_t length
,
9570 bufferlist::iterator
& blp
,
9573 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
9574 << std::dec
<< dendl
;
9575 assert(length
< min_alloc_size
);
9576 uint64_t end_offs
= offset
+ length
;
9578 logger
->inc(l_bluestore_write_small
);
9579 logger
->inc(l_bluestore_write_small_bytes
, length
);
9582 blp
.copy(length
, bl
);
9584 // Look for an existing mutable blob we can use.
9585 auto begin
= o
->extent_map
.extent_map
.begin();
9586 auto end
= o
->extent_map
.extent_map
.end();
9587 auto ep
= o
->extent_map
.seek_lextent(offset
);
9590 if (ep
->blob_end() <= offset
) {
9595 if (prev_ep
!= begin
) {
9598 prev_ep
= end
; // to avoid this extent check as it's a duplicate
9601 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
9602 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
9603 uint32_t alloc_len
= min_alloc_size
;
9604 auto offset0
= P2ALIGN(offset
, alloc_len
);
9608 // search suitable extent in both forward and reverse direction in
9609 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9610 // then check if blob can be reused via can_reuse_blob func or apply
9611 // direct/deferred write (the latter for extents including or higher
9612 // than 'offset' only).
9616 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
9617 BlobRef b
= ep
->blob
;
9618 auto bstart
= ep
->blob_start();
9619 dout(20) << __func__
<< " considering " << *b
9620 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
9621 if (bstart
>= end_offs
) {
9622 dout(20) << __func__
<< " ignoring distant " << *b
<< dendl
;
9623 } else if (!b
->get_blob().is_mutable()) {
9624 dout(20) << __func__
<< " ignoring immutable " << *b
<< dendl
;
9625 } else if (ep
->logical_offset
% min_alloc_size
!=
9626 ep
->blob_offset
% min_alloc_size
) {
9627 dout(20) << __func__
<< " ignoring offset-skewed " << *b
<< dendl
;
9629 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
9630 // can we pad our head/tail out with zeros?
9631 uint64_t head_pad
, tail_pad
;
9632 head_pad
= P2PHASE(offset
, chunk_size
);
9633 tail_pad
= P2NPHASE(end_offs
, chunk_size
);
9634 if (head_pad
|| tail_pad
) {
9635 o
->extent_map
.fault_range(db
, offset
- head_pad
,
9636 end_offs
- offset
+ head_pad
+ tail_pad
);
9639 o
->extent_map
.has_any_lextents(offset
- head_pad
, chunk_size
)) {
9642 if (tail_pad
&& o
->extent_map
.has_any_lextents(end_offs
, tail_pad
)) {
9646 uint64_t b_off
= offset
- head_pad
- bstart
;
9647 uint64_t b_len
= length
+ head_pad
+ tail_pad
;
9649 // direct write into unused blocks of an existing mutable blob?
9650 if ((b_off
% chunk_size
== 0 && b_len
% chunk_size
== 0) &&
9651 b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
9652 b
->get_blob().is_unused(b_off
, b_len
) &&
9653 b
->get_blob().is_allocated(b_off
, b_len
)) {
9654 _apply_padding(head_pad
, tail_pad
, bl
);
9656 dout(20) << __func__
<< " write to unused 0x" << std::hex
9657 << b_off
<< "~" << b_len
9658 << " pad 0x" << head_pad
<< " + 0x" << tail_pad
9659 << std::dec
<< " of mutable " << *b
<< dendl
;
9660 _buffer_cache_write(txc
, b
, b_off
, bl
,
9661 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
9663 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
9664 if (b_len
<= prefer_deferred_size
) {
9665 dout(20) << __func__
<< " deferring small 0x" << std::hex
9666 << b_len
<< std::dec
<< " unused write via deferred" << dendl
;
9667 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
9668 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
9671 [&](uint64_t offset
, uint64_t length
) {
9672 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
9677 b
->get_blob().map_bl(
9679 [&](uint64_t offset
, bufferlist
& t
) {
9680 bdev
->aio_write(offset
, t
,
9681 &txc
->ioc
, wctx
->buffered
);
9685 b
->dirty_blob().calc_csum(b_off
, bl
);
9686 dout(20) << __func__
<< " lex old " << *ep
<< dendl
;
9687 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, b_off
+ head_pad
, length
,
9689 &wctx
->old_extents
);
9690 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
9691 txc
->statfs_delta
.stored() += le
->length
;
9692 dout(20) << __func__
<< " lex " << *le
<< dendl
;
9693 logger
->inc(l_bluestore_write_small_unused
);
9696 // read some data to fill out the chunk?
9697 uint64_t head_read
= P2PHASE(b_off
, chunk_size
);
9698 uint64_t tail_read
= P2NPHASE(b_off
+ b_len
, chunk_size
);
9699 if ((head_read
|| tail_read
) &&
9700 (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
+ tail_read
) &&
9701 head_read
+ tail_read
< min_alloc_size
) {
9703 b_len
+= head_read
+ tail_read
;
9706 head_read
= tail_read
= 0;
9709 // chunk-aligned deferred overwrite?
9710 if (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
9711 b_off
% chunk_size
== 0 &&
9712 b_len
% chunk_size
== 0 &&
9713 b
->get_blob().is_allocated(b_off
, b_len
)) {
9715 _apply_padding(head_pad
, tail_pad
, bl
);
9717 dout(20) << __func__
<< " reading head 0x" << std::hex
<< head_read
9718 << " and tail 0x" << tail_read
<< std::dec
<< dendl
;
9721 int r
= _do_read(c
.get(), o
, offset
- head_pad
- head_read
, head_read
,
9723 assert(r
>= 0 && r
<= (int)head_read
);
9724 size_t zlen
= head_read
- r
;
9726 head_bl
.append_zero(zlen
);
9727 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
9729 bl
.claim_prepend(head_bl
);
9730 logger
->inc(l_bluestore_write_penalty_read_ops
);
9734 int r
= _do_read(c
.get(), o
, offset
+ length
+ tail_pad
, tail_read
,
9736 assert(r
>= 0 && r
<= (int)tail_read
);
9737 size_t zlen
= tail_read
- r
;
9739 tail_bl
.append_zero(zlen
);
9740 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
9742 bl
.claim_append(tail_bl
);
9743 logger
->inc(l_bluestore_write_penalty_read_ops
);
9745 logger
->inc(l_bluestore_write_small_pre_read
);
9747 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
9748 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
9749 _buffer_cache_write(txc
, b
, b_off
, bl
,
9750 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
9752 int r
= b
->get_blob().map(
9754 [&](uint64_t offset
, uint64_t length
) {
9755 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
9759 if (b
->get_blob().csum_type
) {
9760 b
->dirty_blob().calc_csum(b_off
, bl
);
9763 dout(20) << __func__
<< " deferred write 0x" << std::hex
<< b_off
<< "~"
9764 << b_len
<< std::dec
<< " of mutable " << *b
9765 << " at " << op
->extents
<< dendl
;
9766 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, offset
- bstart
, length
,
9767 b
, &wctx
->old_extents
);
9768 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
9769 txc
->statfs_delta
.stored() += le
->length
;
9770 dout(20) << __func__
<< " lex " << *le
<< dendl
;
9771 logger
->inc(l_bluestore_write_small_deferred
);
9774 // try to reuse blob if we can
9775 if (b
->can_reuse_blob(min_alloc_size
,
9779 assert(alloc_len
== min_alloc_size
); // expecting data always
9780 // fit into reused blob
9781 // Need to check for pending writes desiring to
9782 // reuse the same pextent. The rationale is that during GC two chunks
9783 // from garbage blobs(compressed?) can share logical space within the same
9784 // AU. That's in turn might be caused by unaligned len in clone_range2.
9785 // Hence the second write will fail in an attempt to reuse blob at
9786 // do_alloc_write().
9787 if (!wctx
->has_conflict(b
,
9789 offset0
+ alloc_len
,
9792 // we can't reuse pad_head/pad_tail since they might be truncated
9793 // due to existent extents
9794 uint64_t b_off
= offset
- bstart
;
9795 uint64_t b_off0
= b_off
;
9796 _pad_zeros(&bl
, &b_off0
, chunk_size
);
9798 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9799 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
9800 << " (0x" << b_off
<< "~" << length
<< ")"
9801 << std::dec
<< dendl
;
9803 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9804 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
9806 logger
->inc(l_bluestore_write_small_unused
);
9813 } // if (ep != end && ep->logical_offset < offset + max_bsize)
9815 // check extent for reuse in reverse order
9816 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
9817 BlobRef b
= prev_ep
->blob
;
9818 auto bstart
= prev_ep
->blob_start();
9819 dout(20) << __func__
<< " considering " << *b
9820 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
9821 if (b
->can_reuse_blob(min_alloc_size
,
9825 assert(alloc_len
== min_alloc_size
); // expecting data always
9826 // fit into reused blob
9827 // Need to check for pending writes desiring to
9828 // reuse the same pextent. The rationale is that during GC two chunks
9829 // from garbage blobs(compressed?) can share logical space within the same
9830 // AU. That's in turn might be caused by unaligned len in clone_range2.
9831 // Hence the second write will fail in an attempt to reuse blob at
9832 // do_alloc_write().
9833 if (!wctx
->has_conflict(b
,
9835 offset0
+ alloc_len
,
9838 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
9839 uint64_t b_off
= offset
- bstart
;
9840 uint64_t b_off0
= b_off
;
9841 _pad_zeros(&bl
, &b_off0
, chunk_size
);
9843 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9844 << " (0x" << b_off0
<< "~" << bl
.length() << ")"
9845 << " (0x" << b_off
<< "~" << length
<< ")"
9846 << std::dec
<< dendl
;
9848 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9849 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
9851 logger
->inc(l_bluestore_write_small_unused
);
9855 if (prev_ep
!= begin
) {
9859 prev_ep
= end
; // to avoid useless first extent re-check
9861 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
9862 } while (any_change
);
9866 BlobRef b
= c
->new_blob();
9867 uint64_t b_off
= P2PHASE(offset
, alloc_len
);
9868 uint64_t b_off0
= b_off
;
9869 _pad_zeros(&bl
, &b_off0
, block_size
);
9870 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9871 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
, true, true);
9872 logger
->inc(l_bluestore_write_small_new
);
9877 void BlueStore::_do_write_big(
9881 uint64_t offset
, uint64_t length
,
9882 bufferlist::iterator
& blp
,
9885 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
9886 << " target_blob_size 0x" << wctx
->target_blob_size
<< std::dec
9887 << " compress " << (int)wctx
->compress
9889 logger
->inc(l_bluestore_write_big
);
9890 logger
->inc(l_bluestore_write_big_bytes
, length
);
9891 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9892 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
9893 while (length
> 0) {
9894 bool new_blob
= false;
9895 uint32_t l
= MIN(max_bsize
, length
);
9899 //attempting to reuse existing blob
9900 if (!wctx
->compress
) {
9901 // look for an existing mutable blob we can reuse
9902 auto begin
= o
->extent_map
.extent_map
.begin();
9903 auto end
= o
->extent_map
.extent_map
.end();
9904 auto ep
= o
->extent_map
.seek_lextent(offset
);
9906 if (prev_ep
!= begin
) {
9909 prev_ep
= end
; // to avoid this extent check as it's a duplicate
9911 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
9912 // search suitable extent in both forward and reverse direction in
9913 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9914 // then check if blob can be reused via can_reuse_blob func.
9918 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
9919 if (offset
>= ep
->blob_start() &&
9920 ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
9921 offset
- ep
->blob_start(),
9924 b_off
= offset
- ep
->blob_start();
9925 prev_ep
= end
; // to avoid check below
9926 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9927 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
9934 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
9935 if (prev_ep
->blob
->can_reuse_blob(min_alloc_size
, max_bsize
,
9936 offset
- prev_ep
->blob_start(),
9939 b_off
= offset
- prev_ep
->blob_start();
9940 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9941 << " (0x" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
9942 } else if (prev_ep
!= begin
) {
9946 prev_ep
= end
; // to avoid useless first extent re-check
9949 } while (b
== nullptr && any_change
);
9959 wctx
->write(offset
, b
, l
, b_off
, t
, b_off
, l
, false, new_blob
);
9962 logger
->inc(l_bluestore_write_big_blobs
);
9966 int BlueStore::_do_alloc_write(
9972 dout(20) << __func__
<< " txc " << txc
9973 << " " << wctx
->writes
.size() << " blobs"
9975 if (wctx
->writes
.empty()) {
9981 if (wctx
->compress
) {
9983 "compression_algorithm",
9987 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_ALGORITHM
, &val
)) {
9988 CompressorRef cp
= compressor
;
9989 if (!cp
|| cp
->get_type_name() != val
) {
9990 cp
= Compressor::create(cct
, val
);
9992 return boost::optional
<CompressorRef
>(cp
);
9994 return boost::optional
<CompressorRef
>();
9998 crr
= select_option(
9999 "compression_required_ratio",
10000 cct
->_conf
->bluestore_compression_required_ratio
,
10003 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO
, &val
)) {
10004 return boost::optional
<double>(val
);
10006 return boost::optional
<double>();
10012 int csum
= csum_type
.load();
10013 csum
= select_option(
10018 if (coll
->pool_opts
.get(pool_opts_t::CSUM_TYPE
, &val
)) {
10019 return boost::optional
<int>(val
);
10021 return boost::optional
<int>();
10025 // compress (as needed) and calc needed space
10027 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
10028 for (auto& wi
: wctx
->writes
) {
10029 if (c
&& wi
.blob_length
> min_alloc_size
) {
10030 utime_t start
= ceph_clock_now();
10033 assert(wi
.b_off
== 0);
10034 assert(wi
.blob_length
== wi
.bl
.length());
10036 // FIXME: memory alignment here is bad
10038 int r
= c
->compress(wi
.bl
, t
);
10041 bluestore_compression_header_t chdr
;
10042 chdr
.type
= c
->get_type();
10043 chdr
.length
= t
.length();
10044 ::encode(chdr
, wi
.compressed_bl
);
10045 wi
.compressed_bl
.claim_append(t
);
10047 wi
.compressed_len
= wi
.compressed_bl
.length();
10048 uint64_t newlen
= P2ROUNDUP(wi
.compressed_len
, min_alloc_size
);
10049 uint64_t want_len_raw
= wi
.blob_length
* crr
;
10050 uint64_t want_len
= P2ROUNDUP(want_len_raw
, min_alloc_size
);
10051 if (newlen
<= want_len
&& newlen
< wi
.blob_length
) {
10052 // Cool. We compressed at least as much as we were hoping to.
10053 // pad out to min_alloc_size
10054 wi
.compressed_bl
.append_zero(newlen
- wi
.compressed_len
);
10055 logger
->inc(l_bluestore_write_pad_bytes
, newlen
- wi
.compressed_len
);
10056 dout(20) << __func__
<< std::hex
<< " compressed 0x" << wi
.blob_length
10057 << " -> 0x" << wi
.compressed_len
<< " => 0x" << newlen
10058 << " with " << c
->get_type()
10059 << std::dec
<< dendl
;
10060 txc
->statfs_delta
.compressed() += wi
.compressed_len
;
10061 txc
->statfs_delta
.compressed_original() += wi
.blob_length
;
10062 txc
->statfs_delta
.compressed_allocated() += newlen
;
10063 logger
->inc(l_bluestore_compress_success_count
);
10064 wi
.compressed
= true;
10067 dout(20) << __func__
<< std::hex
<< " 0x" << wi
.blob_length
10068 << " compressed to 0x" << wi
.compressed_len
<< " -> 0x" << newlen
10069 << " with " << c
->get_type()
10070 << ", which is more than required 0x" << want_len_raw
10071 << " -> 0x" << want_len
10072 << ", leaving uncompressed"
10073 << std::dec
<< dendl
;
10074 logger
->inc(l_bluestore_compress_rejected_count
);
10075 need
+= wi
.blob_length
;
10077 logger
->tinc(l_bluestore_compress_lat
,
10078 ceph_clock_now() - start
);
10080 need
+= wi
.blob_length
;
10083 int r
= alloc
->reserve(need
);
10085 derr
<< __func__
<< " failed to reserve 0x" << std::hex
<< need
<< std::dec
10089 AllocExtentVector prealloc
;
10090 prealloc
.reserve(2 * wctx
->writes
.size());;
10091 int prealloc_left
= 0;
10092 prealloc_left
= alloc
->allocate(
10093 need
, min_alloc_size
, need
,
10095 assert(prealloc_left
== (int64_t)need
);
10096 dout(20) << __func__
<< " prealloc " << prealloc
<< dendl
;
10097 auto prealloc_pos
= prealloc
.begin();
10099 for (auto& wi
: wctx
->writes
) {
10101 bluestore_blob_t
& dblob
= b
->dirty_blob();
10102 uint64_t b_off
= wi
.b_off
;
10103 bufferlist
*l
= &wi
.bl
;
10104 uint64_t final_length
= wi
.blob_length
;
10105 uint64_t csum_length
= wi
.blob_length
;
10106 unsigned csum_order
= block_size_order
;
10107 if (wi
.compressed
) {
10108 final_length
= wi
.compressed_bl
.length();
10109 csum_length
= final_length
;
10110 csum_order
= ctz(csum_length
);
10111 l
= &wi
.compressed_bl
;
10112 dblob
.set_compressed(wi
.blob_length
, wi
.compressed_len
);
10113 } else if (wi
.new_blob
) {
10114 // initialize newly created blob only
10115 assert(dblob
.is_mutable());
10116 if (l
->length() != wi
.blob_length
) {
10117 // hrm, maybe we could do better here, but let's not bother.
10118 dout(20) << __func__
<< " forcing csum_order to block_size_order "
10119 << block_size_order
<< dendl
;
10120 csum_order
= block_size_order
;
10122 csum_order
= std::min(wctx
->csum_order
, ctz(l
->length()));
10124 // try to align blob with max_blob_size to improve
10125 // its reuse ratio, e.g. in case of reverse write
10126 uint32_t suggested_boff
=
10127 (wi
.logical_offset
- (wi
.b_off0
- wi
.b_off
)) % max_bsize
;
10128 if ((suggested_boff
% (1 << csum_order
)) == 0 &&
10129 suggested_boff
+ final_length
<= max_bsize
&&
10130 suggested_boff
> b_off
) {
10131 dout(20) << __func__
<< " forcing blob_offset to 0x"
10132 << std::hex
<< suggested_boff
<< std::dec
<< dendl
;
10133 assert(suggested_boff
>= b_off
);
10134 csum_length
+= suggested_boff
- b_off
;
10135 b_off
= suggested_boff
;
10137 if (csum
!= Checksummer::CSUM_NONE
) {
10138 dout(20) << __func__
<< " initialize csum setting for new blob " << *b
10139 << " csum_type " << Checksummer::get_csum_type_string(csum
)
10140 << " csum_order " << csum_order
10141 << " csum_length 0x" << std::hex
<< csum_length
<< std::dec
10143 dblob
.init_csum(csum
, csum_order
, csum_length
);
10147 AllocExtentVector extents
;
10148 int64_t left
= final_length
;
10150 assert(prealloc_left
> 0);
10151 if (prealloc_pos
->length
<= left
) {
10152 prealloc_left
-= prealloc_pos
->length
;
10153 left
-= prealloc_pos
->length
;
10154 txc
->statfs_delta
.allocated() += prealloc_pos
->length
;
10155 extents
.push_back(*prealloc_pos
);
10158 extents
.emplace_back(prealloc_pos
->offset
, left
);
10159 prealloc_pos
->offset
+= left
;
10160 prealloc_pos
->length
-= left
;
10161 prealloc_left
-= left
;
10162 txc
->statfs_delta
.allocated() += left
;
10167 for (auto& p
: extents
) {
10168 txc
->allocated
.insert(p
.offset
, p
.length
);
10170 dblob
.allocated(P2ALIGN(b_off
, min_alloc_size
), final_length
, extents
);
10172 dout(20) << __func__
<< " blob " << *b
<< dendl
;
10173 if (dblob
.has_csum()) {
10174 dblob
.calc_csum(b_off
, *l
);
10177 if (wi
.mark_unused
) {
10178 auto b_end
= b_off
+ wi
.bl
.length();
10180 dblob
.add_unused(0, b_off
);
10182 if (b_end
< wi
.blob_length
) {
10183 dblob
.add_unused(b_end
, wi
.blob_length
- b_end
);
10187 Extent
*le
= o
->extent_map
.set_lextent(coll
, wi
.logical_offset
,
10188 b_off
+ (wi
.b_off0
- wi
.b_off
),
10192 wi
.b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
10193 txc
->statfs_delta
.stored() += le
->length
;
10194 dout(20) << __func__
<< " lex " << *le
<< dendl
;
10195 _buffer_cache_write(txc
, wi
.b
, b_off
, wi
.bl
,
10196 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
10199 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
10200 if (l
->length() <= prefer_deferred_size
.load()) {
10201 dout(20) << __func__
<< " deferring small 0x" << std::hex
10202 << l
->length() << std::dec
<< " write via deferred" << dendl
;
10203 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
10204 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
10205 int r
= b
->get_blob().map(
10206 b_off
, l
->length(),
10207 [&](uint64_t offset
, uint64_t length
) {
10208 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
10214 b
->get_blob().map_bl(
10216 [&](uint64_t offset
, bufferlist
& t
) {
10217 bdev
->aio_write(offset
, t
, &txc
->ioc
, false);
10222 assert(prealloc_pos
== prealloc
.end());
10223 assert(prealloc_left
== 0);
10227 void BlueStore::_wctx_finish(
10231 WriteContext
*wctx
,
10232 set
<SharedBlob
*> *maybe_unshared_blobs
)
10234 auto oep
= wctx
->old_extents
.begin();
10235 while (oep
!= wctx
->old_extents
.end()) {
10237 oep
= wctx
->old_extents
.erase(oep
);
10238 dout(20) << __func__
<< " lex_old " << lo
.e
<< dendl
;
10239 BlobRef b
= lo
.e
.blob
;
10240 const bluestore_blob_t
& blob
= b
->get_blob();
10241 if (blob
.is_compressed()) {
10242 if (lo
.blob_empty
) {
10243 txc
->statfs_delta
.compressed() -= blob
.get_compressed_payload_length();
10245 txc
->statfs_delta
.compressed_original() -= lo
.e
.length
;
10248 txc
->statfs_delta
.stored() -= lo
.e
.length
;
10250 dout(20) << __func__
<< " blob release " << r
<< dendl
;
10251 if (blob
.is_shared()) {
10252 PExtentVector final
;
10253 c
->load_shared_blob(b
->shared_blob
);
10255 b
->shared_blob
->put_ref(
10256 e
.offset
, e
.length
, &final
,
10257 b
->is_referenced() ? nullptr : maybe_unshared_blobs
);
10259 dout(20) << __func__
<< " shared_blob release " << final
10260 << " from " << *b
->shared_blob
<< dendl
;
10261 txc
->write_shared_blob(b
->shared_blob
);
10266 // we can't invalidate our logical extents as we drop them because
10267 // other lextents (either in our onode or others) may still
10268 // reference them. but we can throw out anything that is no
10269 // longer allocated. Note that this will leave behind edge bits
10270 // that are no longer referenced but not deallocated (until they
10271 // age out of the cache naturally).
10272 b
->discard_unallocated(c
.get());
10274 dout(20) << __func__
<< " release " << e
<< dendl
;
10275 txc
->released
.insert(e
.offset
, e
.length
);
10276 txc
->statfs_delta
.allocated() -= e
.length
;
10277 if (blob
.is_compressed()) {
10278 txc
->statfs_delta
.compressed_allocated() -= e
.length
;
10282 if (b
->is_spanning() && !b
->is_referenced()) {
10283 dout(20) << __func__
<< " spanning_blob_map removing empty " << *b
10285 o
->extent_map
.spanning_blob_map
.erase(b
->id
);
10290 void BlueStore::_do_write_data(
10297 WriteContext
*wctx
)
10299 uint64_t end
= offset
+ length
;
10300 bufferlist::iterator p
= bl
.begin();
10302 if (offset
/ min_alloc_size
== (end
- 1) / min_alloc_size
&&
10303 (length
!= min_alloc_size
)) {
10304 // we fall within the same block
10305 _do_write_small(txc
, c
, o
, offset
, length
, p
, wctx
);
10307 uint64_t head_offset
, head_length
;
10308 uint64_t middle_offset
, middle_length
;
10309 uint64_t tail_offset
, tail_length
;
10311 head_offset
= offset
;
10312 head_length
= P2NPHASE(offset
, min_alloc_size
);
10314 tail_offset
= P2ALIGN(end
, min_alloc_size
);
10315 tail_length
= P2PHASE(end
, min_alloc_size
);
10317 middle_offset
= head_offset
+ head_length
;
10318 middle_length
= length
- head_length
- tail_length
;
10321 _do_write_small(txc
, c
, o
, head_offset
, head_length
, p
, wctx
);
10324 if (middle_length
) {
10325 _do_write_big(txc
, c
, o
, middle_offset
, middle_length
, p
, wctx
);
10329 _do_write_small(txc
, c
, o
, tail_offset
, tail_length
, p
, wctx
);
10334 void BlueStore::_choose_write_options(
10337 uint32_t fadvise_flags
,
10338 WriteContext
*wctx
)
10340 if (fadvise_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
10341 dout(20) << __func__
<< " will do buffered write" << dendl
;
10342 wctx
->buffered
= true;
10343 } else if (cct
->_conf
->bluestore_default_buffered_write
&&
10344 (fadvise_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
10345 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
10346 dout(20) << __func__
<< " defaulting to buffered write" << dendl
;
10347 wctx
->buffered
= true;
10350 // apply basic csum block size
10351 wctx
->csum_order
= block_size_order
;
10353 // compression parameters
10354 unsigned alloc_hints
= o
->onode
.alloc_hint_flags
;
10355 auto cm
= select_option(
10356 "compression_mode",
10360 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MODE
, &val
)) {
10361 return boost::optional
<Compressor::CompressionMode
>(
10362 Compressor::get_comp_mode_type(val
));
10364 return boost::optional
<Compressor::CompressionMode
>();
10368 wctx
->compress
= (cm
!= Compressor::COMP_NONE
) &&
10369 ((cm
== Compressor::COMP_FORCE
) ||
10370 (cm
== Compressor::COMP_AGGRESSIVE
&&
10371 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE
) == 0) ||
10372 (cm
== Compressor::COMP_PASSIVE
&&
10373 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE
)));
10375 if ((alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ
) &&
10376 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ
) == 0 &&
10377 (alloc_hints
& (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE
|
10378 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY
)) &&
10379 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE
) == 0) {
10381 dout(20) << __func__
<< " will prefer large blob and csum sizes" << dendl
;
10383 if (o
->onode
.expected_write_size
) {
10384 wctx
->csum_order
= std::max(min_alloc_size_order
,
10385 (uint8_t)ctz(o
->onode
.expected_write_size
));
10387 wctx
->csum_order
= min_alloc_size_order
;
10390 if (wctx
->compress
) {
10391 wctx
->target_blob_size
= select_option(
10392 "compression_max_blob_size",
10393 comp_max_blob_size
.load(),
10396 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, &val
)) {
10397 return boost::optional
<uint64_t>((uint64_t)val
);
10399 return boost::optional
<uint64_t>();
10404 if (wctx
->compress
) {
10405 wctx
->target_blob_size
= select_option(
10406 "compression_min_blob_size",
10407 comp_min_blob_size
.load(),
10410 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, &val
)) {
10411 return boost::optional
<uint64_t>((uint64_t)val
);
10413 return boost::optional
<uint64_t>();
10419 uint64_t max_bsize
= max_blob_size
.load();
10420 if (wctx
->target_blob_size
== 0 || wctx
->target_blob_size
> max_bsize
) {
10421 wctx
->target_blob_size
= max_bsize
;
10424 // set the min blob size floor at 2x the min_alloc_size, or else we
10425 // won't be able to allocate a smaller extent for the compressed
10427 if (wctx
->compress
&&
10428 wctx
->target_blob_size
< min_alloc_size
* 2) {
10429 wctx
->target_blob_size
= min_alloc_size
* 2;
10432 dout(20) << __func__
<< " prefer csum_order " << wctx
->csum_order
10433 << " target_blob_size 0x" << std::hex
<< wctx
->target_blob_size
10434 << std::dec
<< dendl
;
10437 int BlueStore::_do_gc(
10441 const GarbageCollector
& gc
,
10442 const WriteContext
& wctx
,
10443 uint64_t *dirty_start
,
10444 uint64_t *dirty_end
)
10446 auto& extents_to_collect
= gc
.get_extents_to_collect();
10448 WriteContext wctx_gc
;
10449 wctx_gc
.fork(wctx
); // make a clone for garbage collection
10451 for (auto it
= extents_to_collect
.begin();
10452 it
!= extents_to_collect
.end();
10455 int r
= _do_read(c
.get(), o
, it
->offset
, it
->length
, bl
, 0);
10456 assert(r
== (int)it
->length
);
10458 o
->extent_map
.fault_range(db
, it
->offset
, it
->length
);
10459 _do_write_data(txc
, c
, o
, it
->offset
, it
->length
, bl
, &wctx_gc
);
10460 logger
->inc(l_bluestore_gc_merged
, it
->length
);
10462 if (*dirty_start
> it
->offset
) {
10463 *dirty_start
= it
->offset
;
10466 if (*dirty_end
< it
->offset
+ it
->length
) {
10467 *dirty_end
= it
->offset
+ it
->length
;
10471 dout(30) << __func__
<< " alloc write" << dendl
;
10472 int r
= _do_alloc_write(txc
, c
, o
, &wctx_gc
);
10474 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
10479 _wctx_finish(txc
, c
, o
, &wctx_gc
);
10483 int BlueStore::_do_write(
10490 uint32_t fadvise_flags
)
10494 dout(20) << __func__
10496 << " 0x" << std::hex
<< offset
<< "~" << length
10497 << " - have 0x" << o
->onode
.size
10498 << " (" << std::dec
<< o
->onode
.size
<< ")"
10500 << " fadvise_flags 0x" << std::hex
<< fadvise_flags
<< std::dec
10508 uint64_t end
= offset
+ length
;
10510 GarbageCollector
gc(c
->store
->cct
);
10512 auto dirty_start
= offset
;
10513 auto dirty_end
= end
;
10516 _choose_write_options(c
, o
, fadvise_flags
, &wctx
);
10517 o
->extent_map
.fault_range(db
, offset
, length
);
10518 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx
);
10519 r
= _do_alloc_write(txc
, c
, o
, &wctx
);
10521 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
10526 // NB: _wctx_finish() will empty old_extents
10527 // so we must do gc estimation before that
10528 benefit
= gc
.estimate(offset
,
10534 _wctx_finish(txc
, c
, o
, &wctx
);
10535 if (end
> o
->onode
.size
) {
10536 dout(20) << __func__
<< " extending size to 0x" << std::hex
<< end
10537 << std::dec
<< dendl
;
10538 o
->onode
.size
= end
;
10541 if (benefit
>= g_conf
->bluestore_gc_enable_total_threshold
) {
10542 if (!gc
.get_extents_to_collect().empty()) {
10543 dout(20) << __func__
<< " perform garbage collection, "
10544 << "expected benefit = " << benefit
<< " AUs" << dendl
;
10545 r
= _do_gc(txc
, c
, o
, gc
, wctx
, &dirty_start
, &dirty_end
);
10547 derr
<< __func__
<< " _do_gc failed with " << cpp_strerror(r
)
10554 o
->extent_map
.compress_extent_map(dirty_start
, dirty_end
- dirty_start
);
10555 o
->extent_map
.dirty_range(dirty_start
, dirty_end
- dirty_start
);
10563 int BlueStore::_write(TransContext
*txc
,
10566 uint64_t offset
, size_t length
,
10568 uint32_t fadvise_flags
)
10570 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10571 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10574 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
10577 _assign_nid(txc
, o
);
10578 r
= _do_write(txc
, c
, o
, offset
, length
, bl
, fadvise_flags
);
10579 txc
->write_onode(o
);
10581 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10582 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10583 << " = " << r
<< dendl
;
10587 int BlueStore::_zero(TransContext
*txc
,
10590 uint64_t offset
, size_t length
)
10592 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10593 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10596 if (offset
+ length
>= OBJECT_MAX_SIZE
) {
10599 _assign_nid(txc
, o
);
10600 r
= _do_zero(txc
, c
, o
, offset
, length
);
10602 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10603 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10604 << " = " << r
<< dendl
;
10608 int BlueStore::_do_zero(TransContext
*txc
,
10611 uint64_t offset
, size_t length
)
10613 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10614 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10621 o
->extent_map
.fault_range(db
, offset
, length
);
10622 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
10623 o
->extent_map
.dirty_range(offset
, length
);
10624 _wctx_finish(txc
, c
, o
, &wctx
);
10626 if (length
> 0 && offset
+ length
> o
->onode
.size
) {
10627 o
->onode
.size
= offset
+ length
;
10628 dout(20) << __func__
<< " extending size to " << offset
+ length
10631 txc
->write_onode(o
);
10633 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10634 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10635 << " = " << r
<< dendl
;
10639 void BlueStore::_do_truncate(
10640 TransContext
*txc
, CollectionRef
& c
, OnodeRef o
, uint64_t offset
,
10641 set
<SharedBlob
*> *maybe_unshared_blobs
)
10643 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10644 << " 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
10646 _dump_onode(o
, 30);
10648 if (offset
== o
->onode
.size
)
10651 if (offset
< o
->onode
.size
) {
10653 uint64_t length
= o
->onode
.size
- offset
;
10654 o
->extent_map
.fault_range(db
, offset
, length
);
10655 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
10656 o
->extent_map
.dirty_range(offset
, length
);
10657 _wctx_finish(txc
, c
, o
, &wctx
, maybe_unshared_blobs
);
10659 // if we have shards past EOF, ask for a reshard
10660 if (!o
->onode
.extent_map_shards
.empty() &&
10661 o
->onode
.extent_map_shards
.back().offset
>= offset
) {
10662 dout(10) << __func__
<< " request reshard past EOF" << dendl
;
10664 o
->extent_map
.request_reshard(offset
- 1, offset
+ length
);
10666 o
->extent_map
.request_reshard(0, length
);
10671 o
->onode
.size
= offset
;
10673 txc
->write_onode(o
);
10676 int BlueStore::_truncate(TransContext
*txc
,
10681 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10682 << " 0x" << std::hex
<< offset
<< std::dec
10685 if (offset
>= OBJECT_MAX_SIZE
) {
10688 _do_truncate(txc
, c
, o
, offset
);
10690 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10691 << " 0x" << std::hex
<< offset
<< std::dec
10692 << " = " << r
<< dendl
;
10696 int BlueStore::_do_remove(
10701 set
<SharedBlob
*> maybe_unshared_blobs
;
10702 bool is_gen
= !o
->oid
.is_no_gen();
10703 _do_truncate(txc
, c
, o
, 0, is_gen
? &maybe_unshared_blobs
: nullptr);
10704 if (o
->onode
.has_omap()) {
10706 _do_omap_clear(txc
, o
->onode
.nid
);
10710 for (auto &s
: o
->extent_map
.shards
) {
10711 dout(20) << __func__
<< " removing shard 0x" << std::hex
10712 << s
.shard_info
->offset
<< std::dec
<< dendl
;
10713 generate_extent_shard_key_and_apply(o
->key
, s
.shard_info
->offset
, &key
,
10714 [&](const string
& final_key
) {
10715 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
10719 txc
->t
->rmkey(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size());
10721 o
->extent_map
.clear();
10722 o
->onode
= bluestore_onode_t();
10723 _debug_obj_on_delete(o
->oid
);
10725 if (!is_gen
|| maybe_unshared_blobs
.empty()) {
10729 // see if we can unshare blobs still referenced by the head
10730 dout(10) << __func__
<< " gen and maybe_unshared_blobs "
10731 << maybe_unshared_blobs
<< dendl
;
10732 ghobject_t nogen
= o
->oid
;
10733 nogen
.generation
= ghobject_t::NO_GEN
;
10734 OnodeRef h
= c
->onode_map
.lookup(nogen
);
10736 if (!h
|| !h
->exists
) {
10740 dout(20) << __func__
<< " checking for unshareable blobs on " << h
10741 << " " << h
->oid
<< dendl
;
10742 map
<SharedBlob
*,bluestore_extent_ref_map_t
> expect
;
10743 for (auto& e
: h
->extent_map
.extent_map
) {
10744 const bluestore_blob_t
& b
= e
.blob
->get_blob();
10745 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
10746 if (b
.is_shared() &&
10748 maybe_unshared_blobs
.count(sb
)) {
10749 if (b
.is_compressed()) {
10750 expect
[sb
].get(0, b
.get_ondisk_length());
10752 b
.map(e
.blob_offset
, e
.length
, [&](uint64_t off
, uint64_t len
) {
10753 expect
[sb
].get(off
, len
);
10760 vector
<SharedBlob
*> unshared_blobs
;
10761 unshared_blobs
.reserve(maybe_unshared_blobs
.size());
10762 for (auto& p
: expect
) {
10763 dout(20) << " ? " << *p
.first
<< " vs " << p
.second
<< dendl
;
10764 if (p
.first
->persistent
->ref_map
== p
.second
) {
10765 SharedBlob
*sb
= p
.first
;
10766 dout(20) << __func__
<< " unsharing " << *sb
<< dendl
;
10767 unshared_blobs
.push_back(sb
);
10768 txc
->unshare_blob(sb
);
10769 uint64_t sbid
= c
->make_blob_unshared(sb
);
10771 get_shared_blob_key(sbid
, &key
);
10772 txc
->t
->rmkey(PREFIX_SHARED_BLOB
, key
);
10776 if (unshared_blobs
.empty()) {
10780 for (auto& e
: h
->extent_map
.extent_map
) {
10781 const bluestore_blob_t
& b
= e
.blob
->get_blob();
10782 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
10783 if (b
.is_shared() &&
10784 std::find(unshared_blobs
.begin(), unshared_blobs
.end(),
10785 sb
) != unshared_blobs
.end()) {
10786 dout(20) << __func__
<< " unsharing " << e
<< dendl
;
10787 bluestore_blob_t
& blob
= e
.blob
->dirty_blob();
10788 blob
.clear_flag(bluestore_blob_t::FLAG_SHARED
);
10789 h
->extent_map
.dirty_range(e
.logical_offset
, 1);
10792 txc
->write_onode(h
);
10797 int BlueStore::_remove(TransContext
*txc
,
10801 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10802 int r
= _do_remove(txc
, c
, o
);
10803 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10807 int BlueStore::_setattr(TransContext
*txc
,
10810 const string
& name
,
10813 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10814 << " " << name
<< " (" << val
.length() << " bytes)"
10817 if (val
.is_partial()) {
10818 auto& b
= o
->onode
.attrs
[name
.c_str()] = bufferptr(val
.c_str(),
10820 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
10822 auto& b
= o
->onode
.attrs
[name
.c_str()] = val
;
10823 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
10825 txc
->write_onode(o
);
10826 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10827 << " " << name
<< " (" << val
.length() << " bytes)"
10828 << " = " << r
<< dendl
;
10832 int BlueStore::_setattrs(TransContext
*txc
,
10835 const map
<string
,bufferptr
>& aset
)
10837 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10838 << " " << aset
.size() << " keys"
10841 for (map
<string
,bufferptr
>::const_iterator p
= aset
.begin();
10842 p
!= aset
.end(); ++p
) {
10843 if (p
->second
.is_partial()) {
10844 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] =
10845 bufferptr(p
->second
.c_str(), p
->second
.length());
10846 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
10848 auto& b
= o
->onode
.attrs
[p
->first
.c_str()] = p
->second
;
10849 b
.reassign_to_mempool(mempool::mempool_bluestore_cache_other
);
10852 txc
->write_onode(o
);
10853 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10854 << " " << aset
.size() << " keys"
10855 << " = " << r
<< dendl
;
10860 int BlueStore::_rmattr(TransContext
*txc
,
10863 const string
& name
)
10865 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10866 << " " << name
<< dendl
;
10868 auto it
= o
->onode
.attrs
.find(name
.c_str());
10869 if (it
== o
->onode
.attrs
.end())
10872 o
->onode
.attrs
.erase(it
);
10873 txc
->write_onode(o
);
10876 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10877 << " " << name
<< " = " << r
<< dendl
;
10881 int BlueStore::_rmattrs(TransContext
*txc
,
10885 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10888 if (o
->onode
.attrs
.empty())
10891 o
->onode
.attrs
.clear();
10892 txc
->write_onode(o
);
10895 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10899 void BlueStore::_do_omap_clear(TransContext
*txc
, uint64_t id
)
10901 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
10902 string prefix
, tail
;
10903 get_omap_header(id
, &prefix
);
10904 get_omap_tail(id
, &tail
);
10905 it
->lower_bound(prefix
);
10906 while (it
->valid()) {
10907 if (it
->key() >= tail
) {
10908 dout(30) << __func__
<< " stop at " << pretty_binary_string(tail
)
10912 txc
->t
->rmkey(PREFIX_OMAP
, it
->key());
10913 dout(30) << __func__
<< " rm " << pretty_binary_string(it
->key()) << dendl
;
10918 int BlueStore::_omap_clear(TransContext
*txc
,
10922 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10924 if (o
->onode
.has_omap()) {
10926 _do_omap_clear(txc
, o
->onode
.nid
);
10927 o
->onode
.clear_omap_flag();
10928 txc
->write_onode(o
);
10930 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10934 int BlueStore::_omap_setkeys(TransContext
*txc
,
10939 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10941 bufferlist::iterator p
= bl
.begin();
10943 if (!o
->onode
.has_omap()) {
10944 o
->onode
.set_omap_flag();
10945 txc
->write_onode(o
);
10947 txc
->note_modified_object(o
);
10950 _key_encode_u64(o
->onode
.nid
, &final_key
);
10951 final_key
.push_back('.');
10957 ::decode(value
, p
);
10958 final_key
.resize(9); // keep prefix
10960 dout(30) << __func__
<< " " << pretty_binary_string(final_key
)
10961 << " <- " << key
<< dendl
;
10962 txc
->t
->set(PREFIX_OMAP
, final_key
, value
);
10965 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10969 int BlueStore::_omap_setheader(TransContext
*txc
,
10974 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10977 if (!o
->onode
.has_omap()) {
10978 o
->onode
.set_omap_flag();
10979 txc
->write_onode(o
);
10981 txc
->note_modified_object(o
);
10983 get_omap_header(o
->onode
.nid
, &key
);
10984 txc
->t
->set(PREFIX_OMAP
, key
, bl
);
10986 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10990 int BlueStore::_omap_rmkeys(TransContext
*txc
,
10995 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10997 bufferlist::iterator p
= bl
.begin();
11001 if (!o
->onode
.has_omap()) {
11004 _key_encode_u64(o
->onode
.nid
, &final_key
);
11005 final_key
.push_back('.');
11010 final_key
.resize(9); // keep prefix
11012 dout(30) << __func__
<< " rm " << pretty_binary_string(final_key
)
11013 << " <- " << key
<< dendl
;
11014 txc
->t
->rmkey(PREFIX_OMAP
, final_key
);
11016 txc
->note_modified_object(o
);
11019 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
11023 int BlueStore::_omap_rmkey_range(TransContext
*txc
,
11026 const string
& first
, const string
& last
)
11028 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
11029 KeyValueDB::Iterator it
;
11030 string key_first
, key_last
;
11032 if (!o
->onode
.has_omap()) {
11036 it
= db
->get_iterator(PREFIX_OMAP
);
11037 get_omap_key(o
->onode
.nid
, first
, &key_first
);
11038 get_omap_key(o
->onode
.nid
, last
, &key_last
);
11039 it
->lower_bound(key_first
);
11040 while (it
->valid()) {
11041 if (it
->key() >= key_last
) {
11042 dout(30) << __func__
<< " stop at " << pretty_binary_string(key_last
)
11046 txc
->t
->rmkey(PREFIX_OMAP
, it
->key());
11047 dout(30) << __func__
<< " rm " << pretty_binary_string(it
->key()) << dendl
;
11050 txc
->note_modified_object(o
);
11053 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
11057 int BlueStore::_set_alloc_hint(
11061 uint64_t expected_object_size
,
11062 uint64_t expected_write_size
,
11065 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
11066 << " object_size " << expected_object_size
11067 << " write_size " << expected_write_size
11068 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
11071 o
->onode
.expected_object_size
= expected_object_size
;
11072 o
->onode
.expected_write_size
= expected_write_size
;
11073 o
->onode
.alloc_hint_flags
= flags
;
11074 txc
->write_onode(o
);
11075 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
11076 << " object_size " << expected_object_size
11077 << " write_size " << expected_write_size
11078 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
11079 << " = " << r
<< dendl
;
11083 int BlueStore::_clone(TransContext
*txc
,
11088 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11089 << newo
->oid
<< dendl
;
11091 if (oldo
->oid
.hobj
.get_hash() != newo
->oid
.hobj
.get_hash()) {
11092 derr
<< __func__
<< " mismatched hash on " << oldo
->oid
11093 << " and " << newo
->oid
<< dendl
;
11097 _assign_nid(txc
, newo
);
11101 _do_truncate(txc
, c
, newo
, 0);
11102 if (cct
->_conf
->bluestore_clone_cow
) {
11103 _do_clone_range(txc
, c
, oldo
, newo
, 0, oldo
->onode
.size
, 0);
11106 r
= _do_read(c
.get(), oldo
, 0, oldo
->onode
.size
, bl
, 0);
11109 r
= _do_write(txc
, c
, newo
, 0, oldo
->onode
.size
, bl
, 0);
11115 newo
->onode
.attrs
= oldo
->onode
.attrs
;
11118 if (newo
->onode
.has_omap()) {
11119 dout(20) << __func__
<< " clearing old omap data" << dendl
;
11121 _do_omap_clear(txc
, newo
->onode
.nid
);
11123 if (oldo
->onode
.has_omap()) {
11124 dout(20) << __func__
<< " copying omap data" << dendl
;
11125 if (!newo
->onode
.has_omap()) {
11126 newo
->onode
.set_omap_flag();
11128 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
11130 get_omap_header(oldo
->onode
.nid
, &head
);
11131 get_omap_tail(oldo
->onode
.nid
, &tail
);
11132 it
->lower_bound(head
);
11133 while (it
->valid()) {
11134 if (it
->key() >= tail
) {
11135 dout(30) << __func__
<< " reached tail" << dendl
;
11138 dout(30) << __func__
<< " got header/data "
11139 << pretty_binary_string(it
->key()) << dendl
;
11141 rewrite_omap_key(newo
->onode
.nid
, it
->key(), &key
);
11142 txc
->t
->set(PREFIX_OMAP
, key
, it
->value());
11147 newo
->onode
.clear_omap_flag();
11150 txc
->write_onode(newo
);
11154 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11155 << newo
->oid
<< " = " << r
<< dendl
;
11159 int BlueStore::_do_clone_range(
11168 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11170 << " 0x" << std::hex
<< srcoff
<< "~" << length
<< " -> "
11171 << " 0x" << dstoff
<< "~" << length
<< std::dec
<< dendl
;
11172 oldo
->extent_map
.fault_range(db
, srcoff
, length
);
11173 newo
->extent_map
.fault_range(db
, dstoff
, length
);
11177 // hmm, this could go into an ExtentMap::dup() method.
11178 vector
<BlobRef
> id_to_blob(oldo
->extent_map
.extent_map
.size());
11179 for (auto &e
: oldo
->extent_map
.extent_map
) {
11180 e
.blob
->last_encoded_id
= -1;
11183 uint64_t end
= srcoff
+ length
;
11184 uint32_t dirty_range_begin
= 0;
11185 uint32_t dirty_range_end
= 0;
11186 bool src_dirty
= false;
11187 for (auto ep
= oldo
->extent_map
.seek_lextent(srcoff
);
11188 ep
!= oldo
->extent_map
.extent_map
.end();
11191 if (e
.logical_offset
>= end
) {
11194 dout(20) << __func__
<< " src " << e
<< dendl
;
11196 bool blob_duped
= true;
11197 if (e
.blob
->last_encoded_id
>= 0) {
11198 // blob is already duped
11199 cb
= id_to_blob
[e
.blob
->last_encoded_id
];
11200 blob_duped
= false;
11203 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
11204 // make sure it is shared
11205 if (!blob
.is_shared()) {
11206 c
->make_blob_shared(_assign_blobid(txc
), e
.blob
);
11209 dirty_range_begin
= e
.logical_offset
;
11211 assert(e
.logical_end() > 0);
11212 // -1 to exclude next potential shard
11213 dirty_range_end
= e
.logical_end() - 1;
11215 c
->load_shared_blob(e
.blob
->shared_blob
);
11218 e
.blob
->last_encoded_id
= n
;
11219 id_to_blob
[n
] = cb
;
11221 // bump the extent refs on the copied blob's extents
11222 for (auto p
: blob
.get_extents()) {
11223 if (p
.is_valid()) {
11224 e
.blob
->shared_blob
->get_ref(p
.offset
, p
.length
);
11227 txc
->write_shared_blob(e
.blob
->shared_blob
);
11228 dout(20) << __func__
<< " new " << *cb
<< dendl
;
11231 int skip_front
, skip_back
;
11232 if (e
.logical_offset
< srcoff
) {
11233 skip_front
= srcoff
- e
.logical_offset
;
11237 if (e
.logical_end() > end
) {
11238 skip_back
= e
.logical_end() - end
;
11242 Extent
*ne
= new Extent(e
.logical_offset
+ skip_front
+ dstoff
- srcoff
,
11243 e
.blob_offset
+ skip_front
,
11244 e
.length
- skip_front
- skip_back
, cb
);
11245 newo
->extent_map
.extent_map
.insert(*ne
);
11246 ne
->blob
->get_ref(c
.get(), ne
->blob_offset
, ne
->length
);
11247 // fixme: we may leave parts of new blob unreferenced that could
11248 // be freed (relative to the shared_blob).
11249 txc
->statfs_delta
.stored() += ne
->length
;
11250 if (e
.blob
->get_blob().is_compressed()) {
11251 txc
->statfs_delta
.compressed_original() += ne
->length
;
11253 txc
->statfs_delta
.compressed() +=
11254 cb
->get_blob().get_compressed_payload_length();
11257 dout(20) << __func__
<< " dst " << *ne
<< dendl
;
11261 oldo
->extent_map
.dirty_range(dirty_range_begin
,
11262 dirty_range_end
- dirty_range_begin
);
11263 txc
->write_onode(oldo
);
11265 txc
->write_onode(newo
);
11267 if (dstoff
+ length
> newo
->onode
.size
) {
11268 newo
->onode
.size
= dstoff
+ length
;
11270 newo
->extent_map
.dirty_range(dstoff
, length
);
11276 int BlueStore::_clone_range(TransContext
*txc
,
11280 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
)
11282 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11283 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
11284 << " to offset 0x" << dstoff
<< std::dec
<< dendl
;
11287 if (srcoff
+ length
>= OBJECT_MAX_SIZE
||
11288 dstoff
+ length
>= OBJECT_MAX_SIZE
) {
11292 if (srcoff
+ length
> oldo
->onode
.size
) {
11297 _assign_nid(txc
, newo
);
11300 if (cct
->_conf
->bluestore_clone_cow
) {
11301 _do_zero(txc
, c
, newo
, dstoff
, length
);
11302 _do_clone_range(txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
11305 r
= _do_read(c
.get(), oldo
, srcoff
, length
, bl
, 0);
11308 r
= _do_write(txc
, c
, newo
, dstoff
, bl
.length(), bl
, 0);
11314 txc
->write_onode(newo
);
11318 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11319 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
11320 << " to offset 0x" << dstoff
<< std::dec
11321 << " = " << r
<< dendl
;
11325 int BlueStore::_rename(TransContext
*txc
,
11329 const ghobject_t
& new_oid
)
11331 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
11332 << new_oid
<< dendl
;
11334 ghobject_t old_oid
= oldo
->oid
;
11335 mempool::bluestore_cache_other::string new_okey
;
11338 if (newo
->exists
) {
11342 assert(txc
->onodes
.count(newo
) == 0);
11345 txc
->t
->rmkey(PREFIX_OBJ
, oldo
->key
.c_str(), oldo
->key
.size());
11349 oldo
->extent_map
.fault_range(db
, 0, oldo
->onode
.size
);
11350 get_object_key(cct
, new_oid
, &new_okey
);
11352 for (auto &s
: oldo
->extent_map
.shards
) {
11353 generate_extent_shard_key_and_apply(oldo
->key
, s
.shard_info
->offset
, &key
,
11354 [&](const string
& final_key
) {
11355 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
11363 txc
->write_onode(newo
);
11365 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
11366 // Onode in the old slot
11367 c
->onode_map
.rename(oldo
, old_oid
, new_oid
, new_okey
);
11371 dout(10) << __func__
<< " " << c
->cid
<< " " << old_oid
<< " -> "
11372 << new_oid
<< " = " << r
<< dendl
;
11378 int BlueStore::_create_collection(
11384 dout(15) << __func__
<< " " << cid
<< " bits " << bits
<< dendl
;
11389 RWLock::WLocker
l(coll_lock
);
11397 cache_shards
[cid
.hash_to_shard(cache_shards
.size())],
11399 (*c
)->cnode
.bits
= bits
;
11400 coll_map
[cid
] = *c
;
11402 ::encode((*c
)->cnode
, bl
);
11403 txc
->t
->set(PREFIX_COLL
, stringify(cid
), bl
);
11407 dout(10) << __func__
<< " " << cid
<< " bits " << bits
<< " = " << r
<< dendl
;
11411 int BlueStore::_remove_collection(TransContext
*txc
, const coll_t
&cid
,
11414 dout(15) << __func__
<< " " << cid
<< dendl
;
11418 RWLock::WLocker
l(coll_lock
);
11423 size_t nonexistent_count
= 0;
11424 assert((*c
)->exists
);
11425 if ((*c
)->onode_map
.map_any([&](OnodeRef o
) {
11427 dout(10) << __func__
<< " " << o
->oid
<< " " << o
11428 << " exists in onode_map" << dendl
;
11431 ++nonexistent_count
;
11438 vector
<ghobject_t
> ls
;
11440 // Enumerate onodes in db, up to nonexistent_count + 1
11441 // then check if all of them are marked as non-existent.
11442 // Bypass the check if returned number is greater than nonexistent_count
11443 r
= _collection_list(c
->get(), ghobject_t(), ghobject_t::get_max(),
11444 nonexistent_count
+ 1, &ls
, &next
);
11446 bool exists
= false; //ls.size() > nonexistent_count;
11447 for (auto it
= ls
.begin(); !exists
&& it
< ls
.end(); ++it
) {
11448 dout(10) << __func__
<< " oid " << *it
<< dendl
;
11449 auto onode
= (*c
)->onode_map
.lookup(*it
);
11450 exists
= !onode
|| onode
->exists
;
11452 dout(10) << __func__
<< " " << *it
11453 << " exists in db" << dendl
;
11457 coll_map
.erase(cid
);
11458 txc
->removed_collections
.push_back(*c
);
11459 (*c
)->exists
= false;
11461 txc
->t
->rmkey(PREFIX_COLL
, stringify(cid
));
11464 dout(10) << __func__
<< " " << cid
11465 << " is non-empty" << dendl
;
11472 dout(10) << __func__
<< " " << cid
<< " = " << r
<< dendl
;
11476 int BlueStore::_split_collection(TransContext
*txc
,
11479 unsigned bits
, int rem
)
11481 dout(15) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
11482 << " bits " << bits
<< dendl
;
11483 RWLock::WLocker
l(c
->lock
);
11484 RWLock::WLocker
l2(d
->lock
);
11487 // flush all previous deferred writes on this sequencer. this is a bit
11488 // heavyweight, but we need to make sure all deferred writes complete
11489 // before we split as the new collection's sequencer may need to order
11490 // this after those writes, and we don't bother with the complexity of
11491 // moving those TransContexts over to the new osr.
11492 _osr_drain_preceding(txc
);
11494 // move any cached items (onodes and referenced shared blobs) that will
11495 // belong to the child collection post-split. leave everything else behind.
11496 // this may include things that don't strictly belong to the now-smaller
11497 // parent split, but the OSD will always send us a split for every new
11500 spg_t pgid
, dest_pgid
;
11501 bool is_pg
= c
->cid
.is_pg(&pgid
);
11503 is_pg
= d
->cid
.is_pg(&dest_pgid
);
11506 // the destination should initially be empty.
11507 assert(d
->onode_map
.empty());
11508 assert(d
->shared_blob_set
.empty());
11509 assert(d
->cnode
.bits
== bits
);
11511 c
->split_cache(d
.get());
11513 // adjust bits. note that this will be redundant for all but the first
11514 // split call for this parent (first child).
11515 c
->cnode
.bits
= bits
;
11516 assert(d
->cnode
.bits
== bits
);
11520 ::encode(c
->cnode
, bl
);
11521 txc
->t
->set(PREFIX_COLL
, stringify(c
->cid
), bl
);
11523 dout(10) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
11524 << " bits " << bits
<< " = " << r
<< dendl
;
11528 // DB key value Histogram
11529 #define KEY_SLAB 32
11530 #define VALUE_SLAB 64
11532 const string prefix_onode
= "o";
11533 const string prefix_onode_shard
= "x";
11534 const string prefix_other
= "Z";
11536 int BlueStore::DBHistogram::get_key_slab(size_t sz
)
11538 return (sz
/KEY_SLAB
);
11541 string
BlueStore::DBHistogram::get_key_slab_to_range(int slab
)
11543 int lower_bound
= slab
* KEY_SLAB
;
11544 int upper_bound
= (slab
+ 1) * KEY_SLAB
;
11545 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
11549 int BlueStore::DBHistogram::get_value_slab(size_t sz
)
11551 return (sz
/VALUE_SLAB
);
11554 string
BlueStore::DBHistogram::get_value_slab_to_range(int slab
)
11556 int lower_bound
= slab
* VALUE_SLAB
;
11557 int upper_bound
= (slab
+ 1) * VALUE_SLAB
;
11558 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
11562 void BlueStore::DBHistogram::update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
11563 const string
&prefix
, size_t key_size
, size_t value_size
)
11565 uint32_t key_slab
= get_key_slab(key_size
);
11566 uint32_t value_slab
= get_value_slab(value_size
);
11567 key_hist
[prefix
][key_slab
].count
++;
11568 key_hist
[prefix
][key_slab
].max_len
= MAX(key_size
, key_hist
[prefix
][key_slab
].max_len
);
11569 key_hist
[prefix
][key_slab
].val_map
[value_slab
].count
++;
11570 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
=
11571 MAX(value_size
, key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
);
11574 void BlueStore::DBHistogram::dump(Formatter
*f
)
11576 f
->open_object_section("rocksdb_value_distribution");
11577 for (auto i
: value_hist
) {
11578 f
->dump_unsigned(get_value_slab_to_range(i
.first
).data(), i
.second
);
11580 f
->close_section();
11582 f
->open_object_section("rocksdb_key_value_histogram");
11583 for (auto i
: key_hist
) {
11584 f
->dump_string("prefix", i
.first
);
11585 f
->open_object_section("key_hist");
11586 for ( auto k
: i
.second
) {
11587 f
->dump_unsigned(get_key_slab_to_range(k
.first
).data(), k
.second
.count
);
11588 f
->dump_unsigned("max_len", k
.second
.max_len
);
11589 f
->open_object_section("value_hist");
11590 for ( auto j
: k
.second
.val_map
) {
11591 f
->dump_unsigned(get_value_slab_to_range(j
.first
).data(), j
.second
.count
);
11592 f
->dump_unsigned("max_len", j
.second
.max_len
);
11594 f
->close_section();
11596 f
->close_section();
11598 f
->close_section();
11601 //Itrerates through the db and collects the stats
11602 void BlueStore::generate_db_histogram(Formatter
*f
)
11605 uint64_t num_onodes
= 0;
11606 uint64_t num_shards
= 0;
11607 uint64_t num_super
= 0;
11608 uint64_t num_coll
= 0;
11609 uint64_t num_omap
= 0;
11610 uint64_t num_deferred
= 0;
11611 uint64_t num_alloc
= 0;
11612 uint64_t num_stat
= 0;
11613 uint64_t num_others
= 0;
11614 uint64_t num_shared_shards
= 0;
11615 size_t max_key_size
=0, max_value_size
= 0;
11616 uint64_t total_key_size
= 0, total_value_size
= 0;
11617 size_t key_size
= 0, value_size
= 0;
11620 utime_t start
= ceph_clock_now();
11622 KeyValueDB::WholeSpaceIterator iter
= db
->get_iterator();
11623 iter
->seek_to_first();
11624 while (iter
->valid()) {
11625 dout(30) << __func__
<< " Key: " << iter
->key() << dendl
;
11626 key_size
= iter
->key_size();
11627 value_size
= iter
->value_size();
11628 hist
.value_hist
[hist
.get_value_slab(value_size
)]++;
11629 max_key_size
= MAX(max_key_size
, key_size
);
11630 max_value_size
= MAX(max_value_size
, value_size
);
11631 total_key_size
+= key_size
;
11632 total_value_size
+= value_size
;
11634 pair
<string
,string
> key(iter
->raw_key());
11636 if (key
.first
== PREFIX_SUPER
) {
11637 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SUPER
, key_size
, value_size
);
11639 } else if (key
.first
== PREFIX_STAT
) {
11640 hist
.update_hist_entry(hist
.key_hist
, PREFIX_STAT
, key_size
, value_size
);
11642 } else if (key
.first
== PREFIX_COLL
) {
11643 hist
.update_hist_entry(hist
.key_hist
, PREFIX_COLL
, key_size
, value_size
);
11645 } else if (key
.first
== PREFIX_OBJ
) {
11646 if (key
.second
.back() == ONODE_KEY_SUFFIX
) {
11647 hist
.update_hist_entry(hist
.key_hist
, prefix_onode
, key_size
, value_size
);
11650 hist
.update_hist_entry(hist
.key_hist
, prefix_onode_shard
, key_size
, value_size
);
11653 } else if (key
.first
== PREFIX_OMAP
) {
11654 hist
.update_hist_entry(hist
.key_hist
, PREFIX_OMAP
, key_size
, value_size
);
11656 } else if (key
.first
== PREFIX_DEFERRED
) {
11657 hist
.update_hist_entry(hist
.key_hist
, PREFIX_DEFERRED
, key_size
, value_size
);
11659 } else if (key
.first
== PREFIX_ALLOC
|| key
.first
== "b" ) {
11660 hist
.update_hist_entry(hist
.key_hist
, PREFIX_ALLOC
, key_size
, value_size
);
11662 } else if (key
.first
== PREFIX_SHARED_BLOB
) {
11663 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SHARED_BLOB
, key_size
, value_size
);
11664 num_shared_shards
++;
11666 hist
.update_hist_entry(hist
.key_hist
, prefix_other
, key_size
, value_size
);
11672 utime_t duration
= ceph_clock_now() - start
;
11673 f
->open_object_section("rocksdb_key_value_stats");
11674 f
->dump_unsigned("num_onodes", num_onodes
);
11675 f
->dump_unsigned("num_shards", num_shards
);
11676 f
->dump_unsigned("num_super", num_super
);
11677 f
->dump_unsigned("num_coll", num_coll
);
11678 f
->dump_unsigned("num_omap", num_omap
);
11679 f
->dump_unsigned("num_deferred", num_deferred
);
11680 f
->dump_unsigned("num_alloc", num_alloc
);
11681 f
->dump_unsigned("num_stat", num_stat
);
11682 f
->dump_unsigned("num_shared_shards", num_shared_shards
);
11683 f
->dump_unsigned("num_others", num_others
);
11684 f
->dump_unsigned("max_key_size", max_key_size
);
11685 f
->dump_unsigned("max_value_size", max_value_size
);
11686 f
->dump_unsigned("total_key_size", total_key_size
);
11687 f
->dump_unsigned("total_value_size", total_value_size
);
11688 f
->close_section();
11692 dout(20) << __func__
<< " finished in " << duration
<< " seconds" << dendl
;
11696 void BlueStore::_flush_cache()
11698 dout(10) << __func__
<< dendl
;
11699 for (auto i
: cache_shards
) {
11701 assert(i
->empty());
11703 for (auto& p
: coll_map
) {
11704 if (!p
.second
->onode_map
.empty()) {
11705 derr
<< __func__
<< "stray onodes on " << p
.first
<< dendl
;
11706 p
.second
->onode_map
.dump(cct
, 0);
11708 if (!p
.second
->shared_blob_set
.empty()) {
11709 derr
<< __func__
<< " stray shared blobs on " << p
.first
<< dendl
;
11710 p
.second
->shared_blob_set
.dump(cct
, 0);
11712 assert(p
.second
->onode_map
.empty());
11713 assert(p
.second
->shared_blob_set
.empty());
11718 // For external caller.
11719 // We use a best-effort policy instead, e.g.,
11720 // we don't care if there are still some pinned onodes/data in the cache
11721 // after this command is completed.
11722 void BlueStore::flush_cache()
11724 dout(10) << __func__
<< dendl
;
11725 for (auto i
: cache_shards
) {
11730 void BlueStore::_apply_padding(uint64_t head_pad
,
11732 bufferlist
& padded
)
11735 padded
.prepend_zero(head_pad
);
11738 padded
.append_zero(tail_pad
);
11740 if (head_pad
|| tail_pad
) {
11741 dout(20) << __func__
<< " can pad head 0x" << std::hex
<< head_pad
11742 << " tail 0x" << tail_pad
<< std::dec
<< dendl
;
11743 logger
->inc(l_bluestore_write_pad_bytes
, head_pad
+ tail_pad
);
11747 // ===========================================