1 // vim: ts=8 sw=2 smarttab
3 * Ceph - scalable distributed file system
5 * Copyright (C) 2014 Red Hat
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
16 #include <sys/types.h>
20 #include "include/cpp-btree/btree_set.h"
22 #include "BlueStore.h"
24 #include "include/compat.h"
25 #include "include/intarith.h"
26 #include "include/stringify.h"
27 #include "common/errno.h"
28 #include "common/safe_io.h"
29 #include "Allocator.h"
30 #include "FreelistManager.h"
32 #include "BlueRocksEnv.h"
33 #include "auth/Crypto.h"
34 #include "common/EventTrace.h"
36 #define dout_context cct
37 #define dout_subsys ceph_subsys_bluestore
39 using bid_t
= decltype(BlueStore::Blob::id
);
41 // bluestore_cache_onode
42 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode
, bluestore_onode
,
43 bluestore_cache_onode
);
45 // bluestore_cache_other
46 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer
, bluestore_buffer
,
47 bluestore_cache_other
);
48 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent
, bluestore_extent
,
49 bluestore_cache_other
);
50 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob
, bluestore_blob
,
51 bluestore_cache_other
);
52 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob
, bluestore_shared_blob
,
53 bluestore_cache_other
);
56 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext
, bluestore_transcontext
,
61 const string PREFIX_SUPER
= "S"; // field -> value
62 const string PREFIX_STAT
= "T"; // field -> value(int64 array)
63 const string PREFIX_COLL
= "C"; // collection name -> cnode_t
64 const string PREFIX_OBJ
= "O"; // object name -> onode_t
65 const string PREFIX_OMAP
= "M"; // u64 + keyname -> value
66 const string PREFIX_DEFERRED
= "L"; // id -> deferred_transaction_t
67 const string PREFIX_ALLOC
= "B"; // u64 offset -> u64 length (freelist)
68 const string PREFIX_SHARED_BLOB
= "X"; // u64 offset -> shared_blob_t
70 // write a label in the first block. always use this size. note that
71 // bluefs makes a matching assumption about the location of its
72 // superblock (always the second block of the device).
73 #define BDEV_LABEL_BLOCK_SIZE 4096
75 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
76 #define SUPER_RESERVED 8192
78 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
82 * extent map blob encoding
84 * we use the low bits of the blobid field to indicate some common scenarios
85 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
87 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
88 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
89 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
90 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
91 #define BLOBID_SHIFT_BITS 4
94 * object name key structure
96 * encoded u8: shard + 2^7 (so that it sorts properly)
97 * encoded u64: poolid + 2^63 (so that it sorts properly)
98 * encoded u32: hash (bit reversed)
100 * escaped string: namespace
102 * escaped string: key or object name
103 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
104 * we are done. otherwise, we are followed by the object name.
105 * escaped string: object name (unless '=' above)
108 * encoded u64: generation
111 #define ONODE_KEY_SUFFIX 'o'
120 #define EXTENT_SHARD_KEY_SUFFIX 'x'
123 * string encoding in the key
125 * The key string needs to lexicographically sort the same way that
126 * ghobject_t does. We do this by escaping anything <= to '#' with #
127 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
130 * We use ! as a terminator for strings; this works because it is < #
131 * and will get escaped if it is present in the string.
135 static void append_escaped(const string
&in
, S
*out
)
138 for (string::const_iterator i
= in
.begin(); i
!= in
.end(); ++i
) {
140 snprintf(hexbyte
, sizeof(hexbyte
), "#%02x", (uint8_t)*i
);
141 out
->append(hexbyte
);
142 } else if (*i
>= '~') {
143 snprintf(hexbyte
, sizeof(hexbyte
), "~%02x", (uint8_t)*i
);
144 out
->append(hexbyte
);
152 static int decode_escaped(const char *p
, string
*out
)
154 const char *orig_p
= p
;
155 while (*p
&& *p
!= '!') {
156 if (*p
== '#' || *p
== '~') {
158 int r
= sscanf(++p
, "%2x", &hex
);
161 out
->push_back((char)hex
);
164 out
->push_back(*p
++);
170 // some things we encode in binary (as le32 or le64); print the
171 // resulting key strings nicely
173 static string
pretty_binary_string(const S
& in
)
177 out
.reserve(in
.length() * 3);
178 enum { NONE
, HEX
, STRING
} mode
= NONE
;
179 unsigned from
= 0, i
;
180 for (i
=0; i
< in
.length(); ++i
) {
181 if ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
182 (mode
== HEX
&& in
.length() - i
>= 4 &&
183 ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
184 (in
[i
+1] < 32 || (unsigned char)in
[i
+1] > 126) ||
185 (in
[i
+2] < 32 || (unsigned char)in
[i
+2] > 126) ||
186 (in
[i
+3] < 32 || (unsigned char)in
[i
+3] > 126)))) {
187 if (mode
== STRING
) {
188 out
.append(in
.c_str() + from
, i
- from
);
195 if (in
.length() - i
>= 4) {
196 // print a whole u32 at once
197 snprintf(buf
, sizeof(buf
), "%08x",
198 (uint32_t)(((unsigned char)in
[i
] << 24) |
199 ((unsigned char)in
[i
+1] << 16) |
200 ((unsigned char)in
[i
+2] << 8) |
201 ((unsigned char)in
[i
+3] << 0)));
204 snprintf(buf
, sizeof(buf
), "%02x", (int)(unsigned char)in
[i
]);
208 if (mode
!= STRING
) {
215 if (mode
== STRING
) {
216 out
.append(in
.c_str() + from
, i
- from
);
223 static void _key_encode_shard(shard_id_t shard
, T
*key
)
225 key
->push_back((char)((uint8_t)shard
.id
+ (uint8_t)0x80));
228 static const char *_key_decode_shard(const char *key
, shard_id_t
*pshard
)
230 pshard
->id
= (uint8_t)*key
- (uint8_t)0x80;
234 static void get_coll_key_range(const coll_t
& cid
, int bits
,
235 string
*temp_start
, string
*temp_end
,
236 string
*start
, string
*end
)
244 if (cid
.is_pg(&pgid
)) {
245 _key_encode_shard(pgid
.shard
, start
);
246 *temp_start
= *start
;
248 _key_encode_u64(pgid
.pool() + 0x8000000000000000ull
, start
);
249 _key_encode_u64((-2ll - pgid
.pool()) + 0x8000000000000000ull
, temp_start
);
252 *temp_end
= *temp_start
;
254 uint32_t reverse_hash
= hobject_t::_reverse_bits(pgid
.ps());
255 _key_encode_u32(reverse_hash
, start
);
256 _key_encode_u32(reverse_hash
, temp_start
);
258 uint64_t end_hash
= reverse_hash
+ (1ull << (32 - bits
));
259 if (end_hash
> 0xffffffffull
)
260 end_hash
= 0xffffffffull
;
262 _key_encode_u32(end_hash
, end
);
263 _key_encode_u32(end_hash
, temp_end
);
265 _key_encode_shard(shard_id_t::NO_SHARD
, start
);
266 _key_encode_u64(-1ull + 0x8000000000000000ull
, start
);
268 _key_encode_u32(0, start
);
269 _key_encode_u32(0xffffffff, end
);
271 // no separate temp section
277 static void get_shared_blob_key(uint64_t sbid
, string
*key
)
280 _key_encode_u64(sbid
, key
);
283 static int get_key_shared_blob(const string
& key
, uint64_t *sbid
)
285 const char *p
= key
.c_str();
286 if (key
.length() < sizeof(uint64_t))
288 p
= _key_decode_u64(p
, sbid
);
293 static int get_key_object(const S
& key
, ghobject_t
*oid
)
296 const char *p
= key
.c_str();
298 if (key
.length() < 1 + 8 + 4)
300 p
= _key_decode_shard(p
, &oid
->shard_id
);
303 p
= _key_decode_u64(p
, &pool
);
304 oid
->hobj
.pool
= pool
- 0x8000000000000000ull
;
307 p
= _key_decode_u32(p
, &hash
);
309 oid
->hobj
.set_bitwise_key_u32(hash
);
311 r
= decode_escaped(p
, &oid
->hobj
.nspace
);
317 r
= decode_escaped(p
, &k
);
324 oid
->hobj
.oid
.name
= k
;
325 } else if (*p
== '<' || *p
== '>') {
328 r
= decode_escaped(p
, &oid
->hobj
.oid
.name
);
332 oid
->hobj
.set_key(k
);
338 p
= _key_decode_u64(p
, &oid
->hobj
.snap
.val
);
339 p
= _key_decode_u64(p
, &oid
->generation
);
341 if (*p
!= ONODE_KEY_SUFFIX
) {
346 // if we get something other than a null terminator here,
347 // something goes wrong.
355 static void get_object_key(CephContext
*cct
, const ghobject_t
& oid
, S
*key
)
359 size_t max_len
= 1 + 8 + 4 +
360 (oid
.hobj
.nspace
.length() * 3 + 1) +
361 (oid
.hobj
.get_key().length() * 3 + 1) +
362 1 + // for '<', '=', or '>'
363 (oid
.hobj
.oid
.name
.length() * 3 + 1) +
365 key
->reserve(max_len
);
367 _key_encode_shard(oid
.shard_id
, key
);
368 _key_encode_u64(oid
.hobj
.pool
+ 0x8000000000000000ull
, key
);
369 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), key
);
371 append_escaped(oid
.hobj
.nspace
, key
);
373 if (oid
.hobj
.get_key().length()) {
374 // is a key... could be < = or >.
375 append_escaped(oid
.hobj
.get_key(), key
);
376 // (ASCII chars < = and > sort in that order, yay)
377 int r
= oid
.hobj
.get_key().compare(oid
.hobj
.oid
.name
);
379 key
->append(r
> 0 ? ">" : "<");
380 append_escaped(oid
.hobj
.oid
.name
, key
);
387 append_escaped(oid
.hobj
.oid
.name
, key
);
391 _key_encode_u64(oid
.hobj
.snap
, key
);
392 _key_encode_u64(oid
.generation
, key
);
394 key
->push_back(ONODE_KEY_SUFFIX
);
399 int r
= get_key_object(*key
, &t
);
401 derr
<< " r " << r
<< dendl
;
402 derr
<< "key " << pretty_binary_string(*key
) << dendl
;
403 derr
<< "oid " << oid
<< dendl
;
404 derr
<< " t " << t
<< dendl
;
405 assert(r
== 0 && t
== oid
);
411 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
412 // char lets us quickly test whether it is a shard key without decoding any
413 // of the prefix bytes.
415 static void get_extent_shard_key(const S
& onode_key
, uint32_t offset
,
419 key
->reserve(onode_key
.length() + 4 + 1);
420 key
->append(onode_key
.c_str(), onode_key
.size());
421 _key_encode_u32(offset
, key
);
422 key
->push_back(EXTENT_SHARD_KEY_SUFFIX
);
425 static void rewrite_extent_shard_key(uint32_t offset
, string
*key
)
427 assert(key
->size() > sizeof(uint32_t) + 1);
428 assert(*key
->rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
429 _key_encode_u32(offset
, key
->size() - sizeof(uint32_t) - 1, key
);
433 static void generate_extent_shard_key_and_apply(
437 std::function
<void(const string
& final_key
)> apply
)
439 if (key
->empty()) { // make full key
440 assert(!onode_key
.empty());
441 get_extent_shard_key(onode_key
, offset
, key
);
443 rewrite_extent_shard_key(offset
, key
);
448 int get_key_extent_shard(const string
& key
, string
*onode_key
, uint32_t *offset
)
450 assert(key
.size() > sizeof(uint32_t) + 1);
451 assert(*key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
452 int okey_len
= key
.size() - sizeof(uint32_t) - 1;
453 *onode_key
= key
.substr(0, okey_len
);
454 const char *p
= key
.data() + okey_len
;
455 p
= _key_decode_u32(p
, offset
);
459 static bool is_extent_shard_key(const string
& key
)
461 return *key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
;
465 static void get_omap_header(uint64_t id
, string
*out
)
467 _key_encode_u64(id
, out
);
471 // hmm, I don't think there's any need to escape the user key since we
472 // have a clean prefix.
473 static void get_omap_key(uint64_t id
, const string
& key
, string
*out
)
475 _key_encode_u64(id
, out
);
480 static void rewrite_omap_key(uint64_t id
, string old
, string
*out
)
482 _key_encode_u64(id
, out
);
483 out
->append(old
.c_str() + out
->length(), old
.size() - out
->length());
486 static void decode_omap_key(const string
& key
, string
*user_key
)
488 *user_key
= key
.substr(sizeof(uint64_t) + 1);
491 static void get_omap_tail(uint64_t id
, string
*out
)
493 _key_encode_u64(id
, out
);
497 static void get_deferred_key(uint64_t seq
, string
*out
)
499 _key_encode_u64(seq
, out
);
505 struct Int64ArrayMergeOperator
: public KeyValueDB::MergeOperator
{
506 void merge_nonexistent(
507 const char *rdata
, size_t rlen
, std::string
*new_value
) override
{
508 *new_value
= std::string(rdata
, rlen
);
511 const char *ldata
, size_t llen
,
512 const char *rdata
, size_t rlen
,
513 std::string
*new_value
) override
{
514 assert(llen
== rlen
);
515 assert((rlen
% 8) == 0);
516 new_value
->resize(rlen
);
517 const __le64
* lv
= (const __le64
*)ldata
;
518 const __le64
* rv
= (const __le64
*)rdata
;
519 __le64
* nv
= &(__le64
&)new_value
->at(0);
520 for (size_t i
= 0; i
< rlen
>> 3; ++i
) {
521 nv
[i
] = lv
[i
] + rv
[i
];
524 // We use each operator name and each prefix to construct the
525 // overall RocksDB operator name for consistency check at open time.
526 string
name() const override
{
527 return "int64_array";
534 ostream
& operator<<(ostream
& out
, const BlueStore::Buffer
& b
)
536 out
<< "buffer(" << &b
<< " space " << b
.space
<< " 0x" << std::hex
537 << b
.offset
<< "~" << b
.length
<< std::dec
538 << " " << BlueStore::Buffer::get_state_name(b
.state
);
540 out
<< " " << BlueStore::Buffer::get_flag_name(b
.flags
);
546 void BlueStore::GarbageCollector::process_protrusive_extents(
547 const BlueStore::ExtentMap
& extent_map
,
548 uint64_t start_offset
,
550 uint64_t start_touch_offset
,
551 uint64_t end_touch_offset
,
552 uint64_t min_alloc_size
)
554 assert(start_offset
<= start_touch_offset
&& end_offset
>= end_touch_offset
);
556 uint64_t lookup_start_offset
= P2ALIGN(start_offset
, min_alloc_size
);
557 uint64_t lookup_end_offset
= ROUND_UP_TO(end_offset
, min_alloc_size
);
559 dout(30) << __func__
<< " (hex): [" << std::hex
560 << lookup_start_offset
<< ", " << lookup_end_offset
561 << ")" << std::dec
<< dendl
;
563 for (auto it
= extent_map
.seek_lextent(lookup_start_offset
);
564 it
!= extent_map
.extent_map
.end() &&
565 it
->logical_offset
< lookup_end_offset
;
567 uint64_t alloc_unit_start
= it
->logical_offset
/ min_alloc_size
;
568 uint64_t alloc_unit_end
= (it
->logical_end() - 1) / min_alloc_size
;
570 dout(30) << __func__
<< " " << *it
571 << "alloc_units: " << alloc_unit_start
<< ".." << alloc_unit_end
574 Blob
* b
= it
->blob
.get();
576 if (it
->logical_offset
>=start_touch_offset
&&
577 it
->logical_end() <= end_touch_offset
) {
578 // Process extents within the range affected by
579 // the current write request.
580 // Need to take into account if existing extents
581 // can be merged with them (uncompressed case)
582 if (!b
->get_blob().is_compressed()) {
583 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
584 --blob_info_counted
->expected_allocations
; // don't need to allocate
585 // new AU for compressed
586 // data since another
587 // collocated uncompressed
588 // blob already exists
589 dout(30) << __func__
<< " --expected:"
590 << alloc_unit_start
<< dendl
;
592 used_alloc_unit
= alloc_unit_end
;
593 blob_info_counted
= nullptr;
595 } else if (b
->get_blob().is_compressed()) {
597 // additionally we take compressed blobs that were not impacted
598 // by the write into account too
600 affected_blobs
.emplace(
601 b
, BlobInfo(b
->get_referenced_bytes())).first
->second
;
604 (used_alloc_unit
&& used_alloc_unit
== alloc_unit_start
) ? 0 : 1;
605 bi
.expected_allocations
+= alloc_unit_end
- alloc_unit_start
+ adjust
;
606 dout(30) << __func__
<< " expected_allocations="
607 << bi
.expected_allocations
<< " end_au:"
608 << alloc_unit_end
<< dendl
;
610 blob_info_counted
= &bi
;
611 used_alloc_unit
= alloc_unit_end
;
613 assert(it
->length
<= bi
.referenced_bytes
);
614 bi
.referenced_bytes
-= it
->length
;
615 dout(30) << __func__
<< " affected_blob:" << *b
616 << " unref 0x" << std::hex
<< it
->length
617 << " referenced = 0x" << bi
.referenced_bytes
618 << std::dec
<< dendl
;
619 // NOTE: we can't move specific blob to resulting GC list here
620 // when reference counter == 0 since subsequent extents might
621 // decrement its expected_allocation.
622 // Hence need to enumerate all the extents first.
623 if (!bi
.collect_candidate
) {
624 bi
.first_lextent
= it
;
625 bi
.collect_candidate
= true;
627 bi
.last_lextent
= it
;
629 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
630 // don't need to allocate new AU for compressed data since another
631 // collocated uncompressed blob already exists
632 --blob_info_counted
->expected_allocations
;
633 dout(30) << __func__
<< " --expected_allocations:"
634 << alloc_unit_start
<< dendl
;
636 used_alloc_unit
= alloc_unit_end
;
637 blob_info_counted
= nullptr;
641 for (auto b_it
= affected_blobs
.begin();
642 b_it
!= affected_blobs
.end();
644 Blob
* b
= b_it
->first
;
645 BlobInfo
& bi
= b_it
->second
;
646 if (bi
.referenced_bytes
== 0) {
647 uint64_t len_on_disk
= b_it
->first
->get_blob().get_ondisk_length();
648 int64_t blob_expected_for_release
=
649 ROUND_UP_TO(len_on_disk
, min_alloc_size
) / min_alloc_size
;
651 dout(30) << __func__
<< " " << *(b_it
->first
)
652 << " expected4release=" << blob_expected_for_release
653 << " expected_allocations=" << bi
.expected_allocations
655 int64_t benefit
= blob_expected_for_release
- bi
.expected_allocations
;
656 if (benefit
>= g_conf
->bluestore_gc_enable_blob_threshold
) {
657 if (bi
.collect_candidate
) {
658 auto it
= bi
.first_lextent
;
661 if (it
->blob
.get() == b
) {
662 extents_to_collect
.emplace_back(it
->logical_offset
, it
->length
);
664 bExit
= it
== bi
.last_lextent
;
668 expected_for_release
+= blob_expected_for_release
;
669 expected_allocations
+= bi
.expected_allocations
;
675 int64_t BlueStore::GarbageCollector::estimate(
676 uint64_t start_offset
,
678 const BlueStore::ExtentMap
& extent_map
,
679 const BlueStore::old_extent_map_t
& old_extents
,
680 uint64_t min_alloc_size
)
683 affected_blobs
.clear();
684 extents_to_collect
.clear();
685 used_alloc_unit
= boost::optional
<uint64_t >();
686 blob_info_counted
= nullptr;
688 gc_start_offset
= start_offset
;
689 gc_end_offset
= start_offset
+ length
;
691 uint64_t end_offset
= start_offset
+ length
;
693 for (auto it
= old_extents
.begin(); it
!= old_extents
.end(); ++it
) {
694 Blob
* b
= it
->e
.blob
.get();
695 if (b
->get_blob().is_compressed()) {
697 // update gc_start_offset/gc_end_offset if needed
698 gc_start_offset
= min(gc_start_offset
, (uint64_t)it
->e
.blob_start());
699 gc_end_offset
= max(gc_end_offset
, (uint64_t)it
->e
.blob_end());
701 auto o
= it
->e
.logical_offset
;
702 auto l
= it
->e
.length
;
704 uint64_t ref_bytes
= b
->get_referenced_bytes();
705 // micro optimization to bypass blobs that have no more references
706 if (ref_bytes
!= 0) {
707 dout(30) << __func__
<< " affected_blob:" << *b
708 << " unref 0x" << std::hex
<< o
<< "~" << l
709 << std::dec
<< dendl
;
710 affected_blobs
.emplace(b
, BlobInfo(ref_bytes
));
714 dout(30) << __func__
<< " gc range(hex): [" << std::hex
715 << gc_start_offset
<< ", " << gc_end_offset
716 << ")" << std::dec
<< dendl
;
718 // enumerate preceeding extents to check if they reference affected blobs
719 if (gc_start_offset
< start_offset
|| gc_end_offset
> end_offset
) {
720 process_protrusive_extents(extent_map
,
727 return expected_for_release
- expected_allocations
;
732 BlueStore::Cache
*BlueStore::Cache::create(CephContext
* cct
, string type
,
733 PerfCounters
*logger
)
738 c
= new LRUCache(cct
);
739 else if (type
== "2q")
740 c
= new TwoQCache(cct
);
742 assert(0 == "unrecognized cache type");
748 void BlueStore::Cache::trim_all()
750 std::lock_guard
<std::recursive_mutex
> l(lock
);
754 void BlueStore::Cache::trim(
755 uint64_t target_bytes
,
756 float target_meta_ratio
,
757 float target_data_ratio
,
758 float bytes_per_onode
)
760 std::lock_guard
<std::recursive_mutex
> l(lock
);
761 uint64_t current_meta
= _get_num_onodes() * bytes_per_onode
;
762 uint64_t current_buffer
= _get_buffer_bytes();
763 uint64_t current
= current_meta
+ current_buffer
;
765 uint64_t target_meta
= target_bytes
* target_meta_ratio
;
766 uint64_t target_buffer
= target_bytes
* target_data_ratio
;
768 // correct for overflow or float imprecision
769 target_meta
= min(target_bytes
, target_meta
);
770 target_buffer
= min(target_bytes
- target_meta
, target_buffer
);
772 if (current
<= target_bytes
) {
774 << " shard target " << pretty_si_t(target_bytes
)
775 << " meta/data ratios " << target_meta_ratio
776 << " + " << target_data_ratio
<< " ("
777 << pretty_si_t(target_meta
) << " + "
778 << pretty_si_t(target_buffer
) << "), "
779 << " current " << pretty_si_t(current
) << " ("
780 << pretty_si_t(current_meta
) << " + "
781 << pretty_si_t(current_buffer
) << ")"
786 uint64_t need_to_free
= current
- target_bytes
;
787 uint64_t free_buffer
= 0;
788 uint64_t free_meta
= 0;
789 if (current_buffer
> target_buffer
) {
790 free_buffer
= current_buffer
- target_buffer
;
791 if (free_buffer
> need_to_free
) {
792 free_buffer
= need_to_free
;
795 free_meta
= need_to_free
- free_buffer
;
797 // start bounds at what we have now
798 uint64_t max_buffer
= current_buffer
- free_buffer
;
799 uint64_t max_meta
= current_meta
- free_meta
;
800 uint64_t max_onodes
= max_meta
/ bytes_per_onode
;
803 << " shard target " << pretty_si_t(target_bytes
)
804 << " ratio " << target_meta_ratio
<< " ("
805 << pretty_si_t(target_meta
) << " + "
806 << pretty_si_t(target_buffer
) << "), "
807 << " current " << pretty_si_t(current
) << " ("
808 << pretty_si_t(current_meta
) << " + "
809 << pretty_si_t(current_buffer
) << "),"
810 << " need_to_free " << pretty_si_t(need_to_free
) << " ("
811 << pretty_si_t(free_meta
) << " + "
812 << pretty_si_t(free_buffer
) << ")"
813 << " -> max " << max_onodes
<< " onodes + "
814 << max_buffer
<< " buffer"
816 _trim(max_onodes
, max_buffer
);
822 #define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
824 void BlueStore::LRUCache::_touch_onode(OnodeRef
& o
)
826 auto p
= onode_lru
.iterator_to(*o
);
828 onode_lru
.push_front(*o
);
831 void BlueStore::LRUCache::_trim(uint64_t onode_max
, uint64_t buffer_max
)
833 dout(20) << __func__
<< " onodes " << onode_lru
.size() << " / " << onode_max
834 << " buffers " << buffer_size
<< " / " << buffer_max
837 _audit("trim start");
840 while (buffer_size
> buffer_max
) {
841 auto i
= buffer_lru
.rbegin();
842 if (i
== buffer_lru
.rend()) {
843 // stop if buffer_lru is now empty
848 assert(b
->is_clean());
849 dout(20) << __func__
<< " rm " << *b
<< dendl
;
850 b
->space
->_rm_buffer(this, b
);
854 int num
= onode_lru
.size() - onode_max
;
856 return; // don't even try
858 auto p
= onode_lru
.end();
859 assert(p
!= onode_lru
.begin());
862 int max_skipped
= g_conf
->bluestore_cache_trim_max_skip_pinned
;
865 int refs
= o
->nref
.load();
867 dout(20) << __func__
<< " " << o
->oid
<< " has " << refs
868 << " refs, skipping" << dendl
;
869 if (++skipped
>= max_skipped
) {
870 dout(20) << __func__
<< " maximum skip pinned reached; stopping with "
871 << num
<< " left to trim" << dendl
;
875 if (p
== onode_lru
.begin()) {
883 dout(30) << __func__
<< " rm " << o
->oid
<< dendl
;
884 if (p
!= onode_lru
.begin()) {
885 onode_lru
.erase(p
--);
890 o
->get(); // paranoia
891 o
->c
->onode_map
.remove(o
->oid
);
898 void BlueStore::LRUCache::_audit(const char *when
)
900 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
902 for (auto i
= buffer_lru
.begin(); i
!= buffer_lru
.end(); ++i
) {
905 if (s
!= buffer_size
) {
906 derr
<< __func__
<< " buffer_size " << buffer_size
<< " actual " << s
908 for (auto i
= buffer_lru
.begin(); i
!= buffer_lru
.end(); ++i
) {
909 derr
<< __func__
<< " " << *i
<< dendl
;
911 assert(s
== buffer_size
);
913 dout(20) << __func__
<< " " << when
<< " buffer_size " << buffer_size
920 #define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
923 void BlueStore::TwoQCache::_touch_onode(OnodeRef
& o
)
925 auto p
= onode_lru
.iterator_to(*o
);
927 onode_lru
.push_front(*o
);
930 void BlueStore::TwoQCache::_add_buffer(Buffer
*b
, int level
, Buffer
*near
)
932 dout(20) << __func__
<< " level " << level
<< " near " << near
934 << " which has cache_private " << b
->cache_private
<< dendl
;
936 b
->cache_private
= near
->cache_private
;
937 switch (b
->cache_private
) {
939 buffer_warm_in
.insert(buffer_warm_in
.iterator_to(*near
), *b
);
941 case BUFFER_WARM_OUT
:
942 assert(b
->is_empty());
943 buffer_warm_out
.insert(buffer_warm_out
.iterator_to(*near
), *b
);
946 buffer_hot
.insert(buffer_hot
.iterator_to(*near
), *b
);
949 assert(0 == "bad cache_private");
951 } else if (b
->cache_private
== BUFFER_NEW
) {
952 b
->cache_private
= BUFFER_WARM_IN
;
954 buffer_warm_in
.push_front(*b
);
956 // take caller hint to start at the back of the warm queue
957 buffer_warm_in
.push_back(*b
);
960 // we got a hint from discard
961 switch (b
->cache_private
) {
963 // stay in warm_in. move to front, even though 2Q doesn't actually
965 dout(20) << __func__
<< " move to front of warm " << *b
<< dendl
;
966 buffer_warm_in
.push_front(*b
);
968 case BUFFER_WARM_OUT
:
969 b
->cache_private
= BUFFER_HOT
;
970 // move to hot. fall-thru
972 dout(20) << __func__
<< " move to front of hot " << *b
<< dendl
;
973 buffer_hot
.push_front(*b
);
976 assert(0 == "bad cache_private");
979 if (!b
->is_empty()) {
980 buffer_bytes
+= b
->length
;
981 buffer_list_bytes
[b
->cache_private
] += b
->length
;
985 void BlueStore::TwoQCache::_rm_buffer(Buffer
*b
)
987 dout(20) << __func__
<< " " << *b
<< dendl
;
988 if (!b
->is_empty()) {
989 assert(buffer_bytes
>= b
->length
);
990 buffer_bytes
-= b
->length
;
991 assert(buffer_list_bytes
[b
->cache_private
] >= b
->length
);
992 buffer_list_bytes
[b
->cache_private
] -= b
->length
;
994 switch (b
->cache_private
) {
996 buffer_warm_in
.erase(buffer_warm_in
.iterator_to(*b
));
998 case BUFFER_WARM_OUT
:
999 buffer_warm_out
.erase(buffer_warm_out
.iterator_to(*b
));
1002 buffer_hot
.erase(buffer_hot
.iterator_to(*b
));
1005 assert(0 == "bad cache_private");
1009 void BlueStore::TwoQCache::_move_buffer(Cache
*srcc
, Buffer
*b
)
1011 TwoQCache
*src
= static_cast<TwoQCache
*>(srcc
);
1014 // preserve which list we're on (even if we can't preserve the order!)
1015 switch (b
->cache_private
) {
1016 case BUFFER_WARM_IN
:
1017 assert(!b
->is_empty());
1018 buffer_warm_in
.push_back(*b
);
1020 case BUFFER_WARM_OUT
:
1021 assert(b
->is_empty());
1022 buffer_warm_out
.push_back(*b
);
1025 assert(!b
->is_empty());
1026 buffer_hot
.push_back(*b
);
1029 assert(0 == "bad cache_private");
1031 if (!b
->is_empty()) {
1032 buffer_bytes
+= b
->length
;
1033 buffer_list_bytes
[b
->cache_private
] += b
->length
;
1037 void BlueStore::TwoQCache::_adjust_buffer_size(Buffer
*b
, int64_t delta
)
1039 dout(20) << __func__
<< " delta " << delta
<< " on " << *b
<< dendl
;
1040 if (!b
->is_empty()) {
1041 assert((int64_t)buffer_bytes
+ delta
>= 0);
1042 buffer_bytes
+= delta
;
1043 assert((int64_t)buffer_list_bytes
[b
->cache_private
] + delta
>= 0);
1044 buffer_list_bytes
[b
->cache_private
] += delta
;
1048 void BlueStore::TwoQCache::_trim(uint64_t onode_max
, uint64_t buffer_max
)
1050 dout(20) << __func__
<< " onodes " << onode_lru
.size() << " / " << onode_max
1051 << " buffers " << buffer_bytes
<< " / " << buffer_max
1054 _audit("trim start");
1057 if (buffer_bytes
> buffer_max
) {
1058 uint64_t kin
= buffer_max
* cct
->_conf
->bluestore_2q_cache_kin_ratio
;
1059 uint64_t khot
= buffer_max
- kin
;
1061 // pre-calculate kout based on average buffer size too,
1062 // which is typical(the warm_in and hot lists may change later)
1064 uint64_t buffer_num
= buffer_hot
.size() + buffer_warm_in
.size();
1066 uint64_t buffer_avg_size
= buffer_bytes
/ buffer_num
;
1067 assert(buffer_avg_size
);
1068 uint64_t calculated_buffer_num
= buffer_max
/ buffer_avg_size
;
1069 kout
= calculated_buffer_num
* cct
->_conf
->bluestore_2q_cache_kout_ratio
;
1072 if (buffer_list_bytes
[BUFFER_HOT
] < khot
) {
1073 // hot is small, give slack to warm_in
1074 kin
+= khot
- buffer_list_bytes
[BUFFER_HOT
];
1075 } else if (buffer_list_bytes
[BUFFER_WARM_IN
] < kin
) {
1076 // warm_in is small, give slack to hot
1077 khot
+= kin
- buffer_list_bytes
[BUFFER_WARM_IN
];
1080 // adjust warm_in list
1081 int64_t to_evict_bytes
= buffer_list_bytes
[BUFFER_WARM_IN
] - kin
;
1082 uint64_t evicted
= 0;
1084 while (to_evict_bytes
> 0) {
1085 auto p
= buffer_warm_in
.rbegin();
1086 if (p
== buffer_warm_in
.rend()) {
1087 // stop if warm_in list is now empty
1092 assert(b
->is_clean());
1093 dout(20) << __func__
<< " buffer_warm_in -> out " << *b
<< dendl
;
1094 assert(buffer_bytes
>= b
->length
);
1095 buffer_bytes
-= b
->length
;
1096 assert(buffer_list_bytes
[BUFFER_WARM_IN
] >= b
->length
);
1097 buffer_list_bytes
[BUFFER_WARM_IN
] -= b
->length
;
1098 to_evict_bytes
-= b
->length
;
1099 evicted
+= b
->length
;
1100 b
->state
= Buffer::STATE_EMPTY
;
1102 buffer_warm_in
.erase(buffer_warm_in
.iterator_to(*b
));
1103 buffer_warm_out
.push_front(*b
);
1104 b
->cache_private
= BUFFER_WARM_OUT
;
1108 dout(20) << __func__
<< " evicted " << prettybyte_t(evicted
)
1109 << " from warm_in list, done evicting warm_in buffers"
1114 to_evict_bytes
= buffer_list_bytes
[BUFFER_HOT
] - khot
;
1117 while (to_evict_bytes
> 0) {
1118 auto p
= buffer_hot
.rbegin();
1119 if (p
== buffer_hot
.rend()) {
1120 // stop if hot list is now empty
1125 dout(20) << __func__
<< " buffer_hot rm " << *b
<< dendl
;
1126 assert(b
->is_clean());
1127 // adjust evict size before buffer goes invalid
1128 to_evict_bytes
-= b
->length
;
1129 evicted
+= b
->length
;
1130 b
->space
->_rm_buffer(this, b
);
1134 dout(20) << __func__
<< " evicted " << prettybyte_t(evicted
)
1135 << " from hot list, done evicting hot buffers"
1139 // adjust warm out list too, if necessary
1140 int64_t num
= buffer_warm_out
.size() - kout
;
1142 Buffer
*b
= &*buffer_warm_out
.rbegin();
1143 assert(b
->is_empty());
1144 dout(20) << __func__
<< " buffer_warm_out rm " << *b
<< dendl
;
1145 b
->space
->_rm_buffer(this, b
);
1150 int num
= onode_lru
.size() - onode_max
;
1152 return; // don't even try
1154 auto p
= onode_lru
.end();
1155 assert(p
!= onode_lru
.begin());
1158 int max_skipped
= g_conf
->bluestore_cache_trim_max_skip_pinned
;
1161 dout(20) << __func__
<< " considering " << o
<< dendl
;
1162 int refs
= o
->nref
.load();
1164 dout(20) << __func__
<< " " << o
->oid
<< " has " << refs
1165 << " refs; skipping" << dendl
;
1166 if (++skipped
>= max_skipped
) {
1167 dout(20) << __func__
<< " maximum skip pinned reached; stopping with "
1168 << num
<< " left to trim" << dendl
;
1172 if (p
== onode_lru
.begin()) {
1180 dout(30) << __func__
<< " " << o
->oid
<< " num=" << num
<<" lru size="<<onode_lru
.size()<< dendl
;
1181 if (p
!= onode_lru
.begin()) {
1182 onode_lru
.erase(p
--);
1187 o
->get(); // paranoia
1188 o
->c
->onode_map
.remove(o
->oid
);
1195 void BlueStore::TwoQCache::_audit(const char *when
)
1197 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1199 for (auto i
= buffer_hot
.begin(); i
!= buffer_hot
.end(); ++i
) {
1203 uint64_t hot_bytes
= s
;
1204 if (hot_bytes
!= buffer_list_bytes
[BUFFER_HOT
]) {
1205 derr
<< __func__
<< " hot_list_bytes "
1206 << buffer_list_bytes
[BUFFER_HOT
]
1207 << " != actual " << hot_bytes
1209 assert(hot_bytes
== buffer_list_bytes
[BUFFER_HOT
]);
1212 for (auto i
= buffer_warm_in
.begin(); i
!= buffer_warm_in
.end(); ++i
) {
1216 uint64_t warm_in_bytes
= s
- hot_bytes
;
1217 if (warm_in_bytes
!= buffer_list_bytes
[BUFFER_WARM_IN
]) {
1218 derr
<< __func__
<< " warm_in_list_bytes "
1219 << buffer_list_bytes
[BUFFER_WARM_IN
]
1220 << " != actual " << warm_in_bytes
1222 assert(warm_in_bytes
== buffer_list_bytes
[BUFFER_WARM_IN
]);
1225 if (s
!= buffer_bytes
) {
1226 derr
<< __func__
<< " buffer_bytes " << buffer_bytes
<< " actual " << s
1228 assert(s
== buffer_bytes
);
1231 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1240 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1242 void BlueStore::BufferSpace::_clear(Cache
* cache
)
1244 // note: we already hold cache->lock
1245 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1246 while (!buffer_map
.empty()) {
1247 _rm_buffer(cache
, buffer_map
.begin());
1251 int BlueStore::BufferSpace::_discard(Cache
* cache
, uint32_t offset
, uint32_t length
)
1253 // note: we already hold cache->lock
1254 ldout(cache
->cct
, 20) << __func__
<< std::hex
<< " 0x" << offset
<< "~" << length
1255 << std::dec
<< dendl
;
1256 int cache_private
= 0;
1257 cache
->_audit("discard start");
1258 auto i
= _data_lower_bound(offset
);
1259 uint32_t end
= offset
+ length
;
1260 while (i
!= buffer_map
.end()) {
1261 Buffer
*b
= i
->second
.get();
1262 if (b
->offset
>= end
) {
1265 if (b
->cache_private
> cache_private
) {
1266 cache_private
= b
->cache_private
;
1268 if (b
->offset
< offset
) {
1269 int64_t front
= offset
- b
->offset
;
1270 if (b
->end() > end
) {
1271 // drop middle (split)
1272 uint32_t tail
= b
->end() - end
;
1273 if (b
->data
.length()) {
1275 bl
.substr_of(b
->data
, b
->length
- tail
, tail
);
1276 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1277 nb
->maybe_rebuild();
1278 _add_buffer(cache
, nb
, 0, b
);
1280 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, tail
),
1283 if (!b
->is_writing()) {
1284 cache
->_adjust_buffer_size(b
, front
- (int64_t)b
->length
);
1288 cache
->_audit("discard end 1");
1292 if (!b
->is_writing()) {
1293 cache
->_adjust_buffer_size(b
, front
- (int64_t)b
->length
);
1301 if (b
->end() <= end
) {
1302 // drop entire buffer
1303 _rm_buffer(cache
, i
++);
1307 uint32_t keep
= b
->end() - end
;
1308 if (b
->data
.length()) {
1310 bl
.substr_of(b
->data
, b
->length
- keep
, keep
);
1311 Buffer
*nb
= new Buffer(this, b
->state
, b
->seq
, end
, bl
);
1312 nb
->maybe_rebuild();
1313 _add_buffer(cache
, nb
, 0, b
);
1315 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, keep
), 0, b
);
1317 _rm_buffer(cache
, i
);
1318 cache
->_audit("discard end 2");
1321 return cache_private
;
1324 void BlueStore::BufferSpace::read(
1326 uint32_t offset
, uint32_t length
,
1327 BlueStore::ready_regions_t
& res
,
1328 interval_set
<uint32_t>& res_intervals
)
1330 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1332 res_intervals
.clear();
1333 uint32_t want_bytes
= length
;
1334 uint32_t end
= offset
+ length
;
1335 for (auto i
= _data_lower_bound(offset
);
1336 i
!= buffer_map
.end() && offset
< end
&& i
->first
< end
;
1338 Buffer
*b
= i
->second
.get();
1339 assert(b
->end() > offset
);
1340 if (b
->is_writing() || b
->is_clean()) {
1341 if (b
->offset
< offset
) {
1342 uint32_t skip
= offset
- b
->offset
;
1343 uint32_t l
= MIN(length
, b
->length
- skip
);
1344 res
[offset
].substr_of(b
->data
, skip
, l
);
1345 res_intervals
.insert(offset
, l
);
1348 if (!b
->is_writing()) {
1349 cache
->_touch_buffer(b
);
1353 if (b
->offset
> offset
) {
1354 uint32_t gap
= b
->offset
- offset
;
1355 if (length
<= gap
) {
1361 if (!b
->is_writing()) {
1362 cache
->_touch_buffer(b
);
1364 if (b
->length
> length
) {
1365 res
[offset
].substr_of(b
->data
, 0, length
);
1366 res_intervals
.insert(offset
, length
);
1369 res
[offset
].append(b
->data
);
1370 res_intervals
.insert(offset
, b
->length
);
1371 if (b
->length
== length
)
1373 offset
+= b
->length
;
1374 length
-= b
->length
;
1379 uint64_t hit_bytes
= res_intervals
.size();
1380 assert(hit_bytes
<= want_bytes
);
1381 uint64_t miss_bytes
= want_bytes
- hit_bytes
;
1382 cache
->logger
->inc(l_bluestore_buffer_hit_bytes
, hit_bytes
);
1383 cache
->logger
->inc(l_bluestore_buffer_miss_bytes
, miss_bytes
);
1386 void BlueStore::BufferSpace::finish_write(Cache
* cache
, uint64_t seq
)
1388 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1390 auto i
= writing
.begin();
1391 while (i
!= writing
.end()) {
1401 assert(b
->is_writing());
1403 if (b
->flags
& Buffer::FLAG_NOCACHE
) {
1405 ldout(cache
->cct
, 20) << __func__
<< " discard " << *b
<< dendl
;
1406 buffer_map
.erase(b
->offset
);
1408 b
->state
= Buffer::STATE_CLEAN
;
1411 b
->data
.reassign_to_mempool(mempool::mempool_bluestore_cache_data
);
1412 cache
->_add_buffer(b
, 1, nullptr);
1413 ldout(cache
->cct
, 20) << __func__
<< " added " << *b
<< dendl
;
1417 cache
->_audit("finish_write end");
1420 void BlueStore::BufferSpace::split(Cache
* cache
, size_t pos
, BlueStore::BufferSpace
&r
)
1422 std::lock_guard
<std::recursive_mutex
> lk(cache
->lock
);
1423 if (buffer_map
.empty())
1426 auto p
= --buffer_map
.end();
1428 if (p
->second
->end() <= pos
)
1431 if (p
->second
->offset
< pos
) {
1432 ldout(cache
->cct
, 30) << __func__
<< " cut " << *p
->second
<< dendl
;
1433 size_t left
= pos
- p
->second
->offset
;
1434 size_t right
= p
->second
->length
- left
;
1435 if (p
->second
->data
.length()) {
1437 bl
.substr_of(p
->second
->data
, left
, right
);
1438 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, bl
),
1439 0, p
->second
.get());
1441 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, right
),
1442 0, p
->second
.get());
1444 cache
->_adjust_buffer_size(p
->second
.get(), -right
);
1445 p
->second
->truncate(left
);
1449 assert(p
->second
->end() > pos
);
1450 ldout(cache
->cct
, 30) << __func__
<< " move " << *p
->second
<< dendl
;
1451 if (p
->second
->data
.length()) {
1452 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1453 p
->second
->offset
- pos
, p
->second
->data
),
1454 0, p
->second
.get());
1456 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1457 p
->second
->offset
- pos
, p
->second
->length
),
1458 0, p
->second
.get());
1460 if (p
== buffer_map
.begin()) {
1461 _rm_buffer(cache
, p
);
1464 _rm_buffer(cache
, p
--);
1467 assert(writing
.empty());
1473 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1475 BlueStore::OnodeRef
BlueStore::OnodeSpace::add(const ghobject_t
& oid
, OnodeRef o
)
1477 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1478 auto p
= onode_map
.find(oid
);
1479 if (p
!= onode_map
.end()) {
1480 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
1481 << " raced, returning existing " << p
->second
1485 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
<< dendl
;
1487 cache
->_add_onode(o
, 1);
1491 BlueStore::OnodeRef
BlueStore::OnodeSpace::lookup(const ghobject_t
& oid
)
1493 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1494 ldout(cache
->cct
, 30) << __func__
<< dendl
;
1495 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator p
= onode_map
.find(oid
);
1496 if (p
== onode_map
.end()) {
1497 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " miss" << dendl
;
1498 cache
->logger
->inc(l_bluestore_onode_misses
);
1501 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " hit " << p
->second
1503 cache
->_touch_onode(p
->second
);
1504 cache
->logger
->inc(l_bluestore_onode_hits
);
1508 void BlueStore::OnodeSpace::clear()
1510 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1511 ldout(cache
->cct
, 10) << __func__
<< dendl
;
1512 for (auto &p
: onode_map
) {
1513 cache
->_rm_onode(p
.second
);
1518 bool BlueStore::OnodeSpace::empty()
1520 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1521 return onode_map
.empty();
1524 void BlueStore::OnodeSpace::rename(
1526 const ghobject_t
& old_oid
,
1527 const ghobject_t
& new_oid
,
1528 const mempool::bluestore_cache_other::string
& new_okey
)
1530 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1531 ldout(cache
->cct
, 30) << __func__
<< " " << old_oid
<< " -> " << new_oid
1533 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator po
, pn
;
1534 po
= onode_map
.find(old_oid
);
1535 pn
= onode_map
.find(new_oid
);
1538 assert(po
!= onode_map
.end());
1539 if (pn
!= onode_map
.end()) {
1540 ldout(cache
->cct
, 30) << __func__
<< " removing target " << pn
->second
1542 cache
->_rm_onode(pn
->second
);
1543 onode_map
.erase(pn
);
1545 OnodeRef o
= po
->second
;
1547 // install a non-existent onode at old location
1548 oldo
.reset(new Onode(o
->c
, old_oid
, o
->key
));
1550 cache
->_add_onode(po
->second
, 1);
1552 // add at new position and fix oid, key
1553 onode_map
.insert(make_pair(new_oid
, o
));
1554 cache
->_touch_onode(o
);
1559 bool BlueStore::OnodeSpace::map_any(std::function
<bool(OnodeRef
)> f
)
1561 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1562 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1563 for (auto& i
: onode_map
) {
1575 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1577 ostream
& operator<<(ostream
& out
, const BlueStore::SharedBlob
& sb
)
1579 out
<< "SharedBlob(" << &sb
;
1582 out
<< " loaded " << *sb
.persistent
;
1584 out
<< " sbid 0x" << std::hex
<< sb
.sbid_unloaded
<< std::dec
;
1589 BlueStore::SharedBlob::SharedBlob(uint64_t i
, Collection
*_coll
)
1590 : coll(_coll
), sbid_unloaded(i
)
1592 assert(sbid_unloaded
> 0);
1594 get_cache()->add_blob();
1598 BlueStore::SharedBlob::~SharedBlob()
1600 if (get_cache()) { // the dummy instances have a nullptr
1601 std::lock_guard
<std::recursive_mutex
> l(get_cache()->lock
);
1602 bc
._clear(get_cache());
1603 get_cache()->rm_blob();
1605 if (loaded
&& persistent
) {
1610 void BlueStore::SharedBlob::put()
1613 ldout(coll
->store
->cct
, 20) << __func__
<< " " << this
1614 << " removing self from set " << get_parent()
1617 if (get_parent()->remove(this)) {
1620 ldout(coll
->store
->cct
, 20)
1621 << __func__
<< " " << this << " lost race to remove myself from set"
1630 void BlueStore::SharedBlob::get_ref(uint64_t offset
, uint32_t length
)
1633 persistent
->ref_map
.get(offset
, length
);
1636 void BlueStore::SharedBlob::put_ref(uint64_t offset
, uint32_t length
,
1638 set
<SharedBlob
*> *maybe_unshared
)
1642 persistent
->ref_map
.put(offset
, length
, r
, maybe_unshared
? &maybe
: nullptr);
1643 if (maybe_unshared
&& maybe
) {
1644 maybe_unshared
->insert(this);
1651 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1653 ostream
& operator<<(ostream
& out
, const BlueStore::Blob
& b
)
1655 out
<< "Blob(" << &b
;
1656 if (b
.is_spanning()) {
1657 out
<< " spanning " << b
.id
;
1659 out
<< " " << b
.get_blob() << " " << b
.get_blob_use_tracker()
1660 << " " << *b
.shared_blob
1665 void BlueStore::Blob::discard_unallocated(Collection
*coll
)
1667 if (blob
.is_shared()) {
1670 if (blob
.is_compressed()) {
1671 bool discard
= false;
1672 bool all_invalid
= true;
1673 for (auto e
: blob
.get_extents()) {
1674 if (!e
.is_valid()) {
1677 all_invalid
= false;
1680 assert(discard
== all_invalid
); // in case of compressed blob all
1681 // or none pextents are invalid.
1683 shared_blob
->bc
.discard(shared_blob
->get_cache(), 0, blob
.get_logical_length());
1687 for (auto e
: blob
.get_extents()) {
1688 if (!e
.is_valid()) {
1689 ldout(coll
->store
->cct
, 20) << __func__
<< " 0x" << std::hex
<< pos
1691 << std::dec
<< dendl
;
1692 shared_blob
->bc
.discard(shared_blob
->get_cache(), pos
, e
.length
);
1696 if (blob
.can_prune_tail()) {
1699 used_in_blob
.prune_tail(blob
.get_ondisk_length());
1700 auto cct
= coll
->store
->cct
; //used by dout
1701 dout(20) << __func__
<< " pruned tail, now " << blob
<< dendl
;
1706 void BlueStore::Blob::get_ref(
1711 // Caller has to initialize Blob's logical length prior to increment
1712 // references. Otherwise one is neither unable to determine required
1713 // amount of counters in case of per-au tracking nor obtain min_release_size
1714 // for single counter mode.
1715 assert(get_blob().get_logical_length() != 0);
1716 auto cct
= coll
->store
->cct
;
1717 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
1718 << std::dec
<< " " << *this << dendl
;
1720 if (used_in_blob
.is_empty()) {
1721 uint32_t min_release_size
=
1722 blob
.get_release_size(coll
->store
->min_alloc_size
);
1723 uint64_t l
= blob
.get_logical_length();
1724 dout(20) << __func__
<< " init 0x" << std::hex
<< l
<< ", " << min_release_size
1725 << std::dec
<< dendl
;
1726 used_in_blob
.init(l
, min_release_size
);
1733 bool BlueStore::Blob::put_ref(
1739 PExtentVector logical
;
1741 auto cct
= coll
->store
->cct
;
1742 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
1743 << std::dec
<< " " << *this << dendl
;
1745 bool empty
= used_in_blob
.put(
1750 // nothing to release
1751 if (!empty
&& logical
.empty()) {
1755 bluestore_blob_t
& b
= dirty_blob();
1756 return b
.release_extents(empty
, logical
, r
);
1759 bool BlueStore::Blob::try_reuse_blob(uint32_t min_alloc_size
,
1760 uint32_t target_blob_size
,
1762 uint32_t *length0
) {
1763 assert(min_alloc_size
);
1764 assert(target_blob_size
);
1765 if (!get_blob().is_mutable()) {
1769 uint32_t length
= *length0
;
1770 uint32_t end
= b_offset
+ length
;
1772 // Currently for the sake of simplicity we omit blob reuse if data is
1773 // unaligned with csum chunk. Later we can perform padding if needed.
1774 if (get_blob().has_csum() &&
1775 ((b_offset
% get_blob().get_csum_chunk_size()) != 0 ||
1776 (end
% get_blob().get_csum_chunk_size()) != 0)) {
1780 auto blen
= get_blob().get_logical_length();
1781 uint32_t new_blen
= blen
;
1783 // make sure target_blob_size isn't less than current blob len
1784 target_blob_size
= MAX(blen
, target_blob_size
);
1786 if (b_offset
>= blen
) {
1787 //new data totally stands out of the existing blob
1788 new_blen
= b_offset
+ length
;
1790 //new data overlaps with the existing blob
1791 new_blen
= MAX(blen
, length
+ b_offset
);
1792 if (!get_blob().is_unallocated(
1794 new_blen
> blen
? blen
- b_offset
: length
)) {
1798 if (new_blen
> blen
) {
1799 int64_t overflow
= int64_t(new_blen
) - target_blob_size
;
1800 // Unable to decrease the provided length to fit into max_blob_size
1801 if (overflow
>= length
) {
1805 // FIXME: in some cases we could reduce unused resolution
1806 if (get_blob().has_unused()) {
1811 new_blen
-= overflow
;
1815 if (new_blen
> blen
) {
1816 dirty_blob().add_tail(new_blen
);
1817 used_in_blob
.add_tail(new_blen
,
1818 blob
.get_release_size(min_alloc_size
));
1824 void BlueStore::Blob::split(Collection
*coll
, uint32_t blob_offset
, Blob
*r
)
1826 auto cct
= coll
->store
->cct
; //used by dout
1827 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1828 << " start " << *this << dendl
;
1829 assert(blob
.can_split());
1830 assert(used_in_blob
.can_split());
1831 bluestore_blob_t
&lb
= dirty_blob();
1832 bluestore_blob_t
&rb
= r
->dirty_blob();
1836 &(r
->used_in_blob
));
1838 lb
.split(blob_offset
, rb
);
1839 shared_blob
->bc
.split(shared_blob
->get_cache(), blob_offset
, r
->shared_blob
->bc
);
1841 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1842 << " finish " << *this << dendl
;
1843 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1844 << " and " << *r
<< dendl
;
1847 #ifndef CACHE_BLOB_BL
1848 void BlueStore::Blob::decode(
1850 bufferptr::iterator
& p
,
1853 bool include_ref_map
)
1855 denc(blob
, p
, struct_v
);
1856 if (blob
.is_shared()) {
1859 if (include_ref_map
) {
1861 used_in_blob
.decode(p
);
1863 used_in_blob
.clear();
1864 bluestore_extent_ref_map_t legacy_ref_map
;
1865 legacy_ref_map
.decode(p
);
1866 for (auto r
: legacy_ref_map
.ref_map
) {
1870 r
.second
.refs
* r
.second
.length
);
1879 ostream
& operator<<(ostream
& out
, const BlueStore::Extent
& e
)
1881 return out
<< std::hex
<< "0x" << e
.logical_offset
<< "~" << e
.length
1882 << ": 0x" << e
.blob_offset
<< "~" << e
.length
<< std::dec
1887 BlueStore::OldExtent
* BlueStore::OldExtent::create(CollectionRef c
,
1892 OldExtent
* oe
= new OldExtent(lo
, o
, l
, b
);
1893 b
->put_ref(c
.get(), o
, l
, &(oe
->r
));
1894 oe
->blob_empty
= b
->get_referenced_bytes() == 0;
1901 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1903 BlueStore::ExtentMap::ExtentMap(Onode
*o
)
1906 o
->c
->store
->cct
->_conf
->bluestore_extent_map_inline_shard_prealloc_size
) {
1909 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t
,
1912 auto cct
= onode
->c
->store
->cct
; //used by dout
1913 dout(20) << __func__
<< " " << onode
->oid
<< (force
? " force" : "") << dendl
;
1914 if (onode
->onode
.extent_map_shards
.empty()) {
1915 if (inline_bl
.length() == 0) {
1917 // we need to encode inline_bl to measure encoded length
1918 bool never_happen
= encode_some(0, OBJECT_MAX_SIZE
, inline_bl
, &n
);
1919 assert(!never_happen
);
1920 size_t len
= inline_bl
.length();
1921 dout(20) << __func__
<< " inline shard " << len
<< " bytes from " << n
1922 << " extents" << dendl
;
1923 if (!force
&& len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
1924 request_reshard(0, OBJECT_MAX_SIZE
);
1928 // will persist in the onode key.
1930 // pending shard update
1931 struct dirty_shard_t
{
1934 dirty_shard_t(Shard
*s
) : shard(s
) {}
1936 vector
<dirty_shard_t
> encoded_shards
;
1937 // allocate slots for all shards in a single call instead of
1938 // doing multiple allocations - one per each dirty shard
1939 encoded_shards
.reserve(shards
.size());
1941 auto p
= shards
.begin();
1943 while (p
!= shards
.end()) {
1944 assert(p
->shard_info
->offset
>= prev_p
->shard_info
->offset
);
1949 if (n
== shards
.end()) {
1950 endoff
= OBJECT_MAX_SIZE
;
1952 endoff
= n
->shard_info
->offset
;
1954 encoded_shards
.emplace_back(dirty_shard_t(&(*p
)));
1955 bufferlist
& bl
= encoded_shards
.back().bl
;
1956 if (encode_some(p
->shard_info
->offset
, endoff
- p
->shard_info
->offset
,
1959 derr
<< __func__
<< " encode_some needs reshard" << dendl
;
1963 size_t len
= bl
.length();
1965 dout(20) << __func__
<< " shard 0x" << std::hex
1966 << p
->shard_info
->offset
<< std::dec
<< " is " << len
1967 << " bytes (was " << p
->shard_info
->bytes
<< ") from "
1968 << p
->extents
<< " extents" << dendl
;
1971 if (len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
1972 // we are big; reshard ourselves
1973 request_reshard(p
->shard_info
->offset
, endoff
);
1975 // avoid resharding the trailing shard, even if it is small
1976 else if (n
!= shards
.end() &&
1977 len
< g_conf
->bluestore_extent_map_shard_min_size
) {
1978 assert(endoff
!= OBJECT_MAX_SIZE
);
1979 if (p
== shards
.begin()) {
1980 // we are the first shard, combine with next shard
1981 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
1983 // combine either with the previous shard or the next,
1984 // whichever is smaller
1985 if (prev_p
->shard_info
->bytes
> n
->shard_info
->bytes
) {
1986 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
1988 request_reshard(prev_p
->shard_info
->offset
, endoff
);
1997 if (needs_reshard()) {
2001 // schedule DB update for dirty shards
2003 for (auto& it
: encoded_shards
) {
2004 it
.shard
->dirty
= false;
2005 it
.shard
->shard_info
->bytes
= it
.bl
.length();
2006 generate_extent_shard_key_and_apply(
2008 it
.shard
->shard_info
->offset
,
2010 [&](const string
& final_key
) {
2011 t
->set(PREFIX_OBJ
, final_key
, it
.bl
);
2018 bid_t
BlueStore::ExtentMap::allocate_spanning_blob_id()
2020 if (spanning_blob_map
.empty())
2022 bid_t bid
= spanning_blob_map
.rbegin()->first
+ 1;
2023 // bid is valid and available.
2026 // Find next unused bid;
2027 bid
= rand() % (numeric_limits
<bid_t
>::max() + 1);
2028 const auto begin_bid
= bid
;
2030 if (!spanning_blob_map
.count(bid
))
2034 if (bid
< 0) bid
= 0;
2036 } while (bid
!= begin_bid
);
2037 assert(0 == "no available blob id");
2040 void BlueStore::ExtentMap::reshard(
2042 KeyValueDB::Transaction t
)
2044 auto cct
= onode
->c
->store
->cct
; // used by dout
2046 dout(10) << __func__
<< " 0x[" << std::hex
<< needs_reshard_begin
<< ","
2047 << needs_reshard_end
<< ")" << std::dec
2048 << " of " << onode
->onode
.extent_map_shards
.size()
2049 << " shards on " << onode
->oid
<< dendl
;
2050 for (auto& p
: spanning_blob_map
) {
2051 dout(20) << __func__
<< " spanning blob " << p
.first
<< " " << *p
.second
2054 // determine shard index range
2055 unsigned si_begin
= 0, si_end
= 0;
2056 if (!shards
.empty()) {
2057 while (si_begin
+ 1 < shards
.size() &&
2058 shards
[si_begin
+ 1].shard_info
->offset
<= needs_reshard_begin
) {
2061 needs_reshard_begin
= shards
[si_begin
].shard_info
->offset
;
2062 for (si_end
= si_begin
; si_end
< shards
.size(); ++si_end
) {
2063 if (shards
[si_end
].shard_info
->offset
>= needs_reshard_end
) {
2064 needs_reshard_end
= shards
[si_end
].shard_info
->offset
;
2068 if (si_end
== shards
.size()) {
2069 needs_reshard_end
= OBJECT_MAX_SIZE
;
2071 dout(20) << __func__
<< " shards [" << si_begin
<< "," << si_end
<< ")"
2072 << " over 0x[" << std::hex
<< needs_reshard_begin
<< ","
2073 << needs_reshard_end
<< ")" << std::dec
<< dendl
;
2076 fault_range(db
, needs_reshard_begin
, needs_reshard_end
);
2078 // we may need to fault in a larger interval later must have all
2079 // referring extents for spanning blobs loaded in order to have
2080 // accurate use_tracker values.
2081 uint32_t spanning_scan_begin
= needs_reshard_begin
;
2082 uint32_t spanning_scan_end
= needs_reshard_end
;
2086 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2087 generate_extent_shard_key_and_apply(
2088 onode
->key
, shards
[i
].shard_info
->offset
, &key
,
2089 [&](const string
& final_key
) {
2090 t
->rmkey(PREFIX_OBJ
, final_key
);
2095 // calculate average extent size
2097 unsigned extents
= 0;
2098 if (onode
->onode
.extent_map_shards
.empty()) {
2099 bytes
= inline_bl
.length();
2100 extents
= extent_map
.size();
2102 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2103 bytes
+= shards
[i
].shard_info
->bytes
;
2104 extents
+= shards
[i
].extents
;
2107 unsigned target
= cct
->_conf
->bluestore_extent_map_shard_target_size
;
2108 unsigned slop
= target
*
2109 cct
->_conf
->bluestore_extent_map_shard_target_size_slop
;
2110 unsigned extent_avg
= bytes
/ MAX(1, extents
);
2111 dout(20) << __func__
<< " extent_avg " << extent_avg
<< ", target " << target
2112 << ", slop " << slop
<< dendl
;
2115 unsigned estimate
= 0;
2116 unsigned offset
= needs_reshard_begin
;
2117 vector
<bluestore_onode_t::shard_info
> new_shard_info
;
2118 unsigned max_blob_end
= 0;
2119 Extent
dummy(needs_reshard_begin
);
2120 for (auto e
= extent_map
.lower_bound(dummy
);
2121 e
!= extent_map
.end();
2123 if (e
->logical_offset
>= needs_reshard_end
) {
2126 dout(30) << " extent " << *e
<< dendl
;
2128 // disfavor shard boundaries that span a blob
2129 bool would_span
= (e
->logical_offset
< max_blob_end
) || e
->blob_offset
;
2131 estimate
+ extent_avg
> target
+ (would_span
? slop
: 0)) {
2133 if (offset
== needs_reshard_begin
) {
2134 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2135 new_shard_info
.back().offset
= offset
;
2136 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2137 << std::dec
<< dendl
;
2139 offset
= e
->logical_offset
;
2140 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2141 new_shard_info
.back().offset
= offset
;
2142 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2143 << std::dec
<< dendl
;
2146 estimate
+= extent_avg
;
2147 unsigned bs
= e
->blob_start();
2148 if (bs
< spanning_scan_begin
) {
2149 spanning_scan_begin
= bs
;
2151 uint32_t be
= e
->blob_end();
2152 if (be
> max_blob_end
) {
2155 if (be
> spanning_scan_end
) {
2156 spanning_scan_end
= be
;
2159 if (new_shard_info
.empty() && (si_begin
> 0 ||
2160 si_end
< shards
.size())) {
2161 // we resharded a partial range; we must produce at least one output
2163 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2164 new_shard_info
.back().offset
= needs_reshard_begin
;
2165 dout(20) << __func__
<< " new shard 0x" << std::hex
<< needs_reshard_begin
2166 << std::dec
<< " (singleton degenerate case)" << dendl
;
2169 auto& sv
= onode
->onode
.extent_map_shards
;
2170 dout(20) << __func__
<< " new " << new_shard_info
<< dendl
;
2171 dout(20) << __func__
<< " old " << sv
<< dendl
;
2173 // no old shards to keep
2174 sv
.swap(new_shard_info
);
2175 init_shards(true, true);
2177 // splice in new shards
2178 sv
.erase(sv
.begin() + si_begin
, sv
.begin() + si_end
);
2179 shards
.erase(shards
.begin() + si_begin
, shards
.begin() + si_end
);
2181 sv
.begin() + si_begin
,
2182 new_shard_info
.begin(),
2183 new_shard_info
.end());
2184 shards
.insert(shards
.begin() + si_begin
, new_shard_info
.size(), Shard());
2185 si_end
= si_begin
+ new_shard_info
.size();
2187 assert(sv
.size() == shards
.size());
2189 // note that we need to update every shard_info of shards here,
2190 // as sv might have been totally re-allocated above
2191 for (unsigned i
= 0; i
< shards
.size(); i
++) {
2192 shards
[i
].shard_info
= &sv
[i
];
2195 // mark newly added shards as dirty
2196 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2197 shards
[i
].loaded
= true;
2198 shards
[i
].dirty
= true;
2201 dout(20) << __func__
<< " fin " << sv
<< dendl
;
2205 // no more shards; unspan all previously spanning blobs
2206 auto p
= spanning_blob_map
.begin();
2207 while (p
!= spanning_blob_map
.end()) {
2209 dout(30) << __func__
<< " un-spanning " << *p
->second
<< dendl
;
2210 p
= spanning_blob_map
.erase(p
);
2213 // identify new spanning blobs
2214 dout(20) << __func__
<< " checking spanning blobs 0x[" << std::hex
2215 << spanning_scan_begin
<< "," << spanning_scan_end
<< ")" << dendl
;
2216 if (spanning_scan_begin
< needs_reshard_begin
) {
2217 fault_range(db
, spanning_scan_begin
,
2218 needs_reshard_begin
- spanning_scan_begin
);
2220 if (spanning_scan_end
> needs_reshard_end
) {
2221 fault_range(db
, needs_reshard_end
,
2222 spanning_scan_end
- needs_reshard_end
);
2224 auto sp
= sv
.begin() + si_begin
;
2225 auto esp
= sv
.end();
2226 unsigned shard_start
= sp
->offset
;
2230 shard_end
= OBJECT_MAX_SIZE
;
2232 shard_end
= sp
->offset
;
2234 Extent
dummy(needs_reshard_begin
);
2235 for (auto e
= extent_map
.lower_bound(dummy
); e
!= extent_map
.end(); ++e
) {
2236 if (e
->logical_offset
>= needs_reshard_end
) {
2239 dout(30) << " extent " << *e
<< dendl
;
2240 while (e
->logical_offset
>= shard_end
) {
2241 shard_start
= shard_end
;
2245 shard_end
= OBJECT_MAX_SIZE
;
2247 shard_end
= sp
->offset
;
2249 dout(30) << __func__
<< " shard 0x" << std::hex
<< shard_start
2250 << " to 0x" << shard_end
<< std::dec
<< dendl
;
2252 if (e
->blob_escapes_range(shard_start
, shard_end
- shard_start
)) {
2253 if (!e
->blob
->is_spanning()) {
2254 // We have two options: (1) split the blob into pieces at the
2255 // shard boundaries (and adjust extents accordingly), or (2)
2256 // mark it spanning. We prefer to cut the blob if we can. Note that
2257 // we may have to split it multiple times--potentially at every
2259 bool must_span
= false;
2260 BlobRef b
= e
->blob
;
2261 if (b
->can_split()) {
2262 uint32_t bstart
= e
->blob_start();
2263 uint32_t bend
= e
->blob_end();
2264 for (const auto& sh
: shards
) {
2265 if (bstart
< sh
.shard_info
->offset
&&
2266 bend
> sh
.shard_info
->offset
) {
2267 uint32_t blob_offset
= sh
.shard_info
->offset
- bstart
;
2268 if (b
->can_split_at(blob_offset
)) {
2269 dout(20) << __func__
<< " splitting blob, bstart 0x"
2270 << std::hex
<< bstart
<< " blob_offset 0x"
2271 << blob_offset
<< std::dec
<< " " << *b
<< dendl
;
2272 b
= split_blob(b
, blob_offset
, sh
.shard_info
->offset
);
2273 // switch b to the new right-hand side, in case it
2274 // *also* has to get split.
2275 bstart
+= blob_offset
;
2276 onode
->c
->store
->logger
->inc(l_bluestore_blob_split
);
2287 auto bid
= allocate_spanning_blob_id();
2289 spanning_blob_map
[b
->id
] = b
;
2290 dout(20) << __func__
<< " adding spanning " << *b
<< dendl
;
2294 if (e
->blob
->is_spanning()) {
2295 spanning_blob_map
.erase(e
->blob
->id
);
2297 dout(30) << __func__
<< " un-spanning " << *e
->blob
<< dendl
;
2303 clear_needs_reshard();
2306 bool BlueStore::ExtentMap::encode_some(
2312 auto cct
= onode
->c
->store
->cct
; //used by dout
2313 Extent
dummy(offset
);
2314 auto start
= extent_map
.lower_bound(dummy
);
2315 uint32_t end
= offset
+ length
;
2317 __u8 struct_v
= 2; // Version 2 differs from v1 in blob's ref_map
2318 // serialization only. Hence there is no specific
2319 // handling at ExtentMap level.
2323 bool must_reshard
= false;
2324 for (auto p
= start
;
2325 p
!= extent_map
.end() && p
->logical_offset
< end
;
2327 assert(p
->logical_offset
>= offset
);
2328 p
->blob
->last_encoded_id
= -1;
2329 if (!p
->blob
->is_spanning() && p
->blob_escapes_range(offset
, length
)) {
2330 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2331 << std::dec
<< " hit new spanning blob " << *p
<< dendl
;
2332 request_reshard(p
->blob_start(), p
->blob_end());
2333 must_reshard
= true;
2335 if (!must_reshard
) {
2336 denc_varint(0, bound
); // blobid
2337 denc_varint(0, bound
); // logical_offset
2338 denc_varint(0, bound
); // len
2339 denc_varint(0, bound
); // blob_offset
2341 p
->blob
->bound_encode(
2344 p
->blob
->shared_blob
->get_sbid(),
2352 denc(struct_v
, bound
);
2353 denc_varint(0, bound
); // number of extents
2356 auto app
= bl
.get_contiguous_appender(bound
);
2357 denc(struct_v
, app
);
2358 denc_varint(n
, app
);
2365 uint64_t prev_len
= 0;
2366 for (auto p
= start
;
2367 p
!= extent_map
.end() && p
->logical_offset
< end
;
2370 bool include_blob
= false;
2371 if (p
->blob
->is_spanning()) {
2372 blobid
= p
->blob
->id
<< BLOBID_SHIFT_BITS
;
2373 blobid
|= BLOBID_FLAG_SPANNING
;
2374 } else if (p
->blob
->last_encoded_id
< 0) {
2375 p
->blob
->last_encoded_id
= n
+ 1; // so it is always non-zero
2376 include_blob
= true;
2377 blobid
= 0; // the decoder will infer the id from n
2379 blobid
= p
->blob
->last_encoded_id
<< BLOBID_SHIFT_BITS
;
2381 if (p
->logical_offset
== pos
) {
2382 blobid
|= BLOBID_FLAG_CONTIGUOUS
;
2384 if (p
->blob_offset
== 0) {
2385 blobid
|= BLOBID_FLAG_ZEROOFFSET
;
2387 if (p
->length
== prev_len
) {
2388 blobid
|= BLOBID_FLAG_SAMELENGTH
;
2390 prev_len
= p
->length
;
2392 denc_varint(blobid
, app
);
2393 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
2394 denc_varint_lowz(p
->logical_offset
- pos
, app
);
2396 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
2397 denc_varint_lowz(p
->blob_offset
, app
);
2399 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
2400 denc_varint_lowz(p
->length
, app
);
2402 pos
= p
->logical_end();
2404 p
->blob
->encode(app
, struct_v
, p
->blob
->shared_blob
->get_sbid(), false);
2408 /*derr << __func__ << bl << dendl;
2409 derr << __func__ << ":";
2416 unsigned BlueStore::ExtentMap::decode_some(bufferlist
& bl
)
2418 auto cct
= onode
->c
->store
->cct
; //used by dout
2420 derr << __func__ << ":";
2425 assert(bl
.get_num_buffers() <= 1);
2426 auto p
= bl
.front().begin_deep();
2429 // Version 2 differs from v1 in blob's ref_map
2430 // serialization only. Hence there is no specific
2431 // handling at ExtentMap level below.
2432 assert(struct_v
== 1 || struct_v
== 2);
2435 denc_varint(num
, p
);
2436 vector
<BlobRef
> blobs(num
);
2438 uint64_t prev_len
= 0;
2442 Extent
*le
= new Extent();
2444 denc_varint(blobid
, p
);
2445 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
2447 denc_varint_lowz(gap
, p
);
2450 le
->logical_offset
= pos
;
2451 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
2452 denc_varint_lowz(le
->blob_offset
, p
);
2454 le
->blob_offset
= 0;
2456 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
2457 denc_varint_lowz(prev_len
, p
);
2459 le
->length
= prev_len
;
2461 if (blobid
& BLOBID_FLAG_SPANNING
) {
2462 dout(30) << __func__
<< " getting spanning blob "
2463 << (blobid
>> BLOBID_SHIFT_BITS
) << dendl
;
2464 le
->assign_blob(get_spanning_blob(blobid
>> BLOBID_SHIFT_BITS
));
2466 blobid
>>= BLOBID_SHIFT_BITS
;
2468 le
->assign_blob(blobs
[blobid
- 1]);
2471 Blob
*b
= new Blob();
2473 b
->decode(onode
->c
, p
, struct_v
, &sbid
, false);
2475 onode
->c
->open_shared_blob(sbid
, b
);
2478 // we build ref_map dynamically for non-spanning blobs
2486 extent_map
.insert(*le
);
2493 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p
)
2495 // Version 2 differs from v1 in blob's ref_map
2496 // serialization only. Hence there is no specific
2497 // handling at ExtentMap level.
2501 denc_varint((uint32_t)0, p
);
2502 size_t key_size
= 0;
2503 denc_varint((uint32_t)0, key_size
);
2504 p
+= spanning_blob_map
.size() * key_size
;
2505 for (const auto& i
: spanning_blob_map
) {
2506 i
.second
->bound_encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
2510 void BlueStore::ExtentMap::encode_spanning_blobs(
2511 bufferlist::contiguous_appender
& p
)
2513 // Version 2 differs from v1 in blob's ref_map
2514 // serialization only. Hence there is no specific
2515 // handling at ExtentMap level.
2519 denc_varint(spanning_blob_map
.size(), p
);
2520 for (auto& i
: spanning_blob_map
) {
2521 denc_varint(i
.second
->id
, p
);
2522 i
.second
->encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
2526 void BlueStore::ExtentMap::decode_spanning_blobs(
2527 bufferptr::iterator
& p
)
2531 // Version 2 differs from v1 in blob's ref_map
2532 // serialization only. Hence there is no specific
2533 // handling at ExtentMap level.
2534 assert(struct_v
== 1 || struct_v
== 2);
2539 BlobRef
b(new Blob());
2540 denc_varint(b
->id
, p
);
2541 spanning_blob_map
[b
->id
] = b
;
2543 b
->decode(onode
->c
, p
, struct_v
, &sbid
, true);
2544 onode
->c
->open_shared_blob(sbid
, b
);
2548 void BlueStore::ExtentMap::init_shards(bool loaded
, bool dirty
)
2550 shards
.resize(onode
->onode
.extent_map_shards
.size());
2552 for (auto &s
: onode
->onode
.extent_map_shards
) {
2553 shards
[i
].shard_info
= &s
;
2554 shards
[i
].loaded
= loaded
;
2555 shards
[i
].dirty
= dirty
;
2560 void BlueStore::ExtentMap::fault_range(
2565 auto cct
= onode
->c
->store
->cct
; //used by dout
2566 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2567 << std::dec
<< dendl
;
2568 auto start
= seek_shard(offset
);
2569 auto last
= seek_shard(offset
+ length
);
2574 assert(last
>= start
);
2576 while (start
<= last
) {
2577 assert((size_t)start
< shards
.size());
2578 auto p
= &shards
[start
];
2580 dout(30) << __func__
<< " opening shard 0x" << std::hex
2581 << p
->shard_info
->offset
<< std::dec
<< dendl
;
2583 generate_extent_shard_key_and_apply(
2584 onode
->key
, p
->shard_info
->offset
, &key
,
2585 [&](const string
& final_key
) {
2586 int r
= db
->get(PREFIX_OBJ
, final_key
, &v
);
2588 derr
<< __func__
<< " missing shard 0x" << std::hex
2589 << p
->shard_info
->offset
<< std::dec
<< " for " << onode
->oid
2595 p
->extents
= decode_some(v
);
2597 dout(20) << __func__
<< " open shard 0x" << std::hex
2598 << p
->shard_info
->offset
<< std::dec
2599 << " (" << v
.length() << " bytes)" << dendl
;
2600 assert(p
->dirty
== false);
2601 assert(v
.length() == p
->shard_info
->bytes
);
2602 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_misses
);
2604 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_hits
);
2610 void BlueStore::ExtentMap::dirty_range(
2614 auto cct
= onode
->c
->store
->cct
; //used by dout
2615 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2616 << std::dec
<< dendl
;
2617 if (shards
.empty()) {
2618 dout(20) << __func__
<< " mark inline shard dirty" << dendl
;
2622 auto start
= seek_shard(offset
);
2623 auto last
= seek_shard(offset
+ length
);
2627 assert(last
>= start
);
2628 while (start
<= last
) {
2629 assert((size_t)start
< shards
.size());
2630 auto p
= &shards
[start
];
2632 dout(20) << __func__
<< " shard 0x" << std::hex
<< p
->shard_info
->offset
2633 << std::dec
<< " is not loaded, can't mark dirty" << dendl
;
2634 assert(0 == "can't mark unloaded shard dirty");
2637 dout(20) << __func__
<< " mark shard 0x" << std::hex
2638 << p
->shard_info
->offset
<< std::dec
<< " dirty" << dendl
;
2645 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::find(
2648 Extent
dummy(offset
);
2649 return extent_map
.find(dummy
);
2652 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::seek_lextent(
2655 Extent
dummy(offset
);
2656 auto fp
= extent_map
.lower_bound(dummy
);
2657 if (fp
!= extent_map
.begin()) {
2659 if (fp
->logical_end() <= offset
) {
2666 BlueStore::extent_map_t::const_iterator
BlueStore::ExtentMap::seek_lextent(
2667 uint64_t offset
) const
2669 Extent
dummy(offset
);
2670 auto fp
= extent_map
.lower_bound(dummy
);
2671 if (fp
!= extent_map
.begin()) {
2673 if (fp
->logical_end() <= offset
) {
2680 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset
, uint64_t length
)
2682 auto fp
= seek_lextent(offset
);
2683 if (fp
== extent_map
.end() || fp
->logical_offset
>= offset
+ length
) {
2689 int BlueStore::ExtentMap::compress_extent_map(
2693 auto cct
= onode
->c
->store
->cct
; //used by dout
2694 if (extent_map
.empty())
2697 auto p
= seek_lextent(offset
);
2698 if (p
!= extent_map
.begin()) {
2699 --p
; // start to the left of offset
2701 // the caller should have just written to this region
2702 assert(p
!= extent_map
.end());
2704 // identify the *next* shard
2705 auto pshard
= shards
.begin();
2706 while (pshard
!= shards
.end() &&
2707 p
->logical_offset
>= pshard
->shard_info
->offset
) {
2711 if (pshard
!= shards
.end()) {
2712 shard_end
= pshard
->shard_info
->offset
;
2714 shard_end
= OBJECT_MAX_SIZE
;
2718 for (++n
; n
!= extent_map
.end(); p
= n
++) {
2719 if (n
->logical_offset
> offset
+ length
) {
2720 break; // stop after end
2722 while (n
!= extent_map
.end() &&
2723 p
->logical_end() == n
->logical_offset
&&
2724 p
->blob
== n
->blob
&&
2725 p
->blob_offset
+ p
->length
== n
->blob_offset
&&
2726 n
->logical_offset
< shard_end
) {
2727 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2728 << " next shard 0x" << shard_end
<< std::dec
2729 << " merging " << *p
<< " and " << *n
<< dendl
;
2730 p
->length
+= n
->length
;
2734 if (n
== extent_map
.end()) {
2737 if (n
->logical_offset
>= shard_end
) {
2738 assert(pshard
!= shards
.end());
2740 if (pshard
!= shards
.end()) {
2741 shard_end
= pshard
->shard_info
->offset
;
2743 shard_end
= OBJECT_MAX_SIZE
;
2747 if (removed
&& onode
) {
2748 onode
->c
->store
->logger
->inc(l_bluestore_extent_compress
, removed
);
2753 void BlueStore::ExtentMap::punch_hole(
2757 old_extent_map_t
*old_extents
)
2759 auto p
= seek_lextent(offset
);
2760 uint64_t end
= offset
+ length
;
2761 while (p
!= extent_map
.end()) {
2762 if (p
->logical_offset
>= end
) {
2765 if (p
->logical_offset
< offset
) {
2766 if (p
->logical_end() > end
) {
2767 // split and deref middle
2768 uint64_t front
= offset
- p
->logical_offset
;
2769 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ front
,
2771 old_extents
->push_back(*oe
);
2773 p
->blob_offset
+ front
+ length
,
2774 p
->length
- front
- length
,
2780 assert(p
->logical_end() > offset
); // else seek_lextent bug
2781 uint64_t keep
= offset
- p
->logical_offset
;
2782 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ keep
,
2783 p
->length
- keep
, p
->blob
);
2784 old_extents
->push_back(*oe
);
2790 if (p
->logical_offset
+ p
->length
<= end
) {
2791 // deref whole lextent
2792 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
2793 p
->length
, p
->blob
);
2794 old_extents
->push_back(*oe
);
2799 uint64_t keep
= p
->logical_end() - end
;
2800 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
2801 p
->length
- keep
, p
->blob
);
2802 old_extents
->push_back(*oe
);
2804 add(end
, p
->blob_offset
+ p
->length
- keep
, keep
, p
->blob
);
2810 BlueStore::Extent
*BlueStore::ExtentMap::set_lextent(
2812 uint64_t logical_offset
,
2813 uint64_t blob_offset
, uint64_t length
, BlobRef b
,
2814 old_extent_map_t
*old_extents
)
2816 // We need to have completely initialized Blob to increment its ref counters.
2817 assert(b
->get_blob().get_logical_length() != 0);
2819 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2820 // old_extents list if we overwre the blob totally
2821 // This might happen during WAL overwrite.
2822 b
->get_ref(onode
->c
, blob_offset
, length
);
2825 punch_hole(c
, logical_offset
, length
, old_extents
);
2828 Extent
*le
= new Extent(logical_offset
, blob_offset
, length
, b
);
2829 extent_map
.insert(*le
);
2830 if (spans_shard(logical_offset
, length
)) {
2831 request_reshard(logical_offset
, logical_offset
+ length
);
2836 BlueStore::BlobRef
BlueStore::ExtentMap::split_blob(
2838 uint32_t blob_offset
,
2841 auto cct
= onode
->c
->store
->cct
; //used by dout
2843 uint32_t end_pos
= pos
+ lb
->get_blob().get_logical_length() - blob_offset
;
2844 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< " end 0x" << end_pos
2845 << " blob_offset 0x" << blob_offset
<< std::dec
<< " " << *lb
2847 BlobRef rb
= onode
->c
->new_blob();
2848 lb
->split(onode
->c
, blob_offset
, rb
.get());
2850 for (auto ep
= seek_lextent(pos
);
2851 ep
!= extent_map
.end() && ep
->logical_offset
< end_pos
;
2853 if (ep
->blob
!= lb
) {
2856 if (ep
->logical_offset
< pos
) {
2858 size_t left
= pos
- ep
->logical_offset
;
2859 Extent
*ne
= new Extent(pos
, 0, ep
->length
- left
, rb
);
2860 extent_map
.insert(*ne
);
2862 dout(30) << __func__
<< " split " << *ep
<< dendl
;
2863 dout(30) << __func__
<< " to " << *ne
<< dendl
;
2866 assert(ep
->blob_offset
>= blob_offset
);
2869 ep
->blob_offset
-= blob_offset
;
2870 dout(30) << __func__
<< " adjusted " << *ep
<< dendl
;
2879 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2881 void BlueStore::Onode::flush()
2883 if (flushing_count
.load()) {
2884 ldout(c
->store
->cct
, 20) << __func__
<< " cnt:" << flushing_count
<< dendl
;
2885 std::unique_lock
<std::mutex
> l(flush_lock
);
2886 while (flushing_count
.load()) {
2890 ldout(c
->store
->cct
, 20) << __func__
<< " done" << dendl
;
2893 // =======================================================
2896 /// Checks for writes to the same pextent within a blob
2897 bool BlueStore::WriteContext::has_conflict(
2901 uint64_t min_alloc_size
)
2903 assert((loffs
% min_alloc_size
) == 0);
2904 assert((loffs_end
% min_alloc_size
) == 0);
2905 for (auto w
: writes
) {
2907 auto loffs2
= P2ALIGN(w
.logical_offset
, min_alloc_size
);
2908 auto loffs2_end
= ROUND_UP_TO( w
.logical_offset
+ w
.length0
, min_alloc_size
);
2909 if ((loffs
<= loffs2
&& loffs_end
> loffs2
) ||
2910 (loffs
>= loffs2
&& loffs
< loffs2_end
)) {
2918 // =======================================================
2922 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
2924 void BlueStore::DeferredBatch::prepare_write(
2926 uint64_t seq
, uint64_t offset
, uint64_t length
,
2927 bufferlist::const_iterator
& blp
)
2929 _discard(cct
, offset
, length
);
2930 auto i
= iomap
.insert(make_pair(offset
, deferred_io()));
2931 assert(i
.second
); // this should be a new insertion
2932 i
.first
->second
.seq
= seq
;
2933 blp
.copy(length
, i
.first
->second
.bl
);
2934 i
.first
->second
.bl
.reassign_to_mempool(
2935 mempool::mempool_bluestore_writing_deferred
);
2936 dout(20) << __func__
<< " seq " << seq
2937 << " 0x" << std::hex
<< offset
<< "~" << length
2938 << " crc " << i
.first
->second
.bl
.crc32c(-1)
2939 << std::dec
<< dendl
;
2940 seq_bytes
[seq
] += length
;
2941 #ifdef DEBUG_DEFERRED
2946 void BlueStore::DeferredBatch::_discard(
2947 CephContext
*cct
, uint64_t offset
, uint64_t length
)
2949 generic_dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2950 << std::dec
<< dendl
;
2951 auto p
= iomap
.lower_bound(offset
);
2952 if (p
!= iomap
.begin()) {
2954 auto end
= p
->first
+ p
->second
.bl
.length();
2957 head
.substr_of(p
->second
.bl
, 0, offset
- p
->first
);
2958 dout(20) << __func__
<< " keep head " << p
->second
.seq
2959 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
2960 << " -> 0x" << head
.length() << std::dec
<< dendl
;
2961 auto i
= seq_bytes
.find(p
->second
.seq
);
2962 if (end
> offset
+ length
) {
2964 tail
.substr_of(p
->second
.bl
, offset
+ length
- p
->first
,
2965 end
- (offset
+ length
));
2966 dout(20) << __func__
<< " keep tail " << p
->second
.seq
2967 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
2968 << " -> 0x" << tail
.length() << std::dec
<< dendl
;
2969 auto &n
= iomap
[offset
+ length
];
2971 n
.seq
= p
->second
.seq
;
2972 i
->second
-= length
;
2974 i
->second
-= end
- offset
;
2976 p
->second
.bl
.swap(head
);
2980 while (p
!= iomap
.end()) {
2981 if (p
->first
>= offset
+ length
) {
2984 auto i
= seq_bytes
.find(p
->second
.seq
);
2985 auto end
= p
->first
+ p
->second
.bl
.length();
2986 if (end
> offset
+ length
) {
2987 unsigned drop_front
= offset
+ length
- p
->first
;
2988 unsigned keep_tail
= end
- (offset
+ length
);
2989 dout(20) << __func__
<< " truncate front " << p
->second
.seq
2990 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
2991 << " drop_front 0x" << drop_front
<< " keep_tail 0x" << keep_tail
2992 << " to 0x" << (offset
+ length
) << "~" << keep_tail
2993 << std::dec
<< dendl
;
2994 auto &s
= iomap
[offset
+ length
];
2995 s
.seq
= p
->second
.seq
;
2996 s
.bl
.substr_of(p
->second
.bl
, drop_front
, keep_tail
);
2997 i
->second
-= drop_front
;
2999 dout(20) << __func__
<< " drop " << p
->second
.seq
3000 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
3001 << std::dec
<< dendl
;
3002 i
->second
-= p
->second
.bl
.length();
3008 void BlueStore::DeferredBatch::_audit(CephContext
*cct
)
3010 map
<uint64_t,int> sb
;
3011 for (auto p
: seq_bytes
) {
3012 sb
[p
.first
] = 0; // make sure we have the same set of keys
3015 for (auto& p
: iomap
) {
3016 assert(p
.first
>= pos
);
3017 sb
[p
.second
.seq
] += p
.second
.bl
.length();
3018 pos
= p
.first
+ p
.second
.bl
.length();
3020 assert(sb
== seq_bytes
);
3027 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3029 BlueStore::Collection::Collection(BlueStore
*ns
, Cache
*c
, coll_t cid
)
3033 lock("BlueStore::Collection::lock", true, false),
3039 void BlueStore::Collection::open_shared_blob(uint64_t sbid
, BlobRef b
)
3041 assert(!b
->shared_blob
);
3042 const bluestore_blob_t
& blob
= b
->get_blob();
3043 if (!blob
.is_shared()) {
3044 b
->shared_blob
= new SharedBlob(this);
3048 b
->shared_blob
= shared_blob_set
.lookup(sbid
);
3049 if (b
->shared_blob
) {
3050 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3051 << std::dec
<< " had " << *b
->shared_blob
<< dendl
;
3053 b
->shared_blob
= new SharedBlob(sbid
, this);
3054 shared_blob_set
.add(this, b
->shared_blob
.get());
3055 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3056 << std::dec
<< " opened " << *b
->shared_blob
3061 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb
)
3063 if (!sb
->is_loaded()) {
3067 auto sbid
= sb
->get_sbid();
3068 get_shared_blob_key(sbid
, &key
);
3069 int r
= store
->db
->get(PREFIX_SHARED_BLOB
, key
, &v
);
3071 lderr(store
->cct
) << __func__
<< " sbid 0x" << std::hex
<< sbid
3072 << std::dec
<< " not found at key "
3073 << pretty_binary_string(key
) << dendl
;
3074 assert(0 == "uh oh, missing shared_blob");
3078 sb
->persistent
= new bluestore_shared_blob_t(sbid
);
3079 bufferlist::iterator p
= v
.begin();
3080 ::decode(*(sb
->persistent
), p
);
3081 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3082 << std::dec
<< " loaded shared_blob " << *sb
<< dendl
;
3086 void BlueStore::Collection::make_blob_shared(uint64_t sbid
, BlobRef b
)
3088 ldout(store
->cct
, 10) << __func__
<< " " << *b
<< dendl
;
3089 assert(!b
->shared_blob
->is_loaded());
3092 bluestore_blob_t
& blob
= b
->dirty_blob();
3093 blob
.set_flag(bluestore_blob_t::FLAG_SHARED
);
3095 // update shared blob
3096 b
->shared_blob
->loaded
= true;
3097 b
->shared_blob
->persistent
= new bluestore_shared_blob_t(sbid
);
3098 shared_blob_set
.add(this, b
->shared_blob
.get());
3099 for (auto p
: blob
.get_extents()) {
3101 b
->shared_blob
->get_ref(
3106 ldout(store
->cct
, 20) << __func__
<< " now " << *b
<< dendl
;
3109 uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob
*sb
)
3111 ldout(store
->cct
, 10) << __func__
<< " " << *sb
<< dendl
;
3112 assert(sb
->is_loaded());
3114 uint64_t sbid
= sb
->get_sbid();
3115 shared_blob_set
.remove(sb
);
3117 delete sb
->persistent
;
3118 sb
->sbid_unloaded
= 0;
3119 ldout(store
->cct
, 20) << __func__
<< " now " << *sb
<< dendl
;
3123 BlueStore::OnodeRef
BlueStore::Collection::get_onode(
3124 const ghobject_t
& oid
,
3127 assert(create
? lock
.is_wlocked() : lock
.is_locked());
3130 if (cid
.is_pg(&pgid
)) {
3131 if (!oid
.match(cnode
.bits
, pgid
.ps())) {
3132 lderr(store
->cct
) << __func__
<< " oid " << oid
<< " not part of "
3133 << pgid
<< " bits " << cnode
.bits
<< dendl
;
3138 OnodeRef o
= onode_map
.lookup(oid
);
3142 mempool::bluestore_cache_other::string key
;
3143 get_object_key(store
->cct
, oid
, &key
);
3145 ldout(store
->cct
, 20) << __func__
<< " oid " << oid
<< " key "
3146 << pretty_binary_string(key
) << dendl
;
3149 int r
= store
->db
->get(PREFIX_OBJ
, key
.c_str(), key
.size(), &v
);
3150 ldout(store
->cct
, 20) << " r " << r
<< " v.len " << v
.length() << dendl
;
3152 if (v
.length() == 0) {
3153 assert(r
== -ENOENT
);
3154 if (!store
->cct
->_conf
->bluestore_debug_misc
&&
3158 // new object, new onode
3159 on
= new Onode(this, oid
, key
);
3163 on
= new Onode(this, oid
, key
);
3165 bufferptr::iterator p
= v
.front().begin_deep();
3166 on
->onode
.decode(p
);
3168 // initialize extent_map
3169 on
->extent_map
.decode_spanning_blobs(p
);
3170 if (on
->onode
.extent_map_shards
.empty()) {
3171 denc(on
->extent_map
.inline_bl
, p
);
3172 on
->extent_map
.decode_some(on
->extent_map
.inline_bl
);
3174 on
->extent_map
.init_shards(false, false);
3178 return onode_map
.add(oid
, o
);
3181 void BlueStore::Collection::split_cache(
3184 ldout(store
->cct
, 10) << __func__
<< " to " << dest
<< dendl
;
3186 // lock (one or both) cache shards
3187 std::lock(cache
->lock
, dest
->cache
->lock
);
3188 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
, std::adopt_lock
);
3189 std::lock_guard
<std::recursive_mutex
> l2(dest
->cache
->lock
, std::adopt_lock
);
3191 int destbits
= dest
->cnode
.bits
;
3193 bool is_pg
= dest
->cid
.is_pg(&destpg
);
3196 auto p
= onode_map
.onode_map
.begin();
3197 while (p
!= onode_map
.onode_map
.end()) {
3198 if (!p
->second
->oid
.match(destbits
, destpg
.pgid
.ps())) {
3199 // onode does not belong to this child
3202 OnodeRef o
= p
->second
;
3203 ldout(store
->cct
, 20) << __func__
<< " moving " << o
<< " " << o
->oid
3206 cache
->_rm_onode(p
->second
);
3207 p
= onode_map
.onode_map
.erase(p
);
3210 dest
->cache
->_add_onode(o
, 1);
3211 dest
->onode_map
.onode_map
[o
->oid
] = o
;
3212 dest
->onode_map
.cache
= dest
->cache
;
3214 // move over shared blobs and buffers. cover shared blobs from
3215 // both extent map and spanning blob map (the full extent map
3216 // may not be faulted in)
3217 vector
<SharedBlob
*> sbvec
;
3218 for (auto& e
: o
->extent_map
.extent_map
) {
3219 sbvec
.push_back(e
.blob
->shared_blob
.get());
3221 for (auto& b
: o
->extent_map
.spanning_blob_map
) {
3222 sbvec
.push_back(b
.second
->shared_blob
.get());
3224 for (auto sb
: sbvec
) {
3225 if (sb
->coll
== dest
) {
3226 ldout(store
->cct
, 20) << __func__
<< " already moved " << *sb
3230 ldout(store
->cct
, 20) << __func__
<< " moving " << *sb
<< dendl
;
3232 if (sb
->get_sbid()) {
3233 ldout(store
->cct
, 20) << __func__
3234 << " moving registration " << *sb
<< dendl
;
3235 shared_blob_set
.remove(sb
);
3236 dest
->shared_blob_set
.add(dest
, sb
);
3238 if (dest
->cache
!= cache
) {
3239 for (auto& i
: sb
->bc
.buffer_map
) {
3240 if (!i
.second
->is_writing()) {
3241 ldout(store
->cct
, 20) << __func__
<< " moving " << *i
.second
3243 dest
->cache
->_move_buffer(cache
, i
.second
.get());
3252 // =======================================================
3254 void *BlueStore::MempoolThread::entry()
3256 Mutex::Locker
l(lock
);
3258 uint64_t meta_bytes
=
3259 mempool::bluestore_cache_other::allocated_bytes() +
3260 mempool::bluestore_cache_onode::allocated_bytes();
3261 uint64_t onode_num
=
3262 mempool::bluestore_cache_onode::allocated_items();
3264 if (onode_num
< 2) {
3268 float bytes_per_onode
= (float)meta_bytes
/ (float)onode_num
;
3269 size_t num_shards
= store
->cache_shards
.size();
3270 float target_ratio
= store
->cache_meta_ratio
+ store
->cache_data_ratio
;
3271 // A little sloppy but should be close enough
3272 uint64_t shard_target
= target_ratio
* (store
->cct
->_conf
->bluestore_cache_size
/ num_shards
);
3274 for (auto i
: store
->cache_shards
) {
3275 i
->trim(shard_target
,
3276 store
->cache_meta_ratio
,
3277 store
->cache_data_ratio
,
3281 store
->_update_cache_logger();
3284 wait
+= store
->cct
->_conf
->bluestore_cache_trim_interval
;
3285 cond
.WaitInterval(lock
, wait
);
3291 // =======================================================
3296 #define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
3298 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
3299 CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
)
3300 : c(c
), o(o
), it(it
)
3302 RWLock::RLocker
l(c
->lock
);
3303 if (o
->onode
.has_omap()) {
3304 get_omap_key(o
->onode
.nid
, string(), &head
);
3305 get_omap_tail(o
->onode
.nid
, &tail
);
3306 it
->lower_bound(head
);
3310 int BlueStore::OmapIteratorImpl::seek_to_first()
3312 RWLock::RLocker
l(c
->lock
);
3313 if (o
->onode
.has_omap()) {
3314 it
->lower_bound(head
);
3316 it
= KeyValueDB::Iterator();
3321 int BlueStore::OmapIteratorImpl::upper_bound(const string
& after
)
3323 RWLock::RLocker
l(c
->lock
);
3324 if (o
->onode
.has_omap()) {
3326 get_omap_key(o
->onode
.nid
, after
, &key
);
3327 ldout(c
->store
->cct
,20) << __func__
<< " after " << after
<< " key "
3328 << pretty_binary_string(key
) << dendl
;
3329 it
->upper_bound(key
);
3331 it
= KeyValueDB::Iterator();
3336 int BlueStore::OmapIteratorImpl::lower_bound(const string
& to
)
3338 RWLock::RLocker
l(c
->lock
);
3339 if (o
->onode
.has_omap()) {
3341 get_omap_key(o
->onode
.nid
, to
, &key
);
3342 ldout(c
->store
->cct
,20) << __func__
<< " to " << to
<< " key "
3343 << pretty_binary_string(key
) << dendl
;
3344 it
->lower_bound(key
);
3346 it
= KeyValueDB::Iterator();
3351 bool BlueStore::OmapIteratorImpl::valid()
3353 RWLock::RLocker
l(c
->lock
);
3354 bool r
= o
->onode
.has_omap() && it
&& it
->valid() &&
3355 it
->raw_key().second
<= tail
;
3356 if (it
&& it
->valid()) {
3357 ldout(c
->store
->cct
,20) << __func__
<< " is at "
3358 << pretty_binary_string(it
->raw_key().second
)
3364 int BlueStore::OmapIteratorImpl::next(bool validate
)
3366 RWLock::RLocker
l(c
->lock
);
3367 if (o
->onode
.has_omap()) {
3375 string
BlueStore::OmapIteratorImpl::key()
3377 RWLock::RLocker
l(c
->lock
);
3378 assert(it
->valid());
3379 string db_key
= it
->raw_key().second
;
3381 decode_omap_key(db_key
, &user_key
);
3385 bufferlist
BlueStore::OmapIteratorImpl::value()
3387 RWLock::RLocker
l(c
->lock
);
3388 assert(it
->valid());
3393 // =====================================
3396 #define dout_prefix *_dout << "bluestore(" << path << ") "
3399 static void aio_cb(void *priv
, void *priv2
)
3401 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
3402 BlueStore::AioContext
*c
= static_cast<BlueStore::AioContext
*>(priv2
);
3403 c
->aio_finish(store
);
3406 BlueStore::BlueStore(CephContext
*cct
, const string
& path
)
3407 : ObjectStore(cct
, path
),
3408 throttle_bytes(cct
, "bluestore_throttle_bytes",
3409 cct
->_conf
->bluestore_throttle_bytes
),
3410 throttle_deferred_bytes(cct
, "bluestore_throttle_deferred_bytes",
3411 cct
->_conf
->bluestore_throttle_bytes
+
3412 cct
->_conf
->bluestore_throttle_deferred_bytes
),
3413 kv_sync_thread(this),
3414 kv_finalize_thread(this),
3415 mempool_thread(this)
3418 cct
->_conf
->add_observer(this);
3419 set_cache_shards(1);
3422 BlueStore::BlueStore(CephContext
*cct
,
3424 uint64_t _min_alloc_size
)
3425 : ObjectStore(cct
, path
),
3426 throttle_bytes(cct
, "bluestore_throttle_bytes",
3427 cct
->_conf
->bluestore_throttle_bytes
),
3428 throttle_deferred_bytes(cct
, "bluestore_throttle_deferred_bytes",
3429 cct
->_conf
->bluestore_throttle_bytes
+
3430 cct
->_conf
->bluestore_throttle_deferred_bytes
),
3431 kv_sync_thread(this),
3432 kv_finalize_thread(this),
3433 min_alloc_size(_min_alloc_size
),
3434 min_alloc_size_order(ctz(_min_alloc_size
)),
3435 mempool_thread(this)
3438 cct
->_conf
->add_observer(this);
3439 set_cache_shards(1);
3441 if (cct
->_conf
->bluestore_shard_finishers
) {
3442 m_finisher_num
= cct
->_conf
->osd_op_num_shards
;
3445 for (int i
= 0; i
< m_finisher_num
; ++i
) {
3447 oss
<< "finisher-" << i
;
3448 Finisher
*f
= new Finisher(cct
, oss
.str(), "finisher");
3449 finishers
.push_back(f
);
3453 BlueStore::~BlueStore()
3455 for (auto f
: finishers
) {
3460 cct
->_conf
->remove_observer(this);
3464 assert(bluefs
== NULL
);
3465 assert(fsid_fd
< 0);
3466 assert(path_fd
< 0);
3467 for (auto i
: cache_shards
) {
3470 cache_shards
.clear();
3473 const char **BlueStore::get_tracked_conf_keys() const
3475 static const char* KEYS
[] = {
3476 "bluestore_csum_type",
3477 "bluestore_compression_mode",
3478 "bluestore_compression_algorithm",
3479 "bluestore_compression_min_blob_size",
3480 "bluestore_compression_min_blob_size_ssd",
3481 "bluestore_compression_min_blob_size_hdd",
3482 "bluestore_compression_max_blob_size",
3483 "bluestore_compression_max_blob_size_ssd",
3484 "bluestore_compression_max_blob_size_hdd",
3485 "bluestore_max_alloc_size",
3486 "bluestore_prefer_deferred_size",
3487 "bluestore_deferred_batch_ops",
3488 "bluestore_deferred_batch_ops_hdd",
3489 "bluestore_deferred_batch_ops_ssd",
3490 "bluestore_throttle_bytes",
3491 "bluestore_throttle_deferred_bytes",
3492 "bluestore_throttle_cost_per_io_hdd",
3493 "bluestore_throttle_cost_per_io_ssd",
3494 "bluestore_throttle_cost_per_io",
3495 "bluestore_max_blob_size",
3496 "bluestore_max_blob_size_ssd",
3497 "bluestore_max_blob_size_hdd",
3503 void BlueStore::handle_conf_change(const struct md_config_t
*conf
,
3504 const std::set
<std::string
> &changed
)
3506 if (changed
.count("bluestore_csum_type")) {
3509 if (changed
.count("bluestore_compression_mode") ||
3510 changed
.count("bluestore_compression_algorithm") ||
3511 changed
.count("bluestore_compression_min_blob_size") ||
3512 changed
.count("bluestore_compression_max_blob_size")) {
3517 if (changed
.count("bluestore_max_blob_size") ||
3518 changed
.count("bluestore_max_blob_size_ssd") ||
3519 changed
.count("bluestore_max_blob_size_hdd")) {
3521 // only after startup
3525 if (changed
.count("bluestore_prefer_deferred_size") ||
3526 changed
.count("bluestore_max_alloc_size") ||
3527 changed
.count("bluestore_deferred_batch_ops") ||
3528 changed
.count("bluestore_deferred_batch_ops_hdd") ||
3529 changed
.count("bluestore_deferred_batch_ops_ssd")) {
3531 // only after startup
3535 if (changed
.count("bluestore_throttle_cost_per_io") ||
3536 changed
.count("bluestore_throttle_cost_per_io_hdd") ||
3537 changed
.count("bluestore_throttle_cost_per_io_ssd")) {
3539 _set_throttle_params();
3542 if (changed
.count("bluestore_throttle_bytes")) {
3543 throttle_bytes
.reset_max(conf
->bluestore_throttle_bytes
);
3544 throttle_deferred_bytes
.reset_max(
3545 conf
->bluestore_throttle_bytes
+ conf
->bluestore_throttle_deferred_bytes
);
3547 if (changed
.count("bluestore_throttle_deferred_bytes")) {
3548 throttle_deferred_bytes
.reset_max(
3549 conf
->bluestore_throttle_bytes
+ conf
->bluestore_throttle_deferred_bytes
);
3553 void BlueStore::_set_compression()
3555 if (cct
->_conf
->bluestore_compression_max_blob_size
) {
3556 comp_min_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size
;
3559 if (bdev
->is_rotational()) {
3560 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_hdd
;
3562 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_ssd
;
3566 if (cct
->_conf
->bluestore_compression_max_blob_size
) {
3567 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size
;
3570 if (bdev
->is_rotational()) {
3571 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_hdd
;
3573 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_ssd
;
3577 auto m
= Compressor::get_comp_mode_type(cct
->_conf
->bluestore_compression_mode
);
3581 derr
<< __func__
<< " unrecognized value '"
3582 << cct
->_conf
->bluestore_compression_mode
3583 << "' for bluestore_compression_mode, reverting to 'none'"
3585 comp_mode
= Compressor::COMP_NONE
;
3588 compressor
= nullptr;
3590 auto& alg_name
= cct
->_conf
->bluestore_compression_algorithm
;
3591 if (!alg_name
.empty()) {
3592 compressor
= Compressor::create(cct
, alg_name
);
3594 derr
<< __func__
<< " unable to initialize " << alg_name
.c_str() << " compressor"
3599 dout(10) << __func__
<< " mode " << Compressor::get_comp_mode_name(comp_mode
)
3600 << " alg " << (compressor
? compressor
->get_type_name() : "(none)")
3604 void BlueStore::_set_csum()
3606 csum_type
= Checksummer::CSUM_NONE
;
3607 int t
= Checksummer::get_csum_string_type(cct
->_conf
->bluestore_csum_type
);
3608 if (t
> Checksummer::CSUM_NONE
)
3611 dout(10) << __func__
<< " csum_type "
3612 << Checksummer::get_csum_type_string(csum_type
)
3616 void BlueStore::_set_throttle_params()
3618 if (cct
->_conf
->bluestore_throttle_cost_per_io
) {
3619 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io
;
3622 if (bdev
->is_rotational()) {
3623 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_hdd
;
3625 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_ssd
;
3629 dout(10) << __func__
<< " throttle_cost_per_io " << throttle_cost_per_io
3632 void BlueStore::_set_blob_size()
3634 if (cct
->_conf
->bluestore_max_blob_size
) {
3635 max_blob_size
= cct
->_conf
->bluestore_max_blob_size
;
3638 if (bdev
->is_rotational()) {
3639 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_hdd
;
3641 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_ssd
;
3644 dout(10) << __func__
<< " max_blob_size 0x" << std::hex
<< max_blob_size
3645 << std::dec
<< dendl
;
3648 int BlueStore::_set_cache_sizes()
3650 cache_meta_ratio
= cct
->_conf
->bluestore_cache_meta_ratio
;
3651 cache_kv_ratio
= cct
->_conf
->bluestore_cache_kv_ratio
;
3653 (double)1.0 - (double)cache_meta_ratio
- (double)cache_kv_ratio
;
3655 if (cache_meta_ratio
<= 0 || cache_meta_ratio
> 1.0) {
3656 derr
<< __func__
<< "bluestore_cache_meta_ratio (" << cache_meta_ratio
3657 << ") must be in range (0,1.0]" << dendl
;
3660 if (cache_kv_ratio
<= 0 || cache_kv_ratio
> 1.0) {
3661 derr
<< __func__
<< "bluestore_cache_kv_ratio (" << cache_kv_ratio
3662 << ") must be in range (0,1.0]" << dendl
;
3665 if (cache_meta_ratio
+ cache_kv_ratio
> 1.0) {
3666 derr
<< __func__
<< "bluestore_cache_meta_ratio (" << cache_meta_ratio
3667 << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
3668 << ") = " << cache_meta_ratio
+ cache_kv_ratio
<< "; must be <= 1.0"
3672 if (cache_data_ratio
< 0) {
3673 // deal with floating point imprecision
3674 cache_data_ratio
= 0;
3676 dout(1) << __func__
<< " meta " << cache_meta_ratio
3677 << " kv " << cache_kv_ratio
3678 << " data " << cache_data_ratio
3683 void BlueStore::_init_logger()
3685 PerfCountersBuilder
b(cct
, "bluestore",
3686 l_bluestore_first
, l_bluestore_last
);
3687 b
.add_time_avg(l_bluestore_kv_flush_lat
, "kv_flush_lat",
3688 "Average kv_thread flush latency",
3689 "fl_l", PerfCountersBuilder::PRIO_INTERESTING
);
3690 b
.add_time_avg(l_bluestore_kv_commit_lat
, "kv_commit_lat",
3691 "Average kv_thread commit latency");
3692 b
.add_time_avg(l_bluestore_kv_lat
, "kv_lat",
3693 "Average kv_thread sync latency",
3694 "k_l", PerfCountersBuilder::PRIO_INTERESTING
);
3695 b
.add_time_avg(l_bluestore_state_prepare_lat
, "state_prepare_lat",
3696 "Average prepare state latency");
3697 b
.add_time_avg(l_bluestore_state_aio_wait_lat
, "state_aio_wait_lat",
3698 "Average aio_wait state latency",
3699 "io_l", PerfCountersBuilder::PRIO_INTERESTING
);
3700 b
.add_time_avg(l_bluestore_state_io_done_lat
, "state_io_done_lat",
3701 "Average io_done state latency");
3702 b
.add_time_avg(l_bluestore_state_kv_queued_lat
, "state_kv_queued_lat",
3703 "Average kv_queued state latency");
3704 b
.add_time_avg(l_bluestore_state_kv_committing_lat
, "state_kv_commiting_lat",
3705 "Average kv_commiting state latency");
3706 b
.add_time_avg(l_bluestore_state_kv_done_lat
, "state_kv_done_lat",
3707 "Average kv_done state latency");
3708 b
.add_time_avg(l_bluestore_state_deferred_queued_lat
, "state_deferred_queued_lat",
3709 "Average deferred_queued state latency");
3710 b
.add_time_avg(l_bluestore_state_deferred_aio_wait_lat
, "state_deferred_aio_wait_lat",
3711 "Average aio_wait state latency");
3712 b
.add_time_avg(l_bluestore_state_deferred_cleanup_lat
, "state_deferred_cleanup_lat",
3713 "Average cleanup state latency");
3714 b
.add_time_avg(l_bluestore_state_finishing_lat
, "state_finishing_lat",
3715 "Average finishing state latency");
3716 b
.add_time_avg(l_bluestore_state_done_lat
, "state_done_lat",
3717 "Average done state latency");
3718 b
.add_time_avg(l_bluestore_throttle_lat
, "throttle_lat",
3719 "Average submit throttle latency",
3720 "th_l", PerfCountersBuilder::PRIO_CRITICAL
);
3721 b
.add_time_avg(l_bluestore_submit_lat
, "submit_lat",
3722 "Average submit latency",
3723 "s_l", PerfCountersBuilder::PRIO_CRITICAL
);
3724 b
.add_time_avg(l_bluestore_commit_lat
, "commit_lat",
3725 "Average commit latency",
3726 "c_l", PerfCountersBuilder::PRIO_CRITICAL
);
3727 b
.add_time_avg(l_bluestore_read_lat
, "read_lat",
3728 "Average read latency",
3729 "r_l", PerfCountersBuilder::PRIO_CRITICAL
);
3730 b
.add_time_avg(l_bluestore_read_onode_meta_lat
, "read_onode_meta_lat",
3731 "Average read onode metadata latency");
3732 b
.add_time_avg(l_bluestore_read_wait_aio_lat
, "read_wait_aio_lat",
3733 "Average read latency");
3734 b
.add_time_avg(l_bluestore_compress_lat
, "compress_lat",
3735 "Average compress latency");
3736 b
.add_time_avg(l_bluestore_decompress_lat
, "decompress_lat",
3737 "Average decompress latency");
3738 b
.add_time_avg(l_bluestore_csum_lat
, "csum_lat",
3739 "Average checksum latency");
3740 b
.add_u64_counter(l_bluestore_compress_success_count
, "compress_success_count",
3741 "Sum for beneficial compress ops");
3742 b
.add_u64_counter(l_bluestore_compress_rejected_count
, "compress_rejected_count",
3743 "Sum for compress ops rejected due to low net gain of space");
3744 b
.add_u64_counter(l_bluestore_write_pad_bytes
, "write_pad_bytes",
3745 "Sum for write-op padded bytes");
3746 b
.add_u64_counter(l_bluestore_deferred_write_ops
, "deferred_write_ops",
3747 "Sum for deferred write op");
3748 b
.add_u64_counter(l_bluestore_deferred_write_bytes
, "deferred_write_bytes",
3749 "Sum for deferred write bytes", "def");
3750 b
.add_u64_counter(l_bluestore_write_penalty_read_ops
, "write_penalty_read_ops",
3751 "Sum for write penalty read ops");
3752 b
.add_u64(l_bluestore_allocated
, "bluestore_allocated",
3753 "Sum for allocated bytes");
3754 b
.add_u64(l_bluestore_stored
, "bluestore_stored",
3755 "Sum for stored bytes");
3756 b
.add_u64(l_bluestore_compressed
, "bluestore_compressed",
3757 "Sum for stored compressed bytes");
3758 b
.add_u64(l_bluestore_compressed_allocated
, "bluestore_compressed_allocated",
3759 "Sum for bytes allocated for compressed data");
3760 b
.add_u64(l_bluestore_compressed_original
, "bluestore_compressed_original",
3761 "Sum for original bytes that were compressed");
3763 b
.add_u64(l_bluestore_onodes
, "bluestore_onodes",
3764 "Number of onodes in cache");
3765 b
.add_u64_counter(l_bluestore_onode_hits
, "bluestore_onode_hits",
3766 "Sum for onode-lookups hit in the cache");
3767 b
.add_u64_counter(l_bluestore_onode_misses
, "bluestore_onode_misses",
3768 "Sum for onode-lookups missed in the cache");
3769 b
.add_u64_counter(l_bluestore_onode_shard_hits
, "bluestore_onode_shard_hits",
3770 "Sum for onode-shard lookups hit in the cache");
3771 b
.add_u64_counter(l_bluestore_onode_shard_misses
,
3772 "bluestore_onode_shard_misses",
3773 "Sum for onode-shard lookups missed in the cache");
3774 b
.add_u64(l_bluestore_extents
, "bluestore_extents",
3775 "Number of extents in cache");
3776 b
.add_u64(l_bluestore_blobs
, "bluestore_blobs",
3777 "Number of blobs in cache");
3778 b
.add_u64(l_bluestore_buffers
, "bluestore_buffers",
3779 "Number of buffers in cache");
3780 b
.add_u64(l_bluestore_buffer_bytes
, "bluestore_buffer_bytes",
3781 "Number of buffer bytes in cache");
3782 b
.add_u64(l_bluestore_buffer_hit_bytes
, "bluestore_buffer_hit_bytes",
3783 "Sum for bytes of read hit in the cache");
3784 b
.add_u64(l_bluestore_buffer_miss_bytes
, "bluestore_buffer_miss_bytes",
3785 "Sum for bytes of read missed in the cache");
3787 b
.add_u64_counter(l_bluestore_write_big
, "bluestore_write_big",
3788 "Large aligned writes into fresh blobs");
3789 b
.add_u64_counter(l_bluestore_write_big_bytes
, "bluestore_write_big_bytes",
3790 "Large aligned writes into fresh blobs (bytes)");
3791 b
.add_u64_counter(l_bluestore_write_big_blobs
, "bluestore_write_big_blobs",
3792 "Large aligned writes into fresh blobs (blobs)");
3793 b
.add_u64_counter(l_bluestore_write_small
, "bluestore_write_small",
3794 "Small writes into existing or sparse small blobs");
3795 b
.add_u64_counter(l_bluestore_write_small_bytes
, "bluestore_write_small_bytes",
3796 "Small writes into existing or sparse small blobs (bytes)");
3797 b
.add_u64_counter(l_bluestore_write_small_unused
,
3798 "bluestore_write_small_unused",
3799 "Small writes into unused portion of existing blob");
3800 b
.add_u64_counter(l_bluestore_write_small_deferred
,
3801 "bluestore_write_small_deferred",
3802 "Small overwrites using deferred");
3803 b
.add_u64_counter(l_bluestore_write_small_pre_read
,
3804 "bluestore_write_small_pre_read",
3805 "Small writes that required we read some data (possibly "
3806 "cached) to fill out the block");
3807 b
.add_u64_counter(l_bluestore_write_small_new
, "bluestore_write_small_new",
3808 "Small write into new (sparse) blob");
3810 b
.add_u64_counter(l_bluestore_txc
, "bluestore_txc", "Transactions committed");
3811 b
.add_u64_counter(l_bluestore_onode_reshard
, "bluestore_onode_reshard",
3812 "Onode extent map reshard events");
3813 b
.add_u64_counter(l_bluestore_blob_split
, "bluestore_blob_split",
3814 "Sum for blob splitting due to resharding");
3815 b
.add_u64_counter(l_bluestore_extent_compress
, "bluestore_extent_compress",
3816 "Sum for extents that have been removed due to compression");
3817 b
.add_u64_counter(l_bluestore_gc_merged
, "bluestore_gc_merged",
3818 "Sum for extents that have been merged due to garbage "
3820 logger
= b
.create_perf_counters();
3821 cct
->get_perfcounters_collection()->add(logger
);
3824 int BlueStore::_reload_logger()
3826 struct store_statfs_t store_statfs
;
3828 int r
= statfs(&store_statfs
);
3830 logger
->set(l_bluestore_allocated
, store_statfs
.allocated
);
3831 logger
->set(l_bluestore_stored
, store_statfs
.stored
);
3832 logger
->set(l_bluestore_compressed
, store_statfs
.compressed
);
3833 logger
->set(l_bluestore_compressed_allocated
, store_statfs
.compressed_allocated
);
3834 logger
->set(l_bluestore_compressed_original
, store_statfs
.compressed_original
);
3839 void BlueStore::_shutdown_logger()
3841 cct
->get_perfcounters_collection()->remove(logger
);
3845 int BlueStore::get_block_device_fsid(CephContext
* cct
, const string
& path
,
3848 bluestore_bdev_label_t label
;
3849 int r
= _read_bdev_label(cct
, path
, &label
);
3852 *fsid
= label
.osd_uuid
;
3856 int BlueStore::_open_path()
3858 // initial sanity check
3859 int r
= _set_cache_sizes();
3864 assert(path_fd
< 0);
3865 path_fd
= ::open(path
.c_str(), O_DIRECTORY
);
3868 derr
<< __func__
<< " unable to open " << path
<< ": " << cpp_strerror(r
)
3875 void BlueStore::_close_path()
3877 VOID_TEMP_FAILURE_RETRY(::close(path_fd
));
3881 int BlueStore::_write_bdev_label(string path
, bluestore_bdev_label_t label
)
3883 dout(10) << __func__
<< " path " << path
<< " label " << label
<< dendl
;
3885 ::encode(label
, bl
);
3886 uint32_t crc
= bl
.crc32c(-1);
3888 assert(bl
.length() <= BDEV_LABEL_BLOCK_SIZE
);
3889 bufferptr
z(BDEV_LABEL_BLOCK_SIZE
- bl
.length());
3891 bl
.append(std::move(z
));
3893 int fd
= ::open(path
.c_str(), O_WRONLY
);
3896 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
3900 int r
= bl
.write_fd(fd
);
3902 derr
<< __func__
<< " failed to write to " << path
3903 << ": " << cpp_strerror(r
) << dendl
;
3905 VOID_TEMP_FAILURE_RETRY(::close(fd
));
3909 int BlueStore::_read_bdev_label(CephContext
* cct
, string path
,
3910 bluestore_bdev_label_t
*label
)
3912 dout(10) << __func__
<< dendl
;
3913 int fd
= ::open(path
.c_str(), O_RDONLY
);
3916 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
3921 int r
= bl
.read_fd(fd
, BDEV_LABEL_BLOCK_SIZE
);
3922 VOID_TEMP_FAILURE_RETRY(::close(fd
));
3924 derr
<< __func__
<< " failed to read from " << path
3925 << ": " << cpp_strerror(r
) << dendl
;
3929 uint32_t crc
, expected_crc
;
3930 bufferlist::iterator p
= bl
.begin();
3932 ::decode(*label
, p
);
3934 t
.substr_of(bl
, 0, p
.get_off());
3936 ::decode(expected_crc
, p
);
3938 catch (buffer::error
& e
) {
3939 derr
<< __func__
<< " unable to decode label at offset " << p
.get_off()
3944 if (crc
!= expected_crc
) {
3945 derr
<< __func__
<< " bad crc on label, expected " << expected_crc
3946 << " != actual " << crc
<< dendl
;
3949 dout(10) << __func__
<< " got " << *label
<< dendl
;
3953 int BlueStore::_check_or_set_bdev_label(
3954 string path
, uint64_t size
, string desc
, bool create
)
3956 bluestore_bdev_label_t label
;
3958 label
.osd_uuid
= fsid
;
3960 label
.btime
= ceph_clock_now();
3961 label
.description
= desc
;
3962 int r
= _write_bdev_label(path
, label
);
3966 int r
= _read_bdev_label(cct
, path
, &label
);
3969 if (cct
->_conf
->bluestore_debug_permit_any_bdev_label
) {
3970 dout(20) << __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
3971 << " and fsid " << fsid
<< " check bypassed" << dendl
;
3973 else if (label
.osd_uuid
!= fsid
) {
3974 derr
<< __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
3975 << " does not match our fsid " << fsid
<< dendl
;
3982 void BlueStore::_set_alloc_sizes(void)
3984 min_alloc_size_order
= ctz(min_alloc_size
);
3985 assert(min_alloc_size
== 1u << min_alloc_size_order
);
3987 max_alloc_size
= cct
->_conf
->bluestore_max_alloc_size
;
3989 if (cct
->_conf
->bluestore_prefer_deferred_size
) {
3990 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size
;
3993 if (bdev
->is_rotational()) {
3994 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_hdd
;
3996 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_ssd
;
4000 if (cct
->_conf
->bluestore_deferred_batch_ops
) {
4001 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops
;
4004 if (bdev
->is_rotational()) {
4005 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_hdd
;
4007 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_ssd
;
4011 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
4012 << std::dec
<< " order " << min_alloc_size_order
4013 << " max_alloc_size 0x" << std::hex
<< max_alloc_size
4014 << " prefer_deferred_size 0x" << prefer_deferred_size
4016 << " deferred_batch_ops " << deferred_batch_ops
4020 int BlueStore::_open_bdev(bool create
)
4022 assert(bdev
== NULL
);
4023 string p
= path
+ "/block";
4024 bdev
= BlockDevice::create(cct
, p
, aio_cb
, static_cast<void*>(this));
4025 int r
= bdev
->open(p
);
4029 if (bdev
->supported_bdev_label()) {
4030 r
= _check_or_set_bdev_label(p
, bdev
->get_size(), "main", create
);
4035 // initialize global block parameters
4036 block_size
= bdev
->get_block_size();
4037 block_mask
= ~(block_size
- 1);
4038 block_size_order
= ctz(block_size
);
4039 assert(block_size
== 1u << block_size_order
);
4050 void BlueStore::_close_bdev()
4058 int BlueStore::_open_fm(bool create
)
4061 fm
= FreelistManager::create(cct
, freelist_type
, db
, PREFIX_ALLOC
);
4064 // initialize freespace
4065 dout(20) << __func__
<< " initializing freespace" << dendl
;
4066 KeyValueDB::Transaction t
= db
->get_transaction();
4069 bl
.append(freelist_type
);
4070 t
->set(PREFIX_SUPER
, "freelist_type", bl
);
4072 fm
->create(bdev
->get_size(), t
);
4074 // allocate superblock reserved space. note that we do not mark
4075 // bluefs space as allocated in the freelist; we instead rely on
4077 fm
->allocate(0, SUPER_RESERVED
, t
);
4079 uint64_t reserved
= 0;
4080 if (cct
->_conf
->bluestore_bluefs
) {
4081 assert(bluefs_extents
.num_intervals() == 1);
4082 interval_set
<uint64_t>::iterator p
= bluefs_extents
.begin();
4083 reserved
= p
.get_start() + p
.get_len();
4084 dout(20) << __func__
<< " reserved 0x" << std::hex
<< reserved
<< std::dec
4085 << " for bluefs" << dendl
;
4087 ::encode(bluefs_extents
, bl
);
4088 t
->set(PREFIX_SUPER
, "bluefs_extents", bl
);
4089 dout(20) << __func__
<< " bluefs_extents 0x" << std::hex
<< bluefs_extents
4090 << std::dec
<< dendl
;
4092 reserved
= SUPER_RESERVED
;
4095 if (cct
->_conf
->bluestore_debug_prefill
> 0) {
4096 uint64_t end
= bdev
->get_size() - reserved
;
4097 dout(1) << __func__
<< " pre-fragmenting freespace, using "
4098 << cct
->_conf
->bluestore_debug_prefill
<< " with max free extent "
4099 << cct
->_conf
->bluestore_debug_prefragment_max
<< dendl
;
4100 uint64_t start
= P2ROUNDUP(reserved
, min_alloc_size
);
4101 uint64_t max_b
= cct
->_conf
->bluestore_debug_prefragment_max
/ min_alloc_size
;
4102 float r
= cct
->_conf
->bluestore_debug_prefill
;
4106 while (!stop
&& start
< end
) {
4107 uint64_t l
= (rand() % max_b
+ 1) * min_alloc_size
;
4108 if (start
+ l
> end
) {
4110 l
= P2ALIGN(l
, min_alloc_size
);
4112 assert(start
+ l
<= end
);
4114 uint64_t u
= 1 + (uint64_t)(r
* (double)l
);
4115 u
= P2ROUNDUP(u
, min_alloc_size
);
4116 if (start
+ l
+ u
> end
) {
4117 u
= end
- (start
+ l
);
4118 // trim to align so we don't overflow again
4119 u
= P2ALIGN(u
, min_alloc_size
);
4122 assert(start
+ l
+ u
<= end
);
4124 dout(20) << " free 0x" << std::hex
<< start
<< "~" << l
4125 << " use 0x" << u
<< std::dec
<< dendl
;
4128 // break if u has been trimmed to nothing
4132 fm
->allocate(start
+ l
, u
, t
);
4136 db
->submit_transaction_sync(t
);
4141 derr
<< __func__
<< " freelist init failed: " << cpp_strerror(r
) << dendl
;
4149 void BlueStore::_close_fm()
4151 dout(10) << __func__
<< dendl
;
4158 int BlueStore::_open_alloc()
4160 assert(alloc
== NULL
);
4161 assert(bdev
->get_size());
4162 alloc
= Allocator::create(cct
, cct
->_conf
->bluestore_allocator
,
4166 lderr(cct
) << __func__
<< " Allocator::unknown alloc type "
4167 << cct
->_conf
->bluestore_allocator
4172 uint64_t num
= 0, bytes
= 0;
4174 dout(1) << __func__
<< " opening allocation metadata" << dendl
;
4175 // initialize from freelist
4176 fm
->enumerate_reset();
4177 uint64_t offset
, length
;
4178 while (fm
->enumerate_next(&offset
, &length
)) {
4179 alloc
->init_add_free(offset
, length
);
4183 dout(1) << __func__
<< " loaded " << pretty_si_t(bytes
)
4184 << " in " << num
<< " extents"
4187 // also mark bluefs space as allocated
4188 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
4189 alloc
->init_rm_free(e
.get_start(), e
.get_len());
4191 dout(10) << __func__
<< " marked bluefs_extents 0x" << std::hex
4192 << bluefs_extents
<< std::dec
<< " as allocated" << dendl
;
4197 void BlueStore::_close_alloc()
4205 int BlueStore::_open_fsid(bool create
)
4207 assert(fsid_fd
< 0);
4211 fsid_fd
= ::openat(path_fd
, "fsid", flags
, 0644);
4214 derr
<< __func__
<< " " << cpp_strerror(err
) << dendl
;
4220 int BlueStore::_read_fsid(uuid_d
*uuid
)
4223 memset(fsid_str
, 0, sizeof(fsid_str
));
4224 int ret
= safe_read(fsid_fd
, fsid_str
, sizeof(fsid_str
));
4226 derr
<< __func__
<< " failed: " << cpp_strerror(ret
) << dendl
;
4233 if (!uuid
->parse(fsid_str
)) {
4234 derr
<< __func__
<< " unparsable uuid " << fsid_str
<< dendl
;
4240 int BlueStore::_write_fsid()
4242 int r
= ::ftruncate(fsid_fd
, 0);
4245 derr
<< __func__
<< " fsid truncate failed: " << cpp_strerror(r
) << dendl
;
4248 string str
= stringify(fsid
) + "\n";
4249 r
= safe_write(fsid_fd
, str
.c_str(), str
.length());
4251 derr
<< __func__
<< " fsid write failed: " << cpp_strerror(r
) << dendl
;
4254 r
= ::fsync(fsid_fd
);
4257 derr
<< __func__
<< " fsid fsync failed: " << cpp_strerror(r
) << dendl
;
4263 void BlueStore::_close_fsid()
4265 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd
));
4269 int BlueStore::_lock_fsid()
4272 memset(&l
, 0, sizeof(l
));
4274 l
.l_whence
= SEEK_SET
;
4275 int r
= ::fcntl(fsid_fd
, F_SETLK
, &l
);
4278 derr
<< __func__
<< " failed to lock " << path
<< "/fsid"
4279 << " (is another ceph-osd still running?)"
4280 << cpp_strerror(err
) << dendl
;
4286 bool BlueStore::is_rotational()
4289 return bdev
->is_rotational();
4292 bool rotational
= true;
4293 int r
= _open_path();
4296 r
= _open_fsid(false);
4299 r
= _read_fsid(&fsid
);
4305 r
= _open_bdev(false);
4308 rotational
= bdev
->is_rotational();
4318 bool BlueStore::test_mount_in_use()
4320 // most error conditions mean the mount is not in use (e.g., because
4321 // it doesn't exist). only if we fail to lock do we conclude it is
4324 int r
= _open_path();
4327 r
= _open_fsid(false);
4332 ret
= true; // if we can't lock, it is in use
4339 int BlueStore::_open_db(bool create
)
4343 string fn
= path
+ "/db";
4346 ceph::shared_ptr
<Int64ArrayMergeOperator
> merge_op(new Int64ArrayMergeOperator
);
4350 kv_backend
= cct
->_conf
->bluestore_kvbackend
;
4352 r
= read_meta("kv_backend", &kv_backend
);
4354 derr
<< __func__
<< " unable to read 'kv_backend' meta" << dendl
;
4358 dout(10) << __func__
<< " kv_backend = " << kv_backend
<< dendl
;
4362 do_bluefs
= cct
->_conf
->bluestore_bluefs
;
4365 r
= read_meta("bluefs", &s
);
4367 derr
<< __func__
<< " unable to read 'bluefs' meta" << dendl
;
4372 } else if (s
== "0") {
4375 derr
<< __func__
<< " bluefs = " << s
<< " : not 0 or 1, aborting"
4380 dout(10) << __func__
<< " do_bluefs = " << do_bluefs
<< dendl
;
4382 rocksdb::Env
*env
= NULL
;
4384 dout(10) << __func__
<< " initializing bluefs" << dendl
;
4385 if (kv_backend
!= "rocksdb") {
4386 derr
<< " backend must be rocksdb to use bluefs" << dendl
;
4389 bluefs
= new BlueFS(cct
);
4394 bfn
= path
+ "/block.db";
4395 if (::stat(bfn
.c_str(), &st
) == 0) {
4396 r
= bluefs
->add_block_device(BlueFS::BDEV_DB
, bfn
);
4398 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4399 << cpp_strerror(r
) << dendl
;
4403 if (bluefs
->bdev_support_label(BlueFS::BDEV_DB
)) {
4404 r
= _check_or_set_bdev_label(
4406 bluefs
->get_block_device_size(BlueFS::BDEV_DB
),
4407 "bluefs db", create
);
4410 << " check block device(" << bfn
<< ") label returned: "
4411 << cpp_strerror(r
) << dendl
;
4416 bluefs
->add_block_extent(
4419 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) - SUPER_RESERVED
);
4421 bluefs_shared_bdev
= BlueFS::BDEV_SLOW
;
4422 bluefs_single_shared_device
= false;
4423 } else if (::lstat(bfn
.c_str(), &st
) == -1) {
4424 bluefs_shared_bdev
= BlueFS::BDEV_DB
;
4426 //symlink exist is bug
4427 derr
<< __func__
<< " " << bfn
<< " link target doesn't exist" << dendl
;
4433 bfn
= path
+ "/block";
4434 r
= bluefs
->add_block_device(bluefs_shared_bdev
, bfn
);
4436 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4437 << cpp_strerror(r
) << dendl
;
4441 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4443 bdev
->get_size() * (cct
->_conf
->bluestore_bluefs_min_ratio
+
4444 cct
->_conf
->bluestore_bluefs_gift_ratio
);
4445 initial
= MAX(initial
, cct
->_conf
->bluestore_bluefs_min
);
4446 // align to bluefs's alloc_size
4447 initial
= P2ROUNDUP(initial
, cct
->_conf
->bluefs_alloc_size
);
4448 // put bluefs in the middle of the device in case it is an HDD
4449 uint64_t start
= P2ALIGN((bdev
->get_size() - initial
) / 2,
4450 cct
->_conf
->bluefs_alloc_size
);
4451 bluefs
->add_block_extent(bluefs_shared_bdev
, start
, initial
);
4452 bluefs_extents
.insert(start
, initial
);
4455 bfn
= path
+ "/block.wal";
4456 if (::stat(bfn
.c_str(), &st
) == 0) {
4457 r
= bluefs
->add_block_device(BlueFS::BDEV_WAL
, bfn
);
4459 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4460 << cpp_strerror(r
) << dendl
;
4464 if (bluefs
->bdev_support_label(BlueFS::BDEV_WAL
)) {
4465 r
= _check_or_set_bdev_label(
4467 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
),
4468 "bluefs wal", create
);
4470 derr
<< __func__
<< " check block device(" << bfn
4471 << ") label returned: " << cpp_strerror(r
) << dendl
;
4477 bluefs
->add_block_extent(
4478 BlueFS::BDEV_WAL
, BDEV_LABEL_BLOCK_SIZE
,
4479 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) -
4480 BDEV_LABEL_BLOCK_SIZE
);
4482 cct
->_conf
->set_val("rocksdb_separate_wal_dir", "true");
4483 bluefs_single_shared_device
= false;
4484 } else if (::lstat(bfn
.c_str(), &st
) == -1) {
4485 cct
->_conf
->set_val("rocksdb_separate_wal_dir", "false");
4487 //symlink exist is bug
4488 derr
<< __func__
<< " " << bfn
<< " link target doesn't exist" << dendl
;
4496 r
= bluefs
->mount();
4498 derr
<< __func__
<< " failed bluefs mount: " << cpp_strerror(r
) << dendl
;
4501 if (cct
->_conf
->bluestore_bluefs_env_mirror
) {
4502 rocksdb::Env
*a
= new BlueRocksEnv(bluefs
);
4503 rocksdb::Env
*b
= rocksdb::Env::Default();
4505 string cmd
= "rm -rf " + path
+ "/db " +
4506 path
+ "/db.slow " +
4508 int r
= system(cmd
.c_str());
4511 env
= new rocksdb::EnvMirror(b
, a
, false, true);
4513 env
= new BlueRocksEnv(bluefs
);
4515 // simplify the dir names, too, as "seen" by rocksdb
4519 if (bluefs_shared_bdev
== BlueFS::BDEV_SLOW
) {
4520 // we have both block.db and block; tell rocksdb!
4521 // note: the second (last) size value doesn't really matter
4522 ostringstream db_paths
;
4523 uint64_t db_size
= bluefs
->get_block_device_size(BlueFS::BDEV_DB
);
4524 uint64_t slow_size
= bluefs
->get_block_device_size(BlueFS::BDEV_SLOW
);
4525 db_paths
<< fn
<< ","
4526 << (uint64_t)(db_size
* 95 / 100) << " "
4527 << fn
+ ".slow" << ","
4528 << (uint64_t)(slow_size
* 95 / 100);
4529 cct
->_conf
->set_val("rocksdb_db_paths", db_paths
.str(), false);
4530 dout(10) << __func__
<< " set rocksdb_db_paths to "
4531 << cct
->_conf
->get_val
<std::string
>("rocksdb_db_paths") << dendl
;
4536 if (cct
->_conf
->rocksdb_separate_wal_dir
)
4537 env
->CreateDir(fn
+ ".wal");
4538 if (cct
->_conf
->get_val
<std::string
>("rocksdb_db_paths").length())
4539 env
->CreateDir(fn
+ ".slow");
4541 } else if (create
) {
4542 int r
= ::mkdir(fn
.c_str(), 0755);
4545 if (r
< 0 && r
!= -EEXIST
) {
4546 derr
<< __func__
<< " failed to create " << fn
<< ": " << cpp_strerror(r
)
4552 if (cct
->_conf
->rocksdb_separate_wal_dir
) {
4553 string walfn
= path
+ "/db.wal";
4554 r
= ::mkdir(walfn
.c_str(), 0755);
4557 if (r
< 0 && r
!= -EEXIST
) {
4558 derr
<< __func__
<< " failed to create " << walfn
4559 << ": " << cpp_strerror(r
)
4566 db
= KeyValueDB::create(cct
,
4569 static_cast<void*>(env
));
4571 derr
<< __func__
<< " error creating db" << dendl
;
4577 // delete env manually here since we can't depend on db to do this
4584 FreelistManager::setup_merge_operators(db
);
4585 db
->set_merge_operator(PREFIX_STAT
, merge_op
);
4587 db
->set_cache_size(cct
->_conf
->bluestore_cache_size
* cache_kv_ratio
);
4589 if (kv_backend
== "rocksdb")
4590 options
= cct
->_conf
->bluestore_rocksdb_options
;
4593 r
= db
->create_and_open(err
);
4597 derr
<< __func__
<< " erroring opening db: " << err
.str() << dendl
;
4607 dout(1) << __func__
<< " opened " << kv_backend
4608 << " path " << fn
<< " options " << options
<< dendl
;
4618 void BlueStore::_close_db()
4630 int BlueStore::_reconcile_bluefs_freespace()
4632 dout(10) << __func__
<< dendl
;
4633 interval_set
<uint64_t> bset
;
4634 int r
= bluefs
->get_block_extents(bluefs_shared_bdev
, &bset
);
4636 if (bset
== bluefs_extents
) {
4637 dout(10) << __func__
<< " we agree bluefs has 0x" << std::hex
<< bset
4638 << std::dec
<< dendl
;
4641 dout(10) << __func__
<< " bluefs says 0x" << std::hex
<< bset
<< std::dec
4643 dout(10) << __func__
<< " super says 0x" << std::hex
<< bluefs_extents
4644 << std::dec
<< dendl
;
4646 interval_set
<uint64_t> overlap
;
4647 overlap
.intersection_of(bset
, bluefs_extents
);
4649 bset
.subtract(overlap
);
4650 if (!bset
.empty()) {
4651 derr
<< __func__
<< " bluefs extra 0x" << std::hex
<< bset
<< std::dec
4656 interval_set
<uint64_t> super_extra
;
4657 super_extra
= bluefs_extents
;
4658 super_extra
.subtract(overlap
);
4659 if (!super_extra
.empty()) {
4660 // This is normal: it can happen if we commit to give extents to
4661 // bluefs and we crash before bluefs commits that it owns them.
4662 dout(10) << __func__
<< " super extra " << super_extra
<< dendl
;
4663 for (interval_set
<uint64_t>::iterator p
= super_extra
.begin();
4664 p
!= super_extra
.end();
4666 bluefs
->add_block_extent(bluefs_shared_bdev
, p
.get_start(), p
.get_len());
4673 int BlueStore::_balance_bluefs_freespace(PExtentVector
*extents
)
4678 vector
<pair
<uint64_t,uint64_t>> bluefs_usage
; // <free, total> ...
4679 bluefs
->get_usage(&bluefs_usage
);
4680 assert(bluefs_usage
.size() > bluefs_shared_bdev
);
4682 // fixme: look at primary bdev only for now
4683 uint64_t bluefs_free
= bluefs_usage
[bluefs_shared_bdev
].first
;
4684 uint64_t bluefs_total
= bluefs_usage
[bluefs_shared_bdev
].second
;
4685 float bluefs_free_ratio
= (float)bluefs_free
/ (float)bluefs_total
;
4687 uint64_t my_free
= alloc
->get_free();
4688 uint64_t total
= bdev
->get_size();
4689 float my_free_ratio
= (float)my_free
/ (float)total
;
4691 uint64_t total_free
= bluefs_free
+ my_free
;
4693 float bluefs_ratio
= (float)bluefs_free
/ (float)total_free
;
4695 dout(10) << __func__
4696 << " bluefs " << pretty_si_t(bluefs_free
)
4697 << " free (" << bluefs_free_ratio
4698 << ") bluestore " << pretty_si_t(my_free
)
4699 << " free (" << my_free_ratio
4700 << "), bluefs_ratio " << bluefs_ratio
4704 uint64_t reclaim
= 0;
4705 if (bluefs_ratio
< cct
->_conf
->bluestore_bluefs_min_ratio
) {
4706 gift
= cct
->_conf
->bluestore_bluefs_gift_ratio
* total_free
;
4707 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
4708 << " < min_ratio " << cct
->_conf
->bluestore_bluefs_min_ratio
4709 << ", should gift " << pretty_si_t(gift
) << dendl
;
4710 } else if (bluefs_ratio
> cct
->_conf
->bluestore_bluefs_max_ratio
) {
4711 reclaim
= cct
->_conf
->bluestore_bluefs_reclaim_ratio
* total_free
;
4712 if (bluefs_total
- reclaim
< cct
->_conf
->bluestore_bluefs_min
)
4713 reclaim
= bluefs_total
- cct
->_conf
->bluestore_bluefs_min
;
4714 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
4715 << " > max_ratio " << cct
->_conf
->bluestore_bluefs_max_ratio
4716 << ", should reclaim " << pretty_si_t(reclaim
) << dendl
;
4718 if (bluefs_total
< cct
->_conf
->bluestore_bluefs_min
&&
4719 cct
->_conf
->bluestore_bluefs_min
<
4720 (uint64_t)(cct
->_conf
->bluestore_bluefs_max_ratio
* total_free
)) {
4721 uint64_t g
= cct
->_conf
->bluestore_bluefs_min
- bluefs_total
;
4722 dout(10) << __func__
<< " bluefs_total " << bluefs_total
4723 << " < min " << cct
->_conf
->bluestore_bluefs_min
4724 << ", should gift " << pretty_si_t(g
) << dendl
;
4731 // round up to alloc size
4732 gift
= P2ROUNDUP(gift
, cct
->_conf
->bluefs_alloc_size
);
4734 // hard cap to fit into 32 bits
4735 gift
= MIN(gift
, 1ull<<31);
4736 dout(10) << __func__
<< " gifting " << gift
4737 << " (" << pretty_si_t(gift
) << ")" << dendl
;
4739 // fixme: just do one allocation to start...
4740 int r
= alloc
->reserve(gift
);
4743 AllocExtentVector exts
;
4744 int64_t alloc_len
= alloc
->allocate(gift
, cct
->_conf
->bluefs_alloc_size
,
4747 if (alloc_len
< (int64_t)gift
) {
4748 derr
<< __func__
<< " allocate failed on 0x" << std::hex
<< gift
4749 << " min_alloc_size 0x" << min_alloc_size
<< std::dec
<< dendl
;
4751 assert(0 == "allocate failed, wtf");
4754 for (auto& p
: exts
) {
4755 bluestore_pextent_t e
= bluestore_pextent_t(p
);
4756 dout(1) << __func__
<< " gifting " << e
<< " to bluefs" << dendl
;
4757 extents
->push_back(e
);
4764 // reclaim from bluefs?
4766 // round up to alloc size
4767 reclaim
= P2ROUNDUP(reclaim
, cct
->_conf
->bluefs_alloc_size
);
4769 // hard cap to fit into 32 bits
4770 reclaim
= MIN(reclaim
, 1ull<<31);
4771 dout(10) << __func__
<< " reclaiming " << reclaim
4772 << " (" << pretty_si_t(reclaim
) << ")" << dendl
;
4774 while (reclaim
> 0) {
4775 // NOTE: this will block and do IO.
4776 AllocExtentVector extents
;
4777 int r
= bluefs
->reclaim_blocks(bluefs_shared_bdev
, reclaim
,
4780 derr
<< __func__
<< " failed to reclaim space from bluefs"
4784 for (auto e
: extents
) {
4785 bluefs_extents
.erase(e
.offset
, e
.length
);
4786 bluefs_extents_reclaiming
.insert(e
.offset
, e
.length
);
4787 reclaim
-= e
.length
;
4797 void BlueStore::_commit_bluefs_freespace(
4798 const PExtentVector
& bluefs_gift_extents
)
4800 dout(10) << __func__
<< dendl
;
4801 for (auto& p
: bluefs_gift_extents
) {
4802 bluefs
->add_block_extent(bluefs_shared_bdev
, p
.offset
, p
.length
);
4806 int BlueStore::_open_collections(int *errors
)
4808 assert(coll_map
.empty());
4809 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
4810 for (it
->upper_bound(string());
4814 if (cid
.parse(it
->key())) {
4818 cache_shards
[cid
.hash_to_shard(cache_shards
.size())],
4820 bufferlist bl
= it
->value();
4821 bufferlist::iterator p
= bl
.begin();
4823 ::decode(c
->cnode
, p
);
4824 } catch (buffer::error
& e
) {
4825 derr
<< __func__
<< " failed to decode cnode, key:"
4826 << pretty_binary_string(it
->key()) << dendl
;
4829 dout(20) << __func__
<< " opened " << cid
<< " " << c
<< dendl
;
4832 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
4840 void BlueStore::open_statfs()
4843 int r
= db
->get(PREFIX_STAT
, "bluestore_statfs", &bl
);
4845 if (size_t(bl
.length()) >= sizeof(vstatfs
.values
)) {
4846 auto it
= bl
.begin();
4850 dout(10) << __func__
<< " store_statfs is corrupt, using empty" << dendl
;
4854 dout(10) << __func__
<< " store_statfs missed, using empty" << dendl
;
4858 int BlueStore::_setup_block_symlink_or_file(
4864 dout(20) << __func__
<< " name " << name
<< " path " << epath
4865 << " size " << size
<< " create=" << (int)create
<< dendl
;
4870 if (epath
.length()) {
4871 r
= ::symlinkat(epath
.c_str(), path_fd
, name
.c_str());
4874 derr
<< __func__
<< " failed to create " << name
<< " symlink to "
4875 << epath
<< ": " << cpp_strerror(r
) << dendl
;
4879 if (!epath
.compare(0, strlen(SPDK_PREFIX
), SPDK_PREFIX
)) {
4880 int fd
= ::openat(path_fd
, epath
.c_str(), flags
, 0644);
4883 derr
<< __func__
<< " failed to open " << epath
<< " file: "
4884 << cpp_strerror(r
) << dendl
;
4887 string serial_number
= epath
.substr(strlen(SPDK_PREFIX
));
4888 r
= ::write(fd
, serial_number
.c_str(), serial_number
.size());
4889 assert(r
== (int)serial_number
.size());
4890 dout(1) << __func__
<< " created " << name
<< " symlink to "
4892 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4896 int fd
= ::openat(path_fd
, name
.c_str(), flags
, 0644);
4898 // block file is present
4900 int r
= ::fstat(fd
, &st
);
4902 S_ISREG(st
.st_mode
) && // if it is a regular file
4903 st
.st_size
== 0) { // and is 0 bytes
4904 r
= ::ftruncate(fd
, size
);
4907 derr
<< __func__
<< " failed to resize " << name
<< " file to "
4908 << size
<< ": " << cpp_strerror(r
) << dendl
;
4909 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4913 if (cct
->_conf
->bluestore_block_preallocate_file
) {
4914 #ifdef HAVE_POSIX_FALLOCATE
4915 r
= ::posix_fallocate(fd
, 0, size
);
4917 derr
<< __func__
<< " failed to prefallocate " << name
<< " file to "
4918 << size
<< ": " << cpp_strerror(r
) << dendl
;
4919 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4923 char data
[1024*128];
4924 for (uint64_t off
= 0; off
< size
; off
+= sizeof(data
)) {
4925 if (off
+ sizeof(data
) > size
)
4926 r
= ::write(fd
, data
, size
- off
);
4928 r
= ::write(fd
, data
, sizeof(data
));
4931 derr
<< __func__
<< " failed to prefallocate w/ write " << name
<< " file to "
4932 << size
<< ": " << cpp_strerror(r
) << dendl
;
4933 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4939 dout(1) << __func__
<< " resized " << name
<< " file to "
4940 << pretty_si_t(size
) << "B" << dendl
;
4942 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4946 derr
<< __func__
<< " failed to open " << name
<< " file: "
4947 << cpp_strerror(r
) << dendl
;
4955 int BlueStore::mkfs()
4957 dout(1) << __func__
<< " path " << path
<< dendl
;
4963 r
= read_meta("mkfs_done", &done
);
4965 dout(1) << __func__
<< " already created" << dendl
;
4966 if (cct
->_conf
->bluestore_fsck_on_mkfs
) {
4967 r
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
4969 derr
<< __func__
<< " fsck found fatal error: " << cpp_strerror(r
)
4974 derr
<< __func__
<< " fsck found " << r
<< " errors" << dendl
;
4978 return r
; // idempotent
4984 r
= read_meta("type", &type
);
4986 if (type
!= "bluestore") {
4987 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
4991 r
= write_meta("type", "bluestore");
4997 freelist_type
= "bitmap";
5003 r
= _open_fsid(true);
5009 goto out_close_fsid
;
5011 r
= _read_fsid(&old_fsid
);
5012 if (r
< 0 || old_fsid
.is_zero()) {
5013 if (fsid
.is_zero()) {
5014 fsid
.generate_random();
5015 dout(1) << __func__
<< " generated fsid " << fsid
<< dendl
;
5017 dout(1) << __func__
<< " using provided fsid " << fsid
<< dendl
;
5019 // we'll write it later.
5021 if (!fsid
.is_zero() && fsid
!= old_fsid
) {
5022 derr
<< __func__
<< " on-disk fsid " << old_fsid
5023 << " != provided " << fsid
<< dendl
;
5025 goto out_close_fsid
;
5030 r
= _setup_block_symlink_or_file("block", cct
->_conf
->bluestore_block_path
,
5031 cct
->_conf
->bluestore_block_size
,
5032 cct
->_conf
->bluestore_block_create
);
5034 goto out_close_fsid
;
5035 if (cct
->_conf
->bluestore_bluefs
) {
5036 r
= _setup_block_symlink_or_file("block.wal", cct
->_conf
->bluestore_block_wal_path
,
5037 cct
->_conf
->bluestore_block_wal_size
,
5038 cct
->_conf
->bluestore_block_wal_create
);
5040 goto out_close_fsid
;
5041 r
= _setup_block_symlink_or_file("block.db", cct
->_conf
->bluestore_block_db_path
,
5042 cct
->_conf
->bluestore_block_db_size
,
5043 cct
->_conf
->bluestore_block_db_create
);
5045 goto out_close_fsid
;
5048 r
= _open_bdev(true);
5050 goto out_close_fsid
;
5054 goto out_close_bdev
;
5061 KeyValueDB::Transaction t
= db
->get_transaction();
5064 ::encode((uint64_t)0, bl
);
5065 t
->set(PREFIX_SUPER
, "nid_max", bl
);
5066 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
5069 // choose min_alloc_size
5070 if (cct
->_conf
->bluestore_min_alloc_size
) {
5071 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size
;
5074 if (bdev
->is_rotational()) {
5075 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_hdd
;
5077 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_ssd
;
5083 ::encode((uint64_t)min_alloc_size
, bl
);
5084 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
5087 ondisk_format
= latest_ondisk_format
;
5088 _prepare_ondisk_format_super(t
);
5089 db
->submit_transaction_sync(t
);
5096 r
= write_meta("kv_backend", cct
->_conf
->bluestore_kvbackend
);
5098 goto out_close_alloc
;
5099 r
= write_meta("bluefs", stringify((int)cct
->_conf
->bluestore_bluefs
));
5101 goto out_close_alloc
;
5103 if (fsid
!= old_fsid
) {
5106 derr
<< __func__
<< " error writing fsid: " << cpp_strerror(r
) << dendl
;
5107 goto out_close_alloc
;
5125 cct
->_conf
->bluestore_fsck_on_mkfs
) {
5126 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
5130 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
5136 // indicate success by writing the 'mkfs_done' file
5137 r
= write_meta("mkfs_done", "yes");
5141 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
5143 dout(0) << __func__
<< " success" << dendl
;
5148 void BlueStore::set_cache_shards(unsigned num
)
5150 dout(10) << __func__
<< " " << num
<< dendl
;
5151 size_t old
= cache_shards
.size();
5153 cache_shards
.resize(num
);
5154 for (unsigned i
= old
; i
< num
; ++i
) {
5155 cache_shards
[i
] = Cache::create(cct
, cct
->_conf
->bluestore_cache_type
,
5160 int BlueStore::_mount(bool kv_only
)
5162 dout(1) << __func__
<< " path " << path
<< dendl
;
5166 int r
= read_meta("type", &type
);
5168 derr
<< __func__
<< " failed to load os-type: " << cpp_strerror(r
)
5173 if (type
!= "bluestore") {
5174 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
5179 if (cct
->_conf
->bluestore_fsck_on_mount
) {
5180 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mount_deep
);
5184 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
5189 int r
= _open_path();
5192 r
= _open_fsid(false);
5196 r
= _read_fsid(&fsid
);
5204 r
= _open_bdev(false);
5208 r
= _open_db(false);
5215 r
= _open_super_meta();
5219 r
= _open_fm(false);
5227 r
= _open_collections();
5231 r
= _reload_logger();
5236 r
= _reconcile_bluefs_freespace();
5243 r
= _deferred_replay();
5247 mempool_thread
.init();
5272 int BlueStore::umount()
5275 dout(1) << __func__
<< dendl
;
5278 _osr_unregister_all();
5280 mempool_thread
.shutdown();
5282 dout(20) << __func__
<< " stopping kv thread" << dendl
;
5284 _reap_collections();
5286 dout(20) << __func__
<< " closing" << dendl
;
5296 if (cct
->_conf
->bluestore_fsck_on_umount
) {
5297 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_umount_deep
);
5301 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
5308 static void apply(uint64_t off
,
5310 uint64_t granularity
,
5311 BlueStore::mempool_dynamic_bitset
&bitset
,
5313 std::function
<void(uint64_t,
5314 BlueStore::mempool_dynamic_bitset
&)> f
) {
5315 auto end
= ROUND_UP_TO(off
+ len
, granularity
);
5317 uint64_t pos
= off
/ granularity
;
5323 int BlueStore::_fsck_check_extents(
5324 const ghobject_t
& oid
,
5325 const PExtentVector
& extents
,
5327 mempool_dynamic_bitset
&used_blocks
,
5328 store_statfs_t
& expected_statfs
)
5330 dout(30) << __func__
<< " oid " << oid
<< " extents " << extents
<< dendl
;
5332 for (auto e
: extents
) {
5335 expected_statfs
.allocated
+= e
.length
;
5337 expected_statfs
.compressed_allocated
+= e
.length
;
5339 bool already
= false;
5341 e
.offset
, e
.length
, block_size
, used_blocks
, __func__
,
5342 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5349 derr
<< " " << oid
<< " extent " << e
5350 << " or a subset is already allocated" << dendl
;
5353 if (e
.end() > bdev
->get_size()) {
5354 derr
<< " " << oid
<< " extent " << e
5355 << " past end of block device" << dendl
;
5362 int BlueStore::fsck(bool deep
)
5364 dout(1) << __func__
<< (deep
? " (deep)" : " (shallow)") << " start" << dendl
;
5367 typedef btree::btree_set
<
5368 uint64_t,std::less
<uint64_t>,
5369 mempool::bluestore_fsck::pool_allocator
<uint64_t>> uint64_t_btree_t
;
5370 uint64_t_btree_t used_nids
;
5371 uint64_t_btree_t used_omap_head
;
5372 uint64_t_btree_t used_sbids
;
5374 mempool_dynamic_bitset used_blocks
;
5375 KeyValueDB::Iterator it
;
5376 store_statfs_t expected_statfs
, actual_statfs
;
5378 list
<ghobject_t
> oids
;
5380 bluestore_extent_ref_map_t ref_map
;
5383 mempool::bluestore_fsck::map
<uint64_t,sb_info_t
> sb_info
;
5385 uint64_t num_objects
= 0;
5386 uint64_t num_extents
= 0;
5387 uint64_t num_blobs
= 0;
5388 uint64_t num_spanning_blobs
= 0;
5389 uint64_t num_shared_blobs
= 0;
5390 uint64_t num_sharded_objects
= 0;
5391 uint64_t num_object_shards
= 0;
5393 utime_t start
= ceph_clock_now();
5395 int r
= _open_path();
5398 r
= _open_fsid(false);
5402 r
= _read_fsid(&fsid
);
5410 r
= _open_bdev(false);
5414 r
= _open_db(false);
5418 r
= _open_super_meta();
5422 r
= _open_fm(false);
5430 r
= _open_collections(&errors
);
5434 mempool_thread
.init();
5436 // we need finishers and kv_{sync,finalize}_thread *just* for replay
5438 r
= _deferred_replay();
5443 used_blocks
.resize(bdev
->get_size() / block_size
);
5445 0, SUPER_RESERVED
, block_size
, used_blocks
, "0~SUPER_RESERVED",
5446 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5452 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
5454 e
.get_start(), e
.get_len(), block_size
, used_blocks
, "bluefs",
5455 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5468 // get expected statfs; fill unaffected fields to be able to compare
5470 statfs(&actual_statfs
);
5471 expected_statfs
.total
= actual_statfs
.total
;
5472 expected_statfs
.available
= actual_statfs
.available
;
5475 dout(1) << __func__
<< " walking object keyspace" << dendl
;
5476 it
= db
->get_iterator(PREFIX_OBJ
);
5480 mempool::bluestore_fsck::list
<string
> expecting_shards
;
5481 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
5482 if (g_conf
->bluestore_debug_fsck_abort
) {
5485 dout(30) << " key " << pretty_binary_string(it
->key()) << dendl
;
5486 if (is_extent_shard_key(it
->key())) {
5487 while (!expecting_shards
.empty() &&
5488 expecting_shards
.front() < it
->key()) {
5489 derr
<< __func__
<< " error: missing shard key "
5490 << pretty_binary_string(expecting_shards
.front())
5493 expecting_shards
.pop_front();
5495 if (!expecting_shards
.empty() &&
5496 expecting_shards
.front() == it
->key()) {
5498 expecting_shards
.pop_front();
5504 get_key_extent_shard(it
->key(), &okey
, &offset
);
5505 derr
<< __func__
<< " error: stray shard 0x" << std::hex
<< offset
5506 << std::dec
<< dendl
;
5507 if (expecting_shards
.empty()) {
5508 derr
<< __func__
<< " error: " << pretty_binary_string(it
->key())
5509 << " is unexpected" << dendl
;
5513 while (expecting_shards
.front() > it
->key()) {
5514 derr
<< __func__
<< " error: saw " << pretty_binary_string(it
->key())
5516 derr
<< __func__
<< " error: exp "
5517 << pretty_binary_string(expecting_shards
.front()) << dendl
;
5519 expecting_shards
.pop_front();
5520 if (expecting_shards
.empty()) {
5528 int r
= get_key_object(it
->key(), &oid
);
5530 derr
<< __func__
<< " error: bad object key "
5531 << pretty_binary_string(it
->key()) << dendl
;
5536 oid
.shard_id
!= pgid
.shard
||
5537 oid
.hobj
.pool
!= (int64_t)pgid
.pool() ||
5538 !c
->contains(oid
)) {
5540 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
=
5542 p
!= coll_map
.end();
5544 if (p
->second
->contains(oid
)) {
5550 derr
<< __func__
<< " error: stray object " << oid
5551 << " not owned by any collection" << dendl
;
5555 c
->cid
.is_pg(&pgid
);
5556 dout(20) << __func__
<< " collection " << c
->cid
<< dendl
;
5559 if (!expecting_shards
.empty()) {
5560 for (auto &k
: expecting_shards
) {
5561 derr
<< __func__
<< " error: missing shard key "
5562 << pretty_binary_string(k
) << dendl
;
5565 expecting_shards
.clear();
5568 dout(10) << __func__
<< " " << oid
<< dendl
;
5569 RWLock::RLocker
l(c
->lock
);
5570 OnodeRef o
= c
->get_onode(oid
, false);
5572 if (o
->onode
.nid
> nid_max
) {
5573 derr
<< __func__
<< " error: " << oid
<< " nid " << o
->onode
.nid
5574 << " > nid_max " << nid_max
<< dendl
;
5577 if (used_nids
.count(o
->onode
.nid
)) {
5578 derr
<< __func__
<< " error: " << oid
<< " nid " << o
->onode
.nid
5579 << " already in use" << dendl
;
5581 continue; // go for next object
5583 used_nids
.insert(o
->onode
.nid
);
5586 num_spanning_blobs
+= o
->extent_map
.spanning_blob_map
.size();
5587 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
5590 if (!o
->extent_map
.shards
.empty()) {
5591 ++num_sharded_objects
;
5592 num_object_shards
+= o
->extent_map
.shards
.size();
5594 for (auto& s
: o
->extent_map
.shards
) {
5595 dout(20) << __func__
<< " shard " << *s
.shard_info
<< dendl
;
5596 expecting_shards
.push_back(string());
5597 get_extent_shard_key(o
->key
, s
.shard_info
->offset
,
5598 &expecting_shards
.back());
5599 if (s
.shard_info
->offset
>= o
->onode
.size
) {
5600 derr
<< __func__
<< " error: " << oid
<< " shard 0x" << std::hex
5601 << s
.shard_info
->offset
<< " past EOF at 0x" << o
->onode
.size
5602 << std::dec
<< dendl
;
5607 map
<BlobRef
,bluestore_blob_t::unused_t
> referenced
;
5609 mempool::bluestore_fsck::map
<BlobRef
,
5610 bluestore_blob_use_tracker_t
> ref_map
;
5611 for (auto& l
: o
->extent_map
.extent_map
) {
5612 dout(20) << __func__
<< " " << l
<< dendl
;
5613 if (l
.logical_offset
< pos
) {
5614 derr
<< __func__
<< " error: " << oid
<< " lextent at 0x"
5615 << std::hex
<< l
.logical_offset
5616 << " overlaps with the previous, which ends at 0x" << pos
5617 << std::dec
<< dendl
;
5620 if (o
->extent_map
.spans_shard(l
.logical_offset
, l
.length
)) {
5621 derr
<< __func__
<< " error: " << oid
<< " lextent at 0x"
5622 << std::hex
<< l
.logical_offset
<< "~" << l
.length
5623 << " spans a shard boundary"
5624 << std::dec
<< dendl
;
5627 pos
= l
.logical_offset
+ l
.length
;
5628 expected_statfs
.stored
+= l
.length
;
5630 const bluestore_blob_t
& blob
= l
.blob
->get_blob();
5632 auto& ref
= ref_map
[l
.blob
];
5633 if (ref
.is_empty()) {
5634 uint32_t min_release_size
= blob
.get_release_size(min_alloc_size
);
5635 uint32_t l
= blob
.get_logical_length();
5636 ref
.init(l
, min_release_size
);
5642 if (blob
.has_unused()) {
5643 auto p
= referenced
.find(l
.blob
);
5644 bluestore_blob_t::unused_t
*pu
;
5645 if (p
== referenced
.end()) {
5646 pu
= &referenced
[l
.blob
];
5650 uint64_t blob_len
= blob
.get_logical_length();
5651 assert((blob_len
% (sizeof(*pu
)*8)) == 0);
5652 assert(l
.blob_offset
+ l
.length
<= blob_len
);
5653 uint64_t chunk_size
= blob_len
/ (sizeof(*pu
)*8);
5654 uint64_t start
= l
.blob_offset
/ chunk_size
;
5656 ROUND_UP_TO(l
.blob_offset
+ l
.length
, chunk_size
) / chunk_size
;
5657 for (auto i
= start
; i
< end
; ++i
) {
5662 for (auto &i
: referenced
) {
5663 dout(20) << __func__
<< " referenced 0x" << std::hex
<< i
.second
5664 << std::dec
<< " for " << *i
.first
<< dendl
;
5665 const bluestore_blob_t
& blob
= i
.first
->get_blob();
5666 if (i
.second
& blob
.unused
) {
5667 derr
<< __func__
<< " error: " << oid
<< " blob claims unused 0x"
5668 << std::hex
<< blob
.unused
5669 << " but extents reference 0x" << i
.second
5670 << " on blob " << *i
.first
<< dendl
;
5673 if (blob
.has_csum()) {
5674 uint64_t blob_len
= blob
.get_logical_length();
5675 uint64_t unused_chunk_size
= blob_len
/ (sizeof(blob
.unused
)*8);
5676 unsigned csum_count
= blob
.get_csum_count();
5677 unsigned csum_chunk_size
= blob
.get_csum_chunk_size();
5678 for (unsigned p
= 0; p
< csum_count
; ++p
) {
5679 unsigned pos
= p
* csum_chunk_size
;
5680 unsigned firstbit
= pos
/ unused_chunk_size
; // [firstbit,lastbit]
5681 unsigned lastbit
= (pos
+ csum_chunk_size
- 1) / unused_chunk_size
;
5682 unsigned mask
= 1u << firstbit
;
5683 for (unsigned b
= firstbit
+ 1; b
<= lastbit
; ++b
) {
5686 if ((blob
.unused
& mask
) == mask
) {
5687 // this csum chunk region is marked unused
5688 if (blob
.get_csum_item(p
) != 0) {
5689 derr
<< __func__
<< " error: " << oid
5690 << " blob claims csum chunk 0x" << std::hex
<< pos
5691 << "~" << csum_chunk_size
5692 << " is unused (mask 0x" << mask
<< " of unused 0x"
5693 << blob
.unused
<< ") but csum is non-zero 0x"
5694 << blob
.get_csum_item(p
) << std::dec
<< " on blob "
5695 << *i
.first
<< dendl
;
5702 for (auto &i
: ref_map
) {
5704 const bluestore_blob_t
& blob
= i
.first
->get_blob();
5705 bool equal
= i
.first
->get_blob_use_tracker().equal(i
.second
);
5707 derr
<< __func__
<< " error: " << oid
<< " blob " << *i
.first
5708 << " doesn't match expected ref_map " << i
.second
<< dendl
;
5711 if (blob
.is_compressed()) {
5712 expected_statfs
.compressed
+= blob
.get_compressed_payload_length();
5713 expected_statfs
.compressed_original
+=
5714 i
.first
->get_referenced_bytes();
5716 if (blob
.is_shared()) {
5717 if (i
.first
->shared_blob
->get_sbid() > blobid_max
) {
5718 derr
<< __func__
<< " error: " << oid
<< " blob " << blob
5719 << " sbid " << i
.first
->shared_blob
->get_sbid() << " > blobid_max "
5720 << blobid_max
<< dendl
;
5722 } else if (i
.first
->shared_blob
->get_sbid() == 0) {
5723 derr
<< __func__
<< " error: " << oid
<< " blob " << blob
5724 << " marked as shared but has uninitialized sbid"
5728 sb_info_t
& sbi
= sb_info
[i
.first
->shared_blob
->get_sbid()];
5729 sbi
.sb
= i
.first
->shared_blob
;
5730 sbi
.oids
.push_back(oid
);
5731 sbi
.compressed
= blob
.is_compressed();
5732 for (auto e
: blob
.get_extents()) {
5734 sbi
.ref_map
.get(e
.offset
, e
.length
);
5738 errors
+= _fsck_check_extents(oid
, blob
.get_extents(),
5739 blob
.is_compressed(),
5746 int r
= _do_read(c
.get(), o
, 0, o
->onode
.size
, bl
, 0);
5749 derr
<< __func__
<< " error: " << oid
<< " error during read: "
5750 << cpp_strerror(r
) << dendl
;
5754 if (o
->onode
.has_omap()) {
5755 if (used_omap_head
.count(o
->onode
.nid
)) {
5756 derr
<< __func__
<< " error: " << oid
<< " omap_head " << o
->onode
.nid
5757 << " already in use" << dendl
;
5760 used_omap_head
.insert(o
->onode
.nid
);
5765 dout(1) << __func__
<< " checking shared_blobs" << dendl
;
5766 it
= db
->get_iterator(PREFIX_SHARED_BLOB
);
5768 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
5769 string key
= it
->key();
5771 if (get_key_shared_blob(key
, &sbid
)) {
5772 derr
<< __func__
<< " error: bad key '" << key
5773 << "' in shared blob namespace" << dendl
;
5777 auto p
= sb_info
.find(sbid
);
5778 if (p
== sb_info
.end()) {
5779 derr
<< __func__
<< " error: found stray shared blob data for sbid 0x"
5780 << std::hex
<< sbid
<< std::dec
<< dendl
;
5784 sb_info_t
& sbi
= p
->second
;
5785 bluestore_shared_blob_t
shared_blob(sbid
);
5786 bufferlist bl
= it
->value();
5787 bufferlist::iterator blp
= bl
.begin();
5788 ::decode(shared_blob
, blp
);
5789 dout(20) << __func__
<< " " << *sbi
.sb
<< " " << shared_blob
<< dendl
;
5790 if (shared_blob
.ref_map
!= sbi
.ref_map
) {
5791 derr
<< __func__
<< " error: shared blob 0x" << std::hex
<< sbid
5792 << std::dec
<< " ref_map " << shared_blob
.ref_map
5793 << " != expected " << sbi
.ref_map
<< dendl
;
5796 PExtentVector extents
;
5797 for (auto &r
: shared_blob
.ref_map
.ref_map
) {
5798 extents
.emplace_back(bluestore_pextent_t(r
.first
, r
.second
.length
));
5800 errors
+= _fsck_check_extents(p
->second
.oids
.front(),
5802 p
->second
.compressed
,
5803 used_blocks
, expected_statfs
);
5808 for (auto &p
: sb_info
) {
5809 derr
<< __func__
<< " error: shared_blob 0x" << p
.first
5810 << " key is missing (" << *p
.second
.sb
<< ")" << dendl
;
5813 if (!(actual_statfs
== expected_statfs
)) {
5814 derr
<< __func__
<< " error: actual " << actual_statfs
5815 << " != expected " << expected_statfs
<< dendl
;
5819 dout(1) << __func__
<< " checking for stray omap data" << dendl
;
5820 it
= db
->get_iterator(PREFIX_OMAP
);
5822 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
5824 _key_decode_u64(it
->key().c_str(), &omap_head
);
5825 if (used_omap_head
.count(omap_head
) == 0) {
5826 derr
<< __func__
<< " error: found stray omap data on omap_head "
5827 << omap_head
<< dendl
;
5833 dout(1) << __func__
<< " checking deferred events" << dendl
;
5834 it
= db
->get_iterator(PREFIX_DEFERRED
);
5836 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
5837 bufferlist bl
= it
->value();
5838 bufferlist::iterator p
= bl
.begin();
5839 bluestore_deferred_transaction_t wt
;
5842 } catch (buffer::error
& e
) {
5843 derr
<< __func__
<< " error: failed to decode deferred txn "
5844 << pretty_binary_string(it
->key()) << dendl
;
5848 dout(20) << __func__
<< " deferred " << wt
.seq
5849 << " ops " << wt
.ops
.size()
5850 << " released 0x" << std::hex
<< wt
.released
<< std::dec
<< dendl
;
5851 for (auto e
= wt
.released
.begin(); e
!= wt
.released
.end(); ++e
) {
5853 e
.get_start(), e
.get_len(), block_size
, used_blocks
, "deferred",
5854 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5862 dout(1) << __func__
<< " checking freelist vs allocated" << dendl
;
5864 // remove bluefs_extents from used set since the freelist doesn't
5865 // know they are allocated.
5866 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
5868 e
.get_start(), e
.get_len(), block_size
, used_blocks
, "bluefs_extents",
5869 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5874 fm
->enumerate_reset();
5875 uint64_t offset
, length
;
5876 while (fm
->enumerate_next(&offset
, &length
)) {
5877 bool intersects
= false;
5879 offset
, length
, block_size
, used_blocks
, "free",
5880 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5889 derr
<< __func__
<< " error: free extent 0x" << std::hex
<< offset
5890 << "~" << length
<< std::dec
5891 << " intersects allocated blocks" << dendl
;
5895 size_t count
= used_blocks
.count();
5896 if (used_blocks
.size() != count
) {
5897 assert(used_blocks
.size() > count
);
5898 derr
<< __func__
<< " error: leaked some space;"
5899 << (used_blocks
.size() - count
) * min_alloc_size
5900 << " bytes leaked" << dendl
;
5906 mempool_thread
.shutdown();
5913 it
.reset(); // before db is closed
5922 // fatal errors take precedence
5926 dout(2) << __func__
<< " " << num_objects
<< " objects, "
5927 << num_sharded_objects
<< " of them sharded. "
5929 dout(2) << __func__
<< " " << num_extents
<< " extents to "
5930 << num_blobs
<< " blobs, "
5931 << num_spanning_blobs
<< " spanning, "
5932 << num_shared_blobs
<< " shared."
5935 utime_t duration
= ceph_clock_now() - start
;
5936 dout(1) << __func__
<< " finish with " << errors
<< " errors in "
5937 << duration
<< " seconds" << dendl
;
5941 void BlueStore::collect_metadata(map
<string
,string
> *pm
)
5943 dout(10) << __func__
<< dendl
;
5944 bdev
->collect_metadata("bluestore_bdev_", pm
);
5946 (*pm
)["bluefs"] = "1";
5947 (*pm
)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device
);
5948 bluefs
->collect_metadata(pm
);
5950 (*pm
)["bluefs"] = "0";
5954 int BlueStore::statfs(struct store_statfs_t
*buf
)
5957 buf
->total
= bdev
->get_size();
5958 buf
->available
= alloc
->get_free();
5961 // part of our shared device is "free" according to BlueFS
5962 // Don't include bluestore_bluefs_min because that space can't
5963 // be used for any other purpose.
5964 buf
->available
+= bluefs
->get_free(bluefs_shared_bdev
) - cct
->_conf
->bluestore_bluefs_min
;
5966 // include dedicated db, too, if that isn't the shared device.
5967 if (bluefs_shared_bdev
!= BlueFS::BDEV_DB
) {
5968 buf
->total
+= bluefs
->get_total(BlueFS::BDEV_DB
);
5973 std::lock_guard
<std::mutex
> l(vstatfs_lock
);
5975 buf
->allocated
= vstatfs
.allocated();
5976 buf
->stored
= vstatfs
.stored();
5977 buf
->compressed
= vstatfs
.compressed();
5978 buf
->compressed_original
= vstatfs
.compressed_original();
5979 buf
->compressed_allocated
= vstatfs
.compressed_allocated();
5982 dout(20) << __func__
<< *buf
<< dendl
;
5989 BlueStore::CollectionRef
BlueStore::_get_collection(const coll_t
& cid
)
5991 RWLock::RLocker
l(coll_lock
);
5992 ceph::unordered_map
<coll_t
,CollectionRef
>::iterator cp
= coll_map
.find(cid
);
5993 if (cp
== coll_map
.end())
5994 return CollectionRef();
5998 void BlueStore::_queue_reap_collection(CollectionRef
& c
)
6000 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
6001 std::lock_guard
<std::mutex
> l(reap_lock
);
6002 removed_collections
.push_back(c
);
6005 void BlueStore::_reap_collections()
6007 list
<CollectionRef
> removed_colls
;
6009 std::lock_guard
<std::mutex
> l(reap_lock
);
6010 removed_colls
.swap(removed_collections
);
6013 bool all_reaped
= true;
6015 for (list
<CollectionRef
>::iterator p
= removed_colls
.begin();
6016 p
!= removed_colls
.end();
6018 CollectionRef c
= *p
;
6019 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
6020 if (c
->onode_map
.map_any([&](OnodeRef o
) {
6022 if (o
->flushing_count
.load()) {
6023 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " " << o
->oid
6024 << " flush_txns " << o
->flushing_count
<< dendl
;
6032 c
->onode_map
.clear();
6033 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " done" << dendl
;
6037 dout(10) << __func__
<< " all reaped" << dendl
;
6041 void BlueStore::_update_cache_logger()
6043 uint64_t num_onodes
= 0;
6044 uint64_t num_extents
= 0;
6045 uint64_t num_blobs
= 0;
6046 uint64_t num_buffers
= 0;
6047 uint64_t num_buffer_bytes
= 0;
6048 for (auto c
: cache_shards
) {
6049 c
->add_stats(&num_onodes
, &num_extents
, &num_blobs
,
6050 &num_buffers
, &num_buffer_bytes
);
6052 logger
->set(l_bluestore_onodes
, num_onodes
);
6053 logger
->set(l_bluestore_extents
, num_extents
);
6054 logger
->set(l_bluestore_blobs
, num_blobs
);
6055 logger
->set(l_bluestore_buffers
, num_buffers
);
6056 logger
->set(l_bluestore_buffer_bytes
, num_buffer_bytes
);
6062 ObjectStore::CollectionHandle
BlueStore::open_collection(const coll_t
& cid
)
6064 return _get_collection(cid
);
6067 bool BlueStore::exists(const coll_t
& cid
, const ghobject_t
& oid
)
6069 CollectionHandle c
= _get_collection(cid
);
6072 return exists(c
, oid
);
6075 bool BlueStore::exists(CollectionHandle
&c_
, const ghobject_t
& oid
)
6077 Collection
*c
= static_cast<Collection
*>(c_
.get());
6078 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
6085 RWLock::RLocker
l(c
->lock
);
6086 OnodeRef o
= c
->get_onode(oid
, false);
6087 if (!o
|| !o
->exists
)
6094 int BlueStore::stat(
6096 const ghobject_t
& oid
,
6100 CollectionHandle c
= _get_collection(cid
);
6103 return stat(c
, oid
, st
, allow_eio
);
6106 int BlueStore::stat(
6107 CollectionHandle
&c_
,
6108 const ghobject_t
& oid
,
6112 Collection
*c
= static_cast<Collection
*>(c_
.get());
6115 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
6118 RWLock::RLocker
l(c
->lock
);
6119 OnodeRef o
= c
->get_onode(oid
, false);
6120 if (!o
|| !o
->exists
)
6122 st
->st_size
= o
->onode
.size
;
6123 st
->st_blksize
= 4096;
6124 st
->st_blocks
= (st
->st_size
+ st
->st_blksize
- 1) / st
->st_blksize
;
6129 if (_debug_mdata_eio(oid
)) {
6131 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
6135 int BlueStore::set_collection_opts(
6137 const pool_opts_t
& opts
)
6139 CollectionHandle ch
= _get_collection(cid
);
6142 Collection
*c
= static_cast<Collection
*>(ch
.get());
6143 dout(15) << __func__
<< " " << cid
<< " options " << opts
<< dendl
;
6146 RWLock::WLocker
l(c
->lock
);
6147 c
->pool_opts
= opts
;
6151 int BlueStore::read(
6153 const ghobject_t
& oid
,
6160 CollectionHandle c
= _get_collection(cid
);
6163 return read(c
, oid
, offset
, length
, bl
, op_flags
, allow_eio
);
6166 int BlueStore::read(
6167 CollectionHandle
&c_
,
6168 const ghobject_t
& oid
,
6175 utime_t start
= ceph_clock_now();
6176 Collection
*c
= static_cast<Collection
*>(c_
.get());
6177 const coll_t
&cid
= c
->get_cid();
6178 dout(15) << __func__
<< " " << cid
<< " " << oid
6179 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
6187 RWLock::RLocker
l(c
->lock
);
6188 utime_t start1
= ceph_clock_now();
6189 OnodeRef o
= c
->get_onode(oid
, false);
6190 logger
->tinc(l_bluestore_read_onode_meta_lat
, ceph_clock_now() - start1
);
6191 if (!o
|| !o
->exists
) {
6196 if (offset
== length
&& offset
== 0)
6197 length
= o
->onode
.size
;
6199 r
= _do_read(c
, o
, offset
, length
, bl
, op_flags
);
6203 assert(allow_eio
|| r
!= -EIO
);
6204 if (r
== 0 && _debug_data_eio(oid
)) {
6206 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
6208 dout(10) << __func__
<< " " << cid
<< " " << oid
6209 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
6210 << " = " << r
<< dendl
;
6211 logger
->tinc(l_bluestore_read_lat
, ceph_clock_now() - start
);
6215 // --------------------------------------------------------
6216 // intermediate data structures used while reading
6218 uint64_t logical_offset
;
6219 uint64_t blob_xoffset
; //region offset within the blob
6223 // used later in read process
6227 region_t(uint64_t offset
, uint64_t b_offs
, uint64_t len
)
6228 : logical_offset(offset
),
6229 blob_xoffset(b_offs
),
6231 region_t(const region_t
& from
)
6232 : logical_offset(from
.logical_offset
),
6233 blob_xoffset(from
.blob_xoffset
),
6234 length(from
.length
){}
6236 friend ostream
& operator<<(ostream
& out
, const region_t
& r
) {
6237 return out
<< "0x" << std::hex
<< r
.logical_offset
<< ":"
6238 << r
.blob_xoffset
<< "~" << r
.length
<< std::dec
;
6242 typedef list
<region_t
> regions2read_t
;
6243 typedef map
<BlueStore::BlobRef
, regions2read_t
> blobs2read_t
;
6245 int BlueStore::_do_read(
6256 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
6257 << " size 0x" << o
->onode
.size
<< " (" << std::dec
6258 << o
->onode
.size
<< ")" << dendl
;
6261 if (offset
>= o
->onode
.size
) {
6265 // generally, don't buffer anything, unless the client explicitly requests
6267 bool buffered
= false;
6268 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
6269 dout(20) << __func__
<< " will do buffered read" << dendl
;
6271 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
6272 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
6273 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
6274 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
6278 if (offset
+ length
> o
->onode
.size
) {
6279 length
= o
->onode
.size
- offset
;
6282 utime_t start
= ceph_clock_now();
6283 o
->extent_map
.fault_range(db
, offset
, length
);
6284 logger
->tinc(l_bluestore_read_onode_meta_lat
, ceph_clock_now() - start
);
6287 ready_regions_t ready_regions
;
6289 // build blob-wise list to of stuff read (that isn't cached)
6290 blobs2read_t blobs2read
;
6291 unsigned left
= length
;
6292 uint64_t pos
= offset
;
6293 unsigned num_regions
= 0;
6294 auto lp
= o
->extent_map
.seek_lextent(offset
);
6295 while (left
> 0 && lp
!= o
->extent_map
.extent_map
.end()) {
6296 if (pos
< lp
->logical_offset
) {
6297 unsigned hole
= lp
->logical_offset
- pos
;
6301 dout(30) << __func__
<< " hole 0x" << std::hex
<< pos
<< "~" << hole
6302 << std::dec
<< dendl
;
6306 BlobRef bptr
= lp
->blob
;
6307 unsigned l_off
= pos
- lp
->logical_offset
;
6308 unsigned b_off
= l_off
+ lp
->blob_offset
;
6309 unsigned b_len
= std::min(left
, lp
->length
- l_off
);
6311 ready_regions_t cache_res
;
6312 interval_set
<uint32_t> cache_interval
;
6313 bptr
->shared_blob
->bc
.read(
6314 bptr
->shared_blob
->get_cache(), b_off
, b_len
, cache_res
, cache_interval
);
6315 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6316 << " need 0x" << b_off
<< "~" << b_len
6317 << " cache has 0x" << cache_interval
6318 << std::dec
<< dendl
;
6320 auto pc
= cache_res
.begin();
6323 if (pc
!= cache_res
.end() &&
6324 pc
->first
== b_off
) {
6325 l
= pc
->second
.length();
6326 ready_regions
[pos
].claim(pc
->second
);
6327 dout(30) << __func__
<< " use cache 0x" << std::hex
<< pos
<< ": 0x"
6328 << b_off
<< "~" << l
<< std::dec
<< dendl
;
6332 if (pc
!= cache_res
.end()) {
6333 assert(pc
->first
> b_off
);
6334 l
= pc
->first
- b_off
;
6336 dout(30) << __func__
<< " will read 0x" << std::hex
<< pos
<< ": 0x"
6337 << b_off
<< "~" << l
<< std::dec
<< dendl
;
6338 blobs2read
[bptr
].emplace_back(region_t(pos
, b_off
, l
));
6349 // read raw blob data. use aio if we have >1 blobs to read.
6350 start
= ceph_clock_now(); // for the sake of simplicity
6351 // measure the whole block below.
6352 // The error isn't that much...
6353 vector
<bufferlist
> compressed_blob_bls
;
6354 IOContext
ioc(cct
, NULL
);
6355 for (auto& p
: blobs2read
) {
6356 BlobRef bptr
= p
.first
;
6357 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6358 << " need " << p
.second
<< std::dec
<< dendl
;
6359 if (bptr
->get_blob().is_compressed()) {
6360 // read the whole thing
6361 if (compressed_blob_bls
.empty()) {
6362 // ensure we avoid any reallocation on subsequent blobs
6363 compressed_blob_bls
.reserve(blobs2read
.size());
6365 compressed_blob_bls
.push_back(bufferlist());
6366 bufferlist
& bl
= compressed_blob_bls
.back();
6367 r
= bptr
->get_blob().map(
6368 0, bptr
->get_blob().get_ondisk_length(),
6369 [&](uint64_t offset
, uint64_t length
) {
6371 // use aio if there are more regions to read than those in this blob
6372 if (num_regions
> p
.second
.size()) {
6373 r
= bdev
->aio_read(offset
, length
, &bl
, &ioc
);
6375 r
= bdev
->read(offset
, length
, &bl
, &ioc
, false);
6384 for (auto& reg
: p
.second
) {
6385 // determine how much of the blob to read
6386 uint64_t chunk_size
= bptr
->get_blob().get_chunk_size(block_size
);
6387 reg
.r_off
= reg
.blob_xoffset
;
6388 uint64_t r_len
= reg
.length
;
6389 reg
.front
= reg
.r_off
% chunk_size
;
6391 reg
.r_off
-= reg
.front
;
6394 unsigned tail
= r_len
% chunk_size
;
6396 r_len
+= chunk_size
- tail
;
6398 dout(20) << __func__
<< " region 0x" << std::hex
6399 << reg
.logical_offset
6400 << ": 0x" << reg
.blob_xoffset
<< "~" << reg
.length
6401 << " reading 0x" << reg
.r_off
<< "~" << r_len
<< std::dec
6405 r
= bptr
->get_blob().map(
6407 [&](uint64_t offset
, uint64_t length
) {
6409 // use aio if there is more than one region to read
6410 if (num_regions
> 1) {
6411 r
= bdev
->aio_read(offset
, length
, ®
.bl
, &ioc
);
6413 r
= bdev
->read(offset
, length
, ®
.bl
, &ioc
, false);
6420 assert(reg
.bl
.length() == r_len
);
6424 if (ioc
.has_pending_aios()) {
6425 bdev
->aio_submit(&ioc
);
6426 dout(20) << __func__
<< " waiting for aio" << dendl
;
6429 logger
->tinc(l_bluestore_read_wait_aio_lat
, ceph_clock_now() - start
);
6431 // enumerate and decompress desired blobs
6432 auto p
= compressed_blob_bls
.begin();
6433 blobs2read_t::iterator b2r_it
= blobs2read
.begin();
6434 while (b2r_it
!= blobs2read
.end()) {
6435 BlobRef bptr
= b2r_it
->first
;
6436 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6437 << " need 0x" << b2r_it
->second
<< std::dec
<< dendl
;
6438 if (bptr
->get_blob().is_compressed()) {
6439 assert(p
!= compressed_blob_bls
.end());
6440 bufferlist
& compressed_bl
= *p
++;
6441 if (_verify_csum(o
, &bptr
->get_blob(), 0, compressed_bl
,
6442 b2r_it
->second
.front().logical_offset
) < 0) {
6446 r
= _decompress(compressed_bl
, &raw_bl
);
6450 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(), 0,
6453 for (auto& i
: b2r_it
->second
) {
6454 ready_regions
[i
.logical_offset
].substr_of(
6455 raw_bl
, i
.blob_xoffset
, i
.length
);
6458 for (auto& reg
: b2r_it
->second
) {
6459 if (_verify_csum(o
, &bptr
->get_blob(), reg
.r_off
, reg
.bl
,
6460 reg
.logical_offset
) < 0) {
6464 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(),
6468 // prune and keep result
6469 ready_regions
[reg
.logical_offset
].substr_of(
6470 reg
.bl
, reg
.front
, reg
.length
);
6476 // generate a resulting buffer
6477 auto pr
= ready_regions
.begin();
6478 auto pr_end
= ready_regions
.end();
6480 while (pos
< length
) {
6481 if (pr
!= pr_end
&& pr
->first
== pos
+ offset
) {
6482 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
6483 << ": data from 0x" << pr
->first
<< "~" << pr
->second
.length()
6484 << std::dec
<< dendl
;
6485 pos
+= pr
->second
.length();
6486 bl
.claim_append(pr
->second
);
6489 uint64_t l
= length
- pos
;
6491 assert(pr
->first
> pos
+ offset
);
6492 l
= pr
->first
- (pos
+ offset
);
6494 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
6495 << ": zeros for 0x" << (pos
+ offset
) << "~" << l
6496 << std::dec
<< dendl
;
6501 assert(bl
.length() == length
);
6502 assert(pos
== length
);
6503 assert(pr
== pr_end
);
6508 int BlueStore::_verify_csum(OnodeRef
& o
,
6509 const bluestore_blob_t
* blob
, uint64_t blob_xoffset
,
6510 const bufferlist
& bl
,
6511 uint64_t logical_offset
) const
6515 utime_t start
= ceph_clock_now();
6516 int r
= blob
->verify_csum(blob_xoffset
, bl
, &bad
, &bad_csum
);
6522 blob
->get_csum_chunk_size(),
6523 [&](uint64_t offset
, uint64_t length
) {
6524 pex
.emplace_back(bluestore_pextent_t(offset
, length
));
6527 derr
<< __func__
<< " bad "
6528 << Checksummer::get_csum_type_string(blob
->csum_type
)
6529 << "/0x" << std::hex
<< blob
->get_csum_chunk_size()
6530 << " checksum at blob offset 0x" << bad
6531 << ", got 0x" << bad_csum
<< ", expected 0x"
6532 << blob
->get_csum_item(bad
/ blob
->get_csum_chunk_size()) << std::dec
6533 << ", device location " << pex
6534 << ", logical extent 0x" << std::hex
6535 << (logical_offset
+ bad
- blob_xoffset
) << "~"
6536 << blob
->get_csum_chunk_size() << std::dec
6537 << ", object " << o
->oid
6540 derr
<< __func__
<< " failed with exit code: " << cpp_strerror(r
) << dendl
;
6543 logger
->tinc(l_bluestore_csum_lat
, ceph_clock_now() - start
);
6547 int BlueStore::_decompress(bufferlist
& source
, bufferlist
* result
)
6550 utime_t start
= ceph_clock_now();
6551 bufferlist::iterator i
= source
.begin();
6552 bluestore_compression_header_t chdr
;
6554 int alg
= int(chdr
.type
);
6555 CompressorRef cp
= compressor
;
6556 if (!cp
|| (int)cp
->get_type() != alg
) {
6557 cp
= Compressor::create(cct
, alg
);
6561 // if compressor isn't available - error, because cannot return
6562 // decompressed data?
6563 derr
<< __func__
<< " can't load decompressor " << alg
<< dendl
;
6566 r
= cp
->decompress(i
, chdr
.length
, *result
);
6568 derr
<< __func__
<< " decompression failed with exit code " << r
<< dendl
;
6572 logger
->tinc(l_bluestore_decompress_lat
, ceph_clock_now() - start
);
6576 // this stores fiemap into interval_set, other variations
6577 // use it internally
6578 int BlueStore::_fiemap(
6579 CollectionHandle
&c_
,
6580 const ghobject_t
& oid
,
6583 interval_set
<uint64_t>& destset
)
6585 Collection
*c
= static_cast<Collection
*>(c_
.get());
6589 RWLock::RLocker
l(c
->lock
);
6591 OnodeRef o
= c
->get_onode(oid
, false);
6592 if (!o
|| !o
->exists
) {
6597 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
6598 << " size 0x" << o
->onode
.size
<< std::dec
<< dendl
;
6600 boost::intrusive::set
<Extent
>::iterator ep
, eend
;
6601 if (offset
>= o
->onode
.size
)
6604 if (offset
+ length
> o
->onode
.size
) {
6605 length
= o
->onode
.size
- offset
;
6608 o
->extent_map
.fault_range(db
, offset
, length
);
6609 eend
= o
->extent_map
.extent_map
.end();
6610 ep
= o
->extent_map
.seek_lextent(offset
);
6611 while (length
> 0) {
6612 dout(20) << __func__
<< " offset " << offset
<< dendl
;
6613 if (ep
!= eend
&& ep
->logical_offset
+ ep
->length
<= offset
) {
6618 uint64_t x_len
= length
;
6619 if (ep
!= eend
&& ep
->logical_offset
<= offset
) {
6620 uint64_t x_off
= offset
- ep
->logical_offset
;
6621 x_len
= MIN(x_len
, ep
->length
- x_off
);
6622 dout(30) << __func__
<< " lextent 0x" << std::hex
<< offset
<< "~"
6623 << x_len
<< std::dec
<< " blob " << ep
->blob
<< dendl
;
6624 destset
.insert(offset
, x_len
);
6627 if (x_off
+ x_len
== ep
->length
)
6632 ep
->logical_offset
> offset
&&
6633 ep
->logical_offset
- offset
< x_len
) {
6634 x_len
= ep
->logical_offset
- offset
;
6642 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
6643 << " size = 0x(" << destset
<< ")" << std::dec
<< dendl
;
6647 int BlueStore::fiemap(
6649 const ghobject_t
& oid
,
6654 CollectionHandle c
= _get_collection(cid
);
6657 return fiemap(c
, oid
, offset
, len
, bl
);
6660 int BlueStore::fiemap(
6661 CollectionHandle
&c_
,
6662 const ghobject_t
& oid
,
6667 interval_set
<uint64_t> m
;
6668 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
6675 int BlueStore::fiemap(
6677 const ghobject_t
& oid
,
6680 map
<uint64_t, uint64_t>& destmap
)
6682 CollectionHandle c
= _get_collection(cid
);
6685 return fiemap(c
, oid
, offset
, len
, destmap
);
6688 int BlueStore::fiemap(
6689 CollectionHandle
&c_
,
6690 const ghobject_t
& oid
,
6693 map
<uint64_t, uint64_t>& destmap
)
6695 interval_set
<uint64_t> m
;
6696 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
6698 m
.move_into(destmap
);
6703 int BlueStore::getattr(
6705 const ghobject_t
& oid
,
6709 CollectionHandle c
= _get_collection(cid
);
6712 return getattr(c
, oid
, name
, value
);
6715 int BlueStore::getattr(
6716 CollectionHandle
&c_
,
6717 const ghobject_t
& oid
,
6721 Collection
*c
= static_cast<Collection
*>(c_
.get());
6722 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
<< dendl
;
6728 RWLock::RLocker
l(c
->lock
);
6729 mempool::bluestore_cache_other::string
k(name
);
6731 OnodeRef o
= c
->get_onode(oid
, false);
6732 if (!o
|| !o
->exists
) {
6737 if (!o
->onode
.attrs
.count(k
)) {
6741 value
= o
->onode
.attrs
[k
];
6745 if (r
== 0 && _debug_mdata_eio(oid
)) {
6747 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
6749 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
6750 << " = " << r
<< dendl
;
6755 int BlueStore::getattrs(
6757 const ghobject_t
& oid
,
6758 map
<string
,bufferptr
>& aset
)
6760 CollectionHandle c
= _get_collection(cid
);
6763 return getattrs(c
, oid
, aset
);
6766 int BlueStore::getattrs(
6767 CollectionHandle
&c_
,
6768 const ghobject_t
& oid
,
6769 map
<string
,bufferptr
>& aset
)
6771 Collection
*c
= static_cast<Collection
*>(c_
.get());
6772 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
6778 RWLock::RLocker
l(c
->lock
);
6780 OnodeRef o
= c
->get_onode(oid
, false);
6781 if (!o
|| !o
->exists
) {
6785 for (auto& i
: o
->onode
.attrs
) {
6786 aset
.emplace(i
.first
.c_str(), i
.second
);
6792 if (r
== 0 && _debug_mdata_eio(oid
)) {
6794 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
6796 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
6797 << " = " << r
<< dendl
;
6801 int BlueStore::list_collections(vector
<coll_t
>& ls
)
6803 RWLock::RLocker
l(coll_lock
);
6804 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
= coll_map
.begin();
6805 p
!= coll_map
.end();
6807 ls
.push_back(p
->first
);
6811 bool BlueStore::collection_exists(const coll_t
& c
)
6813 RWLock::RLocker
l(coll_lock
);
6814 return coll_map
.count(c
);
6817 int BlueStore::collection_empty(const coll_t
& cid
, bool *empty
)
6819 dout(15) << __func__
<< " " << cid
<< dendl
;
6820 vector
<ghobject_t
> ls
;
6822 int r
= collection_list(cid
, ghobject_t(), ghobject_t::get_max(), 1,
6825 derr
<< __func__
<< " collection_list returned: " << cpp_strerror(r
)
6829 *empty
= ls
.empty();
6830 dout(10) << __func__
<< " " << cid
<< " = " << (int)(*empty
) << dendl
;
6834 int BlueStore::collection_bits(const coll_t
& cid
)
6836 dout(15) << __func__
<< " " << cid
<< dendl
;
6837 CollectionRef c
= _get_collection(cid
);
6840 RWLock::RLocker
l(c
->lock
);
6841 dout(10) << __func__
<< " " << cid
<< " = " << c
->cnode
.bits
<< dendl
;
6842 return c
->cnode
.bits
;
6845 int BlueStore::collection_list(
6846 const coll_t
& cid
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
6847 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
6849 CollectionHandle c
= _get_collection(cid
);
6852 return collection_list(c
, start
, end
, max
, ls
, pnext
);
6855 int BlueStore::collection_list(
6856 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
6857 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
6859 Collection
*c
= static_cast<Collection
*>(c_
.get());
6860 dout(15) << __func__
<< " " << c
->cid
6861 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
6864 RWLock::RLocker
l(c
->lock
);
6865 r
= _collection_list(c
, start
, end
, max
, ls
, pnext
);
6868 dout(10) << __func__
<< " " << c
->cid
6869 << " start " << start
<< " end " << end
<< " max " << max
6870 << " = " << r
<< ", ls.size() = " << ls
->size()
6871 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
6875 int BlueStore::_collection_list(
6876 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
6877 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
6884 ghobject_t static_next
;
6885 KeyValueDB::Iterator it
;
6886 string temp_start_key
, temp_end_key
;
6887 string start_key
, end_key
;
6888 bool set_next
= false;
6893 pnext
= &static_next
;
6895 if (start
== ghobject_t::get_max() ||
6896 start
.hobj
.is_max()) {
6899 get_coll_key_range(c
->cid
, c
->cnode
.bits
, &temp_start_key
, &temp_end_key
,
6900 &start_key
, &end_key
);
6901 dout(20) << __func__
6902 << " range " << pretty_binary_string(temp_start_key
)
6903 << " to " << pretty_binary_string(temp_end_key
)
6904 << " and " << pretty_binary_string(start_key
)
6905 << " to " << pretty_binary_string(end_key
)
6906 << " start " << start
<< dendl
;
6907 it
= db
->get_iterator(PREFIX_OBJ
);
6908 if (start
== ghobject_t() ||
6909 start
.hobj
== hobject_t() ||
6910 start
== c
->cid
.get_min_hobj()) {
6911 it
->upper_bound(temp_start_key
);
6915 get_object_key(cct
, start
, &k
);
6916 if (start
.hobj
.is_temp()) {
6918 assert(k
>= temp_start_key
&& k
< temp_end_key
);
6921 assert(k
>= start_key
&& k
< end_key
);
6923 dout(20) << " start from " << pretty_binary_string(k
)
6924 << " temp=" << (int)temp
<< dendl
;
6927 if (end
.hobj
.is_max()) {
6928 pend
= temp
? temp_end_key
: end_key
;
6930 get_object_key(cct
, end
, &end_key
);
6931 if (end
.hobj
.is_temp()) {
6937 pend
= temp
? temp_end_key
: end_key
;
6940 dout(20) << __func__
<< " pend " << pretty_binary_string(pend
) << dendl
;
6942 if (!it
->valid() || it
->key() >= pend
) {
6944 dout(20) << __func__
<< " iterator not valid (end of db?)" << dendl
;
6946 dout(20) << __func__
<< " key " << pretty_binary_string(it
->key())
6947 << " >= " << end
<< dendl
;
6949 if (end
.hobj
.is_temp()) {
6952 dout(30) << __func__
<< " switch to non-temp namespace" << dendl
;
6954 it
->upper_bound(start_key
);
6956 dout(30) << __func__
<< " pend " << pretty_binary_string(pend
) << dendl
;
6961 dout(30) << __func__
<< " key " << pretty_binary_string(it
->key()) << dendl
;
6962 if (is_extent_shard_key(it
->key())) {
6967 int r
= get_key_object(it
->key(), &oid
);
6969 dout(20) << __func__
<< " oid " << oid
<< " end " << end
<< dendl
;
6970 if (ls
->size() >= (unsigned)max
) {
6971 dout(20) << __func__
<< " reached max " << max
<< dendl
;
6981 *pnext
= ghobject_t::get_max();
6987 int BlueStore::omap_get(
6988 const coll_t
& cid
, ///< [in] Collection containing oid
6989 const ghobject_t
&oid
, ///< [in] Object containing omap
6990 bufferlist
*header
, ///< [out] omap header
6991 map
<string
, bufferlist
> *out
/// < [out] Key to value map
6994 CollectionHandle c
= _get_collection(cid
);
6997 return omap_get(c
, oid
, header
, out
);
7000 int BlueStore::omap_get(
7001 CollectionHandle
&c_
, ///< [in] Collection containing oid
7002 const ghobject_t
&oid
, ///< [in] Object containing omap
7003 bufferlist
*header
, ///< [out] omap header
7004 map
<string
, bufferlist
> *out
/// < [out] Key to value map
7007 Collection
*c
= static_cast<Collection
*>(c_
.get());
7008 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7011 RWLock::RLocker
l(c
->lock
);
7013 OnodeRef o
= c
->get_onode(oid
, false);
7014 if (!o
|| !o
->exists
) {
7018 if (!o
->onode
.has_omap())
7022 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
7024 get_omap_header(o
->onode
.nid
, &head
);
7025 get_omap_tail(o
->onode
.nid
, &tail
);
7026 it
->lower_bound(head
);
7027 while (it
->valid()) {
7028 if (it
->key() == head
) {
7029 dout(30) << __func__
<< " got header" << dendl
;
7030 *header
= it
->value();
7031 } else if (it
->key() >= tail
) {
7032 dout(30) << __func__
<< " reached tail" << dendl
;
7036 decode_omap_key(it
->key(), &user_key
);
7037 dout(30) << __func__
<< " got " << pretty_binary_string(it
->key())
7038 << " -> " << user_key
<< dendl
;
7039 (*out
)[user_key
] = it
->value();
7045 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7050 int BlueStore::omap_get_header(
7051 const coll_t
& cid
, ///< [in] Collection containing oid
7052 const ghobject_t
&oid
, ///< [in] Object containing omap
7053 bufferlist
*header
, ///< [out] omap header
7054 bool allow_eio
///< [in] don't assert on eio
7057 CollectionHandle c
= _get_collection(cid
);
7060 return omap_get_header(c
, oid
, header
, allow_eio
);
7063 int BlueStore::omap_get_header(
7064 CollectionHandle
&c_
, ///< [in] Collection containing oid
7065 const ghobject_t
&oid
, ///< [in] Object containing omap
7066 bufferlist
*header
, ///< [out] omap header
7067 bool allow_eio
///< [in] don't assert on eio
7070 Collection
*c
= static_cast<Collection
*>(c_
.get());
7071 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7074 RWLock::RLocker
l(c
->lock
);
7076 OnodeRef o
= c
->get_onode(oid
, false);
7077 if (!o
|| !o
->exists
) {
7081 if (!o
->onode
.has_omap())
7086 get_omap_header(o
->onode
.nid
, &head
);
7087 if (db
->get(PREFIX_OMAP
, head
, header
) >= 0) {
7088 dout(30) << __func__
<< " got header" << dendl
;
7090 dout(30) << __func__
<< " no header" << dendl
;
7094 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7099 int BlueStore::omap_get_keys(
7100 const coll_t
& cid
, ///< [in] Collection containing oid
7101 const ghobject_t
&oid
, ///< [in] Object containing omap
7102 set
<string
> *keys
///< [out] Keys defined on oid
7105 CollectionHandle c
= _get_collection(cid
);
7108 return omap_get_keys(c
, oid
, keys
);
7111 int BlueStore::omap_get_keys(
7112 CollectionHandle
&c_
, ///< [in] Collection containing oid
7113 const ghobject_t
&oid
, ///< [in] Object containing omap
7114 set
<string
> *keys
///< [out] Keys defined on oid
7117 Collection
*c
= static_cast<Collection
*>(c_
.get());
7118 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7121 RWLock::RLocker
l(c
->lock
);
7123 OnodeRef o
= c
->get_onode(oid
, false);
7124 if (!o
|| !o
->exists
) {
7128 if (!o
->onode
.has_omap())
7132 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
7134 get_omap_key(o
->onode
.nid
, string(), &head
);
7135 get_omap_tail(o
->onode
.nid
, &tail
);
7136 it
->lower_bound(head
);
7137 while (it
->valid()) {
7138 if (it
->key() >= tail
) {
7139 dout(30) << __func__
<< " reached tail" << dendl
;
7143 decode_omap_key(it
->key(), &user_key
);
7144 dout(30) << __func__
<< " got " << pretty_binary_string(it
->key())
7145 << " -> " << user_key
<< dendl
;
7146 keys
->insert(user_key
);
7151 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7156 int BlueStore::omap_get_values(
7157 const coll_t
& cid
, ///< [in] Collection containing oid
7158 const ghobject_t
&oid
, ///< [in] Object containing omap
7159 const set
<string
> &keys
, ///< [in] Keys to get
7160 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
7163 CollectionHandle c
= _get_collection(cid
);
7166 return omap_get_values(c
, oid
, keys
, out
);
7169 int BlueStore::omap_get_values(
7170 CollectionHandle
&c_
, ///< [in] Collection containing oid
7171 const ghobject_t
&oid
, ///< [in] Object containing omap
7172 const set
<string
> &keys
, ///< [in] Keys to get
7173 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
7176 Collection
*c
= static_cast<Collection
*>(c_
.get());
7177 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7180 RWLock::RLocker
l(c
->lock
);
7183 OnodeRef o
= c
->get_onode(oid
, false);
7184 if (!o
|| !o
->exists
) {
7188 if (!o
->onode
.has_omap())
7191 _key_encode_u64(o
->onode
.nid
, &final_key
);
7192 final_key
.push_back('.');
7193 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
7194 final_key
.resize(9); // keep prefix
7197 if (db
->get(PREFIX_OMAP
, final_key
, &val
) >= 0) {
7198 dout(30) << __func__
<< " got " << pretty_binary_string(final_key
)
7199 << " -> " << *p
<< dendl
;
7200 out
->insert(make_pair(*p
, val
));
7204 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7209 int BlueStore::omap_check_keys(
7210 const coll_t
& cid
, ///< [in] Collection containing oid
7211 const ghobject_t
&oid
, ///< [in] Object containing omap
7212 const set
<string
> &keys
, ///< [in] Keys to check
7213 set
<string
> *out
///< [out] Subset of keys defined on oid
7216 CollectionHandle c
= _get_collection(cid
);
7219 return omap_check_keys(c
, oid
, keys
, out
);
7222 int BlueStore::omap_check_keys(
7223 CollectionHandle
&c_
, ///< [in] Collection containing oid
7224 const ghobject_t
&oid
, ///< [in] Object containing omap
7225 const set
<string
> &keys
, ///< [in] Keys to check
7226 set
<string
> *out
///< [out] Subset of keys defined on oid
7229 Collection
*c
= static_cast<Collection
*>(c_
.get());
7230 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7233 RWLock::RLocker
l(c
->lock
);
7236 OnodeRef o
= c
->get_onode(oid
, false);
7237 if (!o
|| !o
->exists
) {
7241 if (!o
->onode
.has_omap())
7244 _key_encode_u64(o
->onode
.nid
, &final_key
);
7245 final_key
.push_back('.');
7246 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
7247 final_key
.resize(9); // keep prefix
7250 if (db
->get(PREFIX_OMAP
, final_key
, &val
) >= 0) {
7251 dout(30) << __func__
<< " have " << pretty_binary_string(final_key
)
7252 << " -> " << *p
<< dendl
;
7255 dout(30) << __func__
<< " miss " << pretty_binary_string(final_key
)
7256 << " -> " << *p
<< dendl
;
7260 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7265 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
7266 const coll_t
& cid
, ///< [in] collection
7267 const ghobject_t
&oid
///< [in] object
7270 CollectionHandle c
= _get_collection(cid
);
7272 dout(10) << __func__
<< " " << cid
<< "doesn't exist" <<dendl
;
7273 return ObjectMap::ObjectMapIterator();
7275 return get_omap_iterator(c
, oid
);
7278 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
7279 CollectionHandle
&c_
, ///< [in] collection
7280 const ghobject_t
&oid
///< [in] object
7283 Collection
*c
= static_cast<Collection
*>(c_
.get());
7284 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
7286 return ObjectMap::ObjectMapIterator();
7288 RWLock::RLocker
l(c
->lock
);
7289 OnodeRef o
= c
->get_onode(oid
, false);
7290 if (!o
|| !o
->exists
) {
7291 dout(10) << __func__
<< " " << oid
<< "doesn't exist" <<dendl
;
7292 return ObjectMap::ObjectMapIterator();
7295 dout(10) << __func__
<< " has_omap = " << (int)o
->onode
.has_omap() <<dendl
;
7296 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
7297 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c
, o
, it
));
7300 // -----------------
7303 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction
& t
)
7305 dout(10) << __func__
<< " ondisk_format " << ondisk_format
7306 << " min_compat_ondisk_format " << min_compat_ondisk_format
7308 assert(ondisk_format
== latest_ondisk_format
);
7311 ::encode(ondisk_format
, bl
);
7312 t
->set(PREFIX_SUPER
, "ondisk_format", bl
);
7316 ::encode(min_compat_ondisk_format
, bl
);
7317 t
->set(PREFIX_SUPER
, "min_compat_ondisk_format", bl
);
7321 int BlueStore::_open_super_meta()
7327 db
->get(PREFIX_SUPER
, "nid_max", &bl
);
7328 bufferlist::iterator p
= bl
.begin();
7333 } catch (buffer::error
& e
) {
7334 derr
<< __func__
<< " unable to read nid_max" << dendl
;
7337 dout(10) << __func__
<< " old nid_max " << nid_max
<< dendl
;
7338 nid_last
= nid_max
.load();
7345 db
->get(PREFIX_SUPER
, "blobid_max", &bl
);
7346 bufferlist::iterator p
= bl
.begin();
7351 } catch (buffer::error
& e
) {
7352 derr
<< __func__
<< " unable to read blobid_max" << dendl
;
7355 dout(10) << __func__
<< " old blobid_max " << blobid_max
<< dendl
;
7356 blobid_last
= blobid_max
.load();
7362 db
->get(PREFIX_SUPER
, "freelist_type", &bl
);
7364 freelist_type
= std::string(bl
.c_str(), bl
.length());
7365 dout(10) << __func__
<< " freelist_type " << freelist_type
<< dendl
;
7367 assert("Not Support extent freelist manager" == 0);
7372 if (cct
->_conf
->bluestore_bluefs
) {
7373 bluefs_extents
.clear();
7375 db
->get(PREFIX_SUPER
, "bluefs_extents", &bl
);
7376 bufferlist::iterator p
= bl
.begin();
7378 ::decode(bluefs_extents
, p
);
7380 catch (buffer::error
& e
) {
7381 derr
<< __func__
<< " unable to read bluefs_extents" << dendl
;
7384 dout(10) << __func__
<< " bluefs_extents 0x" << std::hex
<< bluefs_extents
7385 << std::dec
<< dendl
;
7389 int32_t compat_ondisk_format
= 0;
7392 int r
= db
->get(PREFIX_SUPER
, "ondisk_format", &bl
);
7394 // base case: kraken bluestore is v1 and readable by v1
7395 dout(20) << __func__
<< " missing ondisk_format; assuming kraken"
7398 compat_ondisk_format
= 1;
7400 auto p
= bl
.begin();
7402 ::decode(ondisk_format
, p
);
7403 } catch (buffer::error
& e
) {
7404 derr
<< __func__
<< " unable to read ondisk_format" << dendl
;
7409 r
= db
->get(PREFIX_SUPER
, "min_compat_ondisk_format", &bl
);
7411 auto p
= bl
.begin();
7413 ::decode(compat_ondisk_format
, p
);
7414 } catch (buffer::error
& e
) {
7415 derr
<< __func__
<< " unable to read compat_ondisk_format" << dendl
;
7420 dout(10) << __func__
<< " ondisk_format " << ondisk_format
7421 << " compat_ondisk_format " << compat_ondisk_format
7425 if (latest_ondisk_format
< compat_ondisk_format
) {
7426 derr
<< __func__
<< " compat_ondisk_format is "
7427 << compat_ondisk_format
<< " but we only understand version "
7428 << latest_ondisk_format
<< dendl
;
7431 if (ondisk_format
< latest_ondisk_format
) {
7432 int r
= _upgrade_super();
7440 db
->get(PREFIX_SUPER
, "min_alloc_size", &bl
);
7441 auto p
= bl
.begin();
7445 min_alloc_size
= val
;
7446 } catch (buffer::error
& e
) {
7447 derr
<< __func__
<< " unable to read min_alloc_size" << dendl
;
7450 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
7451 << std::dec
<< dendl
;
7455 _set_throttle_params();
7464 int BlueStore::_upgrade_super()
7466 dout(1) << __func__
<< " from " << ondisk_format
<< ", latest "
7467 << latest_ondisk_format
<< dendl
;
7468 assert(ondisk_format
> 0);
7469 assert(ondisk_format
< latest_ondisk_format
);
7471 if (ondisk_format
== 1) {
7473 // - super: added ondisk_format
7474 // - super: added min_readable_ondisk_format
7475 // - super: added min_compat_ondisk_format
7476 // - super: added min_alloc_size
7477 // - super: removed min_min_alloc_size
7478 KeyValueDB::Transaction t
= db
->get_transaction();
7481 db
->get(PREFIX_SUPER
, "min_min_alloc_size", &bl
);
7482 auto p
= bl
.begin();
7486 min_alloc_size
= val
;
7487 } catch (buffer::error
& e
) {
7488 derr
<< __func__
<< " failed to read min_min_alloc_size" << dendl
;
7491 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
7492 t
->rmkey(PREFIX_SUPER
, "min_min_alloc_size");
7495 _prepare_ondisk_format_super(t
);
7496 int r
= db
->submit_transaction_sync(t
);
7501 dout(1) << __func__
<< " done" << dendl
;
7505 void BlueStore::_assign_nid(TransContext
*txc
, OnodeRef o
)
7509 uint64_t nid
= ++nid_last
;
7510 dout(20) << __func__
<< " " << nid
<< dendl
;
7512 txc
->last_nid
= nid
;
7515 uint64_t BlueStore::_assign_blobid(TransContext
*txc
)
7517 uint64_t bid
= ++blobid_last
;
7518 dout(20) << __func__
<< " " << bid
<< dendl
;
7519 txc
->last_blobid
= bid
;
7523 void BlueStore::get_db_statistics(Formatter
*f
)
7525 db
->get_statistics(f
);
7528 BlueStore::TransContext
*BlueStore::_txc_create(OpSequencer
*osr
)
7530 TransContext
*txc
= new TransContext(cct
, osr
);
7531 txc
->t
= db
->get_transaction();
7532 osr
->queue_new(txc
);
7533 dout(20) << __func__
<< " osr " << osr
<< " = " << txc
7534 << " seq " << txc
->seq
<< dendl
;
7538 void BlueStore::_txc_calc_cost(TransContext
*txc
)
7540 // this is about the simplest model for transaction cost you can
7541 // imagine. there is some fixed overhead cost by saying there is a
7542 // minimum of one "io". and then we have some cost per "io" that is
7543 // a configurable (with different hdd and ssd defaults), and add
7544 // that to the bytes value.
7545 int ios
= 1; // one "io" for the kv commit
7546 for (auto& p
: txc
->ioc
.pending_aios
) {
7547 ios
+= p
.iov
.size();
7549 auto cost
= throttle_cost_per_io
.load();
7550 txc
->cost
= ios
* cost
+ txc
->bytes
;
7551 dout(10) << __func__
<< " " << txc
<< " cost " << txc
->cost
<< " ("
7552 << ios
<< " ios * " << cost
<< " + " << txc
->bytes
7553 << " bytes)" << dendl
;
7556 void BlueStore::_txc_update_store_statfs(TransContext
*txc
)
7558 if (txc
->statfs_delta
.is_empty())
7561 logger
->inc(l_bluestore_allocated
, txc
->statfs_delta
.allocated());
7562 logger
->inc(l_bluestore_stored
, txc
->statfs_delta
.stored());
7563 logger
->inc(l_bluestore_compressed
, txc
->statfs_delta
.compressed());
7564 logger
->inc(l_bluestore_compressed_allocated
, txc
->statfs_delta
.compressed_allocated());
7565 logger
->inc(l_bluestore_compressed_original
, txc
->statfs_delta
.compressed_original());
7568 std::lock_guard
<std::mutex
> l(vstatfs_lock
);
7569 vstatfs
+= txc
->statfs_delta
;
7573 txc
->statfs_delta
.encode(bl
);
7575 txc
->t
->merge(PREFIX_STAT
, "bluestore_statfs", bl
);
7576 txc
->statfs_delta
.reset();
7579 void BlueStore::_txc_state_proc(TransContext
*txc
)
7582 dout(10) << __func__
<< " txc " << txc
7583 << " " << txc
->get_state_name() << dendl
;
7584 switch (txc
->state
) {
7585 case TransContext::STATE_PREPARE
:
7586 txc
->log_state_latency(logger
, l_bluestore_state_prepare_lat
);
7587 if (txc
->ioc
.has_pending_aios()) {
7588 txc
->state
= TransContext::STATE_AIO_WAIT
;
7589 txc
->had_ios
= true;
7590 _txc_aio_submit(txc
);
7595 case TransContext::STATE_AIO_WAIT
:
7596 txc
->log_state_latency(logger
, l_bluestore_state_aio_wait_lat
);
7597 _txc_finish_io(txc
); // may trigger blocked txc's too
7600 case TransContext::STATE_IO_DONE
:
7601 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
7603 ++txc
->osr
->txc_with_unstable_io
;
7605 txc
->log_state_latency(logger
, l_bluestore_state_io_done_lat
);
7606 txc
->state
= TransContext::STATE_KV_QUEUED
;
7607 if (cct
->_conf
->bluestore_sync_submit_transaction
) {
7608 if (txc
->last_nid
>= nid_max
||
7609 txc
->last_blobid
>= blobid_max
) {
7610 dout(20) << __func__
7611 << " last_{nid,blobid} exceeds max, submit via kv thread"
7613 } else if (txc
->osr
->kv_committing_serially
) {
7614 dout(20) << __func__
<< " prior txc submitted via kv thread, us too"
7616 // note: this is starvation-prone. once we have a txc in a busy
7617 // sequencer that is committing serially it is possible to keep
7618 // submitting new transactions fast enough that we get stuck doing
7619 // so. the alternative is to block here... fixme?
7620 } else if (txc
->osr
->txc_with_unstable_io
) {
7621 dout(20) << __func__
<< " prior txc(s) with unstable ios "
7622 << txc
->osr
->txc_with_unstable_io
.load() << dendl
;
7623 } else if (cct
->_conf
->bluestore_debug_randomize_serial_transaction
&&
7624 rand() % cct
->_conf
->bluestore_debug_randomize_serial_transaction
7626 dout(20) << __func__
<< " DEBUG randomly forcing submit via kv thread"
7629 txc
->state
= TransContext::STATE_KV_SUBMITTED
;
7630 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction(txc
->t
);
7632 _txc_applied_kv(txc
);
7636 std::lock_guard
<std::mutex
> l(kv_lock
);
7637 kv_queue
.push_back(txc
);
7638 kv_cond
.notify_one();
7639 if (txc
->state
!= TransContext::STATE_KV_SUBMITTED
) {
7640 kv_queue_unsubmitted
.push_back(txc
);
7641 ++txc
->osr
->kv_committing_serially
;
7645 kv_throttle_costs
+= txc
->cost
;
7648 case TransContext::STATE_KV_SUBMITTED
:
7649 txc
->log_state_latency(logger
, l_bluestore_state_kv_committing_lat
);
7650 txc
->state
= TransContext::STATE_KV_DONE
;
7651 _txc_committed_kv(txc
);
7654 case TransContext::STATE_KV_DONE
:
7655 txc
->log_state_latency(logger
, l_bluestore_state_kv_done_lat
);
7656 if (txc
->deferred_txn
) {
7657 txc
->state
= TransContext::STATE_DEFERRED_QUEUED
;
7658 _deferred_queue(txc
);
7661 txc
->state
= TransContext::STATE_FINISHING
;
7664 case TransContext::STATE_DEFERRED_CLEANUP
:
7665 txc
->log_state_latency(logger
, l_bluestore_state_deferred_cleanup_lat
);
7666 txc
->state
= TransContext::STATE_FINISHING
;
7669 case TransContext::STATE_FINISHING
:
7670 txc
->log_state_latency(logger
, l_bluestore_state_finishing_lat
);
7675 derr
<< __func__
<< " unexpected txc " << txc
7676 << " state " << txc
->get_state_name() << dendl
;
7677 assert(0 == "unexpected txc state");
7683 void BlueStore::_txc_finish_io(TransContext
*txc
)
7685 dout(20) << __func__
<< " " << txc
<< dendl
;
7688 * we need to preserve the order of kv transactions,
7689 * even though aio will complete in any order.
7692 OpSequencer
*osr
= txc
->osr
.get();
7693 std::lock_guard
<std::mutex
> l(osr
->qlock
);
7694 txc
->state
= TransContext::STATE_IO_DONE
;
7696 // release aio contexts (including pinned buffers).
7697 txc
->ioc
.running_aios
.clear();
7699 OpSequencer::q_list_t::iterator p
= osr
->q
.iterator_to(*txc
);
7700 while (p
!= osr
->q
.begin()) {
7702 if (p
->state
< TransContext::STATE_IO_DONE
) {
7703 dout(20) << __func__
<< " " << txc
<< " blocked by " << &*p
<< " "
7704 << p
->get_state_name() << dendl
;
7707 if (p
->state
> TransContext::STATE_IO_DONE
) {
7713 _txc_state_proc(&*p
++);
7714 } while (p
!= osr
->q
.end() &&
7715 p
->state
== TransContext::STATE_IO_DONE
);
7717 if (osr
->kv_submitted_waiters
&&
7718 osr
->_is_all_kv_submitted()) {
7719 osr
->qcond
.notify_all();
7723 void BlueStore::_txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
)
7725 dout(20) << __func__
<< " txc " << txc
7726 << " onodes " << txc
->onodes
7727 << " shared_blobs " << txc
->shared_blobs
7731 for (auto o
: txc
->onodes
) {
7732 // finalize extent_map shards
7733 o
->extent_map
.update(t
, false);
7734 if (o
->extent_map
.needs_reshard()) {
7735 o
->extent_map
.reshard(db
, t
);
7736 o
->extent_map
.update(t
, true);
7737 if (o
->extent_map
.needs_reshard()) {
7738 dout(20) << __func__
<< " warning: still wants reshard, check options?"
7740 o
->extent_map
.clear_needs_reshard();
7742 logger
->inc(l_bluestore_onode_reshard
);
7747 denc(o
->onode
, bound
);
7748 o
->extent_map
.bound_encode_spanning_blobs(bound
);
7749 if (o
->onode
.extent_map_shards
.empty()) {
7750 denc(o
->extent_map
.inline_bl
, bound
);
7755 unsigned onode_part
, blob_part
, extent_part
;
7757 auto p
= bl
.get_contiguous_appender(bound
, true);
7759 onode_part
= p
.get_logical_offset();
7760 o
->extent_map
.encode_spanning_blobs(p
);
7761 blob_part
= p
.get_logical_offset() - onode_part
;
7762 if (o
->onode
.extent_map_shards
.empty()) {
7763 denc(o
->extent_map
.inline_bl
, p
);
7765 extent_part
= p
.get_logical_offset() - onode_part
- blob_part
;
7768 dout(20) << " onode " << o
->oid
<< " is " << bl
.length()
7769 << " (" << onode_part
<< " bytes onode + "
7770 << blob_part
<< " bytes spanning blobs + "
7771 << extent_part
<< " bytes inline extents)"
7773 t
->set(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size(), bl
);
7774 o
->flushing_count
++;
7777 // objects we modified but didn't affect the onode
7778 auto p
= txc
->modified_objects
.begin();
7779 while (p
!= txc
->modified_objects
.end()) {
7780 if (txc
->onodes
.count(*p
) == 0) {
7781 (*p
)->flushing_count
++;
7784 // remove dups with onodes list to avoid problems in _txc_finish
7785 p
= txc
->modified_objects
.erase(p
);
7789 // finalize shared_blobs
7790 for (auto sb
: txc
->shared_blobs
) {
7792 auto sbid
= sb
->get_sbid();
7793 get_shared_blob_key(sbid
, &key
);
7794 if (sb
->persistent
->empty()) {
7795 dout(20) << " shared_blob 0x" << std::hex
<< sbid
<< std::dec
7796 << " is empty" << dendl
;
7797 t
->rmkey(PREFIX_SHARED_BLOB
, key
);
7800 ::encode(*(sb
->persistent
), bl
);
7801 dout(20) << " shared_blob 0x" << std::hex
<< sbid
<< std::dec
7802 << " is " << bl
.length() << " " << *sb
<< dendl
;
7803 t
->set(PREFIX_SHARED_BLOB
, key
, bl
);
7808 void BlueStore::BSPerfTracker::update_from_perfcounters(
7809 PerfCounters
&logger
)
7811 os_commit_latency
.consume_next(
7813 l_bluestore_commit_lat
));
7814 os_apply_latency
.consume_next(
7816 l_bluestore_commit_lat
));
7819 void BlueStore::_txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
)
7821 dout(20) << __func__
<< " txc " << txc
<< std::hex
7822 << " allocated 0x" << txc
->allocated
7823 << " released 0x" << txc
->released
7824 << std::dec
<< dendl
;
7826 // We have to handle the case where we allocate *and* deallocate the
7827 // same region in this transaction. The freelist doesn't like that.
7828 // (Actually, the only thing that cares is the BitmapFreelistManager
7829 // debug check. But that's important.)
7830 interval_set
<uint64_t> tmp_allocated
, tmp_released
;
7831 interval_set
<uint64_t> *pallocated
= &txc
->allocated
;
7832 interval_set
<uint64_t> *preleased
= &txc
->released
;
7833 if (!txc
->allocated
.empty() && !txc
->released
.empty()) {
7834 interval_set
<uint64_t> overlap
;
7835 overlap
.intersection_of(txc
->allocated
, txc
->released
);
7836 if (!overlap
.empty()) {
7837 tmp_allocated
= txc
->allocated
;
7838 tmp_allocated
.subtract(overlap
);
7839 tmp_released
= txc
->released
;
7840 tmp_released
.subtract(overlap
);
7841 dout(20) << __func__
<< " overlap 0x" << std::hex
<< overlap
7842 << ", new allocated 0x" << tmp_allocated
7843 << " released 0x" << tmp_released
<< std::dec
7845 pallocated
= &tmp_allocated
;
7846 preleased
= &tmp_released
;
7850 // update freelist with non-overlap sets
7851 for (interval_set
<uint64_t>::iterator p
= pallocated
->begin();
7852 p
!= pallocated
->end();
7854 fm
->allocate(p
.get_start(), p
.get_len(), t
);
7856 for (interval_set
<uint64_t>::iterator p
= preleased
->begin();
7857 p
!= preleased
->end();
7859 dout(20) << __func__
<< " release 0x" << std::hex
<< p
.get_start()
7860 << "~" << p
.get_len() << std::dec
<< dendl
;
7861 fm
->release(p
.get_start(), p
.get_len(), t
);
7864 _txc_update_store_statfs(txc
);
7867 void BlueStore::_txc_applied_kv(TransContext
*txc
)
7869 for (auto ls
: { &txc
->onodes
, &txc
->modified_objects
}) {
7870 for (auto& o
: *ls
) {
7871 dout(20) << __func__
<< " onode " << o
<< " had " << o
->flushing_count
7873 if (--o
->flushing_count
== 0) {
7874 std::lock_guard
<std::mutex
> l(o
->flush_lock
);
7875 o
->flush_cond
.notify_all();
7881 void BlueStore::_txc_committed_kv(TransContext
*txc
)
7883 dout(20) << __func__
<< " txc " << txc
<< dendl
;
7885 // warning: we're calling onreadable_sync inside the sequencer lock
7886 if (txc
->onreadable_sync
) {
7887 txc
->onreadable_sync
->complete(0);
7888 txc
->onreadable_sync
= NULL
;
7890 unsigned n
= txc
->osr
->parent
->shard_hint
.hash_to_shard(m_finisher_num
);
7891 if (txc
->oncommit
) {
7892 logger
->tinc(l_bluestore_commit_lat
, ceph_clock_now() - txc
->start
);
7893 finishers
[n
]->queue(txc
->oncommit
);
7894 txc
->oncommit
= NULL
;
7896 if (txc
->onreadable
) {
7897 finishers
[n
]->queue(txc
->onreadable
);
7898 txc
->onreadable
= NULL
;
7901 if (!txc
->oncommits
.empty()) {
7902 finishers
[n
]->queue(txc
->oncommits
);
7906 void BlueStore::_txc_finish(TransContext
*txc
)
7908 dout(20) << __func__
<< " " << txc
<< " onodes " << txc
->onodes
<< dendl
;
7909 assert(txc
->state
== TransContext::STATE_FINISHING
);
7911 for (auto& sb
: txc
->shared_blobs_written
) {
7912 sb
->bc
.finish_write(sb
->get_cache(), txc
->seq
);
7914 txc
->shared_blobs_written
.clear();
7916 while (!txc
->removed_collections
.empty()) {
7917 _queue_reap_collection(txc
->removed_collections
.front());
7918 txc
->removed_collections
.pop_front();
7921 OpSequencerRef osr
= txc
->osr
;
7924 bool submit_deferred
= false;
7925 OpSequencer::q_list_t releasing_txc
;
7927 std::lock_guard
<std::mutex
> l(osr
->qlock
);
7928 txc
->state
= TransContext::STATE_DONE
;
7929 bool notify
= false;
7930 while (!osr
->q
.empty()) {
7931 TransContext
*txc
= &osr
->q
.front();
7932 dout(20) << __func__
<< " txc " << txc
<< " " << txc
->get_state_name()
7934 if (txc
->state
!= TransContext::STATE_DONE
) {
7935 if (txc
->state
== TransContext::STATE_PREPARE
&&
7936 deferred_aggressive
) {
7937 // for _osr_drain_preceding()
7940 if (txc
->state
== TransContext::STATE_DEFERRED_QUEUED
&&
7941 osr
->q
.size() > g_conf
->bluestore_max_deferred_txc
) {
7942 submit_deferred
= true;
7947 if (!c
&& txc
->first_collection
) {
7948 c
= txc
->first_collection
;
7951 releasing_txc
.push_back(*txc
);
7955 osr
->qcond
.notify_all();
7957 if (osr
->q
.empty()) {
7958 dout(20) << __func__
<< " osr " << osr
<< " q now empty" << dendl
;
7962 while (!releasing_txc
.empty()) {
7963 // release to allocator only after all preceding txc's have also
7964 // finished any deferred writes that potentially land in these
7966 auto txc
= &releasing_txc
.front();
7967 _txc_release_alloc(txc
);
7968 releasing_txc
.pop_front();
7969 txc
->log_state_latency(logger
, l_bluestore_state_done_lat
);
7973 if (submit_deferred
) {
7974 // we're pinning memory; flush! we could be more fine-grained here but
7975 // i'm not sure it's worth the bother.
7976 deferred_try_submit();
7979 if (empty
&& osr
->zombie
) {
7980 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
7985 void BlueStore::_txc_release_alloc(TransContext
*txc
)
7987 // update allocator with full released set
7988 if (!cct
->_conf
->bluestore_debug_no_reuse_blocks
) {
7989 dout(10) << __func__
<< " " << txc
<< " " << txc
->released
<< dendl
;
7990 for (interval_set
<uint64_t>::iterator p
= txc
->released
.begin();
7991 p
!= txc
->released
.end();
7993 alloc
->release(p
.get_start(), p
.get_len());
7997 txc
->allocated
.clear();
7998 txc
->released
.clear();
8001 void BlueStore::_osr_drain_preceding(TransContext
*txc
)
8003 OpSequencer
*osr
= txc
->osr
.get();
8004 dout(10) << __func__
<< " " << txc
<< " osr " << osr
<< dendl
;
8005 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
8007 // submit anything pending
8008 std::lock_guard
<std::mutex
> l(deferred_lock
);
8009 if (osr
->deferred_pending
) {
8010 _deferred_submit(osr
);
8014 // wake up any previously finished deferred events
8015 std::lock_guard
<std::mutex
> l(kv_lock
);
8016 kv_cond
.notify_one();
8018 osr
->drain_preceding(txc
);
8019 --deferred_aggressive
;
8020 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
8023 void BlueStore::_osr_drain_all()
8025 dout(10) << __func__
<< dendl
;
8027 set
<OpSequencerRef
> s
;
8029 std::lock_guard
<std::mutex
> l(osr_lock
);
8032 dout(20) << __func__
<< " osr_set " << s
<< dendl
;
8034 ++deferred_aggressive
;
8036 // submit anything pending
8037 std::lock_guard
<std::mutex
> l(deferred_lock
);
8038 _deferred_try_submit();
8041 // wake up any previously finished deferred events
8042 std::lock_guard
<std::mutex
> l(kv_lock
);
8043 kv_cond
.notify_one();
8046 std::lock_guard
<std::mutex
> l(kv_finalize_lock
);
8047 kv_finalize_cond
.notify_one();
8049 for (auto osr
: s
) {
8050 dout(20) << __func__
<< " drain " << osr
<< dendl
;
8053 --deferred_aggressive
;
8055 dout(10) << __func__
<< " done" << dendl
;
8058 void BlueStore::_osr_unregister_all()
8060 set
<OpSequencerRef
> s
;
8062 std::lock_guard
<std::mutex
> l(osr_lock
);
8065 dout(10) << __func__
<< " " << s
<< dendl
;
8066 for (auto osr
: s
) {
8070 // break link from Sequencer to us so that this OpSequencer
8071 // instance can die with this mount/umount cycle. note that
8072 // we assume umount() will not race against ~Sequencer.
8073 assert(osr
->parent
);
8074 osr
->parent
->p
.reset();
8077 // nobody should be creating sequencers during umount either.
8079 std::lock_guard
<std::mutex
> l(osr_lock
);
8080 assert(osr_set
.empty());
8084 void BlueStore::_kv_start()
8086 dout(10) << __func__
<< dendl
;
8088 if (cct
->_conf
->bluestore_shard_finishers
) {
8089 if (cct
->_conf
->osd_op_num_shards
) {
8090 m_finisher_num
= cct
->_conf
->osd_op_num_shards
;
8093 if (bdev
->is_rotational()) {
8094 m_finisher_num
= cct
->_conf
->osd_op_num_shards_hdd
;
8096 m_finisher_num
= cct
->_conf
->osd_op_num_shards_ssd
;
8101 assert(m_finisher_num
!= 0);
8103 for (int i
= 0; i
< m_finisher_num
; ++i
) {
8105 oss
<< "finisher-" << i
;
8106 Finisher
*f
= new Finisher(cct
, oss
.str(), "finisher");
8107 finishers
.push_back(f
);
8110 for (auto f
: finishers
) {
8113 kv_sync_thread
.create("bstore_kv_sync");
8114 kv_finalize_thread
.create("bstore_kv_final");
8117 void BlueStore::_kv_stop()
8119 dout(10) << __func__
<< dendl
;
8121 std::unique_lock
<std::mutex
> l(kv_lock
);
8122 while (!kv_sync_started
) {
8126 kv_cond
.notify_all();
8129 std::unique_lock
<std::mutex
> l(kv_finalize_lock
);
8130 while (!kv_finalize_started
) {
8131 kv_finalize_cond
.wait(l
);
8133 kv_finalize_stop
= true;
8134 kv_finalize_cond
.notify_all();
8136 kv_sync_thread
.join();
8137 kv_finalize_thread
.join();
8139 std::lock_guard
<std::mutex
> l(kv_lock
);
8143 std::lock_guard
<std::mutex
> l(kv_finalize_lock
);
8144 kv_finalize_stop
= false;
8146 dout(10) << __func__
<< " stopping finishers" << dendl
;
8147 for (auto f
: finishers
) {
8148 f
->wait_for_empty();
8151 dout(10) << __func__
<< " stopped" << dendl
;
8154 void BlueStore::_kv_sync_thread()
8156 dout(10) << __func__
<< " start" << dendl
;
8157 std::unique_lock
<std::mutex
> l(kv_lock
);
8158 assert(!kv_sync_started
);
8159 kv_sync_started
= true;
8160 kv_cond
.notify_all();
8162 assert(kv_committing
.empty());
8163 if (kv_queue
.empty() &&
8164 ((deferred_done_queue
.empty() && deferred_stable_queue
.empty()) ||
8165 !deferred_aggressive
)) {
8168 dout(20) << __func__
<< " sleep" << dendl
;
8170 dout(20) << __func__
<< " wake" << dendl
;
8172 deque
<TransContext
*> kv_submitting
;
8173 deque
<DeferredBatch
*> deferred_done
, deferred_stable
;
8174 uint64_t aios
= 0, costs
= 0;
8176 dout(20) << __func__
<< " committing " << kv_queue
.size()
8177 << " submitting " << kv_queue_unsubmitted
.size()
8178 << " deferred done " << deferred_done_queue
.size()
8179 << " stable " << deferred_stable_queue
.size()
8181 kv_committing
.swap(kv_queue
);
8182 kv_submitting
.swap(kv_queue_unsubmitted
);
8183 deferred_done
.swap(deferred_done_queue
);
8184 deferred_stable
.swap(deferred_stable_queue
);
8186 costs
= kv_throttle_costs
;
8188 kv_throttle_costs
= 0;
8189 utime_t start
= ceph_clock_now();
8192 dout(30) << __func__
<< " committing " << kv_committing
<< dendl
;
8193 dout(30) << __func__
<< " submitting " << kv_submitting
<< dendl
;
8194 dout(30) << __func__
<< " deferred_done " << deferred_done
<< dendl
;
8195 dout(30) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
8197 bool force_flush
= false;
8198 // if bluefs is sharing the same device as data (only), then we
8199 // can rely on the bluefs commit to flush the device and make
8200 // deferred aios stable. that means that if we do have done deferred
8201 // txcs AND we are not on a single device, we need to force a flush.
8202 if (bluefs_single_shared_device
&& bluefs
) {
8205 } else if (kv_committing
.empty() && kv_submitting
.empty() &&
8206 deferred_stable
.empty()) {
8207 force_flush
= true; // there's nothing else to commit!
8208 } else if (deferred_aggressive
) {
8215 dout(20) << __func__
<< " num_aios=" << aios
8216 << " force_flush=" << (int)force_flush
8217 << ", flushing, deferred done->stable" << dendl
;
8218 // flush/barrier on block device
8221 // if we flush then deferred done are now deferred stable
8222 deferred_stable
.insert(deferred_stable
.end(), deferred_done
.begin(),
8223 deferred_done
.end());
8224 deferred_done
.clear();
8226 utime_t after_flush
= ceph_clock_now();
8228 // we will use one final transaction to force a sync
8229 KeyValueDB::Transaction synct
= db
->get_transaction();
8231 // increase {nid,blobid}_max? note that this covers both the
8232 // case where we are approaching the max and the case we passed
8233 // it. in either case, we increase the max in the earlier txn
8235 uint64_t new_nid_max
= 0, new_blobid_max
= 0;
8236 if (nid_last
+ cct
->_conf
->bluestore_nid_prealloc
/2 > nid_max
) {
8237 KeyValueDB::Transaction t
=
8238 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
8239 new_nid_max
= nid_last
+ cct
->_conf
->bluestore_nid_prealloc
;
8241 ::encode(new_nid_max
, bl
);
8242 t
->set(PREFIX_SUPER
, "nid_max", bl
);
8243 dout(10) << __func__
<< " new_nid_max " << new_nid_max
<< dendl
;
8245 if (blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
/2 > blobid_max
) {
8246 KeyValueDB::Transaction t
=
8247 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
8248 new_blobid_max
= blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
;
8250 ::encode(new_blobid_max
, bl
);
8251 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
8252 dout(10) << __func__
<< " new_blobid_max " << new_blobid_max
<< dendl
;
8254 for (auto txc
: kv_submitting
) {
8255 assert(txc
->state
== TransContext::STATE_KV_QUEUED
);
8256 txc
->log_state_latency(logger
, l_bluestore_state_kv_queued_lat
);
8257 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction(txc
->t
);
8259 _txc_applied_kv(txc
);
8260 --txc
->osr
->kv_committing_serially
;
8261 txc
->state
= TransContext::STATE_KV_SUBMITTED
;
8262 if (txc
->osr
->kv_submitted_waiters
) {
8263 std::lock_guard
<std::mutex
> l(txc
->osr
->qlock
);
8264 if (txc
->osr
->_is_all_kv_submitted()) {
8265 txc
->osr
->qcond
.notify_all();
8269 for (auto txc
: kv_committing
) {
8271 --txc
->osr
->txc_with_unstable_io
;
8273 txc
->log_state_latency(logger
, l_bluestore_state_kv_queued_lat
);
8276 // release throttle *before* we commit. this allows new ops
8277 // to be prepared and enter pipeline while we are waiting on
8278 // the kv commit sync/flush. then hopefully on the next
8279 // iteration there will already be ops awake. otherwise, we
8280 // end up going to sleep, and then wake up when the very first
8281 // transaction is ready for commit.
8282 throttle_bytes
.put(costs
);
8284 PExtentVector bluefs_gift_extents
;
8286 after_flush
- bluefs_last_balance
>
8287 cct
->_conf
->bluestore_bluefs_balance_interval
) {
8288 bluefs_last_balance
= after_flush
;
8289 int r
= _balance_bluefs_freespace(&bluefs_gift_extents
);
8292 for (auto& p
: bluefs_gift_extents
) {
8293 bluefs_extents
.insert(p
.offset
, p
.length
);
8296 ::encode(bluefs_extents
, bl
);
8297 dout(10) << __func__
<< " bluefs_extents now 0x" << std::hex
8298 << bluefs_extents
<< std::dec
<< dendl
;
8299 synct
->set(PREFIX_SUPER
, "bluefs_extents", bl
);
8303 // cleanup sync deferred keys
8304 for (auto b
: deferred_stable
) {
8305 for (auto& txc
: b
->txcs
) {
8306 bluestore_deferred_transaction_t
& wt
= *txc
.deferred_txn
;
8307 if (!wt
.released
.empty()) {
8308 // kraken replay compat only
8309 txc
.released
= wt
.released
;
8310 dout(10) << __func__
<< " deferred txn has released "
8312 << " (we just upgraded from kraken) on " << &txc
<< dendl
;
8313 _txc_finalize_kv(&txc
, synct
);
8315 // cleanup the deferred
8317 get_deferred_key(wt
.seq
, &key
);
8318 synct
->rm_single_key(PREFIX_DEFERRED
, key
);
8322 // submit synct synchronously (block and wait for it to commit)
8323 int r
= cct
->_conf
->bluestore_debug_omit_kv_commit
? 0 : db
->submit_transaction_sync(synct
);
8327 nid_max
= new_nid_max
;
8328 dout(10) << __func__
<< " nid_max now " << nid_max
<< dendl
;
8330 if (new_blobid_max
) {
8331 blobid_max
= new_blobid_max
;
8332 dout(10) << __func__
<< " blobid_max now " << blobid_max
<< dendl
;
8335 utime_t finish
= ceph_clock_now();
8336 utime_t dur_flush
= after_flush
- start
;
8337 utime_t dur_kv
= finish
- after_flush
;
8338 utime_t dur
= finish
- start
;
8339 dout(20) << __func__
<< " committed " << kv_committing
.size()
8340 << " cleaned " << deferred_stable
.size()
8342 << " (" << dur_flush
<< " flush + " << dur_kv
<< " kv commit)"
8345 logger
->tinc(l_bluestore_kv_flush_lat
, dur_flush
);
8346 logger
->tinc(l_bluestore_kv_commit_lat
, dur_kv
);
8347 logger
->tinc(l_bluestore_kv_lat
, dur
);
8351 if (!bluefs_gift_extents
.empty()) {
8352 _commit_bluefs_freespace(bluefs_gift_extents
);
8354 for (auto p
= bluefs_extents_reclaiming
.begin();
8355 p
!= bluefs_extents_reclaiming
.end();
8357 dout(20) << __func__
<< " releasing old bluefs 0x" << std::hex
8358 << p
.get_start() << "~" << p
.get_len() << std::dec
8360 alloc
->release(p
.get_start(), p
.get_len());
8362 bluefs_extents_reclaiming
.clear();
8366 std::unique_lock
<std::mutex
> m(kv_finalize_lock
);
8367 if (kv_committing_to_finalize
.empty()) {
8368 kv_committing_to_finalize
.swap(kv_committing
);
8370 kv_committing_to_finalize
.insert(
8371 kv_committing_to_finalize
.end(),
8372 kv_committing
.begin(),
8373 kv_committing
.end());
8374 kv_committing
.clear();
8376 if (deferred_stable_to_finalize
.empty()) {
8377 deferred_stable_to_finalize
.swap(deferred_stable
);
8379 deferred_stable_to_finalize
.insert(
8380 deferred_stable_to_finalize
.end(),
8381 deferred_stable
.begin(),
8382 deferred_stable
.end());
8383 deferred_stable
.clear();
8385 kv_finalize_cond
.notify_one();
8389 // previously deferred "done" are now "stable" by virtue of this
8391 deferred_stable_queue
.swap(deferred_done
);
8394 dout(10) << __func__
<< " finish" << dendl
;
8395 kv_sync_started
= false;
8398 void BlueStore::_kv_finalize_thread()
8400 deque
<TransContext
*> kv_committed
;
8401 deque
<DeferredBatch
*> deferred_stable
;
8402 dout(10) << __func__
<< " start" << dendl
;
8403 std::unique_lock
<std::mutex
> l(kv_finalize_lock
);
8404 assert(!kv_finalize_started
);
8405 kv_finalize_started
= true;
8406 kv_finalize_cond
.notify_all();
8408 assert(kv_committed
.empty());
8409 assert(deferred_stable
.empty());
8410 if (kv_committing_to_finalize
.empty() &&
8411 deferred_stable_to_finalize
.empty()) {
8412 if (kv_finalize_stop
)
8414 dout(20) << __func__
<< " sleep" << dendl
;
8415 kv_finalize_cond
.wait(l
);
8416 dout(20) << __func__
<< " wake" << dendl
;
8418 kv_committed
.swap(kv_committing_to_finalize
);
8419 deferred_stable
.swap(deferred_stable_to_finalize
);
8421 dout(20) << __func__
<< " kv_committed " << kv_committed
<< dendl
;
8422 dout(20) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
8424 while (!kv_committed
.empty()) {
8425 TransContext
*txc
= kv_committed
.front();
8426 assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
8427 _txc_state_proc(txc
);
8428 kv_committed
.pop_front();
8431 for (auto b
: deferred_stable
) {
8432 auto p
= b
->txcs
.begin();
8433 while (p
!= b
->txcs
.end()) {
8434 TransContext
*txc
= &*p
;
8435 p
= b
->txcs
.erase(p
); // unlink here because
8436 _txc_state_proc(txc
); // this may destroy txc
8440 deferred_stable
.clear();
8442 if (!deferred_aggressive
) {
8443 std::lock_guard
<std::mutex
> l(deferred_lock
);
8444 if (deferred_queue_size
>= deferred_batch_ops
.load() ||
8445 throttle_deferred_bytes
.past_midpoint()) {
8446 _deferred_try_submit();
8450 // this is as good a place as any ...
8451 _reap_collections();
8456 dout(10) << __func__
<< " finish" << dendl
;
8457 kv_finalize_started
= false;
8460 bluestore_deferred_op_t
*BlueStore::_get_deferred_op(
8461 TransContext
*txc
, OnodeRef o
)
8463 if (!txc
->deferred_txn
) {
8464 txc
->deferred_txn
= new bluestore_deferred_transaction_t
;
8466 txc
->deferred_txn
->ops
.push_back(bluestore_deferred_op_t());
8467 return &txc
->deferred_txn
->ops
.back();
8470 void BlueStore::_deferred_queue(TransContext
*txc
)
8472 dout(20) << __func__
<< " txc " << txc
<< " osr " << txc
->osr
<< dendl
;
8473 std::lock_guard
<std::mutex
> l(deferred_lock
);
8474 if (!txc
->osr
->deferred_pending
&&
8475 !txc
->osr
->deferred_running
) {
8476 deferred_queue
.push_back(*txc
->osr
);
8478 if (!txc
->osr
->deferred_pending
) {
8479 txc
->osr
->deferred_pending
= new DeferredBatch(cct
, txc
->osr
.get());
8481 ++deferred_queue_size
;
8482 txc
->osr
->deferred_pending
->txcs
.push_back(*txc
);
8483 bluestore_deferred_transaction_t
& wt
= *txc
->deferred_txn
;
8484 for (auto opi
= wt
.ops
.begin(); opi
!= wt
.ops
.end(); ++opi
) {
8485 const auto& op
= *opi
;
8486 assert(op
.op
== bluestore_deferred_op_t::OP_WRITE
);
8487 bufferlist::const_iterator p
= op
.data
.begin();
8488 for (auto e
: op
.extents
) {
8489 txc
->osr
->deferred_pending
->prepare_write(
8490 cct
, wt
.seq
, e
.offset
, e
.length
, p
);
8493 if (deferred_aggressive
&&
8494 !txc
->osr
->deferred_running
) {
8495 _deferred_submit(txc
->osr
.get());
8499 void BlueStore::_deferred_try_submit()
8501 dout(20) << __func__
<< " " << deferred_queue
.size() << " osrs, "
8502 << deferred_queue_size
<< " txcs" << dendl
;
8503 for (auto& osr
: deferred_queue
) {
8504 if (!osr
.deferred_running
) {
8505 _deferred_submit(&osr
);
8510 void BlueStore::_deferred_submit(OpSequencer
*osr
)
8512 dout(10) << __func__
<< " osr " << osr
8513 << " " << osr
->deferred_pending
->iomap
.size() << " ios pending "
8515 assert(osr
->deferred_pending
);
8516 assert(!osr
->deferred_running
);
8518 auto b
= osr
->deferred_pending
;
8519 deferred_queue_size
-= b
->seq_bytes
.size();
8520 assert(deferred_queue_size
>= 0);
8522 osr
->deferred_running
= osr
->deferred_pending
;
8523 osr
->deferred_pending
= nullptr;
8525 uint64_t start
= 0, pos
= 0;
8527 auto i
= b
->iomap
.begin();
8529 if (i
== b
->iomap
.end() || i
->first
!= pos
) {
8531 dout(20) << __func__
<< " write 0x" << std::hex
8532 << start
<< "~" << bl
.length()
8533 << " crc " << bl
.crc32c(-1) << std::dec
<< dendl
;
8534 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
8535 logger
->inc(l_bluestore_deferred_write_ops
);
8536 logger
->inc(l_bluestore_deferred_write_bytes
, bl
.length());
8537 int r
= bdev
->aio_write(start
, bl
, &b
->ioc
, false);
8541 if (i
== b
->iomap
.end()) {
8548 dout(20) << __func__
<< " seq " << i
->second
.seq
<< " 0x"
8549 << std::hex
<< pos
<< "~" << i
->second
.bl
.length() << std::dec
8554 pos
+= i
->second
.bl
.length();
8555 bl
.claim_append(i
->second
.bl
);
8558 bdev
->aio_submit(&b
->ioc
);
8561 void BlueStore::_deferred_aio_finish(OpSequencer
*osr
)
8563 dout(10) << __func__
<< " osr " << osr
<< dendl
;
8564 assert(osr
->deferred_running
);
8565 DeferredBatch
*b
= osr
->deferred_running
;
8568 std::lock_guard
<std::mutex
> l(deferred_lock
);
8569 assert(osr
->deferred_running
== b
);
8570 osr
->deferred_running
= nullptr;
8571 if (!osr
->deferred_pending
) {
8572 auto q
= deferred_queue
.iterator_to(*osr
);
8573 deferred_queue
.erase(q
);
8574 } else if (deferred_aggressive
) {
8575 _deferred_submit(osr
);
8581 std::lock_guard
<std::mutex
> l2(osr
->qlock
);
8582 for (auto& i
: b
->txcs
) {
8583 TransContext
*txc
= &i
;
8584 txc
->state
= TransContext::STATE_DEFERRED_CLEANUP
;
8587 osr
->qcond
.notify_all();
8588 throttle_deferred_bytes
.put(costs
);
8589 std::lock_guard
<std::mutex
> l(kv_lock
);
8590 deferred_done_queue
.emplace_back(b
);
8593 // in the normal case, do not bother waking up the kv thread; it will
8594 // catch us on the next commit anyway.
8595 if (deferred_aggressive
) {
8596 std::lock_guard
<std::mutex
> l(kv_lock
);
8597 kv_cond
.notify_one();
8601 int BlueStore::_deferred_replay()
8603 dout(10) << __func__
<< " start" << dendl
;
8604 OpSequencerRef osr
= new OpSequencer(cct
, this);
8607 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_DEFERRED
);
8608 for (it
->lower_bound(string()); it
->valid(); it
->next(), ++count
) {
8609 dout(20) << __func__
<< " replay " << pretty_binary_string(it
->key())
8611 bluestore_deferred_transaction_t
*deferred_txn
=
8612 new bluestore_deferred_transaction_t
;
8613 bufferlist bl
= it
->value();
8614 bufferlist::iterator p
= bl
.begin();
8616 ::decode(*deferred_txn
, p
);
8617 } catch (buffer::error
& e
) {
8618 derr
<< __func__
<< " failed to decode deferred txn "
8619 << pretty_binary_string(it
->key()) << dendl
;
8620 delete deferred_txn
;
8624 TransContext
*txc
= _txc_create(osr
.get());
8625 txc
->deferred_txn
= deferred_txn
;
8626 txc
->state
= TransContext::STATE_KV_DONE
;
8627 _txc_state_proc(txc
);
8630 dout(20) << __func__
<< " draining osr" << dendl
;
8633 dout(10) << __func__
<< " completed " << count
<< " events" << dendl
;
8637 // ---------------------------
8640 int BlueStore::queue_transactions(
8642 vector
<Transaction
>& tls
,
8644 ThreadPool::TPHandle
*handle
)
8647 Context
*onreadable
;
8649 Context
*onreadable_sync
;
8650 ObjectStore::Transaction::collect_contexts(
8651 tls
, &onreadable
, &ondisk
, &onreadable_sync
);
8653 if (cct
->_conf
->objectstore_blackhole
) {
8654 dout(0) << __func__
<< " objectstore_blackhole = TRUE, dropping transaction"
8658 delete onreadable_sync
;
8661 utime_t start
= ceph_clock_now();
8662 // set up the sequencer
8666 osr
= static_cast<OpSequencer
*>(posr
->p
.get());
8667 dout(10) << __func__
<< " existing " << osr
<< " " << *osr
<< dendl
;
8669 osr
= new OpSequencer(cct
, this);
8672 dout(10) << __func__
<< " new " << osr
<< " " << *osr
<< dendl
;
8676 TransContext
*txc
= _txc_create(osr
);
8677 txc
->onreadable
= onreadable
;
8678 txc
->onreadable_sync
= onreadable_sync
;
8679 txc
->oncommit
= ondisk
;
8681 for (vector
<Transaction
>::iterator p
= tls
.begin(); p
!= tls
.end(); ++p
) {
8683 txc
->bytes
+= (*p
).get_num_bytes();
8684 _txc_add_transaction(txc
, &(*p
));
8686 _txc_calc_cost(txc
);
8688 _txc_write_nodes(txc
, txc
->t
);
8690 // journal deferred items
8691 if (txc
->deferred_txn
) {
8692 txc
->deferred_txn
->seq
= ++deferred_seq
;
8694 ::encode(*txc
->deferred_txn
, bl
);
8696 get_deferred_key(txc
->deferred_txn
->seq
, &key
);
8697 txc
->t
->set(PREFIX_DEFERRED
, key
, bl
);
8700 _txc_finalize_kv(txc
, txc
->t
);
8702 handle
->suspend_tp_timeout();
8704 utime_t tstart
= ceph_clock_now();
8705 throttle_bytes
.get(txc
->cost
);
8706 if (txc
->deferred_txn
) {
8707 // ensure we do not block here because of deferred writes
8708 if (!throttle_deferred_bytes
.get_or_fail(txc
->cost
)) {
8709 deferred_try_submit();
8710 throttle_deferred_bytes
.get(txc
->cost
);
8713 utime_t tend
= ceph_clock_now();
8716 handle
->reset_tp_timeout();
8718 logger
->inc(l_bluestore_txc
);
8721 _txc_state_proc(txc
);
8723 logger
->tinc(l_bluestore_submit_lat
, ceph_clock_now() - start
);
8724 logger
->tinc(l_bluestore_throttle_lat
, tend
- tstart
);
8728 void BlueStore::_txc_aio_submit(TransContext
*txc
)
8730 dout(10) << __func__
<< " txc " << txc
<< dendl
;
8731 bdev
->aio_submit(&txc
->ioc
);
8734 void BlueStore::_txc_add_transaction(TransContext
*txc
, Transaction
*t
)
8736 Transaction::iterator i
= t
->begin();
8738 _dump_transaction(t
);
8740 vector
<CollectionRef
> cvec(i
.colls
.size());
8742 for (vector
<coll_t
>::iterator p
= i
.colls
.begin(); p
!= i
.colls
.end();
8744 cvec
[j
] = _get_collection(*p
);
8746 // note first collection we reference
8747 if (!txc
->first_collection
)
8748 txc
->first_collection
= cvec
[j
];
8750 vector
<OnodeRef
> ovec(i
.objects
.size());
8752 for (int pos
= 0; i
.have_op(); ++pos
) {
8753 Transaction::Op
*op
= i
.decode_op();
8757 if (op
->op
== Transaction::OP_NOP
)
8760 // collection operations
8761 CollectionRef
&c
= cvec
[op
->cid
];
8763 case Transaction::OP_RMCOLL
:
8765 const coll_t
&cid
= i
.get_cid(op
->cid
);
8766 r
= _remove_collection(txc
, cid
, &c
);
8772 case Transaction::OP_MKCOLL
:
8775 const coll_t
&cid
= i
.get_cid(op
->cid
);
8776 r
= _create_collection(txc
, cid
, op
->split_bits
, &c
);
8782 case Transaction::OP_SPLIT_COLLECTION
:
8783 assert(0 == "deprecated");
8786 case Transaction::OP_SPLIT_COLLECTION2
:
8788 uint32_t bits
= op
->split_bits
;
8789 uint32_t rem
= op
->split_rem
;
8790 r
= _split_collection(txc
, c
, cvec
[op
->dest_cid
], bits
, rem
);
8796 case Transaction::OP_COLL_HINT
:
8798 uint32_t type
= op
->hint_type
;
8801 bufferlist::iterator hiter
= hint
.begin();
8802 if (type
== Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
) {
8805 ::decode(pg_num
, hiter
);
8806 ::decode(num_objs
, hiter
);
8807 dout(10) << __func__
<< " collection hint objects is a no-op, "
8808 << " pg_num " << pg_num
<< " num_objects " << num_objs
8812 dout(10) << __func__
<< " unknown collection hint " << type
<< dendl
;
8818 case Transaction::OP_COLL_SETATTR
:
8822 case Transaction::OP_COLL_RMATTR
:
8826 case Transaction::OP_COLL_RENAME
:
8827 assert(0 == "not implemented");
8831 derr
<< __func__
<< " error " << cpp_strerror(r
)
8832 << " not handled on operation " << op
->op
8833 << " (op " << pos
<< ", counting from 0)" << dendl
;
8834 _dump_transaction(t
, 0);
8835 assert(0 == "unexpected error");
8838 // these operations implicity create the object
8839 bool create
= false;
8840 if (op
->op
== Transaction::OP_TOUCH
||
8841 op
->op
== Transaction::OP_WRITE
||
8842 op
->op
== Transaction::OP_ZERO
) {
8846 // object operations
8847 RWLock::WLocker
l(c
->lock
);
8848 OnodeRef
&o
= ovec
[op
->oid
];
8850 ghobject_t oid
= i
.get_oid(op
->oid
);
8851 o
= c
->get_onode(oid
, create
);
8853 if (!create
&& (!o
|| !o
->exists
)) {
8854 dout(10) << __func__
<< " op " << op
->op
<< " got ENOENT on "
8855 << i
.get_oid(op
->oid
) << dendl
;
8861 case Transaction::OP_TOUCH
:
8862 r
= _touch(txc
, c
, o
);
8865 case Transaction::OP_WRITE
:
8867 uint64_t off
= op
->off
;
8868 uint64_t len
= op
->len
;
8869 uint32_t fadvise_flags
= i
.get_fadvise_flags();
8872 r
= _write(txc
, c
, o
, off
, len
, bl
, fadvise_flags
);
8876 case Transaction::OP_ZERO
:
8878 uint64_t off
= op
->off
;
8879 uint64_t len
= op
->len
;
8880 r
= _zero(txc
, c
, o
, off
, len
);
8884 case Transaction::OP_TRIMCACHE
:
8886 // deprecated, no-op
8890 case Transaction::OP_TRUNCATE
:
8892 uint64_t off
= op
->off
;
8893 _truncate(txc
, c
, o
, off
);
8897 case Transaction::OP_REMOVE
:
8899 r
= _remove(txc
, c
, o
);
8903 case Transaction::OP_SETATTR
:
8905 string name
= i
.decode_string();
8908 r
= _setattr(txc
, c
, o
, name
, bp
);
8912 case Transaction::OP_SETATTRS
:
8914 map
<string
, bufferptr
> aset
;
8915 i
.decode_attrset(aset
);
8916 r
= _setattrs(txc
, c
, o
, aset
);
8920 case Transaction::OP_RMATTR
:
8922 string name
= i
.decode_string();
8923 r
= _rmattr(txc
, c
, o
, name
);
8927 case Transaction::OP_RMATTRS
:
8929 r
= _rmattrs(txc
, c
, o
);
8933 case Transaction::OP_CLONE
:
8935 OnodeRef
& no
= ovec
[op
->dest_oid
];
8937 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
8938 no
= c
->get_onode(noid
, true);
8940 r
= _clone(txc
, c
, o
, no
);
8944 case Transaction::OP_CLONERANGE
:
8945 assert(0 == "deprecated");
8948 case Transaction::OP_CLONERANGE2
:
8950 OnodeRef
& no
= ovec
[op
->dest_oid
];
8952 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
8953 no
= c
->get_onode(noid
, true);
8955 uint64_t srcoff
= op
->off
;
8956 uint64_t len
= op
->len
;
8957 uint64_t dstoff
= op
->dest_off
;
8958 r
= _clone_range(txc
, c
, o
, no
, srcoff
, len
, dstoff
);
8962 case Transaction::OP_COLL_ADD
:
8963 assert(0 == "not implemented");
8966 case Transaction::OP_COLL_REMOVE
:
8967 assert(0 == "not implemented");
8970 case Transaction::OP_COLL_MOVE
:
8971 assert(0 == "deprecated");
8974 case Transaction::OP_COLL_MOVE_RENAME
:
8975 case Transaction::OP_TRY_RENAME
:
8977 assert(op
->cid
== op
->dest_cid
);
8978 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
8979 OnodeRef
& no
= ovec
[op
->dest_oid
];
8981 no
= c
->get_onode(noid
, false);
8983 r
= _rename(txc
, c
, o
, no
, noid
);
8987 case Transaction::OP_OMAP_CLEAR
:
8989 r
= _omap_clear(txc
, c
, o
);
8992 case Transaction::OP_OMAP_SETKEYS
:
8995 i
.decode_attrset_bl(&aset_bl
);
8996 r
= _omap_setkeys(txc
, c
, o
, aset_bl
);
8999 case Transaction::OP_OMAP_RMKEYS
:
9002 i
.decode_keyset_bl(&keys_bl
);
9003 r
= _omap_rmkeys(txc
, c
, o
, keys_bl
);
9006 case Transaction::OP_OMAP_RMKEYRANGE
:
9009 first
= i
.decode_string();
9010 last
= i
.decode_string();
9011 r
= _omap_rmkey_range(txc
, c
, o
, first
, last
);
9014 case Transaction::OP_OMAP_SETHEADER
:
9018 r
= _omap_setheader(txc
, c
, o
, bl
);
9022 case Transaction::OP_SETALLOCHINT
:
9024 r
= _set_alloc_hint(txc
, c
, o
,
9025 op
->expected_object_size
,
9026 op
->expected_write_size
,
9027 op
->alloc_hint_flags
);
9032 derr
<< __func__
<< "bad op " << op
->op
<< dendl
;
9040 if (r
== -ENOENT
&& !(op
->op
== Transaction::OP_CLONERANGE
||
9041 op
->op
== Transaction::OP_CLONE
||
9042 op
->op
== Transaction::OP_CLONERANGE2
||
9043 op
->op
== Transaction::OP_COLL_ADD
||
9044 op
->op
== Transaction::OP_SETATTR
||
9045 op
->op
== Transaction::OP_SETATTRS
||
9046 op
->op
== Transaction::OP_RMATTR
||
9047 op
->op
== Transaction::OP_OMAP_SETKEYS
||
9048 op
->op
== Transaction::OP_OMAP_RMKEYS
||
9049 op
->op
== Transaction::OP_OMAP_RMKEYRANGE
||
9050 op
->op
== Transaction::OP_OMAP_SETHEADER
))
9051 // -ENOENT is usually okay
9057 const char *msg
= "unexpected error code";
9059 if (r
== -ENOENT
&& (op
->op
== Transaction::OP_CLONERANGE
||
9060 op
->op
== Transaction::OP_CLONE
||
9061 op
->op
== Transaction::OP_CLONERANGE2
))
9062 msg
= "ENOENT on clone suggests osd bug";
9065 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
9066 // by partially applying transactions.
9067 msg
= "ENOSPC from bluestore, misconfigured cluster";
9069 if (r
== -ENOTEMPTY
) {
9070 msg
= "ENOTEMPTY suggests garbage data in osd data dir";
9073 derr
<< __func__
<< " error " << cpp_strerror(r
)
9074 << " not handled on operation " << op
->op
9075 << " (op " << pos
<< ", counting from 0)"
9077 derr
<< msg
<< dendl
;
9078 _dump_transaction(t
, 0);
9079 assert(0 == "unexpected error");
9087 // -----------------
9090 int BlueStore::_touch(TransContext
*txc
,
9094 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
9097 _assign_nid(txc
, o
);
9098 txc
->write_onode(o
);
9099 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
9103 void BlueStore::_dump_onode(OnodeRef o
, int log_level
)
9105 if (!cct
->_conf
->subsys
.should_gather(ceph_subsys_bluestore
, log_level
))
9107 dout(log_level
) << __func__
<< " " << o
<< " " << o
->oid
9108 << " nid " << o
->onode
.nid
9109 << " size 0x" << std::hex
<< o
->onode
.size
9110 << " (" << std::dec
<< o
->onode
.size
<< ")"
9111 << " expected_object_size " << o
->onode
.expected_object_size
9112 << " expected_write_size " << o
->onode
.expected_write_size
9113 << " in " << o
->onode
.extent_map_shards
.size() << " shards"
9114 << ", " << o
->extent_map
.spanning_blob_map
.size()
9115 << " spanning blobs"
9117 for (auto p
= o
->onode
.attrs
.begin();
9118 p
!= o
->onode
.attrs
.end();
9120 dout(log_level
) << __func__
<< " attr " << p
->first
9121 << " len " << p
->second
.length() << dendl
;
9123 _dump_extent_map(o
->extent_map
, log_level
);
9126 void BlueStore::_dump_extent_map(ExtentMap
&em
, int log_level
)
9129 for (auto& s
: em
.shards
) {
9130 dout(log_level
) << __func__
<< " shard " << *s
.shard_info
9131 << (s
.loaded
? " (loaded)" : "")
9132 << (s
.dirty
? " (dirty)" : "")
9135 for (auto& e
: em
.extent_map
) {
9136 dout(log_level
) << __func__
<< " " << e
<< dendl
;
9137 assert(e
.logical_offset
>= pos
);
9138 pos
= e
.logical_offset
+ e
.length
;
9139 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
9140 if (blob
.has_csum()) {
9142 unsigned n
= blob
.get_csum_count();
9143 for (unsigned i
= 0; i
< n
; ++i
)
9144 v
.push_back(blob
.get_csum_item(i
));
9145 dout(log_level
) << __func__
<< " csum: " << std::hex
<< v
<< std::dec
9148 std::lock_guard
<std::recursive_mutex
> l(e
.blob
->shared_blob
->get_cache()->lock
);
9149 for (auto& i
: e
.blob
->shared_blob
->bc
.buffer_map
) {
9150 dout(log_level
) << __func__
<< " 0x" << std::hex
<< i
.first
9151 << "~" << i
.second
->length
<< std::dec
9152 << " " << *i
.second
<< dendl
;
9157 void BlueStore::_dump_transaction(Transaction
*t
, int log_level
)
9159 dout(log_level
) << " transaction dump:\n";
9160 JSONFormatter
f(true);
9161 f
.open_object_section("transaction");
9168 void BlueStore::_pad_zeros(
9169 bufferlist
*bl
, uint64_t *offset
,
9170 uint64_t chunk_size
)
9172 auto length
= bl
->length();
9173 dout(30) << __func__
<< " 0x" << std::hex
<< *offset
<< "~" << length
9174 << " chunk_size 0x" << chunk_size
<< std::dec
<< dendl
;
9175 dout(40) << "before:\n";
9176 bl
->hexdump(*_dout
);
9179 size_t front_pad
= *offset
% chunk_size
;
9180 size_t back_pad
= 0;
9181 size_t pad_count
= 0;
9183 size_t front_copy
= MIN(chunk_size
- front_pad
, length
);
9184 bufferptr z
= buffer::create_page_aligned(chunk_size
);
9185 memset(z
.c_str(), 0, front_pad
);
9186 pad_count
+= front_pad
;
9187 memcpy(z
.c_str() + front_pad
, bl
->get_contiguous(0, front_copy
), front_copy
);
9188 if (front_copy
+ front_pad
< chunk_size
) {
9189 back_pad
= chunk_size
- (length
+ front_pad
);
9190 memset(z
.c_str() + front_pad
+ length
, 0, back_pad
);
9191 pad_count
+= back_pad
;
9195 t
.substr_of(old
, front_copy
, length
- front_copy
);
9197 bl
->claim_append(t
);
9198 *offset
-= front_pad
;
9199 length
+= front_pad
+ back_pad
;
9203 uint64_t end
= *offset
+ length
;
9204 unsigned back_copy
= end
% chunk_size
;
9206 assert(back_pad
== 0);
9207 back_pad
= chunk_size
- back_copy
;
9208 assert(back_copy
<= length
);
9209 bufferptr
tail(chunk_size
);
9210 memcpy(tail
.c_str(), bl
->get_contiguous(length
- back_copy
, back_copy
),
9212 memset(tail
.c_str() + back_copy
, 0, back_pad
);
9215 bl
->substr_of(old
, 0, length
- back_copy
);
9218 pad_count
+= back_pad
;
9220 dout(20) << __func__
<< " pad 0x" << std::hex
<< front_pad
<< " + 0x"
9221 << back_pad
<< " on front/back, now 0x" << *offset
<< "~"
9222 << length
<< std::dec
<< dendl
;
9223 dout(40) << "after:\n";
9224 bl
->hexdump(*_dout
);
9227 logger
->inc(l_bluestore_write_pad_bytes
, pad_count
);
9228 assert(bl
->length() == length
);
9231 void BlueStore::_do_write_small(
9235 uint64_t offset
, uint64_t length
,
9236 bufferlist::iterator
& blp
,
9239 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
9240 << std::dec
<< dendl
;
9241 assert(length
< min_alloc_size
);
9242 uint64_t end_offs
= offset
+ length
;
9244 logger
->inc(l_bluestore_write_small
);
9245 logger
->inc(l_bluestore_write_small_bytes
, length
);
9248 blp
.copy(length
, bl
);
9250 // Look for an existing mutable blob we can use.
9251 auto begin
= o
->extent_map
.extent_map
.begin();
9252 auto end
= o
->extent_map
.extent_map
.end();
9253 auto ep
= o
->extent_map
.seek_lextent(offset
);
9256 if (ep
->blob_end() <= offset
) {
9261 if (prev_ep
!= begin
) {
9264 prev_ep
= end
; // to avoid this extent check as it's a duplicate
9267 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
9268 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
9269 uint32_t alloc_len
= min_alloc_size
;
9270 auto offset0
= P2ALIGN(offset
, alloc_len
);
9274 // search suitable extent in both forward and reverse direction in
9275 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9276 // then check if blob can be reused via try_reuse_blob func or apply
9277 // direct/deferred write (the latter for extents including or higher
9278 // than 'offset' only).
9282 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
9283 BlobRef b
= ep
->blob
;
9284 auto bstart
= ep
->blob_start();
9285 dout(20) << __func__
<< " considering " << *b
9286 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
9287 if (bstart
>= end_offs
) {
9288 dout(20) << __func__
<< " ignoring distant " << *b
<< dendl
;
9289 } else if (!b
->get_blob().is_mutable()) {
9290 dout(20) << __func__
<< " ignoring immutable " << *b
<< dendl
;
9291 } else if (ep
->logical_offset
% min_alloc_size
!=
9292 ep
->blob_offset
% min_alloc_size
) {
9293 dout(20) << __func__
<< " ignoring offset-skewed " << *b
<< dendl
;
9295 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
9296 // can we pad our head/tail out with zeros?
9297 uint64_t head_pad
, tail_pad
;
9298 head_pad
= P2PHASE(offset
, chunk_size
);
9299 tail_pad
= P2NPHASE(end_offs
, chunk_size
);
9300 if (head_pad
|| tail_pad
) {
9301 o
->extent_map
.fault_range(db
, offset
- head_pad
,
9302 end_offs
- offset
+ head_pad
+ tail_pad
);
9305 o
->extent_map
.has_any_lextents(offset
- head_pad
, chunk_size
)) {
9308 if (tail_pad
&& o
->extent_map
.has_any_lextents(end_offs
, tail_pad
)) {
9312 uint64_t b_off
= offset
- head_pad
- bstart
;
9313 uint64_t b_len
= length
+ head_pad
+ tail_pad
;
9315 // direct write into unused blocks of an existing mutable blob?
9316 if ((b_off
% chunk_size
== 0 && b_len
% chunk_size
== 0) &&
9317 b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
9318 b
->get_blob().is_unused(b_off
, b_len
) &&
9319 b
->get_blob().is_allocated(b_off
, b_len
)) {
9321 _apply_padding(head_pad
, tail_pad
, bl
, padded
);
9323 dout(20) << __func__
<< " write to unused 0x" << std::hex
9324 << b_off
<< "~" << b_len
9325 << " pad 0x" << head_pad
<< " + 0x" << tail_pad
9326 << std::dec
<< " of mutable " << *b
<< dendl
;
9327 _buffer_cache_write(txc
, b
, b_off
, padded
,
9328 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
9330 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
9331 if (b_len
<= prefer_deferred_size
) {
9332 dout(20) << __func__
<< " deferring small 0x" << std::hex
9333 << b_len
<< std::dec
<< " unused write via deferred" << dendl
;
9334 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
9335 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
9338 [&](uint64_t offset
, uint64_t length
) {
9339 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
9344 b
->get_blob().map_bl(
9346 [&](uint64_t offset
, bufferlist
& t
) {
9347 bdev
->aio_write(offset
, t
,
9348 &txc
->ioc
, wctx
->buffered
);
9352 b
->dirty_blob().calc_csum(b_off
, padded
);
9353 dout(20) << __func__
<< " lex old " << *ep
<< dendl
;
9354 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, b_off
+ head_pad
, length
,
9356 &wctx
->old_extents
);
9357 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
9358 txc
->statfs_delta
.stored() += le
->length
;
9359 dout(20) << __func__
<< " lex " << *le
<< dendl
;
9360 logger
->inc(l_bluestore_write_small_unused
);
9363 // read some data to fill out the chunk?
9364 uint64_t head_read
= P2PHASE(b_off
, chunk_size
);
9365 uint64_t tail_read
= P2NPHASE(b_off
+ b_len
, chunk_size
);
9366 if ((head_read
|| tail_read
) &&
9367 (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
+ tail_read
) &&
9368 head_read
+ tail_read
< min_alloc_size
) {
9370 b_len
+= head_read
+ tail_read
;
9373 head_read
= tail_read
= 0;
9376 // chunk-aligned deferred overwrite?
9377 if (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
9378 b_off
% chunk_size
== 0 &&
9379 b_len
% chunk_size
== 0 &&
9380 b
->get_blob().is_allocated(b_off
, b_len
)) {
9383 _apply_padding(head_pad
, tail_pad
, bl
, padded
);
9385 dout(20) << __func__
<< " reading head 0x" << std::hex
<< head_read
9386 << " and tail 0x" << tail_read
<< std::dec
<< dendl
;
9389 int r
= _do_read(c
.get(), o
, offset
- head_pad
- head_read
, head_read
,
9391 assert(r
>= 0 && r
<= (int)head_read
);
9392 size_t zlen
= head_read
- r
;
9394 head_bl
.append_zero(zlen
);
9395 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
9397 head_bl
.claim_append(padded
);
9398 padded
.swap(head_bl
);
9399 logger
->inc(l_bluestore_write_penalty_read_ops
);
9403 int r
= _do_read(c
.get(), o
, offset
+ length
+ tail_pad
, tail_read
,
9405 assert(r
>= 0 && r
<= (int)tail_read
);
9406 size_t zlen
= tail_read
- r
;
9408 tail_bl
.append_zero(zlen
);
9409 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
9411 padded
.claim_append(tail_bl
);
9412 logger
->inc(l_bluestore_write_penalty_read_ops
);
9414 logger
->inc(l_bluestore_write_small_pre_read
);
9416 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
9417 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
9418 _buffer_cache_write(txc
, b
, b_off
, padded
,
9419 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
9421 int r
= b
->get_blob().map(
9423 [&](uint64_t offset
, uint64_t length
) {
9424 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
9428 if (b
->get_blob().csum_type
) {
9429 b
->dirty_blob().calc_csum(b_off
, padded
);
9431 op
->data
.claim(padded
);
9432 dout(20) << __func__
<< " deferred write 0x" << std::hex
<< b_off
<< "~"
9433 << b_len
<< std::dec
<< " of mutable " << *b
9434 << " at " << op
->extents
<< dendl
;
9435 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, offset
- bstart
, length
,
9436 b
, &wctx
->old_extents
);
9437 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
9438 txc
->statfs_delta
.stored() += le
->length
;
9439 dout(20) << __func__
<< " lex " << *le
<< dendl
;
9440 logger
->inc(l_bluestore_write_small_deferred
);
9444 if (b
->try_reuse_blob(min_alloc_size
,
9448 assert(alloc_len
== min_alloc_size
); // expecting data always
9449 // fit into reused blob
9450 // Need to check for pending writes desiring to
9451 // reuse the same pextent. The rationale is that during GC two chunks
9452 // from garbage blobs(compressed?) can share logical space within the same
9453 // AU. That's in turn might be caused by unaligned len in clone_range2.
9454 // Hence the second write will fail in an attempt to reuse blob at
9455 // do_alloc_write().
9456 if (!wctx
->has_conflict(b
,
9458 offset0
+ alloc_len
,
9461 // we can't reuse pad_head/pad_tail since they might be truncated
9462 // due to existent extents
9463 uint64_t b_off
= offset
- bstart
;
9464 uint64_t b_off0
= b_off
;
9465 _pad_zeros(&bl
, &b_off0
, chunk_size
);
9467 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9468 << " (" << b_off0
<< "~" << bl
.length() << ")"
9469 << " (" << b_off
<< "~" << length
<< ")"
9470 << std::dec
<< dendl
;
9472 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9473 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
9475 logger
->inc(l_bluestore_write_small_unused
);
9482 } // if (ep != end && ep->logical_offset < offset + max_bsize)
9484 // check extent for reuse in reverse order
9485 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
9486 BlobRef b
= prev_ep
->blob
;
9487 auto bstart
= prev_ep
->blob_start();
9488 dout(20) << __func__
<< " considering " << *b
9489 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
9490 if (b
->try_reuse_blob(min_alloc_size
,
9494 assert(alloc_len
== min_alloc_size
); // expecting data always
9495 // fit into reused blob
9496 // Need to check for pending writes desiring to
9497 // reuse the same pextent. The rationale is that during GC two chunks
9498 // from garbage blobs(compressed?) can share logical space within the same
9499 // AU. That's in turn might be caused by unaligned len in clone_range2.
9500 // Hence the second write will fail in an attempt to reuse blob at
9501 // do_alloc_write().
9502 if (!wctx
->has_conflict(b
,
9504 offset0
+ alloc_len
,
9507 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
9508 uint64_t b_off
= offset
- bstart
;
9509 uint64_t b_off0
= b_off
;
9510 _pad_zeros(&bl
, &b_off0
, chunk_size
);
9512 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9513 << " (" << b_off0
<< "~" << bl
.length() << ")"
9514 << " (" << b_off
<< "~" << length
<< ")"
9515 << std::dec
<< dendl
;
9517 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9518 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
9520 logger
->inc(l_bluestore_write_small_unused
);
9524 if (prev_ep
!= begin
) {
9528 prev_ep
= end
; // to avoid useless first extent re-check
9530 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
9531 } while (any_change
);
9535 BlobRef b
= c
->new_blob();
9536 uint64_t b_off
= P2PHASE(offset
, alloc_len
);
9537 uint64_t b_off0
= b_off
;
9538 _pad_zeros(&bl
, &b_off0
, block_size
);
9539 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9540 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
, true, true);
9541 logger
->inc(l_bluestore_write_small_new
);
9546 void BlueStore::_do_write_big(
9550 uint64_t offset
, uint64_t length
,
9551 bufferlist::iterator
& blp
,
9554 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
9555 << " target_blob_size 0x" << wctx
->target_blob_size
<< std::dec
9556 << " compress " << (int)wctx
->compress
9558 logger
->inc(l_bluestore_write_big
);
9559 logger
->inc(l_bluestore_write_big_bytes
, length
);
9560 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9561 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
9562 while (length
> 0) {
9563 bool new_blob
= false;
9564 uint32_t l
= MIN(max_bsize
, length
);
9568 //attempting to reuse existing blob
9569 if (!wctx
->compress
) {
9570 // look for an existing mutable blob we can reuse
9571 auto begin
= o
->extent_map
.extent_map
.begin();
9572 auto end
= o
->extent_map
.extent_map
.end();
9573 auto ep
= o
->extent_map
.seek_lextent(offset
);
9575 if (prev_ep
!= begin
) {
9578 prev_ep
= end
; // to avoid this extent check as it's a duplicate
9580 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
9581 // search suitable extent in both forward and reverse direction in
9582 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9583 // then check if blob can be reused via try_reuse_blob func.
9587 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
9588 if (offset
>= ep
->blob_start() &&
9589 ep
->blob
->try_reuse_blob(min_alloc_size
, max_bsize
,
9590 offset
- ep
->blob_start(),
9593 b_off
= offset
- ep
->blob_start();
9594 prev_ep
= end
; // to avoid check below
9595 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9596 << " (" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
9603 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
9604 if (prev_ep
->blob
->try_reuse_blob(min_alloc_size
, max_bsize
,
9605 offset
- prev_ep
->blob_start(),
9608 b_off
= offset
- prev_ep
->blob_start();
9609 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9610 << " (" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
9611 } else if (prev_ep
!= begin
) {
9615 prev_ep
= end
; // to avoid useless first extent re-check
9618 } while (b
== nullptr && any_change
);
9628 wctx
->write(offset
, b
, l
, b_off
, t
, b_off
, l
, false, new_blob
);
9631 logger
->inc(l_bluestore_write_big_blobs
);
9635 int BlueStore::_do_alloc_write(
9641 dout(20) << __func__
<< " txc " << txc
9642 << " " << wctx
->writes
.size() << " blobs"
9646 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
9647 for (auto &wi
: wctx
->writes
) {
9648 need
+= wi
.blob_length
;
9650 int r
= alloc
->reserve(need
);
9652 derr
<< __func__
<< " failed to reserve 0x" << std::hex
<< need
<< std::dec
9660 if (wctx
->compress
) {
9662 "compression_algorithm",
9666 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_ALGORITHM
, &val
)) {
9667 CompressorRef cp
= compressor
;
9668 if (!cp
|| cp
->get_type_name() != val
) {
9669 cp
= Compressor::create(cct
, val
);
9671 return boost::optional
<CompressorRef
>(cp
);
9673 return boost::optional
<CompressorRef
>();
9677 crr
= select_option(
9678 "compression_required_ratio",
9679 cct
->_conf
->bluestore_compression_required_ratio
,
9682 if(coll
->pool_opts
.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO
, &val
)) {
9683 return boost::optional
<double>(val
);
9685 return boost::optional
<double>();
9691 int csum
= csum_type
.load();
9692 csum
= select_option(
9697 if(coll
->pool_opts
.get(pool_opts_t::CSUM_TYPE
, &val
)) {
9698 return boost::optional
<int>(val
);
9700 return boost::optional
<int>();
9704 for (auto& wi
: wctx
->writes
) {
9706 bluestore_blob_t
& dblob
= b
->dirty_blob();
9707 uint64_t b_off
= wi
.b_off
;
9708 bufferlist
*l
= &wi
.bl
;
9709 uint64_t final_length
= wi
.blob_length
;
9710 uint64_t csum_length
= wi
.blob_length
;
9711 unsigned csum_order
= block_size_order
;
9712 bufferlist compressed_bl
;
9713 bool compressed
= false;
9714 if(c
&& wi
.blob_length
> min_alloc_size
) {
9716 utime_t start
= ceph_clock_now();
9720 assert(wi
.blob_length
== l
->length());
9721 bluestore_compression_header_t chdr
;
9722 chdr
.type
= c
->get_type();
9723 // FIXME: memory alignment here is bad
9726 r
= c
->compress(*l
, t
);
9729 chdr
.length
= t
.length();
9730 ::encode(chdr
, compressed_bl
);
9731 compressed_bl
.claim_append(t
);
9732 uint64_t rawlen
= compressed_bl
.length();
9733 uint64_t newlen
= P2ROUNDUP(rawlen
, min_alloc_size
);
9734 uint64_t want_len_raw
= final_length
* crr
;
9735 uint64_t want_len
= P2ROUNDUP(want_len_raw
, min_alloc_size
);
9736 if (newlen
<= want_len
&& newlen
< final_length
) {
9737 // Cool. We compressed at least as much as we were hoping to.
9738 // pad out to min_alloc_size
9739 compressed_bl
.append_zero(newlen
- rawlen
);
9740 logger
->inc(l_bluestore_write_pad_bytes
, newlen
- rawlen
);
9741 dout(20) << __func__
<< std::hex
<< " compressed 0x" << wi
.blob_length
9742 << " -> 0x" << rawlen
<< " => 0x" << newlen
9743 << " with " << c
->get_type()
9744 << std::dec
<< dendl
;
9745 txc
->statfs_delta
.compressed() += rawlen
;
9746 txc
->statfs_delta
.compressed_original() += l
->length();
9747 txc
->statfs_delta
.compressed_allocated() += newlen
;
9749 final_length
= newlen
;
9750 csum_length
= newlen
;
9751 csum_order
= ctz(newlen
);
9752 dblob
.set_compressed(wi
.blob_length
, rawlen
);
9754 logger
->inc(l_bluestore_compress_success_count
);
9756 dout(20) << __func__
<< std::hex
<< " 0x" << l
->length()
9757 << " compressed to 0x" << rawlen
<< " -> 0x" << newlen
9758 << " with " << c
->get_type()
9759 << ", which is more than required 0x" << want_len_raw
9760 << " -> 0x" << want_len
9761 << ", leaving uncompressed"
9762 << std::dec
<< dendl
;
9763 logger
->inc(l_bluestore_compress_rejected_count
);
9765 logger
->tinc(l_bluestore_compress_lat
,
9766 ceph_clock_now() - start
);
9768 if (!compressed
&& wi
.new_blob
) {
9769 // initialize newly created blob only
9770 assert(dblob
.is_mutable());
9771 if (l
->length() != wi
.blob_length
) {
9772 // hrm, maybe we could do better here, but let's not bother.
9773 dout(20) << __func__
<< " forcing csum_order to block_size_order "
9774 << block_size_order
<< dendl
;
9775 csum_order
= block_size_order
;
9777 csum_order
= std::min(wctx
->csum_order
, ctz(l
->length()));
9779 // try to align blob with max_blob_size to improve
9780 // its reuse ratio, e.g. in case of reverse write
9781 uint32_t suggested_boff
=
9782 (wi
.logical_offset
- (wi
.b_off0
- wi
.b_off
)) % max_bsize
;
9783 if ((suggested_boff
% (1 << csum_order
)) == 0 &&
9784 suggested_boff
+ final_length
<= max_bsize
&&
9785 suggested_boff
> b_off
) {
9786 dout(20) << __func__
<< " forcing blob_offset to "
9787 << std::hex
<< suggested_boff
<< std::dec
<< dendl
;
9788 assert(suggested_boff
>= b_off
);
9789 csum_length
+= suggested_boff
- b_off
;
9790 b_off
= suggested_boff
;
9794 AllocExtentVector extents
;
9795 extents
.reserve(4); // 4 should be (more than) enough for most allocations
9796 int64_t got
= alloc
->allocate(final_length
, min_alloc_size
,
9797 max_alloc_size
.load(),
9799 assert(got
== (int64_t)final_length
);
9801 txc
->statfs_delta
.allocated() += got
;
9802 for (auto& p
: extents
) {
9803 bluestore_pextent_t e
= bluestore_pextent_t(p
);
9804 txc
->allocated
.insert(e
.offset
, e
.length
);
9807 dblob
.allocated(P2ALIGN(b_off
, min_alloc_size
), final_length
, extents
);
9809 dout(20) << __func__
<< " blob " << *b
9810 << " csum_type " << Checksummer::get_csum_type_string(csum
)
9811 << " csum_order " << csum_order
9812 << " csum_length 0x" << std::hex
<< csum_length
<< std::dec
9815 if (csum
!= Checksummer::CSUM_NONE
) {
9816 if (!dblob
.has_csum()) {
9817 dblob
.init_csum(csum
, csum_order
, csum_length
);
9819 dblob
.calc_csum(b_off
, *l
);
9821 if (wi
.mark_unused
) {
9822 auto b_end
= b_off
+ wi
.bl
.length();
9824 dblob
.add_unused(0, b_off
);
9826 if (b_end
< wi
.blob_length
) {
9827 dblob
.add_unused(b_end
, wi
.blob_length
- b_end
);
9831 Extent
*le
= o
->extent_map
.set_lextent(coll
, wi
.logical_offset
,
9832 b_off
+ (wi
.b_off0
- wi
.b_off
),
9836 wi
.b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
9837 txc
->statfs_delta
.stored() += le
->length
;
9838 dout(20) << __func__
<< " lex " << *le
<< dendl
;
9839 _buffer_cache_write(txc
, wi
.b
, b_off
, wi
.bl
,
9840 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
9843 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
9844 if (l
->length() <= prefer_deferred_size
.load()) {
9845 dout(20) << __func__
<< " deferring small 0x" << std::hex
9846 << l
->length() << std::dec
<< " write via deferred" << dendl
;
9847 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
9848 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
9849 int r
= b
->get_blob().map(
9851 [&](uint64_t offset
, uint64_t length
) {
9852 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
9858 b
->get_blob().map_bl(
9860 [&](uint64_t offset
, bufferlist
& t
) {
9861 bdev
->aio_write(offset
, t
, &txc
->ioc
, false);
9867 alloc
->unreserve(need
);
9872 void BlueStore::_wctx_finish(
9877 set
<SharedBlob
*> *maybe_unshared_blobs
)
9879 auto oep
= wctx
->old_extents
.begin();
9880 while (oep
!= wctx
->old_extents
.end()) {
9882 oep
= wctx
->old_extents
.erase(oep
);
9883 dout(20) << __func__
<< " lex_old " << lo
.e
<< dendl
;
9884 BlobRef b
= lo
.e
.blob
;
9885 const bluestore_blob_t
& blob
= b
->get_blob();
9886 if (blob
.is_compressed()) {
9887 if (lo
.blob_empty
) {
9888 txc
->statfs_delta
.compressed() -= blob
.get_compressed_payload_length();
9890 txc
->statfs_delta
.compressed_original() -= lo
.e
.length
;
9893 txc
->statfs_delta
.stored() -= lo
.e
.length
;
9895 dout(20) << __func__
<< " blob release " << r
<< dendl
;
9896 if (blob
.is_shared()) {
9897 PExtentVector final
;
9898 c
->load_shared_blob(b
->shared_blob
);
9900 b
->shared_blob
->put_ref(
9901 e
.offset
, e
.length
, &final
,
9902 b
->is_referenced() ? nullptr : maybe_unshared_blobs
);
9904 dout(20) << __func__
<< " shared_blob release " << final
9905 << " from " << *b
->shared_blob
<< dendl
;
9906 txc
->write_shared_blob(b
->shared_blob
);
9911 // we can't invalidate our logical extents as we drop them because
9912 // other lextents (either in our onode or others) may still
9913 // reference them. but we can throw out anything that is no
9914 // longer allocated. Note that this will leave behind edge bits
9915 // that are no longer referenced but not deallocated (until they
9916 // age out of the cache naturally).
9917 b
->discard_unallocated(c
.get());
9919 dout(20) << __func__
<< " release " << e
<< dendl
;
9920 txc
->released
.insert(e
.offset
, e
.length
);
9921 txc
->statfs_delta
.allocated() -= e
.length
;
9922 if (blob
.is_compressed()) {
9923 txc
->statfs_delta
.compressed_allocated() -= e
.length
;
9927 if (b
->is_spanning() && !b
->is_referenced()) {
9928 dout(20) << __func__
<< " spanning_blob_map removing empty " << *b
9930 o
->extent_map
.spanning_blob_map
.erase(b
->id
);
9935 void BlueStore::_do_write_data(
9944 uint64_t end
= offset
+ length
;
9945 bufferlist::iterator p
= bl
.begin();
9947 if (offset
/ min_alloc_size
== (end
- 1) / min_alloc_size
&&
9948 (length
!= min_alloc_size
)) {
9949 // we fall within the same block
9950 _do_write_small(txc
, c
, o
, offset
, length
, p
, wctx
);
9952 uint64_t head_offset
, head_length
;
9953 uint64_t middle_offset
, middle_length
;
9954 uint64_t tail_offset
, tail_length
;
9956 head_offset
= offset
;
9957 head_length
= P2NPHASE(offset
, min_alloc_size
);
9959 tail_offset
= P2ALIGN(end
, min_alloc_size
);
9960 tail_length
= P2PHASE(end
, min_alloc_size
);
9962 middle_offset
= head_offset
+ head_length
;
9963 middle_length
= length
- head_length
- tail_length
;
9966 _do_write_small(txc
, c
, o
, head_offset
, head_length
, p
, wctx
);
9969 if (middle_length
) {
9970 _do_write_big(txc
, c
, o
, middle_offset
, middle_length
, p
, wctx
);
9974 _do_write_small(txc
, c
, o
, tail_offset
, tail_length
, p
, wctx
);
9979 void BlueStore::_choose_write_options(
9982 uint32_t fadvise_flags
,
9985 if (fadvise_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
9986 dout(20) << __func__
<< " will do buffered write" << dendl
;
9987 wctx
->buffered
= true;
9988 } else if (cct
->_conf
->bluestore_default_buffered_write
&&
9989 (fadvise_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
9990 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
9991 dout(20) << __func__
<< " defaulting to buffered write" << dendl
;
9992 wctx
->buffered
= true;
9995 // apply basic csum block size
9996 wctx
->csum_order
= block_size_order
;
9998 // compression parameters
9999 unsigned alloc_hints
= o
->onode
.alloc_hint_flags
;
10000 auto cm
= select_option(
10001 "compression_mode",
10005 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MODE
, &val
)) {
10006 return boost::optional
<Compressor::CompressionMode
>(
10007 Compressor::get_comp_mode_type(val
));
10009 return boost::optional
<Compressor::CompressionMode
>();
10013 wctx
->compress
= (cm
!= Compressor::COMP_NONE
) &&
10014 ((cm
== Compressor::COMP_FORCE
) ||
10015 (cm
== Compressor::COMP_AGGRESSIVE
&&
10016 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE
) == 0) ||
10017 (cm
== Compressor::COMP_PASSIVE
&&
10018 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE
)));
10020 if ((alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ
) &&
10021 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ
) == 0 &&
10022 (alloc_hints
& (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE
|
10023 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY
)) &&
10024 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE
) == 0) {
10026 dout(20) << __func__
<< " will prefer large blob and csum sizes" << dendl
;
10028 auto order
= min_alloc_size_order
.load();
10029 if (o
->onode
.expected_write_size
) {
10030 wctx
->csum_order
= std::max(order
,
10031 (uint8_t)ctz(o
->onode
.expected_write_size
));
10033 wctx
->csum_order
= order
;
10036 if (wctx
->compress
) {
10037 wctx
->target_blob_size
= select_option(
10038 "compression_max_blob_size",
10039 comp_max_blob_size
.load(),
10042 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, &val
)) {
10043 return boost::optional
<uint64_t>((uint64_t)val
);
10045 return boost::optional
<uint64_t>();
10050 if (wctx
->compress
) {
10051 wctx
->target_blob_size
= select_option(
10052 "compression_min_blob_size",
10053 comp_min_blob_size
.load(),
10056 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, &val
)) {
10057 return boost::optional
<uint64_t>((uint64_t)val
);
10059 return boost::optional
<uint64_t>();
10065 uint64_t max_bsize
= max_blob_size
.load();
10066 if (wctx
->target_blob_size
== 0 || wctx
->target_blob_size
> max_bsize
) {
10067 wctx
->target_blob_size
= max_bsize
;
10070 // set the min blob size floor at 2x the min_alloc_size, or else we
10071 // won't be able to allocate a smaller extent for the compressed
10073 if (wctx
->compress
&&
10074 wctx
->target_blob_size
< min_alloc_size
* 2) {
10075 wctx
->target_blob_size
= min_alloc_size
* 2;
10078 dout(20) << __func__
<< " prefer csum_order " << wctx
->csum_order
10079 << " target_blob_size 0x" << std::hex
<< wctx
->target_blob_size
10080 << std::dec
<< dendl
;
10083 int BlueStore::_do_gc(
10087 const GarbageCollector
& gc
,
10088 const WriteContext
& wctx
,
10089 uint64_t *dirty_start
,
10090 uint64_t *dirty_end
)
10092 auto& extents_to_collect
= gc
.get_extents_to_collect();
10094 WriteContext wctx_gc
;
10095 wctx_gc
.fork(wctx
); // make a clone for garbage collection
10097 for (auto it
= extents_to_collect
.begin();
10098 it
!= extents_to_collect
.end();
10101 int r
= _do_read(c
.get(), o
, it
->offset
, it
->length
, bl
, 0);
10102 assert(r
== (int)it
->length
);
10104 o
->extent_map
.fault_range(db
, it
->offset
, it
->length
);
10105 _do_write_data(txc
, c
, o
, it
->offset
, it
->length
, bl
, &wctx_gc
);
10106 logger
->inc(l_bluestore_gc_merged
, it
->length
);
10108 if (*dirty_start
> it
->offset
) {
10109 *dirty_start
= it
->offset
;
10112 if (*dirty_end
< it
->offset
+ it
->length
) {
10113 *dirty_end
= it
->offset
+ it
->length
;
10117 dout(30) << __func__
<< " alloc write" << dendl
;
10118 int r
= _do_alloc_write(txc
, c
, o
, &wctx_gc
);
10120 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
10125 _wctx_finish(txc
, c
, o
, &wctx_gc
);
10129 int BlueStore::_do_write(
10136 uint32_t fadvise_flags
)
10140 dout(20) << __func__
10142 << " 0x" << std::hex
<< offset
<< "~" << length
10143 << " - have 0x" << o
->onode
.size
10144 << " (" << std::dec
<< o
->onode
.size
<< ")"
10146 << " fadvise_flags 0x" << std::hex
<< fadvise_flags
<< std::dec
10154 uint64_t end
= offset
+ length
;
10156 GarbageCollector
gc(c
->store
->cct
);
10158 auto dirty_start
= offset
;
10159 auto dirty_end
= end
;
10162 _choose_write_options(c
, o
, fadvise_flags
, &wctx
);
10163 o
->extent_map
.fault_range(db
, offset
, length
);
10164 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx
);
10165 r
= _do_alloc_write(txc
, c
, o
, &wctx
);
10167 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
10172 // NB: _wctx_finish() will empty old_extents
10173 // so we must do gc estimation before that
10174 benefit
= gc
.estimate(offset
,
10180 _wctx_finish(txc
, c
, o
, &wctx
);
10181 if (end
> o
->onode
.size
) {
10182 dout(20) << __func__
<< " extending size to 0x" << std::hex
<< end
10183 << std::dec
<< dendl
;
10184 o
->onode
.size
= end
;
10187 if (benefit
>= g_conf
->bluestore_gc_enable_total_threshold
) {
10188 if (!gc
.get_extents_to_collect().empty()) {
10189 dout(20) << __func__
<< " perform garbage collection, "
10190 << "expected benefit = " << benefit
<< " AUs" << dendl
;
10191 r
= _do_gc(txc
, c
, o
, gc
, wctx
, &dirty_start
, &dirty_end
);
10193 derr
<< __func__
<< " _do_gc failed with " << cpp_strerror(r
)
10200 o
->extent_map
.compress_extent_map(dirty_start
, dirty_end
- dirty_start
);
10201 o
->extent_map
.dirty_range(dirty_start
, dirty_end
- dirty_start
);
10209 int BlueStore::_write(TransContext
*txc
,
10212 uint64_t offset
, size_t length
,
10214 uint32_t fadvise_flags
)
10216 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10217 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10220 _assign_nid(txc
, o
);
10221 int r
= _do_write(txc
, c
, o
, offset
, length
, bl
, fadvise_flags
);
10222 txc
->write_onode(o
);
10224 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10225 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10226 << " = " << r
<< dendl
;
10230 int BlueStore::_zero(TransContext
*txc
,
10233 uint64_t offset
, size_t length
)
10235 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10236 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10239 _assign_nid(txc
, o
);
10240 int r
= _do_zero(txc
, c
, o
, offset
, length
);
10241 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10242 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10243 << " = " << r
<< dendl
;
10247 int BlueStore::_do_zero(TransContext
*txc
,
10250 uint64_t offset
, size_t length
)
10252 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10253 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10260 o
->extent_map
.fault_range(db
, offset
, length
);
10261 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
10262 o
->extent_map
.dirty_range(offset
, length
);
10263 _wctx_finish(txc
, c
, o
, &wctx
);
10265 if (offset
+ length
> o
->onode
.size
) {
10266 o
->onode
.size
= offset
+ length
;
10267 dout(20) << __func__
<< " extending size to " << offset
+ length
10270 txc
->write_onode(o
);
10272 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10273 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
10274 << " = " << r
<< dendl
;
10278 void BlueStore::_do_truncate(
10279 TransContext
*txc
, CollectionRef
& c
, OnodeRef o
, uint64_t offset
,
10280 set
<SharedBlob
*> *maybe_unshared_blobs
)
10282 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10283 << " 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
10285 _dump_onode(o
, 30);
10287 if (offset
== o
->onode
.size
)
10290 if (offset
< o
->onode
.size
) {
10292 uint64_t length
= o
->onode
.size
- offset
;
10293 o
->extent_map
.fault_range(db
, offset
, length
);
10294 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
10295 o
->extent_map
.dirty_range(offset
, length
);
10296 _wctx_finish(txc
, c
, o
, &wctx
, maybe_unshared_blobs
);
10298 // if we have shards past EOF, ask for a reshard
10299 if (!o
->onode
.extent_map_shards
.empty() &&
10300 o
->onode
.extent_map_shards
.back().offset
>= offset
) {
10301 dout(10) << __func__
<< " request reshard past EOF" << dendl
;
10303 o
->extent_map
.request_reshard(offset
- 1, offset
+ length
);
10305 o
->extent_map
.request_reshard(0, length
);
10310 o
->onode
.size
= offset
;
10312 txc
->write_onode(o
);
10315 void BlueStore::_truncate(TransContext
*txc
,
10320 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10321 << " 0x" << std::hex
<< offset
<< std::dec
10323 _do_truncate(txc
, c
, o
, offset
);
10326 int BlueStore::_do_remove(
10331 set
<SharedBlob
*> maybe_unshared_blobs
;
10332 _do_truncate(txc
, c
, o
, 0, &maybe_unshared_blobs
);
10333 if (o
->onode
.has_omap()) {
10335 _do_omap_clear(txc
, o
->onode
.nid
);
10339 for (auto &s
: o
->extent_map
.shards
) {
10340 dout(20) << __func__
<< " removing shard 0x" << std::hex
10341 << s
.shard_info
->offset
<< std::dec
<< dendl
;
10342 generate_extent_shard_key_and_apply(o
->key
, s
.shard_info
->offset
, &key
,
10343 [&](const string
& final_key
) {
10344 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
10348 txc
->t
->rmkey(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size());
10350 o
->extent_map
.clear();
10351 o
->onode
= bluestore_onode_t();
10352 _debug_obj_on_delete(o
->oid
);
10354 if (!o
->oid
.is_no_gen() &&
10355 !maybe_unshared_blobs
.empty()) {
10356 // see if we can unshare blobs still referenced by the head
10357 dout(10) << __func__
<< " gen and maybe_unshared_blobs "
10358 << maybe_unshared_blobs
<< dendl
;
10359 ghobject_t nogen
= o
->oid
;
10360 nogen
.generation
= ghobject_t::NO_GEN
;
10361 OnodeRef h
= c
->onode_map
.lookup(nogen
);
10362 if (h
&& h
->exists
) {
10363 dout(20) << __func__
<< " checking for unshareable blobs on " << h
10364 << " " << h
->oid
<< dendl
;
10365 map
<SharedBlob
*,bluestore_extent_ref_map_t
> expect
;
10366 for (auto& e
: h
->extent_map
.extent_map
) {
10367 const bluestore_blob_t
& b
= e
.blob
->get_blob();
10368 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
10369 if (b
.is_shared() &&
10371 maybe_unshared_blobs
.count(sb
)) {
10372 b
.map(e
.blob_offset
, e
.length
, [&](uint64_t off
, uint64_t len
) {
10373 expect
[sb
].get(off
, len
);
10378 vector
<SharedBlob
*> unshared_blobs
;
10379 unshared_blobs
.reserve(maybe_unshared_blobs
.size());
10380 for (auto& p
: expect
) {
10381 dout(20) << " ? " << *p
.first
<< " vs " << p
.second
<< dendl
;
10382 if (p
.first
->persistent
->ref_map
== p
.second
) {
10383 SharedBlob
*sb
= p
.first
;
10384 dout(20) << __func__
<< " unsharing " << *sb
<< dendl
;
10385 unshared_blobs
.push_back(sb
);
10386 txc
->unshare_blob(sb
);
10387 uint64_t sbid
= c
->make_blob_unshared(sb
);
10389 get_shared_blob_key(sbid
, &key
);
10390 txc
->t
->rmkey(PREFIX_SHARED_BLOB
, key
);
10394 if (!unshared_blobs
.empty()) {
10395 uint32_t b_start
= OBJECT_MAX_SIZE
;
10396 uint32_t b_end
= 0;
10397 for (auto& e
: h
->extent_map
.extent_map
) {
10398 const bluestore_blob_t
& b
= e
.blob
->get_blob();
10399 SharedBlob
*sb
= e
.blob
->shared_blob
.get();
10400 if (b
.is_shared() &&
10401 std::find(unshared_blobs
.begin(), unshared_blobs
.end(),
10402 sb
) != unshared_blobs
.end()) {
10403 dout(20) << __func__
<< " unsharing " << e
<< dendl
;
10404 bluestore_blob_t
& blob
= e
.blob
->dirty_blob();
10405 blob
.clear_flag(bluestore_blob_t::FLAG_SHARED
);
10406 if (e
.logical_offset
< b_start
) {
10407 b_start
= e
.logical_offset
;
10409 if (e
.logical_end() > b_end
) {
10410 b_end
= e
.logical_end();
10415 h
->extent_map
.dirty_range(b_start
, b_end
- b_start
);
10416 txc
->write_onode(h
);
10423 int BlueStore::_remove(TransContext
*txc
,
10427 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10428 int r
= _do_remove(txc
, c
, o
);
10429 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10433 int BlueStore::_setattr(TransContext
*txc
,
10436 const string
& name
,
10439 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10440 << " " << name
<< " (" << val
.length() << " bytes)"
10443 if (val
.is_partial())
10444 o
->onode
.attrs
[name
.c_str()] = bufferptr(val
.c_str(), val
.length());
10446 o
->onode
.attrs
[name
.c_str()] = val
;
10447 txc
->write_onode(o
);
10448 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10449 << " " << name
<< " (" << val
.length() << " bytes)"
10450 << " = " << r
<< dendl
;
10454 int BlueStore::_setattrs(TransContext
*txc
,
10457 const map
<string
,bufferptr
>& aset
)
10459 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10460 << " " << aset
.size() << " keys"
10463 for (map
<string
,bufferptr
>::const_iterator p
= aset
.begin();
10464 p
!= aset
.end(); ++p
) {
10465 if (p
->second
.is_partial())
10466 o
->onode
.attrs
[p
->first
.c_str()] =
10467 bufferptr(p
->second
.c_str(), p
->second
.length());
10469 o
->onode
.attrs
[p
->first
.c_str()] = p
->second
;
10471 txc
->write_onode(o
);
10472 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10473 << " " << aset
.size() << " keys"
10474 << " = " << r
<< dendl
;
10479 int BlueStore::_rmattr(TransContext
*txc
,
10482 const string
& name
)
10484 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10485 << " " << name
<< dendl
;
10487 auto it
= o
->onode
.attrs
.find(name
.c_str());
10488 if (it
== o
->onode
.attrs
.end())
10491 o
->onode
.attrs
.erase(it
);
10492 txc
->write_onode(o
);
10495 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10496 << " " << name
<< " = " << r
<< dendl
;
10500 int BlueStore::_rmattrs(TransContext
*txc
,
10504 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10507 if (o
->onode
.attrs
.empty())
10510 o
->onode
.attrs
.clear();
10511 txc
->write_onode(o
);
10514 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10518 void BlueStore::_do_omap_clear(TransContext
*txc
, uint64_t id
)
10520 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
10521 string prefix
, tail
;
10522 get_omap_header(id
, &prefix
);
10523 get_omap_tail(id
, &tail
);
10524 it
->lower_bound(prefix
);
10525 while (it
->valid()) {
10526 if (it
->key() >= tail
) {
10527 dout(30) << __func__
<< " stop at " << pretty_binary_string(tail
)
10531 txc
->t
->rmkey(PREFIX_OMAP
, it
->key());
10532 dout(30) << __func__
<< " rm " << pretty_binary_string(it
->key()) << dendl
;
10537 int BlueStore::_omap_clear(TransContext
*txc
,
10541 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10543 if (o
->onode
.has_omap()) {
10545 _do_omap_clear(txc
, o
->onode
.nid
);
10546 o
->onode
.clear_omap_flag();
10547 txc
->write_onode(o
);
10549 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10553 int BlueStore::_omap_setkeys(TransContext
*txc
,
10558 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10560 bufferlist::iterator p
= bl
.begin();
10562 if (!o
->onode
.has_omap()) {
10563 o
->onode
.set_omap_flag();
10564 txc
->write_onode(o
);
10566 txc
->note_modified_object(o
);
10569 _key_encode_u64(o
->onode
.nid
, &final_key
);
10570 final_key
.push_back('.');
10576 ::decode(value
, p
);
10577 final_key
.resize(9); // keep prefix
10579 dout(30) << __func__
<< " " << pretty_binary_string(final_key
)
10580 << " <- " << key
<< dendl
;
10581 txc
->t
->set(PREFIX_OMAP
, final_key
, value
);
10584 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10588 int BlueStore::_omap_setheader(TransContext
*txc
,
10593 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10596 if (!o
->onode
.has_omap()) {
10597 o
->onode
.set_omap_flag();
10598 txc
->write_onode(o
);
10600 txc
->note_modified_object(o
);
10602 get_omap_header(o
->onode
.nid
, &key
);
10603 txc
->t
->set(PREFIX_OMAP
, key
, bl
);
10605 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10609 int BlueStore::_omap_rmkeys(TransContext
*txc
,
10614 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10616 bufferlist::iterator p
= bl
.begin();
10620 if (!o
->onode
.has_omap()) {
10623 _key_encode_u64(o
->onode
.nid
, &final_key
);
10624 final_key
.push_back('.');
10629 final_key
.resize(9); // keep prefix
10631 dout(30) << __func__
<< " rm " << pretty_binary_string(final_key
)
10632 << " <- " << key
<< dendl
;
10633 txc
->t
->rmkey(PREFIX_OMAP
, final_key
);
10635 txc
->note_modified_object(o
);
10638 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10642 int BlueStore::_omap_rmkey_range(TransContext
*txc
,
10645 const string
& first
, const string
& last
)
10647 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10648 KeyValueDB::Iterator it
;
10649 string key_first
, key_last
;
10651 if (!o
->onode
.has_omap()) {
10655 it
= db
->get_iterator(PREFIX_OMAP
);
10656 get_omap_key(o
->onode
.nid
, first
, &key_first
);
10657 get_omap_key(o
->onode
.nid
, last
, &key_last
);
10658 it
->lower_bound(key_first
);
10659 while (it
->valid()) {
10660 if (it
->key() >= key_last
) {
10661 dout(30) << __func__
<< " stop at " << pretty_binary_string(key_last
)
10665 txc
->t
->rmkey(PREFIX_OMAP
, it
->key());
10666 dout(30) << __func__
<< " rm " << pretty_binary_string(it
->key()) << dendl
;
10669 txc
->note_modified_object(o
);
10672 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10676 int BlueStore::_set_alloc_hint(
10680 uint64_t expected_object_size
,
10681 uint64_t expected_write_size
,
10684 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10685 << " object_size " << expected_object_size
10686 << " write_size " << expected_write_size
10687 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
10690 o
->onode
.expected_object_size
= expected_object_size
;
10691 o
->onode
.expected_write_size
= expected_write_size
;
10692 o
->onode
.alloc_hint_flags
= flags
;
10693 txc
->write_onode(o
);
10694 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10695 << " object_size " << expected_object_size
10696 << " write_size " << expected_write_size
10697 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
10698 << " = " << r
<< dendl
;
10702 int BlueStore::_clone(TransContext
*txc
,
10707 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10708 << newo
->oid
<< dendl
;
10710 if (oldo
->oid
.hobj
.get_hash() != newo
->oid
.hobj
.get_hash()) {
10711 derr
<< __func__
<< " mismatched hash on " << oldo
->oid
10712 << " and " << newo
->oid
<< dendl
;
10716 newo
->exists
= true;
10717 _assign_nid(txc
, newo
);
10721 _do_truncate(txc
, c
, newo
, 0);
10722 if (cct
->_conf
->bluestore_clone_cow
) {
10723 _do_clone_range(txc
, c
, oldo
, newo
, 0, oldo
->onode
.size
, 0);
10726 r
= _do_read(c
.get(), oldo
, 0, oldo
->onode
.size
, bl
, 0);
10729 r
= _do_write(txc
, c
, newo
, 0, oldo
->onode
.size
, bl
, 0);
10735 newo
->onode
.attrs
= oldo
->onode
.attrs
;
10738 if (newo
->onode
.has_omap()) {
10739 dout(20) << __func__
<< " clearing old omap data" << dendl
;
10741 _do_omap_clear(txc
, newo
->onode
.nid
);
10743 if (oldo
->onode
.has_omap()) {
10744 dout(20) << __func__
<< " copying omap data" << dendl
;
10745 if (!newo
->onode
.has_omap()) {
10746 newo
->onode
.set_omap_flag();
10748 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
10750 get_omap_header(oldo
->onode
.nid
, &head
);
10751 get_omap_tail(oldo
->onode
.nid
, &tail
);
10752 it
->lower_bound(head
);
10753 while (it
->valid()) {
10754 if (it
->key() >= tail
) {
10755 dout(30) << __func__
<< " reached tail" << dendl
;
10758 dout(30) << __func__
<< " got header/data "
10759 << pretty_binary_string(it
->key()) << dendl
;
10761 rewrite_omap_key(newo
->onode
.nid
, it
->key(), &key
);
10762 txc
->t
->set(PREFIX_OMAP
, key
, it
->value());
10767 newo
->onode
.clear_omap_flag();
10770 txc
->write_onode(newo
);
10774 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10775 << newo
->oid
<< " = " << r
<< dendl
;
10779 int BlueStore::_do_clone_range(
10784 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
)
10786 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10788 << " 0x" << std::hex
<< srcoff
<< "~" << length
<< " -> "
10789 << " 0x" << dstoff
<< "~" << length
<< std::dec
<< dendl
;
10790 oldo
->extent_map
.fault_range(db
, srcoff
, length
);
10791 newo
->extent_map
.fault_range(db
, dstoff
, length
);
10795 // hmm, this could go into an ExtentMap::dup() method.
10796 vector
<BlobRef
> id_to_blob(oldo
->extent_map
.extent_map
.size());
10797 for (auto &e
: oldo
->extent_map
.extent_map
) {
10798 e
.blob
->last_encoded_id
= -1;
10801 bool dirtied_oldo
= false;
10802 uint64_t end
= srcoff
+ length
;
10803 for (auto ep
= oldo
->extent_map
.seek_lextent(srcoff
);
10804 ep
!= oldo
->extent_map
.extent_map
.end();
10807 if (e
.logical_offset
>= end
) {
10810 dout(20) << __func__
<< " src " << e
<< dendl
;
10812 bool blob_duped
= true;
10813 if (e
.blob
->last_encoded_id
>= 0) {
10814 // blob is already duped
10815 cb
= id_to_blob
[e
.blob
->last_encoded_id
];
10816 blob_duped
= false;
10819 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
10820 // make sure it is shared
10821 if (!blob
.is_shared()) {
10822 c
->make_blob_shared(_assign_blobid(txc
), e
.blob
);
10823 dirtied_oldo
= true; // fixme: overkill
10825 c
->load_shared_blob(e
.blob
->shared_blob
);
10828 e
.blob
->last_encoded_id
= n
;
10829 id_to_blob
[n
] = cb
;
10831 // bump the extent refs on the copied blob's extents
10832 for (auto p
: blob
.get_extents()) {
10833 if (p
.is_valid()) {
10834 e
.blob
->shared_blob
->get_ref(p
.offset
, p
.length
);
10837 txc
->write_shared_blob(e
.blob
->shared_blob
);
10838 dout(20) << __func__
<< " new " << *cb
<< dendl
;
10841 int skip_front
, skip_back
;
10842 if (e
.logical_offset
< srcoff
) {
10843 skip_front
= srcoff
- e
.logical_offset
;
10847 if (e
.logical_end() > end
) {
10848 skip_back
= e
.logical_end() - end
;
10852 Extent
*ne
= new Extent(e
.logical_offset
+ skip_front
+ dstoff
- srcoff
,
10853 e
.blob_offset
+ skip_front
,
10854 e
.length
- skip_front
- skip_back
, cb
);
10855 newo
->extent_map
.extent_map
.insert(*ne
);
10856 ne
->blob
->get_ref(c
.get(), ne
->blob_offset
, ne
->length
);
10857 // fixme: we may leave parts of new blob unreferenced that could
10858 // be freed (relative to the shared_blob).
10859 txc
->statfs_delta
.stored() += ne
->length
;
10860 if (e
.blob
->get_blob().is_compressed()) {
10861 txc
->statfs_delta
.compressed_original() += ne
->length
;
10863 txc
->statfs_delta
.compressed() +=
10864 cb
->get_blob().get_compressed_payload_length();
10867 dout(20) << __func__
<< " dst " << *ne
<< dendl
;
10870 if (dirtied_oldo
) {
10871 oldo
->extent_map
.dirty_range(srcoff
, length
); // overkill
10872 txc
->write_onode(oldo
);
10874 txc
->write_onode(newo
);
10876 if (dstoff
+ length
> newo
->onode
.size
) {
10877 newo
->onode
.size
= dstoff
+ length
;
10879 newo
->extent_map
.dirty_range(dstoff
, length
);
10885 int BlueStore::_clone_range(TransContext
*txc
,
10889 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
)
10891 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10892 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
10893 << " to offset 0x" << dstoff
<< std::dec
<< dendl
;
10896 if (srcoff
+ length
> oldo
->onode
.size
) {
10901 newo
->exists
= true;
10902 _assign_nid(txc
, newo
);
10905 if (cct
->_conf
->bluestore_clone_cow
) {
10906 _do_zero(txc
, c
, newo
, dstoff
, length
);
10907 _do_clone_range(txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
10910 r
= _do_read(c
.get(), oldo
, srcoff
, length
, bl
, 0);
10913 r
= _do_write(txc
, c
, newo
, dstoff
, bl
.length(), bl
, 0);
10919 txc
->write_onode(newo
);
10923 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10924 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
10925 << " to offset 0x" << dstoff
<< std::dec
10926 << " = " << r
<< dendl
;
10930 int BlueStore::_rename(TransContext
*txc
,
10934 const ghobject_t
& new_oid
)
10936 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10937 << new_oid
<< dendl
;
10939 ghobject_t old_oid
= oldo
->oid
;
10940 mempool::bluestore_cache_other::string new_okey
;
10943 if (newo
->exists
) {
10947 assert(txc
->onodes
.count(newo
) == 0);
10950 txc
->t
->rmkey(PREFIX_OBJ
, oldo
->key
.c_str(), oldo
->key
.size());
10954 oldo
->extent_map
.fault_range(db
, 0, oldo
->onode
.size
);
10955 get_object_key(cct
, new_oid
, &new_okey
);
10957 for (auto &s
: oldo
->extent_map
.shards
) {
10958 generate_extent_shard_key_and_apply(oldo
->key
, s
.shard_info
->offset
, &key
,
10959 [&](const string
& final_key
) {
10960 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
10968 txc
->write_onode(newo
);
10970 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
10971 // Onode in the old slot
10972 c
->onode_map
.rename(oldo
, old_oid
, new_oid
, new_okey
);
10976 dout(10) << __func__
<< " " << c
->cid
<< " " << old_oid
<< " -> "
10977 << new_oid
<< " = " << r
<< dendl
;
10983 int BlueStore::_create_collection(
10989 dout(15) << __func__
<< " " << cid
<< " bits " << bits
<< dendl
;
10994 RWLock::WLocker
l(coll_lock
);
11002 cache_shards
[cid
.hash_to_shard(cache_shards
.size())],
11004 (*c
)->cnode
.bits
= bits
;
11005 coll_map
[cid
] = *c
;
11007 ::encode((*c
)->cnode
, bl
);
11008 txc
->t
->set(PREFIX_COLL
, stringify(cid
), bl
);
11012 dout(10) << __func__
<< " " << cid
<< " bits " << bits
<< " = " << r
<< dendl
;
11016 int BlueStore::_remove_collection(TransContext
*txc
, const coll_t
&cid
,
11019 dout(15) << __func__
<< " " << cid
<< dendl
;
11023 RWLock::WLocker
l(coll_lock
);
11028 size_t nonexistent_count
= 0;
11029 assert((*c
)->exists
);
11030 if ((*c
)->onode_map
.map_any([&](OnodeRef o
) {
11032 dout(10) << __func__
<< " " << o
->oid
<< " " << o
11033 << " exists in onode_map" << dendl
;
11036 ++nonexistent_count
;
11043 vector
<ghobject_t
> ls
;
11045 // Enumerate onodes in db, up to nonexistent_count + 1
11046 // then check if all of them are marked as non-existent.
11047 // Bypass the check if returned number is greater than nonexistent_count
11048 r
= _collection_list(c
->get(), ghobject_t(), ghobject_t::get_max(),
11049 nonexistent_count
+ 1, &ls
, &next
);
11051 bool exists
= false; //ls.size() > nonexistent_count;
11052 for (auto it
= ls
.begin(); !exists
&& it
< ls
.end(); ++it
) {
11053 dout(10) << __func__
<< " oid " << *it
<< dendl
;
11054 auto onode
= (*c
)->onode_map
.lookup(*it
);
11055 exists
= !onode
|| onode
->exists
;
11057 dout(10) << __func__
<< " " << *it
11058 << " exists in db" << dendl
;
11062 coll_map
.erase(cid
);
11063 txc
->removed_collections
.push_back(*c
);
11064 (*c
)->exists
= false;
11066 txc
->t
->rmkey(PREFIX_COLL
, stringify(cid
));
11069 dout(10) << __func__
<< " " << cid
11070 << " is non-empty" << dendl
;
11077 dout(10) << __func__
<< " " << cid
<< " = " << r
<< dendl
;
11081 int BlueStore::_split_collection(TransContext
*txc
,
11084 unsigned bits
, int rem
)
11086 dout(15) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
11087 << " bits " << bits
<< dendl
;
11088 RWLock::WLocker
l(c
->lock
);
11089 RWLock::WLocker
l2(d
->lock
);
11092 // flush all previous deferred writes on this sequencer. this is a bit
11093 // heavyweight, but we need to make sure all deferred writes complete
11094 // before we split as the new collection's sequencer may need to order
11095 // this after those writes, and we don't bother with the complexity of
11096 // moving those TransContexts over to the new osr.
11097 _osr_drain_preceding(txc
);
11099 // move any cached items (onodes and referenced shared blobs) that will
11100 // belong to the child collection post-split. leave everything else behind.
11101 // this may include things that don't strictly belong to the now-smaller
11102 // parent split, but the OSD will always send us a split for every new
11105 spg_t pgid
, dest_pgid
;
11106 bool is_pg
= c
->cid
.is_pg(&pgid
);
11108 is_pg
= d
->cid
.is_pg(&dest_pgid
);
11111 // the destination should initially be empty.
11112 assert(d
->onode_map
.empty());
11113 assert(d
->shared_blob_set
.empty());
11114 assert(d
->cnode
.bits
== bits
);
11116 c
->split_cache(d
.get());
11118 // adjust bits. note that this will be redundant for all but the first
11119 // split call for this parent (first child).
11120 c
->cnode
.bits
= bits
;
11121 assert(d
->cnode
.bits
== bits
);
11125 ::encode(c
->cnode
, bl
);
11126 txc
->t
->set(PREFIX_COLL
, stringify(c
->cid
), bl
);
11128 dout(10) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
11129 << " bits " << bits
<< " = " << r
<< dendl
;
11133 // DB key value Histogram
11134 #define KEY_SLAB 32
11135 #define VALUE_SLAB 64
11137 const string prefix_onode
= "o";
11138 const string prefix_onode_shard
= "x";
11139 const string prefix_other
= "Z";
11141 int BlueStore::DBHistogram::get_key_slab(size_t sz
)
11143 return (sz
/KEY_SLAB
);
11146 string
BlueStore::DBHistogram::get_key_slab_to_range(int slab
)
11148 int lower_bound
= slab
* KEY_SLAB
;
11149 int upper_bound
= (slab
+ 1) * KEY_SLAB
;
11150 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
11154 int BlueStore::DBHistogram::get_value_slab(size_t sz
)
11156 return (sz
/VALUE_SLAB
);
11159 string
BlueStore::DBHistogram::get_value_slab_to_range(int slab
)
11161 int lower_bound
= slab
* VALUE_SLAB
;
11162 int upper_bound
= (slab
+ 1) * VALUE_SLAB
;
11163 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
11167 void BlueStore::DBHistogram::update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
11168 const string
&prefix
, size_t key_size
, size_t value_size
)
11170 uint32_t key_slab
= get_key_slab(key_size
);
11171 uint32_t value_slab
= get_value_slab(value_size
);
11172 key_hist
[prefix
][key_slab
].count
++;
11173 key_hist
[prefix
][key_slab
].max_len
= MAX(key_size
, key_hist
[prefix
][key_slab
].max_len
);
11174 key_hist
[prefix
][key_slab
].val_map
[value_slab
].count
++;
11175 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
=
11176 MAX(value_size
, key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
);
11179 void BlueStore::DBHistogram::dump(Formatter
*f
)
11181 f
->open_object_section("rocksdb_value_distribution");
11182 for (auto i
: value_hist
) {
11183 f
->dump_unsigned(get_value_slab_to_range(i
.first
).data(), i
.second
);
11185 f
->close_section();
11187 f
->open_object_section("rocksdb_key_value_histogram");
11188 for (auto i
: key_hist
) {
11189 f
->dump_string("prefix", i
.first
);
11190 f
->open_object_section("key_hist");
11191 for ( auto k
: i
.second
) {
11192 f
->dump_unsigned(get_key_slab_to_range(k
.first
).data(), k
.second
.count
);
11193 f
->dump_unsigned("max_len", k
.second
.max_len
);
11194 f
->open_object_section("value_hist");
11195 for ( auto j
: k
.second
.val_map
) {
11196 f
->dump_unsigned(get_value_slab_to_range(j
.first
).data(), j
.second
.count
);
11197 f
->dump_unsigned("max_len", j
.second
.max_len
);
11199 f
->close_section();
11201 f
->close_section();
11203 f
->close_section();
11206 //Itrerates through the db and collects the stats
11207 void BlueStore::generate_db_histogram(Formatter
*f
)
11210 uint64_t num_onodes
= 0;
11211 uint64_t num_shards
= 0;
11212 uint64_t num_super
= 0;
11213 uint64_t num_coll
= 0;
11214 uint64_t num_omap
= 0;
11215 uint64_t num_deferred
= 0;
11216 uint64_t num_alloc
= 0;
11217 uint64_t num_stat
= 0;
11218 uint64_t num_others
= 0;
11219 uint64_t num_shared_shards
= 0;
11220 size_t max_key_size
=0, max_value_size
= 0;
11221 uint64_t total_key_size
= 0, total_value_size
= 0;
11222 size_t key_size
= 0, value_size
= 0;
11225 utime_t start
= ceph_clock_now();
11227 KeyValueDB::WholeSpaceIterator iter
= db
->get_iterator();
11228 iter
->seek_to_first();
11229 while (iter
->valid()) {
11230 dout(30) << __func__
<< " Key: " << iter
->key() << dendl
;
11231 key_size
= iter
->key_size();
11232 value_size
= iter
->value_size();
11233 hist
.value_hist
[hist
.get_value_slab(value_size
)]++;
11234 max_key_size
= MAX(max_key_size
, key_size
);
11235 max_value_size
= MAX(max_value_size
, value_size
);
11236 total_key_size
+= key_size
;
11237 total_value_size
+= value_size
;
11239 pair
<string
,string
> key(iter
->raw_key());
11241 if (key
.first
== PREFIX_SUPER
) {
11242 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SUPER
, key_size
, value_size
);
11244 } else if (key
.first
== PREFIX_STAT
) {
11245 hist
.update_hist_entry(hist
.key_hist
, PREFIX_STAT
, key_size
, value_size
);
11247 } else if (key
.first
== PREFIX_COLL
) {
11248 hist
.update_hist_entry(hist
.key_hist
, PREFIX_COLL
, key_size
, value_size
);
11250 } else if (key
.first
== PREFIX_OBJ
) {
11251 if (key
.second
.back() == ONODE_KEY_SUFFIX
) {
11252 hist
.update_hist_entry(hist
.key_hist
, prefix_onode
, key_size
, value_size
);
11255 hist
.update_hist_entry(hist
.key_hist
, prefix_onode_shard
, key_size
, value_size
);
11258 } else if (key
.first
== PREFIX_OMAP
) {
11259 hist
.update_hist_entry(hist
.key_hist
, PREFIX_OMAP
, key_size
, value_size
);
11261 } else if (key
.first
== PREFIX_DEFERRED
) {
11262 hist
.update_hist_entry(hist
.key_hist
, PREFIX_DEFERRED
, key_size
, value_size
);
11264 } else if (key
.first
== PREFIX_ALLOC
|| key
.first
== "b" ) {
11265 hist
.update_hist_entry(hist
.key_hist
, PREFIX_ALLOC
, key_size
, value_size
);
11267 } else if (key
.first
== PREFIX_SHARED_BLOB
) {
11268 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SHARED_BLOB
, key_size
, value_size
);
11269 num_shared_shards
++;
11271 hist
.update_hist_entry(hist
.key_hist
, prefix_other
, key_size
, value_size
);
11277 utime_t duration
= ceph_clock_now() - start
;
11278 f
->open_object_section("rocksdb_key_value_stats");
11279 f
->dump_unsigned("num_onodes", num_onodes
);
11280 f
->dump_unsigned("num_shards", num_shards
);
11281 f
->dump_unsigned("num_super", num_super
);
11282 f
->dump_unsigned("num_coll", num_coll
);
11283 f
->dump_unsigned("num_omap", num_omap
);
11284 f
->dump_unsigned("num_deferred", num_deferred
);
11285 f
->dump_unsigned("num_alloc", num_alloc
);
11286 f
->dump_unsigned("num_stat", num_stat
);
11287 f
->dump_unsigned("num_shared_shards", num_shared_shards
);
11288 f
->dump_unsigned("num_others", num_others
);
11289 f
->dump_unsigned("max_key_size", max_key_size
);
11290 f
->dump_unsigned("max_value_size", max_value_size
);
11291 f
->dump_unsigned("total_key_size", total_key_size
);
11292 f
->dump_unsigned("total_value_size", total_value_size
);
11293 f
->close_section();
11297 dout(20) << __func__
<< " finished in " << duration
<< " seconds" << dendl
;
11301 void BlueStore::_flush_cache()
11303 dout(10) << __func__
<< dendl
;
11304 for (auto i
: cache_shards
) {
11306 assert(i
->empty());
11308 for (auto& p
: coll_map
) {
11309 assert(p
.second
->onode_map
.empty());
11310 assert(p
.second
->shared_blob_set
.empty());
11315 // For external caller.
11316 // We use a best-effort policy instead, e.g.,
11317 // we don't care if there are still some pinned onodes/data in the cache
11318 // after this command is completed.
11319 void BlueStore::flush_cache()
11321 dout(10) << __func__
<< dendl
;
11322 for (auto i
: cache_shards
) {
11327 void BlueStore::_apply_padding(uint64_t head_pad
,
11330 bufferlist
& padded
)
11335 z
.append_zero(head_pad
);
11336 z
.claim_append(padded
);
11340 padded
.append_zero(tail_pad
);
11342 if (head_pad
|| tail_pad
) {
11343 dout(20) << __func__
<< " can pad head 0x" << std::hex
<< head_pad
11344 << " tail 0x" << tail_pad
<< std::dec
<< dendl
;
11345 logger
->inc(l_bluestore_write_pad_bytes
, head_pad
+ tail_pad
);
11349 // ===========================================