1 // vim: ts=8 sw=2 smarttab
3 * Ceph - scalable distributed file system
5 * Copyright (C) 2014 Red Hat
7 * This is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License version 2.1, as published by the Free Software
10 * Foundation. See file COPYING.
16 #include <sys/types.h>
20 #include "BlueStore.h"
22 #include "include/compat.h"
23 #include "include/intarith.h"
24 #include "include/stringify.h"
25 #include "common/errno.h"
26 #include "common/safe_io.h"
27 #include "Allocator.h"
28 #include "FreelistManager.h"
30 #include "BlueRocksEnv.h"
31 #include "auth/Crypto.h"
32 #include "common/EventTrace.h"
34 #define dout_context cct
35 #define dout_subsys ceph_subsys_bluestore
37 // bluestore_meta_onode
38 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode
, bluestore_onode
,
39 bluestore_meta_onode
);
41 // bluestore_meta_other
42 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer
, bluestore_buffer
,
43 bluestore_meta_other
);
44 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent
, bluestore_extent
,
45 bluestore_meta_other
);
46 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob
, bluestore_blob
,
47 bluestore_meta_other
);
48 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob
, bluestore_shared_blob
,
49 bluestore_meta_other
);
52 const string PREFIX_SUPER
= "S"; // field -> value
53 const string PREFIX_STAT
= "T"; // field -> value(int64 array)
54 const string PREFIX_COLL
= "C"; // collection name -> cnode_t
55 const string PREFIX_OBJ
= "O"; // object name -> onode_t
56 const string PREFIX_OMAP
= "M"; // u64 + keyname -> value
57 const string PREFIX_DEFERRED
= "L"; // id -> deferred_transaction_t
58 const string PREFIX_ALLOC
= "B"; // u64 offset -> u64 length (freelist)
59 const string PREFIX_SHARED_BLOB
= "X"; // u64 offset -> shared_blob_t
61 // write a label in the first block. always use this size. note that
62 // bluefs makes a matching assumption about the location of its
63 // superblock (always the second block of the device).
64 #define BDEV_LABEL_BLOCK_SIZE 4096
66 // reserve: label (4k) + bluefs super (4k), which means we start at 8k.
67 #define SUPER_RESERVED 8192
69 #define OBJECT_MAX_SIZE 0xffffffff // 32 bits
73 * extent map blob encoding
75 * we use the low bits of the blobid field to indicate some common scenarios
76 * and spanning vs local ids. See ExtentMap::{encode,decode}_some().
78 #define BLOBID_FLAG_CONTIGUOUS 0x1 // this extent starts at end of previous
79 #define BLOBID_FLAG_ZEROOFFSET 0x2 // blob_offset is 0
80 #define BLOBID_FLAG_SAMELENGTH 0x4 // length matches previous extent
81 #define BLOBID_FLAG_SPANNING 0x8 // has spanning blob id
82 #define BLOBID_SHIFT_BITS 4
85 * object name key structure
87 * encoded u8: shard + 2^7 (so that it sorts properly)
88 * encoded u64: poolid + 2^63 (so that it sorts properly)
89 * encoded u32: hash (bit reversed)
91 * escaped string: namespace
93 * escaped string: key or object name
94 * 1 char: '<', '=', or '>'. if =, then object key == object name, and
95 * we are done. otherwise, we are followed by the object name.
96 * escaped string: object name (unless '=' above)
99 * encoded u64: generation
102 #define ONODE_KEY_SUFFIX 'o'
111 #define EXTENT_SHARD_KEY_SUFFIX 'x'
114 * string encoding in the key
116 * The key string needs to lexicographically sort the same way that
117 * ghobject_t does. We do this by escaping anything <= to '#' with #
118 * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
121 * We use ! as a terminator for strings; this works because it is < #
122 * and will get escaped if it is present in the string.
126 static void append_escaped(const string
&in
, S
*out
)
129 for (string::const_iterator i
= in
.begin(); i
!= in
.end(); ++i
) {
131 snprintf(hexbyte
, sizeof(hexbyte
), "#%02x", (uint8_t)*i
);
132 out
->append(hexbyte
);
133 } else if (*i
>= '~') {
134 snprintf(hexbyte
, sizeof(hexbyte
), "~%02x", (uint8_t)*i
);
135 out
->append(hexbyte
);
143 static int decode_escaped(const char *p
, string
*out
)
145 const char *orig_p
= p
;
146 while (*p
&& *p
!= '!') {
147 if (*p
== '#' || *p
== '~') {
149 int r
= sscanf(++p
, "%2x", &hex
);
152 out
->push_back((char)hex
);
155 out
->push_back(*p
++);
161 // some things we encode in binary (as le32 or le64); print the
162 // resulting key strings nicely
164 static string
pretty_binary_string(const S
& in
)
168 out
.reserve(in
.length() * 3);
169 enum { NONE
, HEX
, STRING
} mode
= NONE
;
170 unsigned from
= 0, i
;
171 for (i
=0; i
< in
.length(); ++i
) {
172 if ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
173 (mode
== HEX
&& in
.length() - i
>= 4 &&
174 ((in
[i
] < 32 || (unsigned char)in
[i
] > 126) ||
175 (in
[i
+1] < 32 || (unsigned char)in
[i
+1] > 126) ||
176 (in
[i
+2] < 32 || (unsigned char)in
[i
+2] > 126) ||
177 (in
[i
+3] < 32 || (unsigned char)in
[i
+3] > 126)))) {
178 if (mode
== STRING
) {
179 out
.append(in
.c_str() + from
, i
- from
);
186 if (in
.length() - i
>= 4) {
187 // print a whole u32 at once
188 snprintf(buf
, sizeof(buf
), "%08x",
189 (uint32_t)(((unsigned char)in
[i
] << 24) |
190 ((unsigned char)in
[i
+1] << 16) |
191 ((unsigned char)in
[i
+2] << 8) |
192 ((unsigned char)in
[i
+3] << 0)));
195 snprintf(buf
, sizeof(buf
), "%02x", (int)(unsigned char)in
[i
]);
199 if (mode
!= STRING
) {
206 if (mode
== STRING
) {
207 out
.append(in
.c_str() + from
, i
- from
);
214 static void _key_encode_shard(shard_id_t shard
, T
*key
)
216 key
->push_back((char)((uint8_t)shard
.id
+ (uint8_t)0x80));
219 static const char *_key_decode_shard(const char *key
, shard_id_t
*pshard
)
221 pshard
->id
= (uint8_t)*key
- (uint8_t)0x80;
225 static void get_coll_key_range(const coll_t
& cid
, int bits
,
226 string
*temp_start
, string
*temp_end
,
227 string
*start
, string
*end
)
235 if (cid
.is_pg(&pgid
)) {
236 _key_encode_shard(pgid
.shard
, start
);
237 *temp_start
= *start
;
239 _key_encode_u64(pgid
.pool() + 0x8000000000000000ull
, start
);
240 _key_encode_u64((-2ll - pgid
.pool()) + 0x8000000000000000ull
, temp_start
);
243 *temp_end
= *temp_start
;
245 uint32_t reverse_hash
= hobject_t::_reverse_bits(pgid
.ps());
246 _key_encode_u32(reverse_hash
, start
);
247 _key_encode_u32(reverse_hash
, temp_start
);
249 uint64_t end_hash
= reverse_hash
+ (1ull << (32 - bits
));
250 if (end_hash
> 0xffffffffull
)
251 end_hash
= 0xffffffffull
;
253 _key_encode_u32(end_hash
, end
);
254 _key_encode_u32(end_hash
, temp_end
);
256 _key_encode_shard(shard_id_t::NO_SHARD
, start
);
257 _key_encode_u64(-1ull + 0x8000000000000000ull
, start
);
259 _key_encode_u32(0, start
);
260 _key_encode_u32(0xffffffff, end
);
262 // no separate temp section
268 static void get_shared_blob_key(uint64_t sbid
, string
*key
)
271 _key_encode_u64(sbid
, key
);
274 static int get_key_shared_blob(const string
& key
, uint64_t *sbid
)
276 const char *p
= key
.c_str();
277 if (key
.length() < sizeof(uint64_t))
279 p
= _key_decode_u64(p
, sbid
);
284 static int get_key_object(const S
& key
, ghobject_t
*oid
)
287 const char *p
= key
.c_str();
289 if (key
.length() < 1 + 8 + 4)
291 p
= _key_decode_shard(p
, &oid
->shard_id
);
294 p
= _key_decode_u64(p
, &pool
);
295 oid
->hobj
.pool
= pool
- 0x8000000000000000ull
;
298 p
= _key_decode_u32(p
, &hash
);
300 oid
->hobj
.set_bitwise_key_u32(hash
);
302 r
= decode_escaped(p
, &oid
->hobj
.nspace
);
308 r
= decode_escaped(p
, &k
);
315 oid
->hobj
.oid
.name
= k
;
316 } else if (*p
== '<' || *p
== '>') {
319 r
= decode_escaped(p
, &oid
->hobj
.oid
.name
);
323 oid
->hobj
.set_key(k
);
329 p
= _key_decode_u64(p
, &oid
->hobj
.snap
.val
);
330 p
= _key_decode_u64(p
, &oid
->generation
);
332 if (*p
!= ONODE_KEY_SUFFIX
) {
337 // if we get something other than a null terminator here,
338 // something goes wrong.
346 static void get_object_key(CephContext
*cct
, const ghobject_t
& oid
, S
*key
)
350 size_t max_len
= 1 + 8 + 4 +
351 (oid
.hobj
.nspace
.length() * 3 + 1) +
352 (oid
.hobj
.get_key().length() * 3 + 1) +
353 1 + // for '<', '=', or '>'
354 (oid
.hobj
.oid
.name
.length() * 3 + 1) +
356 key
->reserve(max_len
);
358 _key_encode_shard(oid
.shard_id
, key
);
359 _key_encode_u64(oid
.hobj
.pool
+ 0x8000000000000000ull
, key
);
360 _key_encode_u32(oid
.hobj
.get_bitwise_key_u32(), key
);
362 append_escaped(oid
.hobj
.nspace
, key
);
364 if (oid
.hobj
.get_key().length()) {
365 // is a key... could be < = or >.
366 append_escaped(oid
.hobj
.get_key(), key
);
367 // (ASCII chars < = and > sort in that order, yay)
368 int r
= oid
.hobj
.get_key().compare(oid
.hobj
.oid
.name
);
370 key
->append(r
> 0 ? ">" : "<");
371 append_escaped(oid
.hobj
.oid
.name
, key
);
378 append_escaped(oid
.hobj
.oid
.name
, key
);
382 _key_encode_u64(oid
.hobj
.snap
, key
);
383 _key_encode_u64(oid
.generation
, key
);
385 key
->push_back(ONODE_KEY_SUFFIX
);
390 int r
= get_key_object(*key
, &t
);
392 derr
<< " r " << r
<< dendl
;
393 derr
<< "key " << pretty_binary_string(*key
) << dendl
;
394 derr
<< "oid " << oid
<< dendl
;
395 derr
<< " t " << t
<< dendl
;
396 assert(r
== 0 && t
== oid
);
402 // extent shard keys are the onode key, plus a u32, plus 'x'. the trailing
403 // char lets us quickly test whether it is a shard key without decoding any
404 // of the prefix bytes.
406 static void get_extent_shard_key(const S
& onode_key
, uint32_t offset
,
410 key
->reserve(onode_key
.length() + 4 + 1);
411 key
->append(onode_key
.c_str(), onode_key
.size());
412 _key_encode_u32(offset
, key
);
413 key
->push_back(EXTENT_SHARD_KEY_SUFFIX
);
416 static void rewrite_extent_shard_key(uint32_t offset
, string
*key
)
418 assert(key
->size() > sizeof(uint32_t) + 1);
419 assert(*key
->rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
420 _key_encode_u32(offset
, key
->size() - sizeof(uint32_t) - 1, key
);
424 static void generate_extent_shard_key_and_apply(
428 std::function
<void(const string
& final_key
)> apply
)
430 if (key
->empty()) { // make full key
431 assert(!onode_key
.empty());
432 get_extent_shard_key(onode_key
, offset
, key
);
434 rewrite_extent_shard_key(offset
, key
);
439 int get_key_extent_shard(const string
& key
, string
*onode_key
, uint32_t *offset
)
441 assert(key
.size() > sizeof(uint32_t) + 1);
442 assert(*key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
);
443 int okey_len
= key
.size() - sizeof(uint32_t) - 1;
444 *onode_key
= key
.substr(0, okey_len
);
445 const char *p
= key
.data() + okey_len
;
446 p
= _key_decode_u32(p
, offset
);
450 static bool is_extent_shard_key(const string
& key
)
452 return *key
.rbegin() == EXTENT_SHARD_KEY_SUFFIX
;
456 static void get_omap_header(uint64_t id
, string
*out
)
458 _key_encode_u64(id
, out
);
462 // hmm, I don't think there's any need to escape the user key since we
463 // have a clean prefix.
464 static void get_omap_key(uint64_t id
, const string
& key
, string
*out
)
466 _key_encode_u64(id
, out
);
471 static void rewrite_omap_key(uint64_t id
, string old
, string
*out
)
473 _key_encode_u64(id
, out
);
474 out
->append(old
.c_str() + out
->length(), old
.size() - out
->length());
477 static void decode_omap_key(const string
& key
, string
*user_key
)
479 *user_key
= key
.substr(sizeof(uint64_t) + 1);
482 static void get_omap_tail(uint64_t id
, string
*out
)
484 _key_encode_u64(id
, out
);
488 static void get_deferred_key(uint64_t seq
, string
*out
)
490 _key_encode_u64(seq
, out
);
496 struct Int64ArrayMergeOperator
: public KeyValueDB::MergeOperator
{
497 void merge_nonexistent(
498 const char *rdata
, size_t rlen
, std::string
*new_value
) override
{
499 *new_value
= std::string(rdata
, rlen
);
502 const char *ldata
, size_t llen
,
503 const char *rdata
, size_t rlen
,
504 std::string
*new_value
) override
{
505 assert(llen
== rlen
);
506 assert((rlen
% 8) == 0);
507 new_value
->resize(rlen
);
508 const __le64
* lv
= (const __le64
*)ldata
;
509 const __le64
* rv
= (const __le64
*)rdata
;
510 __le64
* nv
= &(__le64
&)new_value
->at(0);
511 for (size_t i
= 0; i
< rlen
>> 3; ++i
) {
512 nv
[i
] = lv
[i
] + rv
[i
];
515 // We use each operator name and each prefix to construct the
516 // overall RocksDB operator name for consistency check at open time.
517 string
name() const override
{
518 return "int64_array";
525 ostream
& operator<<(ostream
& out
, const BlueStore::Buffer
& b
)
527 out
<< "buffer(" << &b
<< " space " << b
.space
<< " 0x" << std::hex
528 << b
.offset
<< "~" << b
.length
<< std::dec
529 << " " << BlueStore::Buffer::get_state_name(b
.state
);
531 out
<< " " << BlueStore::Buffer::get_flag_name(b
.flags
);
537 void BlueStore::GarbageCollector::process_protrusive_extents(
538 const BlueStore::ExtentMap
& extent_map
,
539 uint64_t start_offset
,
541 uint64_t start_touch_offset
,
542 uint64_t end_touch_offset
,
543 uint64_t min_alloc_size
)
545 assert(start_offset
<= start_touch_offset
&& end_offset
>= end_touch_offset
);
547 uint64_t lookup_start_offset
= P2ALIGN(start_offset
, min_alloc_size
);
548 uint64_t lookup_end_offset
= ROUND_UP_TO(end_offset
, min_alloc_size
);
550 dout(30) << __func__
<< " (hex): [" << std::hex
551 << lookup_start_offset
<< ", " << lookup_end_offset
552 << ")" << std::dec
<< dendl
;
554 for (auto it
= extent_map
.seek_lextent(lookup_start_offset
);
555 it
!= extent_map
.extent_map
.end() &&
556 it
->logical_offset
< lookup_end_offset
;
558 uint64_t alloc_unit_start
= it
->logical_offset
/ min_alloc_size
;
559 uint64_t alloc_unit_end
= (it
->logical_end() - 1) / min_alloc_size
;
561 dout(30) << __func__
<< " " << *it
562 << "alloc_units: " << alloc_unit_start
<< ".." << alloc_unit_end
565 Blob
* b
= it
->blob
.get();
567 if (it
->logical_offset
>=start_touch_offset
&&
568 it
->logical_end() <= end_touch_offset
) {
569 // Process extents within the range affected by
570 // the current write request.
571 // Need to take into account if existing extents
572 // can be merged with them (uncompressed case)
573 if (!b
->get_blob().is_compressed()) {
574 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
575 --blob_info_counted
->expected_allocations
; // don't need to allocate
576 // new AU for compressed
577 // data since another
578 // collocated uncompressed
579 // blob already exists
580 dout(30) << __func__
<< " --expected:"
581 << alloc_unit_start
<< dendl
;
583 used_alloc_unit
= alloc_unit_end
;
584 blob_info_counted
= nullptr;
586 } else if (b
->get_blob().is_compressed()) {
588 // additionally we take compressed blobs that were not impacted
589 // by the write into account too
591 affected_blobs
.emplace(
592 b
, BlobInfo(b
->get_referenced_bytes())).first
->second
;
595 (used_alloc_unit
&& used_alloc_unit
== alloc_unit_start
) ? 0 : 1;
596 bi
.expected_allocations
+= alloc_unit_end
- alloc_unit_start
+ adjust
;
597 dout(30) << __func__
<< " expected_allocations="
598 << bi
.expected_allocations
<< " end_au:"
599 << alloc_unit_end
<< dendl
;
601 blob_info_counted
= &bi
;
602 used_alloc_unit
= alloc_unit_end
;
604 assert(it
->length
<= bi
.referenced_bytes
);
605 bi
.referenced_bytes
-= it
->length
;
606 dout(30) << __func__
<< " affected_blob:" << *b
607 << " unref 0x" << std::hex
<< it
->length
608 << " referenced = 0x" << bi
.referenced_bytes
609 << std::dec
<< dendl
;
610 // NOTE: we can't move specific blob to resulting GC list here
611 // when reference counter == 0 since subsequent extents might
612 // decrement its expected_allocation.
613 // Hence need to enumerate all the extents first.
614 if (!bi
.collect_candidate
) {
615 bi
.first_lextent
= it
;
616 bi
.collect_candidate
= true;
618 bi
.last_lextent
= it
;
620 if (blob_info_counted
&& used_alloc_unit
== alloc_unit_start
) {
621 // don't need to allocate new AU for compressed data since another
622 // collocated uncompressed blob already exists
623 --blob_info_counted
->expected_allocations
;
624 dout(30) << __func__
<< " --expected_allocations:"
625 << alloc_unit_start
<< dendl
;
627 used_alloc_unit
= alloc_unit_end
;
628 blob_info_counted
= nullptr;
632 for (auto b_it
= affected_blobs
.begin();
633 b_it
!= affected_blobs
.end();
635 Blob
* b
= b_it
->first
;
636 BlobInfo
& bi
= b_it
->second
;
637 if (bi
.referenced_bytes
== 0) {
638 uint64_t len_on_disk
= b_it
->first
->get_blob().get_ondisk_length();
639 int64_t blob_expected_for_release
=
640 ROUND_UP_TO(len_on_disk
, min_alloc_size
) / min_alloc_size
;
642 dout(30) << __func__
<< " " << *(b_it
->first
)
643 << " expected4release=" << blob_expected_for_release
644 << " expected_allocations=" << bi
.expected_allocations
646 int64_t benefit
= blob_expected_for_release
- bi
.expected_allocations
;
647 if (benefit
>= g_conf
->bluestore_gc_enable_blob_threshold
) {
648 if (bi
.collect_candidate
) {
649 auto it
= bi
.first_lextent
;
652 if (it
->blob
.get() == b
) {
653 extents_to_collect
.emplace_back(it
->logical_offset
, it
->length
);
655 bExit
= it
== bi
.last_lextent
;
659 expected_for_release
+= blob_expected_for_release
;
660 expected_allocations
+= bi
.expected_allocations
;
666 int64_t BlueStore::GarbageCollector::estimate(
667 uint64_t start_offset
,
669 const BlueStore::ExtentMap
& extent_map
,
670 const BlueStore::old_extent_map_t
& old_extents
,
671 uint64_t min_alloc_size
)
674 affected_blobs
.clear();
675 extents_to_collect
.clear();
676 used_alloc_unit
= boost::optional
<uint64_t >();
677 blob_info_counted
= nullptr;
679 gc_start_offset
= start_offset
;
680 gc_end_offset
= start_offset
+ length
;
682 uint64_t end_offset
= start_offset
+ length
;
684 for (auto it
= old_extents
.begin(); it
!= old_extents
.end(); ++it
) {
685 Blob
* b
= it
->e
.blob
.get();
686 if (b
->get_blob().is_compressed()) {
688 // update gc_start_offset/gc_end_offset if needed
689 gc_start_offset
= min(gc_start_offset
, (uint64_t)it
->e
.blob_start());
690 gc_end_offset
= max(gc_end_offset
, (uint64_t)it
->e
.blob_end());
692 auto o
= it
->e
.logical_offset
;
693 auto l
= it
->e
.length
;
695 uint64_t ref_bytes
= b
->get_referenced_bytes();
696 // micro optimization to bypass blobs that have no more references
697 if (ref_bytes
!= 0) {
698 dout(30) << __func__
<< " affected_blob:" << *b
699 << " unref 0x" << std::hex
<< o
<< "~" << l
700 << std::dec
<< dendl
;
701 affected_blobs
.emplace(b
, BlobInfo(ref_bytes
));
705 dout(30) << __func__
<< " gc range(hex): [" << std::hex
706 << gc_start_offset
<< ", " << gc_end_offset
707 << ")" << std::dec
<< dendl
;
709 // enumerate preceeding extents to check if they reference affected blobs
710 if (gc_start_offset
< start_offset
|| gc_end_offset
> end_offset
) {
711 process_protrusive_extents(extent_map
,
718 return expected_for_release
- expected_allocations
;
723 BlueStore::Cache
*BlueStore::Cache::create(CephContext
* cct
, string type
,
724 PerfCounters
*logger
)
729 c
= new LRUCache(cct
);
730 else if (type
== "2q")
731 c
= new TwoQCache(cct
);
733 assert(0 == "unrecognized cache type");
739 void BlueStore::Cache::trim_all()
741 std::lock_guard
<std::recursive_mutex
> l(lock
);
743 assert(_get_num_onodes() == 0);
744 assert(_get_buffer_bytes() == 0);
747 void BlueStore::Cache::trim(
748 uint64_t target_bytes
,
749 float target_meta_ratio
,
750 float bytes_per_onode
)
752 std::lock_guard
<std::recursive_mutex
> l(lock
);
753 uint64_t current_meta
= _get_num_onodes() * bytes_per_onode
;
754 uint64_t current_buffer
= _get_buffer_bytes();
755 uint64_t current
= current_meta
+ current_buffer
;
757 uint64_t target_meta
= target_bytes
* (double)target_meta_ratio
; //need to cast to double
758 //since float(1) might produce inaccurate value
759 // for target_meta (a bit greater than target_bytes)
760 // that causes overflow in target_buffer below.
761 //Consider the following code:
762 //uint64_t i =(uint64_t)227*1024*1024*1024 + 1;
767 target_meta
= min(target_bytes
, target_meta
); //and just in case that ratio is > 1
768 uint64_t target_buffer
= target_bytes
- target_meta
;
770 if (current
<= target_bytes
) {
772 << " shard target " << pretty_si_t(target_bytes
)
773 << " ratio " << target_meta_ratio
<< " ("
774 << pretty_si_t(target_meta
) << " + "
775 << pretty_si_t(target_buffer
) << "), "
776 << " current " << pretty_si_t(current
) << " ("
777 << pretty_si_t(current_meta
) << " + "
778 << pretty_si_t(current_buffer
) << ")"
783 uint64_t need_to_free
= current
- target_bytes
;
784 uint64_t free_buffer
= 0;
785 uint64_t free_meta
= 0;
786 if (current_buffer
> target_buffer
) {
787 free_buffer
= current_buffer
- target_buffer
;
788 if (free_buffer
> need_to_free
) {
789 free_buffer
= need_to_free
;
792 free_meta
= need_to_free
- free_buffer
;
794 // start bounds at what we have now
795 uint64_t max_buffer
= current_buffer
- free_buffer
;
796 uint64_t max_meta
= current_meta
- free_meta
;
797 uint64_t max_onodes
= max_meta
/ bytes_per_onode
;
800 << " shard target " << pretty_si_t(target_bytes
)
801 << " ratio " << target_meta_ratio
<< " ("
802 << pretty_si_t(target_meta
) << " + "
803 << pretty_si_t(target_buffer
) << "), "
804 << " current " << pretty_si_t(current
) << " ("
805 << pretty_si_t(current_meta
) << " + "
806 << pretty_si_t(current_buffer
) << "),"
807 << " need_to_free " << pretty_si_t(need_to_free
) << " ("
808 << pretty_si_t(free_meta
) << " + "
809 << pretty_si_t(free_buffer
) << ")"
810 << " -> max " << max_onodes
<< " onodes + "
811 << max_buffer
<< " buffer"
813 _trim(max_onodes
, max_buffer
);
819 #define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
821 void BlueStore::LRUCache::_touch_onode(OnodeRef
& o
)
823 auto p
= onode_lru
.iterator_to(*o
);
825 onode_lru
.push_front(*o
);
828 void BlueStore::LRUCache::_trim(uint64_t onode_max
, uint64_t buffer_max
)
830 dout(20) << __func__
<< " onodes " << onode_lru
.size() << " / " << onode_max
831 << " buffers " << buffer_size
<< " / " << buffer_max
834 _audit("trim start");
837 while (buffer_size
> buffer_max
) {
838 auto i
= buffer_lru
.rbegin();
839 if (i
== buffer_lru
.rend()) {
840 // stop if buffer_lru is now empty
845 assert(b
->is_clean());
846 dout(20) << __func__
<< " rm " << *b
<< dendl
;
847 b
->space
->_rm_buffer(this, b
);
851 int num
= onode_lru
.size() - onode_max
;
853 return; // don't even try
855 auto p
= onode_lru
.end();
856 assert(p
!= onode_lru
.begin());
859 int max_skipped
= g_conf
->bluestore_cache_trim_max_skip_pinned
;
862 int refs
= o
->nref
.load();
864 dout(20) << __func__
<< " " << o
->oid
<< " has " << refs
865 << " refs, skipping" << dendl
;
866 if (++skipped
>= max_skipped
) {
867 dout(20) << __func__
<< " maximum skip pinned reached; stopping with "
868 << num
<< " left to trim" << dendl
;
872 if (p
== onode_lru
.begin()) {
880 dout(30) << __func__
<< " rm " << o
->oid
<< dendl
;
881 if (p
!= onode_lru
.begin()) {
882 onode_lru
.erase(p
--);
887 o
->get(); // paranoia
888 o
->c
->onode_map
.remove(o
->oid
);
895 void BlueStore::LRUCache::_audit(const char *when
)
897 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
899 for (auto i
= buffer_lru
.begin(); i
!= buffer_lru
.end(); ++i
) {
902 if (s
!= buffer_size
) {
903 derr
<< __func__
<< " buffer_size " << buffer_size
<< " actual " << s
905 for (auto i
= buffer_lru
.begin(); i
!= buffer_lru
.end(); ++i
) {
906 derr
<< __func__
<< " " << *i
<< dendl
;
908 assert(s
== buffer_size
);
910 dout(20) << __func__
<< " " << when
<< " buffer_size " << buffer_size
917 #define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
920 void BlueStore::TwoQCache::_touch_onode(OnodeRef
& o
)
922 auto p
= onode_lru
.iterator_to(*o
);
924 onode_lru
.push_front(*o
);
927 void BlueStore::TwoQCache::_add_buffer(Buffer
*b
, int level
, Buffer
*near
)
929 dout(20) << __func__
<< " level " << level
<< " near " << near
931 << " which has cache_private " << b
->cache_private
<< dendl
;
933 b
->cache_private
= near
->cache_private
;
934 switch (b
->cache_private
) {
936 buffer_warm_in
.insert(buffer_warm_in
.iterator_to(*near
), *b
);
938 case BUFFER_WARM_OUT
:
939 assert(b
->is_empty());
940 buffer_warm_out
.insert(buffer_warm_out
.iterator_to(*near
), *b
);
943 buffer_hot
.insert(buffer_hot
.iterator_to(*near
), *b
);
946 assert(0 == "bad cache_private");
948 } else if (b
->cache_private
== BUFFER_NEW
) {
949 b
->cache_private
= BUFFER_WARM_IN
;
951 buffer_warm_in
.push_front(*b
);
953 // take caller hint to start at the back of the warm queue
954 buffer_warm_in
.push_back(*b
);
957 // we got a hint from discard
958 switch (b
->cache_private
) {
960 // stay in warm_in. move to front, even though 2Q doesn't actually
962 dout(20) << __func__
<< " move to front of warm " << *b
<< dendl
;
963 buffer_warm_in
.push_front(*b
);
965 case BUFFER_WARM_OUT
:
966 b
->cache_private
= BUFFER_HOT
;
967 // move to hot. fall-thru
969 dout(20) << __func__
<< " move to front of hot " << *b
<< dendl
;
970 buffer_hot
.push_front(*b
);
973 assert(0 == "bad cache_private");
976 if (!b
->is_empty()) {
977 buffer_bytes
+= b
->length
;
978 buffer_list_bytes
[b
->cache_private
] += b
->length
;
982 void BlueStore::TwoQCache::_rm_buffer(Buffer
*b
)
984 dout(20) << __func__
<< " " << *b
<< dendl
;
985 if (!b
->is_empty()) {
986 assert(buffer_bytes
>= b
->length
);
987 buffer_bytes
-= b
->length
;
988 assert(buffer_list_bytes
[b
->cache_private
] >= b
->length
);
989 buffer_list_bytes
[b
->cache_private
] -= b
->length
;
991 switch (b
->cache_private
) {
993 buffer_warm_in
.erase(buffer_warm_in
.iterator_to(*b
));
995 case BUFFER_WARM_OUT
:
996 buffer_warm_out
.erase(buffer_warm_out
.iterator_to(*b
));
999 buffer_hot
.erase(buffer_hot
.iterator_to(*b
));
1002 assert(0 == "bad cache_private");
1006 void BlueStore::TwoQCache::_move_buffer(Cache
*srcc
, Buffer
*b
)
1008 TwoQCache
*src
= static_cast<TwoQCache
*>(srcc
);
1011 // preserve which list we're on (even if we can't preserve the order!)
1012 switch (b
->cache_private
) {
1013 case BUFFER_WARM_IN
:
1014 assert(!b
->is_empty());
1015 buffer_warm_in
.push_back(*b
);
1017 case BUFFER_WARM_OUT
:
1018 assert(b
->is_empty());
1019 buffer_warm_out
.push_back(*b
);
1022 assert(!b
->is_empty());
1023 buffer_hot
.push_back(*b
);
1026 assert(0 == "bad cache_private");
1028 if (!b
->is_empty()) {
1029 buffer_bytes
+= b
->length
;
1030 buffer_list_bytes
[b
->cache_private
] += b
->length
;
1034 void BlueStore::TwoQCache::_adjust_buffer_size(Buffer
*b
, int64_t delta
)
1036 dout(20) << __func__
<< " delta " << delta
<< " on " << *b
<< dendl
;
1037 if (!b
->is_empty()) {
1038 assert((int64_t)buffer_bytes
+ delta
>= 0);
1039 buffer_bytes
+= delta
;
1040 assert((int64_t)buffer_list_bytes
[b
->cache_private
] + delta
>= 0);
1041 buffer_list_bytes
[b
->cache_private
] += delta
;
1045 void BlueStore::TwoQCache::_trim(uint64_t onode_max
, uint64_t buffer_max
)
1047 dout(20) << __func__
<< " onodes " << onode_lru
.size() << " / " << onode_max
1048 << " buffers " << buffer_bytes
<< " / " << buffer_max
1051 _audit("trim start");
1054 if (buffer_bytes
> buffer_max
) {
1055 uint64_t kin
= buffer_max
* cct
->_conf
->bluestore_2q_cache_kin_ratio
;
1056 uint64_t khot
= buffer_max
- kin
;
1058 // pre-calculate kout based on average buffer size too,
1059 // which is typical(the warm_in and hot lists may change later)
1061 uint64_t buffer_num
= buffer_hot
.size() + buffer_warm_in
.size();
1063 uint64_t buffer_avg_size
= buffer_bytes
/ buffer_num
;
1064 assert(buffer_avg_size
);
1065 uint64_t calculated_buffer_num
= buffer_max
/ buffer_avg_size
;
1066 kout
= calculated_buffer_num
* cct
->_conf
->bluestore_2q_cache_kout_ratio
;
1069 if (buffer_list_bytes
[BUFFER_HOT
] < khot
) {
1070 // hot is small, give slack to warm_in
1071 kin
+= khot
- buffer_list_bytes
[BUFFER_HOT
];
1072 } else if (buffer_list_bytes
[BUFFER_WARM_IN
] < kin
) {
1073 // warm_in is small, give slack to hot
1074 khot
+= kin
- buffer_list_bytes
[BUFFER_WARM_IN
];
1077 // adjust warm_in list
1078 int64_t to_evict_bytes
= buffer_list_bytes
[BUFFER_WARM_IN
] - kin
;
1079 uint64_t evicted
= 0;
1081 while (to_evict_bytes
> 0) {
1082 auto p
= buffer_warm_in
.rbegin();
1083 if (p
== buffer_warm_in
.rend()) {
1084 // stop if warm_in list is now empty
1089 assert(b
->is_clean());
1090 dout(20) << __func__
<< " buffer_warm_in -> out " << *b
<< dendl
;
1091 assert(buffer_bytes
>= b
->length
);
1092 buffer_bytes
-= b
->length
;
1093 assert(buffer_list_bytes
[BUFFER_WARM_IN
] >= b
->length
);
1094 buffer_list_bytes
[BUFFER_WARM_IN
] -= b
->length
;
1095 to_evict_bytes
-= b
->length
;
1096 evicted
+= b
->length
;
1097 b
->state
= Buffer::STATE_EMPTY
;
1099 buffer_warm_in
.erase(buffer_warm_in
.iterator_to(*b
));
1100 buffer_warm_out
.push_front(*b
);
1101 b
->cache_private
= BUFFER_WARM_OUT
;
1105 dout(20) << __func__
<< " evicted " << prettybyte_t(evicted
)
1106 << " from warm_in list, done evicting warm_in buffers"
1111 to_evict_bytes
= buffer_list_bytes
[BUFFER_HOT
] - khot
;
1114 while (to_evict_bytes
> 0) {
1115 auto p
= buffer_hot
.rbegin();
1116 if (p
== buffer_hot
.rend()) {
1117 // stop if hot list is now empty
1122 dout(20) << __func__
<< " buffer_hot rm " << *b
<< dendl
;
1123 assert(b
->is_clean());
1124 // adjust evict size before buffer goes invalid
1125 to_evict_bytes
-= b
->length
;
1126 evicted
+= b
->length
;
1127 b
->space
->_rm_buffer(this, b
);
1131 dout(20) << __func__
<< " evicted " << prettybyte_t(evicted
)
1132 << " from hot list, done evicting hot buffers"
1136 // adjust warm out list too, if necessary
1137 int64_t num
= buffer_warm_out
.size() - kout
;
1139 Buffer
*b
= &*buffer_warm_out
.rbegin();
1140 assert(b
->is_empty());
1141 dout(20) << __func__
<< " buffer_warm_out rm " << *b
<< dendl
;
1142 b
->space
->_rm_buffer(this, b
);
1147 int num
= onode_lru
.size() - onode_max
;
1149 return; // don't even try
1151 auto p
= onode_lru
.end();
1152 assert(p
!= onode_lru
.begin());
1155 int max_skipped
= g_conf
->bluestore_cache_trim_max_skip_pinned
;
1158 dout(20) << __func__
<< " considering " << o
<< dendl
;
1159 int refs
= o
->nref
.load();
1161 dout(20) << __func__
<< " " << o
->oid
<< " has " << refs
1162 << " refs; skipping" << dendl
;
1163 if (++skipped
>= max_skipped
) {
1164 dout(20) << __func__
<< " maximum skip pinned reached; stopping with "
1165 << num
<< " left to trim" << dendl
;
1169 if (p
== onode_lru
.begin()) {
1177 dout(30) << __func__
<< " " << o
->oid
<< " num=" << num
<<" lru size="<<onode_lru
.size()<< dendl
;
1178 if (p
!= onode_lru
.begin()) {
1179 onode_lru
.erase(p
--);
1184 o
->get(); // paranoia
1185 o
->c
->onode_map
.remove(o
->oid
);
1192 void BlueStore::TwoQCache::_audit(const char *when
)
1194 dout(10) << __func__
<< " " << when
<< " start" << dendl
;
1196 for (auto i
= buffer_hot
.begin(); i
!= buffer_hot
.end(); ++i
) {
1200 uint64_t hot_bytes
= s
;
1201 if (hot_bytes
!= buffer_list_bytes
[BUFFER_HOT
]) {
1202 derr
<< __func__
<< " hot_list_bytes "
1203 << buffer_list_bytes
[BUFFER_HOT
]
1204 << " != actual " << hot_bytes
1206 assert(hot_bytes
== buffer_list_bytes
[BUFFER_HOT
]);
1209 for (auto i
= buffer_warm_in
.begin(); i
!= buffer_warm_in
.end(); ++i
) {
1213 uint64_t warm_in_bytes
= s
- hot_bytes
;
1214 if (warm_in_bytes
!= buffer_list_bytes
[BUFFER_WARM_IN
]) {
1215 derr
<< __func__
<< " warm_in_list_bytes "
1216 << buffer_list_bytes
[BUFFER_WARM_IN
]
1217 << " != actual " << warm_in_bytes
1219 assert(warm_in_bytes
== buffer_list_bytes
[BUFFER_WARM_IN
]);
1222 if (s
!= buffer_bytes
) {
1223 derr
<< __func__
<< " buffer_bytes " << buffer_bytes
<< " actual " << s
1225 assert(s
== buffer_bytes
);
1228 dout(20) << __func__
<< " " << when
<< " buffer_bytes " << buffer_bytes
1237 #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
1239 void BlueStore::BufferSpace::_clear(Cache
* cache
)
1241 // note: we already hold cache->lock
1242 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1243 while (!buffer_map
.empty()) {
1244 _rm_buffer(cache
, buffer_map
.begin());
1248 int BlueStore::BufferSpace::_discard(Cache
* cache
, uint32_t offset
, uint32_t length
)
1250 // note: we already hold cache->lock
1251 ldout(cache
->cct
, 20) << __func__
<< std::hex
<< " 0x" << offset
<< "~" << length
1252 << std::dec
<< dendl
;
1253 int cache_private
= 0;
1254 cache
->_audit("discard start");
1255 auto i
= _data_lower_bound(offset
);
1256 uint32_t end
= offset
+ length
;
1257 while (i
!= buffer_map
.end()) {
1258 Buffer
*b
= i
->second
.get();
1259 if (b
->offset
>= end
) {
1262 if (b
->cache_private
> cache_private
) {
1263 cache_private
= b
->cache_private
;
1265 if (b
->offset
< offset
) {
1266 int64_t front
= offset
- b
->offset
;
1267 if (b
->end() > end
) {
1268 // drop middle (split)
1269 uint32_t tail
= b
->end() - end
;
1270 if (b
->data
.length()) {
1272 bl
.substr_of(b
->data
, b
->length
- tail
, tail
);
1273 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, bl
), 0, b
);
1275 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, tail
), 0, b
);
1277 if (!b
->is_writing()) {
1278 cache
->_adjust_buffer_size(b
, front
- (int64_t)b
->length
);
1281 cache
->_audit("discard end 1");
1285 if (!b
->is_writing()) {
1286 cache
->_adjust_buffer_size(b
, front
- (int64_t)b
->length
);
1293 if (b
->end() <= end
) {
1294 // drop entire buffer
1295 _rm_buffer(cache
, i
++);
1299 uint32_t keep
= b
->end() - end
;
1300 if (b
->data
.length()) {
1302 bl
.substr_of(b
->data
, b
->length
- keep
, keep
);
1303 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, bl
), 0, b
);
1305 _add_buffer(cache
, new Buffer(this, b
->state
, b
->seq
, end
, keep
), 0, b
);
1307 _rm_buffer(cache
, i
);
1308 cache
->_audit("discard end 2");
1311 return cache_private
;
1314 void BlueStore::BufferSpace::read(
1316 uint32_t offset
, uint32_t length
,
1317 BlueStore::ready_regions_t
& res
,
1318 interval_set
<uint32_t>& res_intervals
)
1320 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1322 res_intervals
.clear();
1323 uint32_t want_bytes
= length
;
1324 uint32_t end
= offset
+ length
;
1325 for (auto i
= _data_lower_bound(offset
);
1326 i
!= buffer_map
.end() && offset
< end
&& i
->first
< end
;
1328 Buffer
*b
= i
->second
.get();
1329 assert(b
->end() > offset
);
1330 if (b
->is_writing() || b
->is_clean()) {
1331 if (b
->offset
< offset
) {
1332 uint32_t skip
= offset
- b
->offset
;
1333 uint32_t l
= MIN(length
, b
->length
- skip
);
1334 res
[offset
].substr_of(b
->data
, skip
, l
);
1335 res_intervals
.insert(offset
, l
);
1338 if (!b
->is_writing()) {
1339 cache
->_touch_buffer(b
);
1343 if (b
->offset
> offset
) {
1344 uint32_t gap
= b
->offset
- offset
;
1345 if (length
<= gap
) {
1351 if (!b
->is_writing()) {
1352 cache
->_touch_buffer(b
);
1354 if (b
->length
> length
) {
1355 res
[offset
].substr_of(b
->data
, 0, length
);
1356 res_intervals
.insert(offset
, length
);
1359 res
[offset
].append(b
->data
);
1360 res_intervals
.insert(offset
, b
->length
);
1361 if (b
->length
== length
)
1363 offset
+= b
->length
;
1364 length
-= b
->length
;
1369 uint64_t hit_bytes
= res_intervals
.size();
1370 assert(hit_bytes
<= want_bytes
);
1371 uint64_t miss_bytes
= want_bytes
- hit_bytes
;
1372 cache
->logger
->inc(l_bluestore_buffer_hit_bytes
, hit_bytes
);
1373 cache
->logger
->inc(l_bluestore_buffer_miss_bytes
, miss_bytes
);
1376 void BlueStore::BufferSpace::finish_write(Cache
* cache
, uint64_t seq
)
1378 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1380 auto i
= writing
.begin();
1381 while (i
!= writing
.end()) {
1391 assert(b
->is_writing());
1393 if (b
->flags
& Buffer::FLAG_NOCACHE
) {
1395 ldout(cache
->cct
, 20) << __func__
<< " discard " << *b
<< dendl
;
1396 buffer_map
.erase(b
->offset
);
1398 b
->state
= Buffer::STATE_CLEAN
;
1400 cache
->_add_buffer(b
, 1, nullptr);
1401 ldout(cache
->cct
, 20) << __func__
<< " added " << *b
<< dendl
;
1405 cache
->_audit("finish_write end");
1408 void BlueStore::BufferSpace::split(Cache
* cache
, size_t pos
, BlueStore::BufferSpace
&r
)
1410 std::lock_guard
<std::recursive_mutex
> lk(cache
->lock
);
1411 if (buffer_map
.empty())
1414 auto p
= --buffer_map
.end();
1416 if (p
->second
->end() <= pos
)
1419 if (p
->second
->offset
< pos
) {
1420 ldout(cache
->cct
, 30) << __func__
<< " cut " << *p
->second
<< dendl
;
1421 size_t left
= pos
- p
->second
->offset
;
1422 size_t right
= p
->second
->length
- left
;
1423 if (p
->second
->data
.length()) {
1425 bl
.substr_of(p
->second
->data
, left
, right
);
1426 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, bl
),
1427 0, p
->second
.get());
1429 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
, 0, right
),
1430 0, p
->second
.get());
1432 cache
->_adjust_buffer_size(p
->second
.get(), -right
);
1433 p
->second
->truncate(left
);
1437 assert(p
->second
->end() > pos
);
1438 ldout(cache
->cct
, 30) << __func__
<< " move " << *p
->second
<< dendl
;
1439 if (p
->second
->data
.length()) {
1440 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1441 p
->second
->offset
- pos
, p
->second
->data
),
1442 0, p
->second
.get());
1444 r
._add_buffer(cache
, new Buffer(&r
, p
->second
->state
, p
->second
->seq
,
1445 p
->second
->offset
- pos
, p
->second
->length
),
1446 0, p
->second
.get());
1448 if (p
== buffer_map
.begin()) {
1449 _rm_buffer(cache
, p
);
1452 _rm_buffer(cache
, p
--);
1455 assert(writing
.empty());
1461 #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
1463 BlueStore::OnodeRef
BlueStore::OnodeSpace::add(const ghobject_t
& oid
, OnodeRef o
)
1465 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1466 auto p
= onode_map
.find(oid
);
1467 if (p
!= onode_map
.end()) {
1468 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
1469 << " raced, returning existing " << p
->second
1473 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " " << o
<< dendl
;
1475 cache
->_add_onode(o
, 1);
1479 BlueStore::OnodeRef
BlueStore::OnodeSpace::lookup(const ghobject_t
& oid
)
1481 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1482 ldout(cache
->cct
, 30) << __func__
<< dendl
;
1483 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator p
= onode_map
.find(oid
);
1484 if (p
== onode_map
.end()) {
1485 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " miss" << dendl
;
1486 cache
->logger
->inc(l_bluestore_onode_misses
);
1489 ldout(cache
->cct
, 30) << __func__
<< " " << oid
<< " hit " << p
->second
1491 cache
->_touch_onode(p
->second
);
1492 cache
->logger
->inc(l_bluestore_onode_hits
);
1496 void BlueStore::OnodeSpace::clear()
1498 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1499 ldout(cache
->cct
, 10) << __func__
<< dendl
;
1500 for (auto &p
: onode_map
) {
1501 cache
->_rm_onode(p
.second
);
1506 bool BlueStore::OnodeSpace::empty()
1508 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1509 return onode_map
.empty();
1512 void BlueStore::OnodeSpace::rename(
1514 const ghobject_t
& old_oid
,
1515 const ghobject_t
& new_oid
,
1516 const mempool::bluestore_meta_other::string
& new_okey
)
1518 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1519 ldout(cache
->cct
, 30) << __func__
<< " " << old_oid
<< " -> " << new_oid
1521 ceph::unordered_map
<ghobject_t
,OnodeRef
>::iterator po
, pn
;
1522 po
= onode_map
.find(old_oid
);
1523 pn
= onode_map
.find(new_oid
);
1526 assert(po
!= onode_map
.end());
1527 if (pn
!= onode_map
.end()) {
1528 ldout(cache
->cct
, 30) << __func__
<< " removing target " << pn
->second
1530 cache
->_rm_onode(pn
->second
);
1531 onode_map
.erase(pn
);
1533 OnodeRef o
= po
->second
;
1535 // install a non-existent onode at old location
1536 oldo
.reset(new Onode(o
->c
, old_oid
, o
->key
));
1538 cache
->_add_onode(po
->second
, 1);
1540 // add at new position and fix oid, key
1541 onode_map
.insert(make_pair(new_oid
, o
));
1542 cache
->_touch_onode(o
);
1547 bool BlueStore::OnodeSpace::map_any(std::function
<bool(OnodeRef
)> f
)
1549 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
);
1550 ldout(cache
->cct
, 20) << __func__
<< dendl
;
1551 for (auto& i
: onode_map
) {
1563 #define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
1565 ostream
& operator<<(ostream
& out
, const BlueStore::SharedBlob
& sb
)
1567 out
<< "SharedBlob(" << &sb
;
1570 out
<< " loaded " << *sb
.persistent
;
1572 out
<< " sbid 0x" << std::hex
<< sb
.sbid_unloaded
<< std::dec
;
1577 BlueStore::SharedBlob::SharedBlob(uint64_t i
, Collection
*_coll
)
1578 : coll(_coll
), sbid_unloaded(i
)
1580 assert(sbid_unloaded
> 0);
1582 get_cache()->add_blob();
1586 BlueStore::SharedBlob::~SharedBlob()
1588 if (get_cache()) { // the dummy instances have a nullptr
1589 std::lock_guard
<std::recursive_mutex
> l(get_cache()->lock
);
1590 bc
._clear(get_cache());
1591 get_cache()->rm_blob();
1593 if (loaded
&& persistent
) {
1598 void BlueStore::SharedBlob::put()
1601 ldout(coll
->store
->cct
, 20) << __func__
<< " " << this
1602 << " removing self from set " << get_parent()
1605 if (get_parent()->remove(this)) {
1608 ldout(coll
->store
->cct
, 20)
1609 << __func__
<< " " << this << " lost race to remove myself from set"
1618 void BlueStore::SharedBlob::get_ref(uint64_t offset
, uint32_t length
)
1621 persistent
->ref_map
.get(offset
, length
);
1624 void BlueStore::SharedBlob::put_ref(uint64_t offset
, uint32_t length
,
1628 persistent
->ref_map
.put(offset
, length
, r
);
1634 #define dout_prefix *_dout << "bluestore.blob(" << this << ") "
1636 ostream
& operator<<(ostream
& out
, const BlueStore::Blob
& b
)
1638 out
<< "Blob(" << &b
;
1639 if (b
.is_spanning()) {
1640 out
<< " spanning " << b
.id
;
1642 out
<< " " << b
.get_blob() << " " << b
.get_blob_use_tracker()
1643 << " " << *b
.shared_blob
1648 void BlueStore::Blob::discard_unallocated(Collection
*coll
)
1650 if (blob
.is_shared()) {
1653 if (blob
.is_compressed()) {
1654 bool discard
= false;
1655 bool all_invalid
= true;
1656 for (auto e
: blob
.get_extents()) {
1657 if (!e
.is_valid()) {
1660 all_invalid
= false;
1663 assert(discard
== all_invalid
); // in case of compressed blob all
1664 // or none pextents are invalid.
1666 shared_blob
->bc
.discard(shared_blob
->get_cache(), 0, blob
.get_logical_length());
1670 for (auto e
: blob
.get_extents()) {
1671 if (!e
.is_valid()) {
1672 ldout(coll
->store
->cct
, 20) << __func__
<< " 0x" << std::hex
<< pos
1674 << std::dec
<< dendl
;
1675 shared_blob
->bc
.discard(shared_blob
->get_cache(), pos
, e
.length
);
1679 if (blob
.can_prune_tail()) {
1682 used_in_blob
.prune_tail(blob
.get_ondisk_length());
1683 auto cct
= coll
->store
->cct
; //used by dout
1684 dout(20) << __func__
<< " pruned tail, now " << blob
<< dendl
;
1689 void BlueStore::Blob::get_ref(
1694 // Caller has to initialize Blob's logical length prior to increment
1695 // references. Otherwise one is neither unable to determine required
1696 // amount of counters in case of per-au tracking nor obtain min_release_size
1697 // for single counter mode.
1698 assert(get_blob().get_logical_length() != 0);
1699 auto cct
= coll
->store
->cct
;
1700 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
1701 << std::dec
<< " " << *this << dendl
;
1703 if (used_in_blob
.is_empty()) {
1704 uint32_t min_release_size
=
1705 blob
.get_release_size(coll
->store
->min_alloc_size
);
1706 uint64_t l
= blob
.get_logical_length();
1707 dout(20) << __func__
<< " init 0x" << std::hex
<< l
<< ", " << min_release_size
1708 << std::dec
<< dendl
;
1709 used_in_blob
.init(l
, min_release_size
);
1716 bool BlueStore::Blob::put_ref(
1722 PExtentVector logical
;
1724 auto cct
= coll
->store
->cct
;
1725 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
1726 << std::dec
<< " " << *this << dendl
;
1728 bool empty
= used_in_blob
.put(
1733 // nothing to release
1734 if (!empty
&& logical
.empty()) {
1738 bluestore_blob_t
& b
= dirty_blob();
1739 return b
.release_extents(empty
, logical
, r
);
1742 bool BlueStore::Blob::try_reuse_blob(uint32_t min_alloc_size
,
1743 uint32_t target_blob_size
,
1745 uint32_t *length0
) {
1746 assert(min_alloc_size
);
1747 assert(target_blob_size
);
1748 if (!get_blob().is_mutable()) {
1752 uint32_t length
= *length0
;
1753 uint32_t end
= b_offset
+ length
;
1755 // Currently for the sake of simplicity we omit blob reuse if data is
1756 // unaligned with csum chunk. Later we can perform padding if needed.
1757 if (get_blob().has_csum() &&
1758 ((b_offset
% get_blob().get_csum_chunk_size()) != 0 ||
1759 (end
% get_blob().get_csum_chunk_size()) != 0)) {
1763 auto blen
= get_blob().get_logical_length();
1764 uint32_t new_blen
= blen
;
1766 // make sure target_blob_size isn't less than current blob len
1767 target_blob_size
= MAX(blen
, target_blob_size
);
1769 if (b_offset
>= blen
) {
1770 //new data totally stands out of the existing blob
1771 new_blen
= b_offset
+ length
;
1773 //new data overlaps with the existing blob
1774 new_blen
= MAX(blen
, length
+ b_offset
);
1775 if (!get_blob().is_unallocated(
1777 new_blen
> blen
? blen
- b_offset
: length
)) {
1781 if (new_blen
> blen
) {
1782 int64_t overflow
= int64_t(new_blen
) - target_blob_size
;
1783 // Unable to decrease the provided length to fit into max_blob_size
1784 if (overflow
>= length
) {
1788 // FIXME: in some cases we could reduce unused resolution
1789 if (get_blob().has_unused()) {
1794 new_blen
-= overflow
;
1798 if (new_blen
> blen
) {
1799 dirty_blob().add_tail(new_blen
);
1800 used_in_blob
.add_tail(new_blen
,
1801 blob
.get_release_size(min_alloc_size
));
1807 void BlueStore::Blob::split(Collection
*coll
, uint32_t blob_offset
, Blob
*r
)
1809 auto cct
= coll
->store
->cct
; //used by dout
1810 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1811 << " start " << *this << dendl
;
1812 assert(blob
.can_split());
1813 assert(used_in_blob
.can_split());
1814 bluestore_blob_t
&lb
= dirty_blob();
1815 bluestore_blob_t
&rb
= r
->dirty_blob();
1819 &(r
->used_in_blob
));
1821 lb
.split(blob_offset
, rb
);
1822 shared_blob
->bc
.split(shared_blob
->get_cache(), blob_offset
, r
->shared_blob
->bc
);
1824 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1825 << " finish " << *this << dendl
;
1826 dout(10) << __func__
<< " 0x" << std::hex
<< blob_offset
<< std::dec
1827 << " and " << *r
<< dendl
;
1830 #ifndef CACHE_BLOB_BL
1831 void BlueStore::Blob::decode(
1833 bufferptr::iterator
& p
,
1836 bool include_ref_map
)
1838 denc(blob
, p
, struct_v
);
1839 if (blob
.is_shared()) {
1842 if (include_ref_map
) {
1844 used_in_blob
.decode(p
);
1846 used_in_blob
.clear();
1847 bluestore_extent_ref_map_t legacy_ref_map
;
1848 legacy_ref_map
.decode(p
);
1849 for (auto r
: legacy_ref_map
.ref_map
) {
1853 r
.second
.refs
* r
.second
.length
);
1862 ostream
& operator<<(ostream
& out
, const BlueStore::Extent
& e
)
1864 return out
<< std::hex
<< "0x" << e
.logical_offset
<< "~" << e
.length
1865 << ": 0x" << e
.blob_offset
<< "~" << e
.length
<< std::dec
1870 BlueStore::OldExtent
* BlueStore::OldExtent::create(CollectionRef c
,
1875 OldExtent
* oe
= new OldExtent(lo
, o
, l
, b
);
1876 b
->put_ref(c
.get(), o
, l
, &(oe
->r
));
1877 oe
->blob_empty
= b
->get_referenced_bytes() == 0;
1884 #define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
1886 BlueStore::ExtentMap::ExtentMap(Onode
*o
)
1889 o
->c
->store
->cct
->_conf
->bluestore_extent_map_inline_shard_prealloc_size
) {
1892 void BlueStore::ExtentMap::update(KeyValueDB::Transaction t
,
1895 auto cct
= onode
->c
->store
->cct
; //used by dout
1896 dout(20) << __func__
<< " " << onode
->oid
<< (force
? " force" : "") << dendl
;
1897 if (onode
->onode
.extent_map_shards
.empty()) {
1898 if (inline_bl
.length() == 0) {
1900 // we need to encode inline_bl to measure encoded length
1901 bool never_happen
= encode_some(0, OBJECT_MAX_SIZE
, inline_bl
, &n
);
1902 assert(!never_happen
);
1903 size_t len
= inline_bl
.length();
1904 dout(20) << __func__
<< " inline shard " << len
<< " bytes from " << n
1905 << " extents" << dendl
;
1906 if (!force
&& len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
1907 request_reshard(0, OBJECT_MAX_SIZE
);
1911 // will persist in the onode key.
1913 // pending shard update
1914 struct dirty_shard_t
{
1917 dirty_shard_t(Shard
*s
) : shard(s
) {}
1919 vector
<dirty_shard_t
> encoded_shards
;
1920 // allocate slots for all shards in a single call instead of
1921 // doing multiple allocations - one per each dirty shard
1922 encoded_shards
.reserve(shards
.size());
1924 auto p
= shards
.begin();
1926 while (p
!= shards
.end()) {
1931 if (n
== shards
.end()) {
1932 endoff
= OBJECT_MAX_SIZE
;
1934 endoff
= n
->shard_info
->offset
;
1936 encoded_shards
.emplace_back(dirty_shard_t(&(*p
)));
1937 bufferlist
& bl
= encoded_shards
.back().bl
;
1938 if (encode_some(p
->shard_info
->offset
, endoff
- p
->shard_info
->offset
,
1941 derr
<< __func__
<< " encode_some needs reshard" << dendl
;
1945 size_t len
= bl
.length();
1947 dout(20) << __func__
<< " shard 0x" << std::hex
1948 << p
->shard_info
->offset
<< std::dec
<< " is " << len
1949 << " bytes (was " << p
->shard_info
->bytes
<< ") from "
1950 << p
->extents
<< " extents" << dendl
;
1953 if (len
> cct
->_conf
->bluestore_extent_map_shard_max_size
) {
1954 // we are big; reshard ourselves
1955 request_reshard(p
->shard_info
->offset
, endoff
);
1957 // avoid resharding the trailing shard, even if it is small
1958 else if (n
!= shards
.end() &&
1959 len
< g_conf
->bluestore_extent_map_shard_min_size
) {
1960 // we are small; combine with a neighbor
1961 if (p
== shards
.begin() && endoff
== OBJECT_MAX_SIZE
) {
1962 // we are an only shard
1963 request_reshard(0, OBJECT_MAX_SIZE
);
1965 } else if (p
== shards
.begin()) {
1966 // combine with next shard
1967 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
1968 } else if (endoff
== OBJECT_MAX_SIZE
) {
1969 // combine with previous shard
1970 request_reshard(prev_p
->shard_info
->offset
, endoff
);
1973 // combine with the smaller of the two
1974 if (prev_p
->shard_info
->bytes
> n
->shard_info
->bytes
) {
1975 request_reshard(p
->shard_info
->offset
, endoff
+ 1);
1977 request_reshard(prev_p
->shard_info
->offset
, endoff
);
1986 if (needs_reshard()) {
1990 // schedule DB update for dirty shards
1992 for (auto& it
: encoded_shards
) {
1993 it
.shard
->dirty
= false;
1994 it
.shard
->shard_info
->bytes
= it
.bl
.length();
1995 generate_extent_shard_key_and_apply(
1997 it
.shard
->shard_info
->offset
,
1999 [&](const string
& final_key
) {
2000 t
->set(PREFIX_OBJ
, final_key
, it
.bl
);
2007 void BlueStore::ExtentMap::reshard(
2009 KeyValueDB::Transaction t
)
2011 auto cct
= onode
->c
->store
->cct
; // used by dout
2013 dout(10) << __func__
<< " 0x[" << std::hex
<< needs_reshard_begin
<< ","
2014 << needs_reshard_end
<< ")" << std::dec
2015 << " of " << onode
->onode
.extent_map_shards
.size()
2016 << " shards on " << onode
->oid
<< dendl
;
2017 for (auto& p
: spanning_blob_map
) {
2018 dout(20) << __func__
<< " spanning blob " << p
.first
<< " " << *p
.second
2021 // determine shard index range
2022 unsigned si_begin
= 0, si_end
= 0;
2023 if (!shards
.empty()) {
2024 while (si_begin
+ 1 < shards
.size() &&
2025 shards
[si_begin
+ 1].shard_info
->offset
<= needs_reshard_begin
) {
2028 needs_reshard_begin
= shards
[si_begin
].shard_info
->offset
;
2029 for (si_end
= si_begin
; si_end
< shards
.size(); ++si_end
) {
2030 if (shards
[si_end
].shard_info
->offset
>= needs_reshard_end
) {
2031 needs_reshard_end
= shards
[si_end
].shard_info
->offset
;
2035 if (si_end
== shards
.size()) {
2036 needs_reshard_end
= OBJECT_MAX_SIZE
;
2038 dout(20) << __func__
<< " shards [" << si_begin
<< "," << si_end
<< ")"
2039 << " over 0x[" << std::hex
<< needs_reshard_begin
<< ","
2040 << needs_reshard_end
<< ")" << std::dec
<< dendl
;
2043 fault_range(db
, needs_reshard_begin
, needs_reshard_end
);
2045 // we may need to fault in a larger interval later must have all
2046 // referring extents for spanning blobs loaded in order to have
2047 // accurate use_tracker values.
2048 uint32_t spanning_scan_begin
= needs_reshard_begin
;
2049 uint32_t spanning_scan_end
= needs_reshard_end
;
2053 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2054 generate_extent_shard_key_and_apply(
2055 onode
->key
, shards
[i
].shard_info
->offset
, &key
,
2056 [&](const string
& final_key
) {
2057 t
->rmkey(PREFIX_OBJ
, final_key
);
2062 // calculate average extent size
2064 unsigned extents
= 0;
2065 if (onode
->onode
.extent_map_shards
.empty()) {
2066 bytes
= inline_bl
.length();
2067 extents
= extent_map
.size();
2069 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2070 bytes
+= shards
[i
].shard_info
->bytes
;
2071 extents
+= shards
[i
].extents
;
2074 unsigned target
= cct
->_conf
->bluestore_extent_map_shard_target_size
;
2075 unsigned slop
= target
*
2076 cct
->_conf
->bluestore_extent_map_shard_target_size_slop
;
2077 unsigned extent_avg
= bytes
/ MAX(1, extents
);
2078 dout(20) << __func__
<< " extent_avg " << extent_avg
<< ", target " << target
2079 << ", slop " << slop
<< dendl
;
2082 unsigned estimate
= 0;
2083 unsigned offset
= 0;
2084 vector
<bluestore_onode_t::shard_info
> new_shard_info
;
2085 unsigned max_blob_end
= 0;
2086 Extent
dummy(needs_reshard_begin
);
2087 for (auto e
= extent_map
.lower_bound(dummy
);
2088 e
!= extent_map
.end();
2090 if (e
->logical_offset
>= needs_reshard_end
) {
2093 dout(30) << " extent " << *e
<< dendl
;
2095 // disfavor shard boundaries that span a blob
2096 bool would_span
= (e
->logical_offset
< max_blob_end
) || e
->blob_offset
;
2098 estimate
+ extent_avg
> target
+ (would_span
? slop
: 0)) {
2101 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2102 new_shard_info
.back().offset
= offset
;
2103 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2104 << std::dec
<< dendl
;
2106 offset
= e
->logical_offset
;
2107 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2108 new_shard_info
.back().offset
= offset
;
2109 dout(20) << __func__
<< " new shard 0x" << std::hex
<< offset
2110 << std::dec
<< dendl
;
2113 estimate
+= extent_avg
;
2114 unsigned bb
= e
->blob_start();
2115 if (bb
< spanning_scan_begin
) {
2116 spanning_scan_begin
= bb
;
2118 uint32_t be
= e
->blob_end();
2119 if (be
> max_blob_end
) {
2122 if (be
> spanning_scan_end
) {
2123 spanning_scan_end
= be
;
2126 if (new_shard_info
.empty() && (si_begin
> 0 ||
2127 si_end
< shards
.size())) {
2128 // we resharded a partial range; we must produce at least one output
2130 new_shard_info
.emplace_back(bluestore_onode_t::shard_info());
2131 new_shard_info
.back().offset
= needs_reshard_begin
;
2132 dout(20) << __func__
<< " new shard 0x" << std::hex
<< needs_reshard_begin
2133 << std::dec
<< " (singleton degenerate case)" << dendl
;
2136 auto& sv
= onode
->onode
.extent_map_shards
;
2137 dout(20) << __func__
<< " new " << new_shard_info
<< dendl
;
2138 dout(20) << __func__
<< " old " << sv
<< dendl
;
2140 // no old shards to keep
2141 sv
.swap(new_shard_info
);
2142 init_shards(true, true);
2144 // splice in new shards
2145 sv
.erase(sv
.begin() + si_begin
, sv
.begin() + si_end
);
2146 shards
.erase(shards
.begin() + si_begin
, shards
.begin() + si_end
);
2148 sv
.begin() + si_begin
,
2149 new_shard_info
.begin(),
2150 new_shard_info
.end());
2151 shards
.insert(shards
.begin() + si_begin
, new_shard_info
.size(), Shard());
2152 unsigned n
= sv
.size();
2153 si_end
= si_begin
+ new_shard_info
.size();
2154 for (unsigned i
= si_begin
; i
< si_end
; ++i
) {
2155 shards
[i
].shard_info
= &sv
[i
];
2156 shards
[i
].loaded
= true;
2157 shards
[i
].dirty
= true;
2159 for (unsigned i
= si_end
; i
< n
; ++i
) {
2160 shards
[i
].shard_info
= &sv
[i
];
2163 dout(20) << __func__
<< " fin " << sv
<< dendl
;
2167 // no more shards; unspan all previously spanning blobs
2168 auto p
= spanning_blob_map
.begin();
2169 while (p
!= spanning_blob_map
.end()) {
2171 dout(30) << __func__
<< " un-spanning " << *p
->second
<< dendl
;
2172 p
= spanning_blob_map
.erase(p
);
2175 // identify new spanning blobs
2176 dout(20) << __func__
<< " checking spanning blobs 0x[" << std::hex
2177 << spanning_scan_begin
<< "," << spanning_scan_end
<< ")" << dendl
;
2178 if (spanning_scan_begin
< needs_reshard_begin
) {
2179 fault_range(db
, spanning_scan_begin
,
2180 needs_reshard_begin
- spanning_scan_begin
);
2182 if (spanning_scan_end
> needs_reshard_end
) {
2183 fault_range(db
, needs_reshard_end
,
2184 spanning_scan_end
- needs_reshard_begin
);
2186 auto sp
= sv
.begin() + si_begin
;
2187 auto esp
= sv
.end();
2188 unsigned shard_start
= sp
->offset
;
2192 shard_end
= OBJECT_MAX_SIZE
;
2194 shard_end
= sp
->offset
;
2197 if (spanning_blob_map
.empty()) {
2200 bid
= spanning_blob_map
.rbegin()->first
+ 1;
2202 Extent
dummy(needs_reshard_begin
);
2203 for (auto e
= extent_map
.lower_bound(dummy
); e
!= extent_map
.end(); ++e
) {
2204 if (e
->logical_offset
>= needs_reshard_end
) {
2207 dout(30) << " extent " << *e
<< dendl
;
2208 while (e
->logical_offset
>= shard_end
) {
2209 shard_start
= shard_end
;
2213 shard_end
= OBJECT_MAX_SIZE
;
2215 shard_end
= sp
->offset
;
2217 dout(30) << __func__
<< " shard 0x" << std::hex
<< shard_start
2218 << " to 0x" << shard_end
<< std::dec
<< dendl
;
2220 if (e
->blob_escapes_range(shard_start
, shard_end
- shard_start
)) {
2221 if (!e
->blob
->is_spanning()) {
2222 // We have two options: (1) split the blob into pieces at the
2223 // shard boundaries (and adjust extents accordingly), or (2)
2224 // mark it spanning. We prefer to cut the blob if we can. Note that
2225 // we may have to split it multiple times--potentially at every
2227 bool must_span
= false;
2228 BlobRef b
= e
->blob
;
2229 if (b
->can_split()) {
2230 uint32_t bstart
= e
->blob_start();
2231 uint32_t bend
= e
->blob_end();
2232 for (const auto& sh
: shards
) {
2233 if (bstart
< sh
.shard_info
->offset
&&
2234 bend
> sh
.shard_info
->offset
) {
2235 uint32_t blob_offset
= sh
.shard_info
->offset
- bstart
;
2236 if (b
->can_split_at(blob_offset
)) {
2237 dout(20) << __func__
<< " splitting blob, bstart 0x"
2238 << std::hex
<< bstart
<< " blob_offset 0x"
2239 << blob_offset
<< std::dec
<< " " << *b
<< dendl
;
2240 b
= split_blob(b
, blob_offset
, sh
.shard_info
->offset
);
2241 // switch b to the new right-hand side, in case it
2242 // *also* has to get split.
2243 bstart
+= blob_offset
;
2244 onode
->c
->store
->logger
->inc(l_bluestore_blob_split
);
2256 spanning_blob_map
[b
->id
] = b
;
2257 dout(20) << __func__
<< " adding spanning " << *b
<< dendl
;
2261 if (e
->blob
->is_spanning()) {
2262 spanning_blob_map
.erase(e
->blob
->id
);
2264 dout(30) << __func__
<< " un-spanning " << *e
->blob
<< dendl
;
2270 clear_needs_reshard();
2273 bool BlueStore::ExtentMap::encode_some(
2279 auto cct
= onode
->c
->store
->cct
; //used by dout
2280 Extent
dummy(offset
);
2281 auto start
= extent_map
.lower_bound(dummy
);
2282 uint32_t end
= offset
+ length
;
2284 __u8 struct_v
= 2; // Version 2 differs from v1 in blob's ref_map
2285 // serialization only. Hence there is no specific
2286 // handling at ExtentMap level.
2290 denc(struct_v
, bound
);
2291 denc_varint(0, bound
);
2292 bool must_reshard
= false;
2293 for (auto p
= start
;
2294 p
!= extent_map
.end() && p
->logical_offset
< end
;
2296 assert(p
->logical_offset
>= offset
);
2297 p
->blob
->last_encoded_id
= -1;
2298 if (!p
->blob
->is_spanning() && p
->blob_escapes_range(offset
, length
)) {
2299 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2300 << std::dec
<< " hit new spanning blob " << *p
<< dendl
;
2301 request_reshard(p
->blob_start(), p
->blob_end());
2302 must_reshard
= true;
2304 denc_varint(0, bound
); // blobid
2305 denc_varint(0, bound
); // logical_offset
2306 denc_varint(0, bound
); // len
2307 denc_varint(0, bound
); // blob_offset
2309 p
->blob
->bound_encode(
2312 p
->blob
->shared_blob
->get_sbid(),
2320 auto app
= bl
.get_contiguous_appender(bound
);
2321 denc(struct_v
, app
);
2322 denc_varint(n
, app
);
2329 uint64_t prev_len
= 0;
2330 for (auto p
= start
;
2331 p
!= extent_map
.end() && p
->logical_offset
< end
;
2334 bool include_blob
= false;
2335 if (p
->blob
->is_spanning()) {
2336 blobid
= p
->blob
->id
<< BLOBID_SHIFT_BITS
;
2337 blobid
|= BLOBID_FLAG_SPANNING
;
2338 } else if (p
->blob
->last_encoded_id
< 0) {
2339 p
->blob
->last_encoded_id
= n
+ 1; // so it is always non-zero
2340 include_blob
= true;
2341 blobid
= 0; // the decoder will infer the id from n
2343 blobid
= p
->blob
->last_encoded_id
<< BLOBID_SHIFT_BITS
;
2345 if (p
->logical_offset
== pos
) {
2346 blobid
|= BLOBID_FLAG_CONTIGUOUS
;
2348 if (p
->blob_offset
== 0) {
2349 blobid
|= BLOBID_FLAG_ZEROOFFSET
;
2351 if (p
->length
== prev_len
) {
2352 blobid
|= BLOBID_FLAG_SAMELENGTH
;
2354 prev_len
= p
->length
;
2356 denc_varint(blobid
, app
);
2357 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
2358 denc_varint_lowz(p
->logical_offset
- pos
, app
);
2360 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
2361 denc_varint_lowz(p
->blob_offset
, app
);
2363 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
2364 denc_varint_lowz(p
->length
, app
);
2366 pos
= p
->logical_end();
2368 p
->blob
->encode(app
, struct_v
, p
->blob
->shared_blob
->get_sbid(), false);
2372 /*derr << __func__ << bl << dendl;
2373 derr << __func__ << ":";
2380 unsigned BlueStore::ExtentMap::decode_some(bufferlist
& bl
)
2382 auto cct
= onode
->c
->store
->cct
; //used by dout
2384 derr << __func__ << ":";
2389 assert(bl
.get_num_buffers() <= 1);
2390 auto p
= bl
.front().begin_deep();
2393 // Version 2 differs from v1 in blob's ref_map
2394 // serialization only. Hence there is no specific
2395 // handling at ExtentMap level below.
2396 assert(struct_v
== 1 || struct_v
== 2);
2399 denc_varint(num
, p
);
2400 vector
<BlobRef
> blobs(num
);
2402 uint64_t prev_len
= 0;
2406 Extent
*le
= new Extent();
2408 denc_varint(blobid
, p
);
2409 if ((blobid
& BLOBID_FLAG_CONTIGUOUS
) == 0) {
2411 denc_varint_lowz(gap
, p
);
2414 le
->logical_offset
= pos
;
2415 if ((blobid
& BLOBID_FLAG_ZEROOFFSET
) == 0) {
2416 denc_varint_lowz(le
->blob_offset
, p
);
2418 le
->blob_offset
= 0;
2420 if ((blobid
& BLOBID_FLAG_SAMELENGTH
) == 0) {
2421 denc_varint_lowz(prev_len
, p
);
2423 le
->length
= prev_len
;
2425 if (blobid
& BLOBID_FLAG_SPANNING
) {
2426 dout(30) << __func__
<< " getting spanning blob "
2427 << (blobid
>> BLOBID_SHIFT_BITS
) << dendl
;
2428 le
->assign_blob(get_spanning_blob(blobid
>> BLOBID_SHIFT_BITS
));
2430 blobid
>>= BLOBID_SHIFT_BITS
;
2432 le
->assign_blob(blobs
[blobid
- 1]);
2435 Blob
*b
= new Blob();
2437 b
->decode(onode
->c
, p
, struct_v
, &sbid
, false);
2439 onode
->c
->open_shared_blob(sbid
, b
);
2442 // we build ref_map dynamically for non-spanning blobs
2450 extent_map
.insert(*le
);
2457 void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p
)
2459 // Version 2 differs from v1 in blob's ref_map
2460 // serialization only. Hence there is no specific
2461 // handling at ExtentMap level.
2465 denc_varint((uint32_t)0, p
);
2466 size_t key_size
= 0;
2467 denc_varint((uint32_t)0, key_size
);
2468 p
+= spanning_blob_map
.size() * key_size
;
2469 for (const auto& i
: spanning_blob_map
) {
2470 i
.second
->bound_encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
2474 void BlueStore::ExtentMap::encode_spanning_blobs(
2475 bufferlist::contiguous_appender
& p
)
2477 // Version 2 differs from v1 in blob's ref_map
2478 // serialization only. Hence there is no specific
2479 // handling at ExtentMap level.
2483 denc_varint(spanning_blob_map
.size(), p
);
2484 for (auto& i
: spanning_blob_map
) {
2485 denc_varint(i
.second
->id
, p
);
2486 i
.second
->encode(p
, struct_v
, i
.second
->shared_blob
->get_sbid(), true);
2490 void BlueStore::ExtentMap::decode_spanning_blobs(
2491 bufferptr::iterator
& p
)
2495 // Version 2 differs from v1 in blob's ref_map
2496 // serialization only. Hence there is no specific
2497 // handling at ExtentMap level.
2498 assert(struct_v
== 1 || struct_v
== 2);
2503 BlobRef
b(new Blob());
2504 denc_varint(b
->id
, p
);
2505 spanning_blob_map
[b
->id
] = b
;
2507 b
->decode(onode
->c
, p
, struct_v
, &sbid
, true);
2508 onode
->c
->open_shared_blob(sbid
, b
);
2512 void BlueStore::ExtentMap::init_shards(bool loaded
, bool dirty
)
2514 shards
.resize(onode
->onode
.extent_map_shards
.size());
2516 for (auto &s
: onode
->onode
.extent_map_shards
) {
2517 shards
[i
].shard_info
= &s
;
2518 shards
[i
].loaded
= loaded
;
2519 shards
[i
].dirty
= dirty
;
2524 void BlueStore::ExtentMap::fault_range(
2529 auto cct
= onode
->c
->store
->cct
; //used by dout
2530 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2531 << std::dec
<< dendl
;
2532 auto start
= seek_shard(offset
);
2533 auto last
= seek_shard(offset
+ length
);
2538 assert(last
>= start
);
2540 while (start
<= last
) {
2541 assert((size_t)start
< shards
.size());
2542 auto p
= &shards
[start
];
2544 dout(30) << __func__
<< " opening shard 0x" << std::hex
2545 << p
->shard_info
->offset
<< std::dec
<< dendl
;
2547 generate_extent_shard_key_and_apply(
2548 onode
->key
, p
->shard_info
->offset
, &key
,
2549 [&](const string
& final_key
) {
2550 int r
= db
->get(PREFIX_OBJ
, final_key
, &v
);
2552 derr
<< __func__
<< " missing shard 0x" << std::hex
2553 << p
->shard_info
->offset
<< std::dec
<< " for " << onode
->oid
2559 p
->extents
= decode_some(v
);
2561 dout(20) << __func__
<< " open shard 0x" << std::hex
2562 << p
->shard_info
->offset
<< std::dec
2563 << " (" << v
.length() << " bytes)" << dendl
;
2564 assert(p
->dirty
== false);
2565 assert(v
.length() == p
->shard_info
->bytes
);
2566 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_misses
);
2568 onode
->c
->store
->logger
->inc(l_bluestore_onode_shard_hits
);
2574 void BlueStore::ExtentMap::dirty_range(
2575 KeyValueDB::Transaction t
,
2579 auto cct
= onode
->c
->store
->cct
; //used by dout
2580 dout(30) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2581 << std::dec
<< dendl
;
2582 if (shards
.empty()) {
2583 dout(20) << __func__
<< " mark inline shard dirty" << dendl
;
2587 auto start
= seek_shard(offset
);
2588 auto last
= seek_shard(offset
+ length
);
2592 assert(last
>= start
);
2593 while (start
<= last
) {
2594 assert((size_t)start
< shards
.size());
2595 auto p
= &shards
[start
];
2597 dout(20) << __func__
<< " shard 0x" << std::hex
<< p
->shard_info
->offset
2598 << std::dec
<< " is not loaded, can't mark dirty" << dendl
;
2599 assert(0 == "can't mark unloaded shard dirty");
2602 dout(20) << __func__
<< " mark shard 0x" << std::hex
2603 << p
->shard_info
->offset
<< std::dec
<< " dirty" << dendl
;
2610 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::find(
2613 Extent
dummy(offset
);
2614 return extent_map
.find(dummy
);
2617 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::find_lextent(
2620 auto fp
= seek_lextent(offset
);
2621 if (fp
!= extent_map
.end() && fp
->logical_offset
> offset
)
2622 return extent_map
.end(); // extent is past offset
2626 BlueStore::extent_map_t::iterator
BlueStore::ExtentMap::seek_lextent(
2629 Extent
dummy(offset
);
2630 auto fp
= extent_map
.lower_bound(dummy
);
2631 if (fp
!= extent_map
.begin()) {
2633 if (fp
->logical_end() <= offset
) {
2640 BlueStore::extent_map_t::const_iterator
BlueStore::ExtentMap::seek_lextent(
2641 uint64_t offset
) const
2643 Extent
dummy(offset
);
2644 auto fp
= extent_map
.lower_bound(dummy
);
2645 if (fp
!= extent_map
.begin()) {
2647 if (fp
->logical_end() <= offset
) {
2654 bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset
, uint64_t length
)
2656 auto fp
= seek_lextent(offset
);
2657 if (fp
== extent_map
.end() || fp
->logical_offset
>= offset
+ length
) {
2663 int BlueStore::ExtentMap::compress_extent_map(
2667 auto cct
= onode
->c
->store
->cct
; //used by dout
2668 if (extent_map
.empty())
2671 auto p
= seek_lextent(offset
);
2672 if (p
!= extent_map
.begin()) {
2673 --p
; // start to the left of offset
2675 // the caller should have just written to this region
2676 assert(p
!= extent_map
.end());
2678 // identify the *next* shard
2679 auto pshard
= shards
.begin();
2680 while (pshard
!= shards
.end() &&
2681 p
->logical_offset
>= pshard
->shard_info
->offset
) {
2685 if (pshard
!= shards
.end()) {
2686 shard_end
= pshard
->shard_info
->offset
;
2688 shard_end
= OBJECT_MAX_SIZE
;
2692 for (++n
; n
!= extent_map
.end(); p
= n
++) {
2693 if (n
->logical_offset
> offset
+ length
) {
2694 break; // stop after end
2696 while (n
!= extent_map
.end() &&
2697 p
->logical_end() == n
->logical_offset
&&
2698 p
->blob
== n
->blob
&&
2699 p
->blob_offset
+ p
->length
== n
->blob_offset
&&
2700 n
->logical_offset
< shard_end
) {
2701 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2702 << " next shard 0x" << shard_end
<< std::dec
2703 << " merging " << *p
<< " and " << *n
<< dendl
;
2704 p
->length
+= n
->length
;
2708 if (n
== extent_map
.end()) {
2711 if (n
->logical_offset
>= shard_end
) {
2712 assert(pshard
!= shards
.end());
2714 if (pshard
!= shards
.end()) {
2715 shard_end
= pshard
->shard_info
->offset
;
2717 shard_end
= OBJECT_MAX_SIZE
;
2721 if (removed
&& onode
) {
2722 onode
->c
->store
->logger
->inc(l_bluestore_extent_compress
, removed
);
2727 void BlueStore::ExtentMap::punch_hole(
2731 old_extent_map_t
*old_extents
)
2733 auto p
= seek_lextent(offset
);
2734 uint64_t end
= offset
+ length
;
2735 while (p
!= extent_map
.end()) {
2736 if (p
->logical_offset
>= end
) {
2739 if (p
->logical_offset
< offset
) {
2740 if (p
->logical_end() > end
) {
2741 // split and deref middle
2742 uint64_t front
= offset
- p
->logical_offset
;
2743 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ front
,
2745 old_extents
->push_back(*oe
);
2747 p
->blob_offset
+ front
+ length
,
2748 p
->length
- front
- length
,
2754 assert(p
->logical_end() > offset
); // else seek_lextent bug
2755 uint64_t keep
= offset
- p
->logical_offset
;
2756 OldExtent
* oe
= OldExtent::create(c
, offset
, p
->blob_offset
+ keep
,
2757 p
->length
- keep
, p
->blob
);
2758 old_extents
->push_back(*oe
);
2764 if (p
->logical_offset
+ p
->length
<= end
) {
2765 // deref whole lextent
2766 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
2767 p
->length
, p
->blob
);
2768 old_extents
->push_back(*oe
);
2773 uint64_t keep
= p
->logical_end() - end
;
2774 OldExtent
* oe
= OldExtent::create(c
, p
->logical_offset
, p
->blob_offset
,
2775 p
->length
- keep
, p
->blob
);
2776 old_extents
->push_back(*oe
);
2778 add(end
, p
->blob_offset
+ p
->length
- keep
, keep
, p
->blob
);
2784 BlueStore::Extent
*BlueStore::ExtentMap::set_lextent(
2786 uint64_t logical_offset
,
2787 uint64_t blob_offset
, uint64_t length
, BlobRef b
,
2788 old_extent_map_t
*old_extents
)
2790 // We need to have completely initialized Blob to increment its ref counters.
2791 assert(b
->get_blob().get_logical_length() != 0);
2793 // Do get_ref prior to punch_hole to prevent from putting reused blob into
2794 // old_extents list if we overwre the blob totally
2795 // This might happen during WAL overwrite.
2796 b
->get_ref(onode
->c
, blob_offset
, length
);
2799 punch_hole(c
, logical_offset
, length
, old_extents
);
2802 Extent
*le
= new Extent(logical_offset
, blob_offset
, length
, b
);
2803 extent_map
.insert(*le
);
2804 if (spans_shard(logical_offset
, length
)) {
2805 request_reshard(logical_offset
, logical_offset
+ length
);
2810 BlueStore::BlobRef
BlueStore::ExtentMap::split_blob(
2812 uint32_t blob_offset
,
2815 auto cct
= onode
->c
->store
->cct
; //used by dout
2817 uint32_t end_pos
= pos
+ lb
->get_blob().get_logical_length() - blob_offset
;
2818 dout(20) << __func__
<< " 0x" << std::hex
<< pos
<< " end 0x" << end_pos
2819 << " blob_offset 0x" << blob_offset
<< std::dec
<< " " << *lb
2821 BlobRef rb
= onode
->c
->new_blob();
2822 lb
->split(onode
->c
, blob_offset
, rb
.get());
2824 for (auto ep
= seek_lextent(pos
);
2825 ep
!= extent_map
.end() && ep
->logical_offset
< end_pos
;
2827 if (ep
->blob
!= lb
) {
2830 if (ep
->logical_offset
< pos
) {
2832 size_t left
= pos
- ep
->logical_offset
;
2833 Extent
*ne
= new Extent(pos
, 0, ep
->length
- left
, rb
);
2834 extent_map
.insert(*ne
);
2836 dout(30) << __func__
<< " split " << *ep
<< dendl
;
2837 dout(30) << __func__
<< " to " << *ne
<< dendl
;
2840 assert(ep
->blob_offset
>= blob_offset
);
2843 ep
->blob_offset
-= blob_offset
;
2844 dout(30) << __func__
<< " adjusted " << *ep
<< dendl
;
2853 #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
2855 void BlueStore::Onode::flush()
2857 if (flushing_count
.load()) {
2858 ldout(c
->store
->cct
, 20) << __func__
<< " cnt:" << flushing_count
<< dendl
;
2859 std::unique_lock
<std::mutex
> l(flush_lock
);
2860 while (flushing_count
.load()) {
2864 ldout(c
->store
->cct
, 20) << __func__
<< " done" << dendl
;
2867 // =======================================================
2870 /// Checks for writes to the same pextent within a blob
2871 bool BlueStore::WriteContext::has_conflict(
2875 uint64_t min_alloc_size
)
2877 assert((loffs
% min_alloc_size
) == 0);
2878 assert((loffs_end
% min_alloc_size
) == 0);
2879 for (auto w
: writes
) {
2881 auto loffs2
= P2ALIGN(w
.logical_offset
, min_alloc_size
);
2882 auto loffs2_end
= ROUND_UP_TO( w
.logical_offset
+ w
.length0
, min_alloc_size
);
2883 if ((loffs
<= loffs2
&& loffs_end
> loffs2
) ||
2884 (loffs
>= loffs2
&& loffs
< loffs2_end
)) {
2892 // =======================================================
2896 #define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
2898 void BlueStore::DeferredBatch::prepare_write(
2900 uint64_t seq
, uint64_t offset
, uint64_t length
,
2901 bufferlist::const_iterator
& blp
)
2903 _discard(cct
, offset
, length
);
2904 auto i
= iomap
.insert(make_pair(offset
, deferred_io()));
2905 assert(i
.second
); // this should be a new insertion
2906 i
.first
->second
.seq
= seq
;
2907 blp
.copy(length
, i
.first
->second
.bl
);
2908 dout(20) << __func__
<< " seq " << seq
2909 << " 0x" << std::hex
<< offset
<< "~" << length
2910 << " crc " << i
.first
->second
.bl
.crc32c(-1)
2911 << std::dec
<< dendl
;
2912 seq_bytes
[seq
] += length
;
2913 #ifdef DEBUG_DEFERRED
2918 void BlueStore::DeferredBatch::_discard(
2919 CephContext
*cct
, uint64_t offset
, uint64_t length
)
2921 generic_dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
2922 << std::dec
<< dendl
;
2923 auto p
= iomap
.lower_bound(offset
);
2924 if (p
!= iomap
.begin()) {
2926 auto end
= p
->first
+ p
->second
.bl
.length();
2929 head
.substr_of(p
->second
.bl
, 0, offset
- p
->first
);
2930 dout(20) << __func__
<< " keep head " << p
->second
.seq
2931 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
2932 << " -> 0x" << head
.length() << std::dec
<< dendl
;
2933 auto i
= seq_bytes
.find(p
->second
.seq
);
2934 if (end
> offset
+ length
) {
2936 tail
.substr_of(p
->second
.bl
, offset
+ length
- p
->first
,
2937 end
- (offset
+ length
));
2938 dout(20) << __func__
<< " keep tail " << p
->second
.seq
2939 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
2940 << " -> 0x" << tail
.length() << std::dec
<< dendl
;
2941 auto &n
= iomap
[offset
+ length
];
2943 n
.seq
= p
->second
.seq
;
2944 i
->second
-= length
;
2946 i
->second
-= end
- offset
;
2948 p
->second
.bl
.swap(head
);
2952 while (p
!= iomap
.end()) {
2953 if (p
->first
>= offset
+ length
) {
2956 auto i
= seq_bytes
.find(p
->second
.seq
);
2957 auto end
= p
->first
+ p
->second
.bl
.length();
2958 if (end
> offset
+ length
) {
2959 unsigned drop_front
= offset
+ length
- p
->first
;
2960 unsigned keep_tail
= end
- (offset
+ length
);
2961 dout(20) << __func__
<< " truncate front " << p
->second
.seq
2962 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
2963 << " drop_front 0x" << drop_front
<< " keep_tail 0x" << keep_tail
2964 << " to 0x" << (offset
+ length
) << "~" << keep_tail
2965 << std::dec
<< dendl
;
2966 auto &s
= iomap
[offset
+ length
];
2967 s
.seq
= p
->second
.seq
;
2968 s
.bl
.substr_of(p
->second
.bl
, drop_front
, keep_tail
);
2969 i
->second
-= drop_front
;
2971 dout(20) << __func__
<< " drop " << p
->second
.seq
2972 << " 0x" << std::hex
<< p
->first
<< "~" << p
->second
.bl
.length()
2973 << std::dec
<< dendl
;
2974 i
->second
-= p
->second
.bl
.length();
2980 void BlueStore::DeferredBatch::_audit(CephContext
*cct
)
2982 map
<uint64_t,int> sb
;
2983 for (auto p
: seq_bytes
) {
2984 sb
[p
.first
] = 0; // make sure we have the same set of keys
2987 for (auto& p
: iomap
) {
2988 assert(p
.first
>= pos
);
2989 sb
[p
.second
.seq
] += p
.second
.bl
.length();
2990 pos
= p
.first
+ p
.second
.bl
.length();
2992 assert(sb
== seq_bytes
);
2999 #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
3001 BlueStore::Collection::Collection(BlueStore
*ns
, Cache
*c
, coll_t cid
)
3005 lock("BlueStore::Collection::lock", true, false),
3011 void BlueStore::Collection::open_shared_blob(uint64_t sbid
, BlobRef b
)
3013 assert(!b
->shared_blob
);
3014 const bluestore_blob_t
& blob
= b
->get_blob();
3015 if (!blob
.is_shared()) {
3016 b
->shared_blob
= new SharedBlob(this);
3020 b
->shared_blob
= shared_blob_set
.lookup(sbid
);
3021 if (b
->shared_blob
) {
3022 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3023 << std::dec
<< " had " << *b
->shared_blob
<< dendl
;
3025 b
->shared_blob
= new SharedBlob(sbid
, this);
3026 shared_blob_set
.add(this, b
->shared_blob
.get());
3027 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3028 << std::dec
<< " opened " << *b
->shared_blob
3033 void BlueStore::Collection::load_shared_blob(SharedBlobRef sb
)
3035 if (!sb
->is_loaded()) {
3039 auto sbid
= sb
->get_sbid();
3040 get_shared_blob_key(sbid
, &key
);
3041 int r
= store
->db
->get(PREFIX_SHARED_BLOB
, key
, &v
);
3043 lderr(store
->cct
) << __func__
<< " sbid 0x" << std::hex
<< sbid
3044 << std::dec
<< " not found at key "
3045 << pretty_binary_string(key
) << dendl
;
3046 assert(0 == "uh oh, missing shared_blob");
3050 sb
->persistent
= new bluestore_shared_blob_t(sbid
);
3051 bufferlist::iterator p
= v
.begin();
3052 ::decode(*(sb
->persistent
), p
);
3053 ldout(store
->cct
, 10) << __func__
<< " sbid 0x" << std::hex
<< sbid
3054 << std::dec
<< " loaded shared_blob " << *sb
<< dendl
;
3058 void BlueStore::Collection::make_blob_shared(uint64_t sbid
, BlobRef b
)
3060 assert(!b
->shared_blob
->is_loaded());
3062 ldout(store
->cct
, 10) << __func__
<< " " << *b
<< dendl
;
3063 bluestore_blob_t
& blob
= b
->dirty_blob();
3066 blob
.set_flag(bluestore_blob_t::FLAG_SHARED
);
3067 blob
.clear_flag(bluestore_blob_t::FLAG_MUTABLE
);
3069 // update shared blob
3070 b
->shared_blob
->loaded
= true;
3071 b
->shared_blob
->persistent
= new bluestore_shared_blob_t(sbid
);
3072 shared_blob_set
.add(this, b
->shared_blob
.get());
3073 for (auto p
: blob
.get_extents()) {
3075 b
->shared_blob
->get_ref(
3080 ldout(store
->cct
, 20) << __func__
<< " now " << *b
<< dendl
;
3083 BlueStore::OnodeRef
BlueStore::Collection::get_onode(
3084 const ghobject_t
& oid
,
3087 assert(create
? lock
.is_wlocked() : lock
.is_locked());
3090 if (cid
.is_pg(&pgid
)) {
3091 if (!oid
.match(cnode
.bits
, pgid
.ps())) {
3092 lderr(store
->cct
) << __func__
<< " oid " << oid
<< " not part of "
3093 << pgid
<< " bits " << cnode
.bits
<< dendl
;
3098 OnodeRef o
= onode_map
.lookup(oid
);
3102 mempool::bluestore_meta_other::string key
;
3103 get_object_key(store
->cct
, oid
, &key
);
3105 ldout(store
->cct
, 20) << __func__
<< " oid " << oid
<< " key "
3106 << pretty_binary_string(key
) << dendl
;
3109 int r
= store
->db
->get(PREFIX_OBJ
, key
.c_str(), key
.size(), &v
);
3110 ldout(store
->cct
, 20) << " r " << r
<< " v.len " << v
.length() << dendl
;
3112 if (v
.length() == 0) {
3113 assert(r
== -ENOENT
);
3114 if (!store
->cct
->_conf
->bluestore_debug_misc
&&
3118 // new object, new onode
3119 on
= new Onode(this, oid
, key
);
3123 on
= new Onode(this, oid
, key
);
3125 bufferptr::iterator p
= v
.front().begin();
3126 on
->onode
.decode(p
);
3128 // initialize extent_map
3129 on
->extent_map
.decode_spanning_blobs(p
);
3130 if (on
->onode
.extent_map_shards
.empty()) {
3131 denc(on
->extent_map
.inline_bl
, p
);
3132 on
->extent_map
.decode_some(on
->extent_map
.inline_bl
);
3134 on
->extent_map
.init_shards(false, false);
3138 return onode_map
.add(oid
, o
);
3141 void BlueStore::Collection::split_cache(
3144 ldout(store
->cct
, 10) << __func__
<< " to " << dest
<< dendl
;
3146 // lock (one or both) cache shards
3147 std::lock(cache
->lock
, dest
->cache
->lock
);
3148 std::lock_guard
<std::recursive_mutex
> l(cache
->lock
, std::adopt_lock
);
3149 std::lock_guard
<std::recursive_mutex
> l2(dest
->cache
->lock
, std::adopt_lock
);
3151 int destbits
= dest
->cnode
.bits
;
3153 bool is_pg
= dest
->cid
.is_pg(&destpg
);
3156 auto p
= onode_map
.onode_map
.begin();
3157 while (p
!= onode_map
.onode_map
.end()) {
3158 if (!p
->second
->oid
.match(destbits
, destpg
.pgid
.ps())) {
3159 // onode does not belong to this child
3162 OnodeRef o
= p
->second
;
3163 ldout(store
->cct
, 20) << __func__
<< " moving " << o
<< " " << o
->oid
3166 cache
->_rm_onode(p
->second
);
3167 p
= onode_map
.onode_map
.erase(p
);
3170 dest
->cache
->_add_onode(o
, 1);
3171 dest
->onode_map
.onode_map
[o
->oid
] = o
;
3172 dest
->onode_map
.cache
= dest
->cache
;
3174 // move over shared blobs and buffers. cover shared blobs from
3175 // both extent map and spanning blob map (the full extent map
3176 // may not be faulted in)
3177 vector
<SharedBlob
*> sbvec
;
3178 for (auto& e
: o
->extent_map
.extent_map
) {
3179 sbvec
.push_back(e
.blob
->shared_blob
.get());
3181 for (auto& b
: o
->extent_map
.spanning_blob_map
) {
3182 sbvec
.push_back(b
.second
->shared_blob
.get());
3184 for (auto sb
: sbvec
) {
3185 if (sb
->coll
== dest
) {
3186 ldout(store
->cct
, 20) << __func__
<< " already moved " << *sb
3190 ldout(store
->cct
, 20) << __func__
<< " moving " << *sb
<< dendl
;
3192 if (dest
->cache
!= cache
) {
3193 if (sb
->get_sbid()) {
3194 ldout(store
->cct
, 20) << __func__
<< " moving registration " << *sb
<< dendl
;
3195 shared_blob_set
.remove(sb
);
3196 dest
->shared_blob_set
.add(dest
, sb
);
3198 for (auto& i
: sb
->bc
.buffer_map
) {
3199 if (!i
.second
->is_writing()) {
3200 ldout(store
->cct
, 20) << __func__
<< " moving " << *i
.second
3202 dest
->cache
->_move_buffer(cache
, i
.second
.get());
3213 void BlueStore::Collection::trim_cache()
3215 // see if mempool stats have updated
3216 uint64_t total_bytes
;
3217 uint64_t total_onodes
;
3219 store
->get_mempool_stats(&seq
, &total_bytes
, &total_onodes
);
3220 if (seq
== cache
->last_trim_seq
) {
3221 ldout(store
->cct
, 30) << __func__
<< " no new mempool stats; nothing to do"
3225 cache
->last_trim_seq
= seq
;
3228 if (total_onodes
< 2) {
3231 float bytes_per_onode
= (float)total_bytes
/ (float)total_onodes
;
3232 size_t num_shards
= store
->cache_shards
.size();
3233 uint64_t shard_target
= store
->cct
->_conf
->bluestore_cache_size
/ num_shards
;
3234 ldout(store
->cct
, 30) << __func__
3235 << " total meta bytes " << total_bytes
3236 << ", total onodes " << total_onodes
3237 << ", bytes_per_onode " << bytes_per_onode
3239 cache
->trim(shard_target
, store
->cct
->_conf
->bluestore_cache_meta_ratio
,
3242 store
->_update_cache_logger();
3245 // =======================================================
3247 void *BlueStore::MempoolThread::entry()
3249 Mutex::Locker
l(lock
);
3251 store
->mempool_bytes
= mempool::bluestore_meta_other::allocated_bytes() +
3252 mempool::bluestore_meta_onode::allocated_bytes();
3253 store
->mempool_onodes
= mempool::bluestore_meta_onode::allocated_items();
3254 ++store
->mempool_seq
;
3256 wait
+= store
->cct
->_conf
->bluestore_cache_trim_interval
;
3257 cond
.WaitInterval(lock
, wait
);
3263 // =======================================================
3266 #define dout_prefix *_dout << "bluestore(" << path << ") "
3269 static void aio_cb(void *priv
, void *priv2
)
3271 BlueStore
*store
= static_cast<BlueStore
*>(priv
);
3272 BlueStore::AioContext
*c
= static_cast<BlueStore::AioContext
*>(priv2
);
3273 c
->aio_finish(store
);
3276 BlueStore::BlueStore(CephContext
*cct
, const string
& path
)
3277 : ObjectStore(cct
, path
),
3278 throttle_bytes(cct
, "bluestore_throttle_bytes",
3279 cct
->_conf
->bluestore_throttle_bytes
),
3280 throttle_deferred_bytes(cct
, "bluestore_throttle_deferred_bytes",
3281 cct
->_conf
->bluestore_throttle_bytes
+
3282 cct
->_conf
->bluestore_throttle_deferred_bytes
),
3283 kv_sync_thread(this),
3284 mempool_thread(this)
3287 cct
->_conf
->add_observer(this);
3288 set_cache_shards(1);
3290 if (cct
->_conf
->bluestore_shard_finishers
) {
3291 m_finisher_num
= cct
->_conf
->osd_op_num_shards
;
3294 for (int i
= 0; i
< m_finisher_num
; ++i
) {
3296 oss
<< "finisher-" << i
;
3297 Finisher
*f
= new Finisher(cct
, oss
.str(), "finisher");
3298 finishers
.push_back(f
);
3302 BlueStore::BlueStore(CephContext
*cct
,
3304 uint64_t _min_alloc_size
)
3305 : ObjectStore(cct
, path
),
3306 throttle_bytes(cct
, "bluestore_throttle_bytes",
3307 cct
->_conf
->bluestore_throttle_bytes
),
3308 throttle_deferred_bytes(cct
, "bluestore_throttle_deferred_bytes",
3309 cct
->_conf
->bluestore_throttle_bytes
+
3310 cct
->_conf
->bluestore_throttle_deferred_bytes
),
3311 kv_sync_thread(this),
3312 min_alloc_size(_min_alloc_size
),
3313 min_alloc_size_order(ctz(_min_alloc_size
)),
3314 mempool_thread(this)
3317 cct
->_conf
->add_observer(this);
3318 set_cache_shards(1);
3320 if (cct
->_conf
->bluestore_shard_finishers
) {
3321 m_finisher_num
= cct
->_conf
->osd_op_num_shards
;
3324 for (int i
= 0; i
< m_finisher_num
; ++i
) {
3326 oss
<< "finisher-" << i
;
3327 Finisher
*f
= new Finisher(cct
, oss
.str(), "finisher");
3328 finishers
.push_back(f
);
3332 BlueStore::~BlueStore()
3334 for (auto f
: finishers
) {
3339 cct
->_conf
->remove_observer(this);
3343 assert(bluefs
== NULL
);
3344 assert(fsid_fd
< 0);
3345 assert(path_fd
< 0);
3346 for (auto i
: cache_shards
) {
3349 cache_shards
.clear();
3352 const char **BlueStore::get_tracked_conf_keys() const
3354 static const char* KEYS
[] = {
3355 "bluestore_csum_type",
3356 "bluestore_compression_mode",
3357 "bluestore_compression_algorithm",
3358 "bluestore_compression_min_blob_size",
3359 "bluestore_compression_min_blob_size_ssd",
3360 "bluestore_compression_min_blob_size_hdd",
3361 "bluestore_compression_max_blob_size",
3362 "bluestore_compression_max_blob_size_ssd",
3363 "bluestore_compression_max_blob_size_hdd",
3364 "bluestore_max_alloc_size",
3365 "bluestore_prefer_deferred_size",
3366 "bleustore_deferred_batch_ops",
3367 "bleustore_deferred_batch_ops_hdd",
3368 "bleustore_deferred_batch_ops_ssd",
3369 "bluestore_throttle_bytes",
3370 "bluestore_throttle_deferred_bytes",
3371 "bluestore_throttle_cost_per_io_hdd",
3372 "bluestore_throttle_cost_per_io_ssd",
3373 "bluestore_throttle_cost_per_io",
3374 "bluestore_max_blob_size",
3375 "bluestore_max_blob_size_ssd",
3376 "bluestore_max_blob_size_hdd",
3382 void BlueStore::handle_conf_change(const struct md_config_t
*conf
,
3383 const std::set
<std::string
> &changed
)
3385 if (changed
.count("bluestore_csum_type")) {
3388 if (changed
.count("bluestore_compression_mode") ||
3389 changed
.count("bluestore_compression_algorithm") ||
3390 changed
.count("bluestore_compression_min_blob_size") ||
3391 changed
.count("bluestore_compression_max_blob_size")) {
3396 if (changed
.count("bluestore_max_blob_size") ||
3397 changed
.count("bluestore_max_blob_size_ssd") ||
3398 changed
.count("bluestore_max_blob_size_hdd")) {
3400 // only after startup
3404 if (changed
.count("bluestore_prefer_deferred_size") ||
3405 changed
.count("bluestore_max_alloc_size") ||
3406 changed
.count("bluestore_deferred_batch_ops") ||
3407 changed
.count("bluestore_deferred_batch_ops_hdd") ||
3408 changed
.count("bluestore_deferred_batch_ops_ssd")) {
3410 // only after startup
3414 if (changed
.count("bluestore_throttle_cost_per_io") ||
3415 changed
.count("bluestore_throttle_cost_per_io_hdd") ||
3416 changed
.count("bluestore_throttle_cost_per_io_ssd")) {
3418 _set_throttle_params();
3421 if (changed
.count("bluestore_throttle_bytes")) {
3422 throttle_bytes
.reset_max(conf
->bluestore_throttle_bytes
);
3423 throttle_deferred_bytes
.reset_max(
3424 conf
->bluestore_throttle_bytes
+ conf
->bluestore_throttle_deferred_bytes
);
3426 if (changed
.count("bluestore_throttle_deferred_bytes")) {
3427 throttle_deferred_bytes
.reset_max(
3428 conf
->bluestore_throttle_bytes
+ conf
->bluestore_throttle_deferred_bytes
);
3432 void BlueStore::_set_compression()
3434 if (cct
->_conf
->bluestore_compression_max_blob_size
) {
3435 comp_min_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size
;
3438 if (bdev
->is_rotational()) {
3439 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_hdd
;
3441 comp_min_blob_size
= cct
->_conf
->bluestore_compression_min_blob_size_ssd
;
3445 if (cct
->_conf
->bluestore_compression_max_blob_size
) {
3446 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size
;
3449 if (bdev
->is_rotational()) {
3450 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_hdd
;
3452 comp_max_blob_size
= cct
->_conf
->bluestore_compression_max_blob_size_ssd
;
3456 auto m
= Compressor::get_comp_mode_type(cct
->_conf
->bluestore_compression_mode
);
3460 derr
<< __func__
<< " unrecognized value '"
3461 << cct
->_conf
->bluestore_compression_mode
3462 << "' for bluestore_compression_mode, reverting to 'none'"
3464 comp_mode
= Compressor::COMP_NONE
;
3467 compressor
= nullptr;
3469 auto& alg_name
= cct
->_conf
->bluestore_compression_algorithm
;
3470 if (!alg_name
.empty()) {
3471 compressor
= Compressor::create(cct
, alg_name
);
3473 derr
<< __func__
<< " unable to initialize " << alg_name
.c_str() << " compressor"
3478 dout(10) << __func__
<< " mode " << Compressor::get_comp_mode_name(comp_mode
)
3479 << " alg " << (compressor
? compressor
->get_type_name() : "(none)")
3483 void BlueStore::_set_csum()
3485 csum_type
= Checksummer::CSUM_NONE
;
3486 int t
= Checksummer::get_csum_string_type(cct
->_conf
->bluestore_csum_type
);
3487 if (t
> Checksummer::CSUM_NONE
)
3490 dout(10) << __func__
<< " csum_type "
3491 << Checksummer::get_csum_type_string(csum_type
)
3495 void BlueStore::_set_throttle_params()
3497 if (cct
->_conf
->bluestore_throttle_cost_per_io
) {
3498 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io
;
3501 if (bdev
->is_rotational()) {
3502 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_hdd
;
3504 throttle_cost_per_io
= cct
->_conf
->bluestore_throttle_cost_per_io_ssd
;
3508 dout(10) << __func__
<< " throttle_cost_per_io " << throttle_cost_per_io
3511 void BlueStore::_set_blob_size()
3513 if (cct
->_conf
->bluestore_max_blob_size
) {
3514 max_blob_size
= cct
->_conf
->bluestore_max_blob_size
;
3517 if (bdev
->is_rotational()) {
3518 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_hdd
;
3520 max_blob_size
= cct
->_conf
->bluestore_max_blob_size_ssd
;
3523 dout(10) << __func__
<< " max_blob_size 0x" << std::hex
<< max_blob_size
3524 << std::dec
<< dendl
;
3527 void BlueStore::_init_logger()
3529 PerfCountersBuilder
b(cct
, "bluestore",
3530 l_bluestore_first
, l_bluestore_last
);
3531 b
.add_time_avg(l_bluestore_kv_flush_lat
, "kv_flush_lat",
3532 "Average kv_thread flush latency",
3533 "fl_l", PerfCountersBuilder::PRIO_INTERESTING
);
3534 b
.add_time_avg(l_bluestore_kv_commit_lat
, "kv_commit_lat",
3535 "Average kv_thread commit latency");
3536 b
.add_time_avg(l_bluestore_kv_lat
, "kv_lat",
3537 "Average kv_thread sync latency",
3538 "k_l", PerfCountersBuilder::PRIO_INTERESTING
);
3539 b
.add_time_avg(l_bluestore_state_prepare_lat
, "state_prepare_lat",
3540 "Average prepare state latency");
3541 b
.add_time_avg(l_bluestore_state_aio_wait_lat
, "state_aio_wait_lat",
3542 "Average aio_wait state latency",
3543 "io_l", PerfCountersBuilder::PRIO_INTERESTING
);
3544 b
.add_time_avg(l_bluestore_state_io_done_lat
, "state_io_done_lat",
3545 "Average io_done state latency");
3546 b
.add_time_avg(l_bluestore_state_kv_queued_lat
, "state_kv_queued_lat",
3547 "Average kv_queued state latency");
3548 b
.add_time_avg(l_bluestore_state_kv_committing_lat
, "state_kv_commiting_lat",
3549 "Average kv_commiting state latency");
3550 b
.add_time_avg(l_bluestore_state_kv_done_lat
, "state_kv_done_lat",
3551 "Average kv_done state latency");
3552 b
.add_time_avg(l_bluestore_state_deferred_queued_lat
, "state_deferred_queued_lat",
3553 "Average deferred_queued state latency");
3554 b
.add_time_avg(l_bluestore_state_deferred_aio_wait_lat
, "state_deferred_aio_wait_lat",
3555 "Average aio_wait state latency");
3556 b
.add_time_avg(l_bluestore_state_deferred_cleanup_lat
, "state_deferred_cleanup_lat",
3557 "Average cleanup state latency");
3558 b
.add_time_avg(l_bluestore_state_finishing_lat
, "state_finishing_lat",
3559 "Average finishing state latency");
3560 b
.add_time_avg(l_bluestore_state_done_lat
, "state_done_lat",
3561 "Average done state latency");
3562 b
.add_time_avg(l_bluestore_throttle_lat
, "throttle_lat",
3563 "Average submit throttle latency",
3564 "th_l", PerfCountersBuilder::PRIO_CRITICAL
);
3565 b
.add_time_avg(l_bluestore_submit_lat
, "submit_lat",
3566 "Average submit latency",
3567 "s_l", PerfCountersBuilder::PRIO_CRITICAL
);
3568 b
.add_time_avg(l_bluestore_commit_lat
, "commit_lat",
3569 "Average commit latency",
3570 "c_l", PerfCountersBuilder::PRIO_CRITICAL
);
3571 b
.add_time_avg(l_bluestore_read_lat
, "read_lat",
3572 "Average read latency",
3573 "r_l", PerfCountersBuilder::PRIO_CRITICAL
);
3574 b
.add_time_avg(l_bluestore_read_onode_meta_lat
, "read_onode_meta_lat",
3575 "Average read onode metadata latency");
3576 b
.add_time_avg(l_bluestore_read_wait_aio_lat
, "read_wait_aio_lat",
3577 "Average read latency");
3578 b
.add_time_avg(l_bluestore_compress_lat
, "compress_lat",
3579 "Average compress latency");
3580 b
.add_time_avg(l_bluestore_decompress_lat
, "decompress_lat",
3581 "Average decompress latency");
3582 b
.add_time_avg(l_bluestore_csum_lat
, "csum_lat",
3583 "Average checksum latency");
3584 b
.add_u64_counter(l_bluestore_compress_success_count
, "compress_success_count",
3585 "Sum for beneficial compress ops");
3586 b
.add_u64_counter(l_bluestore_compress_rejected_count
, "compress_rejected_count",
3587 "Sum for compress ops rejected due to low net gain of space");
3588 b
.add_u64_counter(l_bluestore_write_pad_bytes
, "write_pad_bytes",
3589 "Sum for write-op padded bytes");
3590 b
.add_u64_counter(l_bluestore_deferred_write_ops
, "deferred_write_ops",
3591 "Sum for deferred write op");
3592 b
.add_u64_counter(l_bluestore_deferred_write_bytes
, "deferred_write_bytes",
3593 "Sum for deferred write bytes", "def");
3594 b
.add_u64_counter(l_bluestore_write_penalty_read_ops
, "write_penalty_read_ops",
3595 "Sum for write penalty read ops");
3596 b
.add_u64(l_bluestore_allocated
, "bluestore_allocated",
3597 "Sum for allocated bytes");
3598 b
.add_u64(l_bluestore_stored
, "bluestore_stored",
3599 "Sum for stored bytes");
3600 b
.add_u64(l_bluestore_compressed
, "bluestore_compressed",
3601 "Sum for stored compressed bytes");
3602 b
.add_u64(l_bluestore_compressed_allocated
, "bluestore_compressed_allocated",
3603 "Sum for bytes allocated for compressed data");
3604 b
.add_u64(l_bluestore_compressed_original
, "bluestore_compressed_original",
3605 "Sum for original bytes that were compressed");
3607 b
.add_u64(l_bluestore_onodes
, "bluestore_onodes",
3608 "Number of onodes in cache");
3609 b
.add_u64_counter(l_bluestore_onode_hits
, "bluestore_onode_hits",
3610 "Sum for onode-lookups hit in the cache");
3611 b
.add_u64_counter(l_bluestore_onode_misses
, "bluestore_onode_misses",
3612 "Sum for onode-lookups missed in the cache");
3613 b
.add_u64_counter(l_bluestore_onode_shard_hits
, "bluestore_onode_shard_hits",
3614 "Sum for onode-shard lookups hit in the cache");
3615 b
.add_u64_counter(l_bluestore_onode_shard_misses
,
3616 "bluestore_onode_shard_misses",
3617 "Sum for onode-shard lookups missed in the cache");
3618 b
.add_u64(l_bluestore_extents
, "bluestore_extents",
3619 "Number of extents in cache");
3620 b
.add_u64(l_bluestore_blobs
, "bluestore_blobs",
3621 "Number of blobs in cache");
3622 b
.add_u64(l_bluestore_buffers
, "bluestore_buffers",
3623 "Number of buffers in cache");
3624 b
.add_u64(l_bluestore_buffer_bytes
, "bluestore_buffer_bytes",
3625 "Number of buffer bytes in cache");
3626 b
.add_u64(l_bluestore_buffer_hit_bytes
, "bluestore_buffer_hit_bytes",
3627 "Sum for bytes of read hit in the cache");
3628 b
.add_u64(l_bluestore_buffer_miss_bytes
, "bluestore_buffer_miss_bytes",
3629 "Sum for bytes of read missed in the cache");
3631 b
.add_u64_counter(l_bluestore_write_big
, "bluestore_write_big",
3632 "Large aligned writes into fresh blobs");
3633 b
.add_u64_counter(l_bluestore_write_big_bytes
, "bluestore_write_big_bytes",
3634 "Large aligned writes into fresh blobs (bytes)");
3635 b
.add_u64_counter(l_bluestore_write_big_blobs
, "bluestore_write_big_blobs",
3636 "Large aligned writes into fresh blobs (blobs)");
3637 b
.add_u64_counter(l_bluestore_write_small
, "bluestore_write_small",
3638 "Small writes into existing or sparse small blobs");
3639 b
.add_u64_counter(l_bluestore_write_small_bytes
, "bluestore_write_small_bytes",
3640 "Small writes into existing or sparse small blobs (bytes)");
3641 b
.add_u64_counter(l_bluestore_write_small_unused
,
3642 "bluestore_write_small_unused",
3643 "Small writes into unused portion of existing blob");
3644 b
.add_u64_counter(l_bluestore_write_small_deferred
,
3645 "bluestore_write_small_deferred",
3646 "Small overwrites using deferred");
3647 b
.add_u64_counter(l_bluestore_write_small_pre_read
,
3648 "bluestore_write_small_pre_read",
3649 "Small writes that required we read some data (possibly "
3650 "cached) to fill out the block");
3651 b
.add_u64_counter(l_bluestore_write_small_new
, "bluestore_write_small_new",
3652 "Small write into new (sparse) blob");
3654 b
.add_u64_counter(l_bluestore_txc
, "bluestore_txc", "Transactions committed");
3655 b
.add_u64_counter(l_bluestore_onode_reshard
, "bluestore_onode_reshard",
3656 "Onode extent map reshard events");
3657 b
.add_u64_counter(l_bluestore_blob_split
, "bluestore_blob_split",
3658 "Sum for blob splitting due to resharding");
3659 b
.add_u64_counter(l_bluestore_extent_compress
, "bluestore_extent_compress",
3660 "Sum for extents that have been removed due to compression");
3661 b
.add_u64_counter(l_bluestore_gc_merged
, "bluestore_gc_merged",
3662 "Sum for extents that have been merged due to garbage "
3664 logger
= b
.create_perf_counters();
3665 cct
->get_perfcounters_collection()->add(logger
);
3668 int BlueStore::_reload_logger()
3670 struct store_statfs_t store_statfs
;
3672 int r
= statfs(&store_statfs
);
3674 logger
->set(l_bluestore_allocated
, store_statfs
.allocated
);
3675 logger
->set(l_bluestore_stored
, store_statfs
.stored
);
3676 logger
->set(l_bluestore_compressed
, store_statfs
.compressed
);
3677 logger
->set(l_bluestore_compressed_allocated
, store_statfs
.compressed_allocated
);
3678 logger
->set(l_bluestore_compressed_original
, store_statfs
.compressed_original
);
3683 void BlueStore::_shutdown_logger()
3685 cct
->get_perfcounters_collection()->remove(logger
);
3689 int BlueStore::get_block_device_fsid(CephContext
* cct
, const string
& path
,
3692 bluestore_bdev_label_t label
;
3693 int r
= _read_bdev_label(cct
, path
, &label
);
3696 *fsid
= label
.osd_uuid
;
3700 int BlueStore::_open_path()
3702 assert(path_fd
< 0);
3703 path_fd
= ::open(path
.c_str(), O_DIRECTORY
);
3706 derr
<< __func__
<< " unable to open " << path
<< ": " << cpp_strerror(r
)
3713 void BlueStore::_close_path()
3715 VOID_TEMP_FAILURE_RETRY(::close(path_fd
));
3719 int BlueStore::_write_bdev_label(string path
, bluestore_bdev_label_t label
)
3721 dout(10) << __func__
<< " path " << path
<< " label " << label
<< dendl
;
3723 ::encode(label
, bl
);
3724 uint32_t crc
= bl
.crc32c(-1);
3726 assert(bl
.length() <= BDEV_LABEL_BLOCK_SIZE
);
3727 bufferptr
z(BDEV_LABEL_BLOCK_SIZE
- bl
.length());
3729 bl
.append(std::move(z
));
3731 int fd
= ::open(path
.c_str(), O_WRONLY
);
3734 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
3738 int r
= bl
.write_fd(fd
);
3740 derr
<< __func__
<< " failed to write to " << path
3741 << ": " << cpp_strerror(r
) << dendl
;
3743 VOID_TEMP_FAILURE_RETRY(::close(fd
));
3747 int BlueStore::_read_bdev_label(CephContext
* cct
, string path
,
3748 bluestore_bdev_label_t
*label
)
3750 dout(10) << __func__
<< dendl
;
3751 int fd
= ::open(path
.c_str(), O_RDONLY
);
3754 derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd
)
3759 int r
= bl
.read_fd(fd
, BDEV_LABEL_BLOCK_SIZE
);
3760 VOID_TEMP_FAILURE_RETRY(::close(fd
));
3762 derr
<< __func__
<< " failed to read from " << path
3763 << ": " << cpp_strerror(r
) << dendl
;
3767 uint32_t crc
, expected_crc
;
3768 bufferlist::iterator p
= bl
.begin();
3770 ::decode(*label
, p
);
3772 t
.substr_of(bl
, 0, p
.get_off());
3774 ::decode(expected_crc
, p
);
3776 catch (buffer::error
& e
) {
3777 derr
<< __func__
<< " unable to decode label at offset " << p
.get_off()
3782 if (crc
!= expected_crc
) {
3783 derr
<< __func__
<< " bad crc on label, expected " << expected_crc
3784 << " != actual " << crc
<< dendl
;
3787 dout(10) << __func__
<< " got " << *label
<< dendl
;
3791 int BlueStore::_check_or_set_bdev_label(
3792 string path
, uint64_t size
, string desc
, bool create
)
3794 bluestore_bdev_label_t label
;
3796 label
.osd_uuid
= fsid
;
3798 label
.btime
= ceph_clock_now();
3799 label
.description
= desc
;
3800 int r
= _write_bdev_label(path
, label
);
3804 int r
= _read_bdev_label(cct
, path
, &label
);
3807 if (label
.osd_uuid
!= fsid
) {
3808 derr
<< __func__
<< " bdev " << path
<< " fsid " << label
.osd_uuid
3809 << " does not match our fsid " << fsid
<< dendl
;
3816 void BlueStore::_set_alloc_sizes(void)
3818 min_alloc_size_order
= ctz(min_alloc_size
);
3819 assert(min_alloc_size
== 1u << min_alloc_size_order
);
3821 max_alloc_size
= cct
->_conf
->bluestore_max_alloc_size
;
3823 if (cct
->_conf
->bluestore_prefer_deferred_size
) {
3824 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size
;
3827 if (bdev
->is_rotational()) {
3828 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_hdd
;
3830 prefer_deferred_size
= cct
->_conf
->bluestore_prefer_deferred_size_ssd
;
3834 if (cct
->_conf
->bluestore_deferred_batch_ops
) {
3835 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops
;
3838 if (bdev
->is_rotational()) {
3839 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_hdd
;
3841 deferred_batch_ops
= cct
->_conf
->bluestore_deferred_batch_ops_ssd
;
3845 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
3846 << std::dec
<< " order " << min_alloc_size_order
3847 << " max_alloc_size 0x" << std::hex
<< max_alloc_size
3848 << " prefer_deferred_size 0x" << prefer_deferred_size
3850 << " deferred_batch_ops " << deferred_batch_ops
3854 int BlueStore::_open_bdev(bool create
)
3856 assert(bdev
== NULL
);
3857 string p
= path
+ "/block";
3858 bdev
= BlockDevice::create(cct
, p
, aio_cb
, static_cast<void*>(this));
3859 int r
= bdev
->open(p
);
3863 if (bdev
->supported_bdev_label()) {
3864 r
= _check_or_set_bdev_label(p
, bdev
->get_size(), "main", create
);
3869 // initialize global block parameters
3870 block_size
= bdev
->get_block_size();
3871 block_mask
= ~(block_size
- 1);
3872 block_size_order
= ctz(block_size
);
3873 assert(block_size
== 1u << block_size_order
);
3884 void BlueStore::_close_bdev()
3892 int BlueStore::_open_fm(bool create
)
3895 fm
= FreelistManager::create(cct
, freelist_type
, db
, PREFIX_ALLOC
);
3898 // initialize freespace
3899 dout(20) << __func__
<< " initializing freespace" << dendl
;
3900 KeyValueDB::Transaction t
= db
->get_transaction();
3903 bl
.append(freelist_type
);
3904 t
->set(PREFIX_SUPER
, "freelist_type", bl
);
3906 fm
->create(bdev
->get_size(), t
);
3908 // allocate superblock reserved space. note that we do not mark
3909 // bluefs space as allocated in the freelist; we instead rely on
3911 fm
->allocate(0, SUPER_RESERVED
, t
);
3913 uint64_t reserved
= 0;
3914 if (cct
->_conf
->bluestore_bluefs
) {
3915 assert(bluefs_extents
.num_intervals() == 1);
3916 interval_set
<uint64_t>::iterator p
= bluefs_extents
.begin();
3917 reserved
= p
.get_start() + p
.get_len();
3918 dout(20) << __func__
<< " reserved 0x" << std::hex
<< reserved
<< std::dec
3919 << " for bluefs" << dendl
;
3921 ::encode(bluefs_extents
, bl
);
3922 t
->set(PREFIX_SUPER
, "bluefs_extents", bl
);
3923 dout(20) << __func__
<< " bluefs_extents 0x" << std::hex
<< bluefs_extents
3924 << std::dec
<< dendl
;
3926 reserved
= SUPER_RESERVED
;
3929 if (cct
->_conf
->bluestore_debug_prefill
> 0) {
3930 uint64_t end
= bdev
->get_size() - reserved
;
3931 dout(1) << __func__
<< " pre-fragmenting freespace, using "
3932 << cct
->_conf
->bluestore_debug_prefill
<< " with max free extent "
3933 << cct
->_conf
->bluestore_debug_prefragment_max
<< dendl
;
3934 uint64_t start
= P2ROUNDUP(reserved
, min_alloc_size
);
3935 uint64_t max_b
= cct
->_conf
->bluestore_debug_prefragment_max
/ min_alloc_size
;
3936 float r
= cct
->_conf
->bluestore_debug_prefill
;
3940 while (!stop
&& start
< end
) {
3941 uint64_t l
= (rand() % max_b
+ 1) * min_alloc_size
;
3942 if (start
+ l
> end
) {
3944 l
= P2ALIGN(l
, min_alloc_size
);
3946 assert(start
+ l
<= end
);
3948 uint64_t u
= 1 + (uint64_t)(r
* (double)l
);
3949 u
= P2ROUNDUP(u
, min_alloc_size
);
3950 if (start
+ l
+ u
> end
) {
3951 u
= end
- (start
+ l
);
3952 // trim to align so we don't overflow again
3953 u
= P2ALIGN(u
, min_alloc_size
);
3956 assert(start
+ l
+ u
<= end
);
3958 dout(20) << " free 0x" << std::hex
<< start
<< "~" << l
3959 << " use 0x" << u
<< std::dec
<< dendl
;
3962 // break if u has been trimmed to nothing
3966 fm
->allocate(start
+ l
, u
, t
);
3970 db
->submit_transaction_sync(t
);
3975 derr
<< __func__
<< " freelist init failed: " << cpp_strerror(r
) << dendl
;
3983 void BlueStore::_close_fm()
3985 dout(10) << __func__
<< dendl
;
3992 int BlueStore::_open_alloc()
3994 assert(alloc
== NULL
);
3995 assert(bdev
->get_size());
3996 alloc
= Allocator::create(cct
, cct
->_conf
->bluestore_allocator
,
4000 lderr(cct
) << __func__
<< " Allocator::unknown alloc type "
4001 << cct
->_conf
->bluestore_allocator
4006 uint64_t num
= 0, bytes
= 0;
4008 dout(1) << __func__
<< " opening allocation metadata" << dendl
;
4009 // initialize from freelist
4010 fm
->enumerate_reset();
4011 uint64_t offset
, length
;
4012 while (fm
->enumerate_next(&offset
, &length
)) {
4013 alloc
->init_add_free(offset
, length
);
4017 dout(1) << __func__
<< " loaded " << pretty_si_t(bytes
)
4018 << " in " << num
<< " extents"
4021 // also mark bluefs space as allocated
4022 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
4023 alloc
->init_rm_free(e
.get_start(), e
.get_len());
4025 dout(10) << __func__
<< " marked bluefs_extents 0x" << std::hex
4026 << bluefs_extents
<< std::dec
<< " as allocated" << dendl
;
4031 void BlueStore::_close_alloc()
4039 int BlueStore::_open_fsid(bool create
)
4041 assert(fsid_fd
< 0);
4045 fsid_fd
= ::openat(path_fd
, "fsid", flags
, 0644);
4048 derr
<< __func__
<< " " << cpp_strerror(err
) << dendl
;
4054 int BlueStore::_read_fsid(uuid_d
*uuid
)
4057 memset(fsid_str
, 0, sizeof(fsid_str
));
4058 int ret
= safe_read(fsid_fd
, fsid_str
, sizeof(fsid_str
));
4060 derr
<< __func__
<< " failed: " << cpp_strerror(ret
) << dendl
;
4067 if (!uuid
->parse(fsid_str
)) {
4068 derr
<< __func__
<< " unparsable uuid " << fsid_str
<< dendl
;
4074 int BlueStore::_write_fsid()
4076 int r
= ::ftruncate(fsid_fd
, 0);
4079 derr
<< __func__
<< " fsid truncate failed: " << cpp_strerror(r
) << dendl
;
4082 string str
= stringify(fsid
) + "\n";
4083 r
= safe_write(fsid_fd
, str
.c_str(), str
.length());
4085 derr
<< __func__
<< " fsid write failed: " << cpp_strerror(r
) << dendl
;
4088 r
= ::fsync(fsid_fd
);
4091 derr
<< __func__
<< " fsid fsync failed: " << cpp_strerror(r
) << dendl
;
4097 void BlueStore::_close_fsid()
4099 VOID_TEMP_FAILURE_RETRY(::close(fsid_fd
));
4103 int BlueStore::_lock_fsid()
4106 memset(&l
, 0, sizeof(l
));
4108 l
.l_whence
= SEEK_SET
;
4109 int r
= ::fcntl(fsid_fd
, F_SETLK
, &l
);
4112 derr
<< __func__
<< " failed to lock " << path
<< "/fsid"
4113 << " (is another ceph-osd still running?)"
4114 << cpp_strerror(err
) << dendl
;
4120 bool BlueStore::test_mount_in_use()
4122 // most error conditions mean the mount is not in use (e.g., because
4123 // it doesn't exist). only if we fail to lock do we conclude it is
4126 int r
= _open_path();
4129 r
= _open_fsid(false);
4134 ret
= true; // if we can't lock, it is in use
4141 int BlueStore::_open_db(bool create
)
4145 string fn
= path
+ "/db";
4148 ceph::shared_ptr
<Int64ArrayMergeOperator
> merge_op(new Int64ArrayMergeOperator
);
4152 kv_backend
= cct
->_conf
->bluestore_kvbackend
;
4154 r
= read_meta("kv_backend", &kv_backend
);
4156 derr
<< __func__
<< " unable to read 'kv_backend' meta" << dendl
;
4160 dout(10) << __func__
<< " kv_backend = " << kv_backend
<< dendl
;
4164 do_bluefs
= cct
->_conf
->bluestore_bluefs
;
4167 r
= read_meta("bluefs", &s
);
4169 derr
<< __func__
<< " unable to read 'bluefs' meta" << dendl
;
4174 } else if (s
== "0") {
4177 derr
<< __func__
<< " bluefs = " << s
<< " : not 0 or 1, aborting"
4182 dout(10) << __func__
<< " do_bluefs = " << do_bluefs
<< dendl
;
4184 rocksdb::Env
*env
= NULL
;
4186 dout(10) << __func__
<< " initializing bluefs" << dendl
;
4187 if (kv_backend
!= "rocksdb") {
4188 derr
<< " backend must be rocksdb to use bluefs" << dendl
;
4191 bluefs
= new BlueFS(cct
);
4196 bfn
= path
+ "/block.db";
4197 if (::stat(bfn
.c_str(), &st
) == 0) {
4198 r
= bluefs
->add_block_device(BlueFS::BDEV_DB
, bfn
);
4200 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4201 << cpp_strerror(r
) << dendl
;
4205 if (bluefs
->bdev_support_label(BlueFS::BDEV_DB
)) {
4206 r
= _check_or_set_bdev_label(
4208 bluefs
->get_block_device_size(BlueFS::BDEV_DB
),
4209 "bluefs db", create
);
4212 << " check block device(" << bfn
<< ") label returned: "
4213 << cpp_strerror(r
) << dendl
;
4218 bluefs
->add_block_extent(
4221 bluefs
->get_block_device_size(BlueFS::BDEV_DB
) - SUPER_RESERVED
);
4223 bluefs_shared_bdev
= BlueFS::BDEV_SLOW
;
4224 bluefs_single_shared_device
= false;
4226 bluefs_shared_bdev
= BlueFS::BDEV_DB
;
4230 bfn
= path
+ "/block";
4231 r
= bluefs
->add_block_device(bluefs_shared_bdev
, bfn
);
4233 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4234 << cpp_strerror(r
) << dendl
;
4238 // note: we always leave the first SUPER_RESERVED (8k) of the device unused
4240 bdev
->get_size() * (cct
->_conf
->bluestore_bluefs_min_ratio
+
4241 cct
->_conf
->bluestore_bluefs_gift_ratio
);
4242 initial
= MAX(initial
, cct
->_conf
->bluestore_bluefs_min
);
4243 // align to bluefs's alloc_size
4244 initial
= P2ROUNDUP(initial
, cct
->_conf
->bluefs_alloc_size
);
4245 initial
+= cct
->_conf
->bluefs_alloc_size
- SUPER_RESERVED
;
4246 bluefs
->add_block_extent(bluefs_shared_bdev
, SUPER_RESERVED
, initial
);
4247 bluefs_extents
.insert(SUPER_RESERVED
, initial
);
4250 bfn
= path
+ "/block.wal";
4251 if (::stat(bfn
.c_str(), &st
) == 0) {
4252 r
= bluefs
->add_block_device(BlueFS::BDEV_WAL
, bfn
);
4254 derr
<< __func__
<< " add block device(" << bfn
<< ") returned: "
4255 << cpp_strerror(r
) << dendl
;
4259 if (bluefs
->bdev_support_label(BlueFS::BDEV_WAL
)) {
4260 r
= _check_or_set_bdev_label(
4262 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
),
4263 "bluefs wal", create
);
4265 derr
<< __func__
<< " check block device(" << bfn
4266 << ") label returned: " << cpp_strerror(r
) << dendl
;
4272 bluefs
->add_block_extent(
4273 BlueFS::BDEV_WAL
, BDEV_LABEL_BLOCK_SIZE
,
4274 bluefs
->get_block_device_size(BlueFS::BDEV_WAL
) -
4275 BDEV_LABEL_BLOCK_SIZE
);
4277 cct
->_conf
->set_val("rocksdb_separate_wal_dir", "true");
4278 bluefs_single_shared_device
= false;
4280 cct
->_conf
->set_val("rocksdb_separate_wal_dir", "false");
4286 r
= bluefs
->mount();
4288 derr
<< __func__
<< " failed bluefs mount: " << cpp_strerror(r
) << dendl
;
4291 if (cct
->_conf
->bluestore_bluefs_env_mirror
) {
4292 rocksdb::Env
*a
= new BlueRocksEnv(bluefs
);
4293 rocksdb::Env
*b
= rocksdb::Env::Default();
4295 string cmd
= "rm -rf " + path
+ "/db " +
4296 path
+ "/db.slow " +
4298 int r
= system(cmd
.c_str());
4301 env
= new rocksdb::EnvMirror(b
, a
, false, true);
4303 env
= new BlueRocksEnv(bluefs
);
4305 // simplify the dir names, too, as "seen" by rocksdb
4309 if (bluefs_shared_bdev
== BlueFS::BDEV_SLOW
) {
4310 // we have both block.db and block; tell rocksdb!
4311 // note: the second (last) size value doesn't really matter
4312 ostringstream db_paths
;
4313 uint64_t db_size
= bluefs
->get_block_device_size(BlueFS::BDEV_DB
);
4314 uint64_t slow_size
= bluefs
->get_block_device_size(BlueFS::BDEV_SLOW
);
4315 db_paths
<< fn
<< ","
4316 << (uint64_t)(db_size
* 95 / 100) << " "
4317 << fn
+ ".slow" << ","
4318 << (uint64_t)(slow_size
* 95 / 100);
4319 cct
->_conf
->set_val("rocksdb_db_paths", db_paths
.str(), false);
4320 dout(10) << __func__
<< " set rocksdb_db_paths to "
4321 << cct
->_conf
->get_val
<std::string
>("rocksdb_db_paths") << dendl
;
4326 if (cct
->_conf
->rocksdb_separate_wal_dir
)
4327 env
->CreateDir(fn
+ ".wal");
4328 if (cct
->_conf
->get_val
<std::string
>("rocksdb_db_paths").length())
4329 env
->CreateDir(fn
+ ".slow");
4331 } else if (create
) {
4332 int r
= ::mkdir(fn
.c_str(), 0755);
4335 if (r
< 0 && r
!= -EEXIST
) {
4336 derr
<< __func__
<< " failed to create " << fn
<< ": " << cpp_strerror(r
)
4342 if (cct
->_conf
->rocksdb_separate_wal_dir
) {
4343 string walfn
= path
+ "/db.wal";
4344 r
= ::mkdir(walfn
.c_str(), 0755);
4347 if (r
< 0 && r
!= -EEXIST
) {
4348 derr
<< __func__
<< " failed to create " << walfn
4349 << ": " << cpp_strerror(r
)
4356 db
= KeyValueDB::create(cct
,
4359 static_cast<void*>(env
));
4361 derr
<< __func__
<< " error creating db" << dendl
;
4367 // delete env manually here since we can't depend on db to do this
4374 FreelistManager::setup_merge_operators(db
);
4375 db
->set_merge_operator(PREFIX_STAT
, merge_op
);
4377 if (kv_backend
== "rocksdb")
4378 options
= cct
->_conf
->bluestore_rocksdb_options
;
4381 r
= db
->create_and_open(err
);
4385 derr
<< __func__
<< " erroring opening db: " << err
.str() << dendl
;
4395 dout(1) << __func__
<< " opened " << kv_backend
4396 << " path " << fn
<< " options " << options
<< dendl
;
4406 void BlueStore::_close_db()
4418 int BlueStore::_reconcile_bluefs_freespace()
4420 dout(10) << __func__
<< dendl
;
4421 interval_set
<uint64_t> bset
;
4422 int r
= bluefs
->get_block_extents(bluefs_shared_bdev
, &bset
);
4424 if (bset
== bluefs_extents
) {
4425 dout(10) << __func__
<< " we agree bluefs has 0x" << std::hex
<< bset
4426 << std::dec
<< dendl
;
4429 dout(10) << __func__
<< " bluefs says 0x" << std::hex
<< bset
<< std::dec
4431 dout(10) << __func__
<< " super says 0x" << std::hex
<< bluefs_extents
4432 << std::dec
<< dendl
;
4434 interval_set
<uint64_t> overlap
;
4435 overlap
.intersection_of(bset
, bluefs_extents
);
4437 bset
.subtract(overlap
);
4438 if (!bset
.empty()) {
4439 derr
<< __func__
<< " bluefs extra 0x" << std::hex
<< bset
<< std::dec
4444 interval_set
<uint64_t> super_extra
;
4445 super_extra
= bluefs_extents
;
4446 super_extra
.subtract(overlap
);
4447 if (!super_extra
.empty()) {
4448 // This is normal: it can happen if we commit to give extents to
4449 // bluefs and we crash before bluefs commits that it owns them.
4450 dout(10) << __func__
<< " super extra " << super_extra
<< dendl
;
4451 for (interval_set
<uint64_t>::iterator p
= super_extra
.begin();
4452 p
!= super_extra
.end();
4454 bluefs
->add_block_extent(bluefs_shared_bdev
, p
.get_start(), p
.get_len());
4461 int BlueStore::_balance_bluefs_freespace(PExtentVector
*extents
)
4466 vector
<pair
<uint64_t,uint64_t>> bluefs_usage
; // <free, total> ...
4467 bluefs
->get_usage(&bluefs_usage
);
4468 assert(bluefs_usage
.size() > bluefs_shared_bdev
);
4470 // fixme: look at primary bdev only for now
4471 uint64_t bluefs_free
= bluefs_usage
[bluefs_shared_bdev
].first
;
4472 uint64_t bluefs_total
= bluefs_usage
[bluefs_shared_bdev
].second
;
4473 float bluefs_free_ratio
= (float)bluefs_free
/ (float)bluefs_total
;
4475 uint64_t my_free
= alloc
->get_free();
4476 uint64_t total
= bdev
->get_size();
4477 float my_free_ratio
= (float)my_free
/ (float)total
;
4479 uint64_t total_free
= bluefs_free
+ my_free
;
4481 float bluefs_ratio
= (float)bluefs_free
/ (float)total_free
;
4483 dout(10) << __func__
4484 << " bluefs " << pretty_si_t(bluefs_free
)
4485 << " free (" << bluefs_free_ratio
4486 << ") bluestore " << pretty_si_t(my_free
)
4487 << " free (" << my_free_ratio
4488 << "), bluefs_ratio " << bluefs_ratio
4492 uint64_t reclaim
= 0;
4493 if (bluefs_ratio
< cct
->_conf
->bluestore_bluefs_min_ratio
) {
4494 gift
= cct
->_conf
->bluestore_bluefs_gift_ratio
* total_free
;
4495 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
4496 << " < min_ratio " << cct
->_conf
->bluestore_bluefs_min_ratio
4497 << ", should gift " << pretty_si_t(gift
) << dendl
;
4498 } else if (bluefs_ratio
> cct
->_conf
->bluestore_bluefs_max_ratio
) {
4499 reclaim
= cct
->_conf
->bluestore_bluefs_reclaim_ratio
* total_free
;
4500 if (bluefs_total
- reclaim
< cct
->_conf
->bluestore_bluefs_min
)
4501 reclaim
= bluefs_total
- cct
->_conf
->bluestore_bluefs_min
;
4502 dout(10) << __func__
<< " bluefs_ratio " << bluefs_ratio
4503 << " > max_ratio " << cct
->_conf
->bluestore_bluefs_max_ratio
4504 << ", should reclaim " << pretty_si_t(reclaim
) << dendl
;
4506 if (bluefs_total
< cct
->_conf
->bluestore_bluefs_min
&&
4507 cct
->_conf
->bluestore_bluefs_min
<
4508 (uint64_t)(cct
->_conf
->bluestore_bluefs_max_ratio
* total_free
)) {
4509 uint64_t g
= cct
->_conf
->bluestore_bluefs_min
- bluefs_total
;
4510 dout(10) << __func__
<< " bluefs_total " << bluefs_total
4511 << " < min " << cct
->_conf
->bluestore_bluefs_min
4512 << ", should gift " << pretty_si_t(g
) << dendl
;
4519 // round up to alloc size
4520 gift
= P2ROUNDUP(gift
, cct
->_conf
->bluefs_alloc_size
);
4522 // hard cap to fit into 32 bits
4523 gift
= MIN(gift
, 1ull<<31);
4524 dout(10) << __func__
<< " gifting " << gift
4525 << " (" << pretty_si_t(gift
) << ")" << dendl
;
4527 // fixme: just do one allocation to start...
4528 int r
= alloc
->reserve(gift
);
4531 AllocExtentVector exts
;
4532 int64_t alloc_len
= alloc
->allocate(gift
, cct
->_conf
->bluefs_alloc_size
,
4535 if (alloc_len
< (int64_t)gift
) {
4536 derr
<< __func__
<< " allocate failed on 0x" << std::hex
<< gift
4537 << " min_alloc_size 0x" << min_alloc_size
<< std::dec
<< dendl
;
4539 assert(0 == "allocate failed, wtf");
4542 for (auto& p
: exts
) {
4543 bluestore_pextent_t e
= bluestore_pextent_t(p
);
4544 dout(1) << __func__
<< " gifting " << e
<< " to bluefs" << dendl
;
4545 extents
->push_back(e
);
4552 // reclaim from bluefs?
4554 // round up to alloc size
4555 reclaim
= P2ROUNDUP(reclaim
, cct
->_conf
->bluefs_alloc_size
);
4557 // hard cap to fit into 32 bits
4558 reclaim
= MIN(reclaim
, 1ull<<31);
4559 dout(10) << __func__
<< " reclaiming " << reclaim
4560 << " (" << pretty_si_t(reclaim
) << ")" << dendl
;
4562 while (reclaim
> 0) {
4563 // NOTE: this will block and do IO.
4564 AllocExtentVector extents
;
4565 int r
= bluefs
->reclaim_blocks(bluefs_shared_bdev
, reclaim
,
4568 derr
<< __func__
<< " failed to reclaim space from bluefs"
4572 for (auto e
: extents
) {
4573 bluefs_extents
.erase(e
.offset
, e
.length
);
4574 bluefs_extents_reclaiming
.insert(e
.offset
, e
.length
);
4575 reclaim
-= e
.length
;
4585 void BlueStore::_commit_bluefs_freespace(
4586 const PExtentVector
& bluefs_gift_extents
)
4588 dout(10) << __func__
<< dendl
;
4589 for (auto& p
: bluefs_gift_extents
) {
4590 bluefs
->add_block_extent(bluefs_shared_bdev
, p
.offset
, p
.length
);
4594 int BlueStore::_open_collections(int *errors
)
4596 assert(coll_map
.empty());
4597 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_COLL
);
4598 for (it
->upper_bound(string());
4602 if (cid
.parse(it
->key())) {
4606 cache_shards
[cid
.hash_to_shard(cache_shards
.size())],
4608 bufferlist bl
= it
->value();
4609 bufferlist::iterator p
= bl
.begin();
4611 ::decode(c
->cnode
, p
);
4612 } catch (buffer::error
& e
) {
4613 derr
<< __func__
<< " failed to decode cnode, key:"
4614 << pretty_binary_string(it
->key()) << dendl
;
4617 dout(20) << __func__
<< " opened " << cid
<< " " << c
<< dendl
;
4620 derr
<< __func__
<< " unrecognized collection " << it
->key() << dendl
;
4628 int BlueStore::_setup_block_symlink_or_file(
4634 dout(20) << __func__
<< " name " << name
<< " path " << epath
4635 << " size " << size
<< " create=" << (int)create
<< dendl
;
4640 if (epath
.length()) {
4641 r
= ::symlinkat(epath
.c_str(), path_fd
, name
.c_str());
4644 derr
<< __func__
<< " failed to create " << name
<< " symlink to "
4645 << epath
<< ": " << cpp_strerror(r
) << dendl
;
4649 if (!epath
.compare(0, strlen(SPDK_PREFIX
), SPDK_PREFIX
)) {
4650 int fd
= ::openat(path_fd
, epath
.c_str(), flags
, 0644);
4653 derr
<< __func__
<< " failed to open " << epath
<< " file: "
4654 << cpp_strerror(r
) << dendl
;
4657 string serial_number
= epath
.substr(strlen(SPDK_PREFIX
));
4658 r
= ::write(fd
, serial_number
.c_str(), serial_number
.size());
4659 assert(r
== (int)serial_number
.size());
4660 dout(1) << __func__
<< " created " << name
<< " symlink to "
4662 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4666 int fd
= ::openat(path_fd
, name
.c_str(), flags
, 0644);
4668 // block file is present
4670 int r
= ::fstat(fd
, &st
);
4672 S_ISREG(st
.st_mode
) && // if it is a regular file
4673 st
.st_size
== 0) { // and is 0 bytes
4674 r
= ::ftruncate(fd
, size
);
4677 derr
<< __func__
<< " failed to resize " << name
<< " file to "
4678 << size
<< ": " << cpp_strerror(r
) << dendl
;
4679 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4683 if (cct
->_conf
->bluestore_block_preallocate_file
) {
4684 #ifdef HAVE_POSIX_FALLOCATE
4685 r
= ::posix_fallocate(fd
, 0, size
);
4687 derr
<< __func__
<< " failed to prefallocate " << name
<< " file to "
4688 << size
<< ": " << cpp_strerror(r
) << dendl
;
4689 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4693 char data
[1024*128];
4694 for (uint64_t off
= 0; off
< size
; off
+= sizeof(data
)) {
4695 if (off
+ sizeof(data
) > size
)
4696 r
= ::write(fd
, data
, size
- off
);
4698 r
= ::write(fd
, data
, sizeof(data
));
4701 derr
<< __func__
<< " failed to prefallocate w/ write " << name
<< " file to "
4702 << size
<< ": " << cpp_strerror(r
) << dendl
;
4703 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4709 dout(1) << __func__
<< " resized " << name
<< " file to "
4710 << pretty_si_t(size
) << "B" << dendl
;
4712 VOID_TEMP_FAILURE_RETRY(::close(fd
));
4716 derr
<< __func__
<< " failed to open " << name
<< " file: "
4717 << cpp_strerror(r
) << dendl
;
4725 int BlueStore::mkfs()
4727 dout(1) << __func__
<< " path " << path
<< dendl
;
4733 r
= read_meta("mkfs_done", &done
);
4735 dout(1) << __func__
<< " already created" << dendl
;
4736 if (cct
->_conf
->bluestore_fsck_on_mkfs
) {
4737 r
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
4739 derr
<< __func__
<< " fsck found fatal error: " << cpp_strerror(r
)
4744 derr
<< __func__
<< " fsck found " << r
<< " errors" << dendl
;
4748 return r
; // idempotent
4754 r
= read_meta("type", &type
);
4756 if (type
!= "bluestore") {
4757 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
4761 r
= write_meta("type", "bluestore");
4767 freelist_type
= "bitmap";
4773 r
= _open_fsid(true);
4779 goto out_close_fsid
;
4781 r
= _read_fsid(&old_fsid
);
4782 if (r
< 0 || old_fsid
.is_zero()) {
4783 if (fsid
.is_zero()) {
4784 fsid
.generate_random();
4785 dout(1) << __func__
<< " generated fsid " << fsid
<< dendl
;
4787 dout(1) << __func__
<< " using provided fsid " << fsid
<< dendl
;
4789 // we'll write it later.
4791 if (!fsid
.is_zero() && fsid
!= old_fsid
) {
4792 derr
<< __func__
<< " on-disk fsid " << old_fsid
4793 << " != provided " << fsid
<< dendl
;
4795 goto out_close_fsid
;
4800 r
= _setup_block_symlink_or_file("block", cct
->_conf
->bluestore_block_path
,
4801 cct
->_conf
->bluestore_block_size
,
4802 cct
->_conf
->bluestore_block_create
);
4804 goto out_close_fsid
;
4805 if (cct
->_conf
->bluestore_bluefs
) {
4806 r
= _setup_block_symlink_or_file("block.wal", cct
->_conf
->bluestore_block_wal_path
,
4807 cct
->_conf
->bluestore_block_wal_size
,
4808 cct
->_conf
->bluestore_block_wal_create
);
4810 goto out_close_fsid
;
4811 r
= _setup_block_symlink_or_file("block.db", cct
->_conf
->bluestore_block_db_path
,
4812 cct
->_conf
->bluestore_block_db_size
,
4813 cct
->_conf
->bluestore_block_db_create
);
4815 goto out_close_fsid
;
4818 r
= _open_bdev(true);
4820 goto out_close_fsid
;
4824 goto out_close_bdev
;
4831 KeyValueDB::Transaction t
= db
->get_transaction();
4834 ::encode((uint64_t)0, bl
);
4835 t
->set(PREFIX_SUPER
, "nid_max", bl
);
4836 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
4839 // choose min_alloc_size
4840 if (cct
->_conf
->bluestore_min_alloc_size
) {
4841 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size
;
4844 if (bdev
->is_rotational()) {
4845 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_hdd
;
4847 min_alloc_size
= cct
->_conf
->bluestore_min_alloc_size_ssd
;
4853 ::encode((uint64_t)min_alloc_size
, bl
);
4854 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
4857 ondisk_format
= latest_ondisk_format
;
4858 _prepare_ondisk_format_super(t
);
4859 db
->submit_transaction_sync(t
);
4866 r
= write_meta("kv_backend", cct
->_conf
->bluestore_kvbackend
);
4868 goto out_close_alloc
;
4869 r
= write_meta("bluefs", stringify((int)cct
->_conf
->bluestore_bluefs
));
4871 goto out_close_alloc
;
4873 if (fsid
!= old_fsid
) {
4876 derr
<< __func__
<< " error writing fsid: " << cpp_strerror(r
) << dendl
;
4877 goto out_close_alloc
;
4881 // indicate success by writing the 'mkfs_done' file
4882 r
= write_meta("mkfs_done", "yes");
4884 goto out_close_alloc
;
4885 dout(10) << __func__
<< " success" << dendl
;
4901 cct
->_conf
->bluestore_fsck_on_mkfs
) {
4902 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mkfs_deep
);
4906 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
4911 derr
<< __func__
<< " failed, " << cpp_strerror(r
) << dendl
;
4916 void BlueStore::set_cache_shards(unsigned num
)
4918 dout(10) << __func__
<< " " << num
<< dendl
;
4919 size_t old
= cache_shards
.size();
4921 cache_shards
.resize(num
);
4922 for (unsigned i
= old
; i
< num
; ++i
) {
4923 cache_shards
[i
] = Cache::create(cct
, cct
->_conf
->bluestore_cache_type
,
4928 int BlueStore::_mount(bool kv_only
)
4930 dout(1) << __func__
<< " path " << path
<< dendl
;
4934 int r
= read_meta("type", &type
);
4936 derr
<< __func__
<< " failed to load os-type: " << cpp_strerror(r
)
4941 if (type
!= "bluestore") {
4942 derr
<< __func__
<< " expected bluestore, but type is " << type
<< dendl
;
4947 if (cct
->_conf
->bluestore_fsck_on_mount
) {
4948 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_mount_deep
);
4952 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
4957 int r
= _open_path();
4960 r
= _open_fsid(false);
4964 r
= _read_fsid(&fsid
);
4972 r
= _open_bdev(false);
4976 r
= _open_db(false);
4983 r
= _open_super_meta();
4987 r
= _open_fm(false);
4995 r
= _open_collections();
4999 r
= _reload_logger();
5004 r
= _reconcile_bluefs_freespace();
5009 for (auto f
: finishers
) {
5012 kv_sync_thread
.create("bstore_kv_sync");
5014 r
= _deferred_replay();
5018 mempool_thread
.init();
5026 for (auto f
: finishers
) {
5027 f
->wait_for_empty();
5047 int BlueStore::umount()
5050 dout(1) << __func__
<< dendl
;
5053 _osr_unregister_all();
5055 mempool_thread
.shutdown();
5057 dout(20) << __func__
<< " stopping kv thread" << dendl
;
5059 for (auto f
: finishers
) {
5060 dout(20) << __func__
<< " draining finisher" << dendl
;
5061 f
->wait_for_empty();
5062 dout(20) << __func__
<< " stopping finisher" << dendl
;
5065 _reap_collections();
5067 dout(20) << __func__
<< " closing" << dendl
;
5077 if (cct
->_conf
->bluestore_fsck_on_umount
) {
5078 int rc
= fsck(cct
->_conf
->bluestore_fsck_on_umount_deep
);
5082 derr
<< __func__
<< " fsck found " << rc
<< " errors" << dendl
;
5089 static void apply(uint64_t off
,
5091 uint64_t granularity
,
5092 BlueStore::mempool_dynamic_bitset
&bitset
,
5094 std::function
<void(uint64_t,
5095 BlueStore::mempool_dynamic_bitset
&)> f
) {
5096 auto end
= ROUND_UP_TO(off
+ len
, granularity
);
5098 uint64_t pos
= off
/ granularity
;
5104 int BlueStore::_fsck_check_extents(
5105 const ghobject_t
& oid
,
5106 const PExtentVector
& extents
,
5108 mempool_dynamic_bitset
&used_blocks
,
5109 store_statfs_t
& expected_statfs
)
5111 dout(30) << __func__
<< " oid " << oid
<< " extents " << extents
<< dendl
;
5113 for (auto e
: extents
) {
5116 expected_statfs
.allocated
+= e
.length
;
5118 expected_statfs
.compressed_allocated
+= e
.length
;
5120 bool already
= false;
5122 e
.offset
, e
.length
, block_size
, used_blocks
, __func__
,
5123 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5130 derr
<< " " << oid
<< " extent " << e
5131 << " or a subset is already allocated" << dendl
;
5134 if (e
.end() > bdev
->get_size()) {
5135 derr
<< " " << oid
<< " extent " << e
5136 << " past end of block device" << dendl
;
5143 int BlueStore::fsck(bool deep
)
5145 dout(1) << __func__
<< (deep
? " (deep)" : " (shallow)") << " start" << dendl
;
5147 mempool::bluestore_fsck::set
<uint64_t> used_nids
;
5148 mempool::bluestore_fsck::set
<uint64_t> used_omap_head
;
5149 mempool_dynamic_bitset used_blocks
;
5150 mempool::bluestore_fsck::set
<uint64_t> used_sbids
;
5151 KeyValueDB::Iterator it
;
5152 store_statfs_t expected_statfs
, actual_statfs
;
5154 list
<ghobject_t
> oids
;
5156 bluestore_extent_ref_map_t ref_map
;
5159 mempool::bluestore_fsck::map
<uint64_t,sb_info_t
> sb_info
;
5161 uint64_t num_objects
= 0;
5162 uint64_t num_extents
= 0;
5163 uint64_t num_blobs
= 0;
5164 uint64_t num_spanning_blobs
= 0;
5165 uint64_t num_shared_blobs
= 0;
5166 uint64_t num_sharded_objects
= 0;
5167 uint64_t num_object_shards
= 0;
5169 utime_t start
= ceph_clock_now();
5171 int r
= _open_path();
5174 r
= _open_fsid(false);
5178 r
= _read_fsid(&fsid
);
5186 r
= _open_bdev(false);
5190 r
= _open_db(false);
5194 r
= _open_super_meta();
5198 r
= _open_fm(false);
5206 r
= _open_collections(&errors
);
5210 mempool_thread
.init();
5212 r
= _deferred_replay();
5216 used_blocks
.resize(bdev
->get_size() / block_size
);
5218 0, SUPER_RESERVED
, block_size
, used_blocks
, "0~SUPER_RESERVED",
5219 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5225 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
5227 e
.get_start(), e
.get_len(), block_size
, used_blocks
, "bluefs",
5228 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5241 // get expected statfs; fill unaffected fields to be able to compare
5243 statfs(&actual_statfs
);
5244 expected_statfs
.total
= actual_statfs
.total
;
5245 expected_statfs
.available
= actual_statfs
.available
;
5248 dout(1) << __func__
<< " walking object keyspace" << dendl
;
5249 it
= db
->get_iterator(PREFIX_OBJ
);
5253 mempool::bluestore_fsck::list
<string
> expecting_shards
;
5254 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
5255 dout(30) << " key " << pretty_binary_string(it
->key()) << dendl
;
5256 if (is_extent_shard_key(it
->key())) {
5257 while (!expecting_shards
.empty() &&
5258 expecting_shards
.front() < it
->key()) {
5259 derr
<< __func__
<< " error: missing shard key "
5260 << pretty_binary_string(expecting_shards
.front())
5263 expecting_shards
.pop_front();
5265 if (!expecting_shards
.empty() &&
5266 expecting_shards
.front() == it
->key()) {
5268 expecting_shards
.pop_front();
5274 get_key_extent_shard(it
->key(), &okey
, &offset
);
5275 derr
<< __func__
<< " error: stray shard 0x" << std::hex
<< offset
5276 << std::dec
<< dendl
;
5277 if (expecting_shards
.empty()) {
5278 derr
<< __func__
<< " error: " << pretty_binary_string(it
->key())
5279 << " is unexpected" << dendl
;
5283 while (expecting_shards
.front() > it
->key()) {
5284 derr
<< __func__
<< " error: saw " << pretty_binary_string(it
->key())
5286 derr
<< __func__
<< " error: exp "
5287 << pretty_binary_string(expecting_shards
.front()) << dendl
;
5289 expecting_shards
.pop_front();
5290 if (expecting_shards
.empty()) {
5298 int r
= get_key_object(it
->key(), &oid
);
5300 derr
<< __func__
<< " error: bad object key "
5301 << pretty_binary_string(it
->key()) << dendl
;
5306 oid
.shard_id
!= pgid
.shard
||
5307 oid
.hobj
.pool
!= (int64_t)pgid
.pool() ||
5308 !c
->contains(oid
)) {
5310 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
=
5312 p
!= coll_map
.end();
5314 if (p
->second
->contains(oid
)) {
5320 derr
<< __func__
<< " error: stray object " << oid
5321 << " not owned by any collection" << dendl
;
5325 c
->cid
.is_pg(&pgid
);
5326 dout(20) << __func__
<< " collection " << c
->cid
<< dendl
;
5329 if (!expecting_shards
.empty()) {
5330 for (auto &k
: expecting_shards
) {
5331 derr
<< __func__
<< " error: missing shard key "
5332 << pretty_binary_string(k
) << dendl
;
5335 expecting_shards
.clear();
5338 dout(10) << __func__
<< " " << oid
<< dendl
;
5339 RWLock::RLocker
l(c
->lock
);
5340 OnodeRef o
= c
->get_onode(oid
, false);
5342 if (o
->onode
.nid
> nid_max
) {
5343 derr
<< __func__
<< " error: " << oid
<< " nid " << o
->onode
.nid
5344 << " > nid_max " << nid_max
<< dendl
;
5347 if (used_nids
.count(o
->onode
.nid
)) {
5348 derr
<< __func__
<< " error: " << oid
<< " nid " << o
->onode
.nid
5349 << " already in use" << dendl
;
5351 continue; // go for next object
5353 used_nids
.insert(o
->onode
.nid
);
5356 num_spanning_blobs
+= o
->extent_map
.spanning_blob_map
.size();
5357 o
->extent_map
.fault_range(db
, 0, OBJECT_MAX_SIZE
);
5360 if (!o
->extent_map
.shards
.empty()) {
5361 ++num_sharded_objects
;
5362 num_object_shards
+= o
->extent_map
.shards
.size();
5364 for (auto& s
: o
->extent_map
.shards
) {
5365 dout(20) << __func__
<< " shard " << *s
.shard_info
<< dendl
;
5366 expecting_shards
.push_back(string());
5367 get_extent_shard_key(o
->key
, s
.shard_info
->offset
,
5368 &expecting_shards
.back());
5369 if (s
.shard_info
->offset
>= o
->onode
.size
) {
5370 derr
<< __func__
<< " error: " << oid
<< " shard 0x" << std::hex
5371 << s
.shard_info
->offset
<< " past EOF at 0x" << o
->onode
.size
5372 << std::dec
<< dendl
;
5377 map
<BlobRef
,bluestore_blob_t::unused_t
> referenced
;
5379 mempool::bluestore_fsck::map
<BlobRef
,
5380 bluestore_blob_use_tracker_t
> ref_map
;
5381 for (auto& l
: o
->extent_map
.extent_map
) {
5382 dout(20) << __func__
<< " " << l
<< dendl
;
5383 if (l
.logical_offset
< pos
) {
5384 derr
<< __func__
<< " error: " << oid
<< " lextent at 0x"
5385 << std::hex
<< l
.logical_offset
5386 << " overlaps with the previous, which ends at 0x" << pos
5387 << std::dec
<< dendl
;
5390 if (o
->extent_map
.spans_shard(l
.logical_offset
, l
.length
)) {
5391 derr
<< __func__
<< " error: " << oid
<< " lextent at 0x"
5392 << std::hex
<< l
.logical_offset
<< "~" << l
.length
5393 << " spans a shard boundary"
5394 << std::dec
<< dendl
;
5397 pos
= l
.logical_offset
+ l
.length
;
5398 expected_statfs
.stored
+= l
.length
;
5400 const bluestore_blob_t
& blob
= l
.blob
->get_blob();
5402 auto& ref
= ref_map
[l
.blob
];
5403 if (ref
.is_empty()) {
5404 uint32_t min_release_size
= blob
.get_release_size(min_alloc_size
);
5405 uint32_t l
= blob
.get_logical_length();
5406 ref
.init(l
, min_release_size
);
5412 if (blob
.has_unused()) {
5413 auto p
= referenced
.find(l
.blob
);
5414 bluestore_blob_t::unused_t
*pu
;
5415 if (p
== referenced
.end()) {
5416 pu
= &referenced
[l
.blob
];
5420 uint64_t blob_len
= blob
.get_logical_length();
5421 assert((blob_len
% (sizeof(*pu
)*8)) == 0);
5422 assert(l
.blob_offset
+ l
.length
<= blob_len
);
5423 uint64_t chunk_size
= blob_len
/ (sizeof(*pu
)*8);
5424 uint64_t start
= l
.blob_offset
/ chunk_size
;
5426 ROUND_UP_TO(l
.blob_offset
+ l
.length
, chunk_size
) / chunk_size
;
5427 for (auto i
= start
; i
< end
; ++i
) {
5432 for (auto &i
: referenced
) {
5433 dout(20) << __func__
<< " referenced 0x" << std::hex
<< i
.second
5434 << std::dec
<< " for " << *i
.first
<< dendl
;
5435 const bluestore_blob_t
& blob
= i
.first
->get_blob();
5436 if (i
.second
& blob
.unused
) {
5437 derr
<< __func__
<< " error: " << oid
<< " blob claims unused 0x"
5438 << std::hex
<< blob
.unused
5439 << " but extents reference 0x" << i
.second
5440 << " on blob " << *i
.first
<< dendl
;
5443 if (blob
.has_csum()) {
5444 uint64_t blob_len
= blob
.get_logical_length();
5445 uint64_t unused_chunk_size
= blob_len
/ (sizeof(blob
.unused
)*8);
5446 unsigned csum_count
= blob
.get_csum_count();
5447 unsigned csum_chunk_size
= blob
.get_csum_chunk_size();
5448 for (unsigned p
= 0; p
< csum_count
; ++p
) {
5449 unsigned pos
= p
* csum_chunk_size
;
5450 unsigned firstbit
= pos
/ unused_chunk_size
; // [firstbit,lastbit]
5451 unsigned lastbit
= (pos
+ csum_chunk_size
- 1) / unused_chunk_size
;
5452 unsigned mask
= 1u << firstbit
;
5453 for (unsigned b
= firstbit
+ 1; b
<= lastbit
; ++b
) {
5456 if ((blob
.unused
& mask
) == mask
) {
5457 // this csum chunk region is marked unused
5458 if (blob
.get_csum_item(p
) != 0) {
5459 derr
<< __func__
<< " error: " << oid
5460 << " blob claims csum chunk 0x" << std::hex
<< pos
5461 << "~" << csum_chunk_size
5462 << " is unused (mask 0x" << mask
<< " of unused 0x"
5463 << blob
.unused
<< ") but csum is non-zero 0x"
5464 << blob
.get_csum_item(p
) << std::dec
<< " on blob "
5465 << *i
.first
<< dendl
;
5472 for (auto &i
: ref_map
) {
5474 const bluestore_blob_t
& blob
= i
.first
->get_blob();
5475 bool equal
= i
.first
->get_blob_use_tracker().equal(i
.second
);
5477 derr
<< __func__
<< " error: " << oid
<< " blob " << *i
.first
5478 << " doesn't match expected ref_map " << i
.second
<< dendl
;
5481 if (blob
.is_compressed()) {
5482 expected_statfs
.compressed
+= blob
.get_compressed_payload_length();
5483 expected_statfs
.compressed_original
+=
5484 i
.first
->get_referenced_bytes();
5486 if (blob
.is_shared()) {
5487 if (i
.first
->shared_blob
->get_sbid() > blobid_max
) {
5488 derr
<< __func__
<< " error: " << oid
<< " blob " << blob
5489 << " sbid " << i
.first
->shared_blob
->get_sbid() << " > blobid_max "
5490 << blobid_max
<< dendl
;
5492 } else if (i
.first
->shared_blob
->get_sbid() == 0) {
5493 derr
<< __func__
<< " error: " << oid
<< " blob " << blob
5494 << " marked as shared but has uninitialized sbid"
5498 sb_info_t
& sbi
= sb_info
[i
.first
->shared_blob
->get_sbid()];
5499 sbi
.sb
= i
.first
->shared_blob
;
5500 sbi
.oids
.push_back(oid
);
5501 sbi
.compressed
= blob
.is_compressed();
5502 for (auto e
: blob
.get_extents()) {
5504 sbi
.ref_map
.get(e
.offset
, e
.length
);
5508 errors
+= _fsck_check_extents(oid
, blob
.get_extents(),
5509 blob
.is_compressed(),
5516 int r
= _do_read(c
.get(), o
, 0, o
->onode
.size
, bl
, 0);
5519 derr
<< __func__
<< " error: " << oid
<< " error during read: "
5520 << cpp_strerror(r
) << dendl
;
5524 if (o
->onode
.has_omap()) {
5525 if (used_omap_head
.count(o
->onode
.nid
)) {
5526 derr
<< __func__
<< " error: " << oid
<< " omap_head " << o
->onode
.nid
5527 << " already in use" << dendl
;
5530 used_omap_head
.insert(o
->onode
.nid
);
5536 dout(1) << __func__
<< " checking shared_blobs" << dendl
;
5537 it
= db
->get_iterator(PREFIX_SHARED_BLOB
);
5539 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
5540 string key
= it
->key();
5542 if (get_key_shared_blob(key
, &sbid
)) {
5543 derr
<< __func__
<< " error: bad key '" << key
5544 << "' in shared blob namespace" << dendl
;
5548 auto p
= sb_info
.find(sbid
);
5549 if (p
== sb_info
.end()) {
5550 derr
<< __func__
<< " error: found stray shared blob data for sbid 0x"
5551 << std::hex
<< sbid
<< std::dec
<< dendl
;
5555 sb_info_t
& sbi
= p
->second
;
5556 bluestore_shared_blob_t
shared_blob(sbid
);
5557 bufferlist bl
= it
->value();
5558 bufferlist::iterator blp
= bl
.begin();
5559 ::decode(shared_blob
, blp
);
5560 dout(20) << __func__
<< " " << *sbi
.sb
<< " " << shared_blob
<< dendl
;
5561 if (shared_blob
.ref_map
!= sbi
.ref_map
) {
5562 derr
<< __func__
<< " error: shared blob 0x" << std::hex
<< sbid
5563 << std::dec
<< " ref_map " << shared_blob
.ref_map
5564 << " != expected " << sbi
.ref_map
<< dendl
;
5567 PExtentVector extents
;
5568 for (auto &r
: shared_blob
.ref_map
.ref_map
) {
5569 extents
.emplace_back(bluestore_pextent_t(r
.first
, r
.second
.length
));
5571 errors
+= _fsck_check_extents(p
->second
.oids
.front(),
5573 p
->second
.compressed
,
5574 used_blocks
, expected_statfs
);
5579 for (auto &p
: sb_info
) {
5580 derr
<< __func__
<< " error: shared_blob 0x" << p
.first
5581 << " key is missing (" << *p
.second
.sb
<< ")" << dendl
;
5584 if (!(actual_statfs
== expected_statfs
)) {
5585 derr
<< __func__
<< " error: actual " << actual_statfs
5586 << " != expected " << expected_statfs
<< dendl
;
5590 dout(1) << __func__
<< " checking for stray omap data" << dendl
;
5591 it
= db
->get_iterator(PREFIX_OMAP
);
5593 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
5595 _key_decode_u64(it
->key().c_str(), &omap_head
);
5596 if (used_omap_head
.count(omap_head
) == 0) {
5597 derr
<< __func__
<< " error: found stray omap data on omap_head "
5598 << omap_head
<< dendl
;
5604 dout(1) << __func__
<< " checking deferred events" << dendl
;
5605 it
= db
->get_iterator(PREFIX_DEFERRED
);
5607 for (it
->lower_bound(string()); it
->valid(); it
->next()) {
5608 bufferlist bl
= it
->value();
5609 bufferlist::iterator p
= bl
.begin();
5610 bluestore_deferred_transaction_t wt
;
5613 } catch (buffer::error
& e
) {
5614 derr
<< __func__
<< " error: failed to decode deferred txn "
5615 << pretty_binary_string(it
->key()) << dendl
;
5619 dout(20) << __func__
<< " deferred " << wt
.seq
5620 << " ops " << wt
.ops
.size()
5621 << " released 0x" << std::hex
<< wt
.released
<< std::dec
<< dendl
;
5622 for (auto e
= wt
.released
.begin(); e
!= wt
.released
.end(); ++e
) {
5624 e
.get_start(), e
.get_len(), block_size
, used_blocks
, "deferred",
5625 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5633 dout(1) << __func__
<< " checking freelist vs allocated" << dendl
;
5635 // remove bluefs_extents from used set since the freelist doesn't
5636 // know they are allocated.
5637 for (auto e
= bluefs_extents
.begin(); e
!= bluefs_extents
.end(); ++e
) {
5639 e
.get_start(), e
.get_len(), block_size
, used_blocks
, "bluefs_extents",
5640 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5645 fm
->enumerate_reset();
5646 uint64_t offset
, length
;
5647 while (fm
->enumerate_next(&offset
, &length
)) {
5648 bool intersects
= false;
5650 offset
, length
, block_size
, used_blocks
, "free",
5651 [&](uint64_t pos
, mempool_dynamic_bitset
&bs
) {
5660 derr
<< __func__
<< " error: free extent 0x" << std::hex
<< offset
5661 << "~" << length
<< std::dec
5662 << " intersects allocated blocks" << dendl
;
5666 size_t count
= used_blocks
.count();
5667 if (used_blocks
.size() != count
) {
5668 assert(used_blocks
.size() > count
);
5669 derr
<< __func__
<< " error: leaked some space;"
5670 << (used_blocks
.size() - count
) * min_alloc_size
5671 << " bytes leaked" << dendl
;
5677 mempool_thread
.shutdown();
5684 it
.reset(); // before db is closed
5693 // fatal errors take precedence
5697 dout(2) << __func__
<< " " << num_objects
<< " objects, "
5698 << num_sharded_objects
<< " of them sharded. "
5700 dout(2) << __func__
<< " " << num_extents
<< " extents to "
5701 << num_blobs
<< " blobs, "
5702 << num_spanning_blobs
<< " spanning, "
5703 << num_shared_blobs
<< " shared."
5706 utime_t duration
= ceph_clock_now() - start
;
5707 dout(1) << __func__
<< " finish with " << errors
<< " errors in "
5708 << duration
<< " seconds" << dendl
;
5712 void BlueStore::collect_metadata(map
<string
,string
> *pm
)
5714 dout(10) << __func__
<< dendl
;
5715 bdev
->collect_metadata("bluestore_bdev_", pm
);
5717 (*pm
)["bluefs"] = "1";
5718 (*pm
)["bluefs_single_shared_device"] = stringify((int)bluefs_single_shared_device
);
5719 bluefs
->collect_metadata(pm
);
5721 (*pm
)["bluefs"] = "0";
5725 int BlueStore::statfs(struct store_statfs_t
*buf
)
5728 buf
->total
= bdev
->get_size();
5729 buf
->available
= alloc
->get_free();
5732 // part of our shared device is "free" according to BlueFS
5733 // Don't include bluestore_bluefs_min because that space can't
5734 // be used for any other purpose.
5735 buf
->available
+= bluefs
->get_free(bluefs_shared_bdev
) - cct
->_conf
->bluestore_bluefs_min
;
5737 // include dedicated db, too, if that isn't the shared device.
5738 if (bluefs_shared_bdev
!= BlueFS::BDEV_DB
) {
5739 buf
->total
+= bluefs
->get_total(BlueFS::BDEV_DB
);
5744 int r
= db
->get(PREFIX_STAT
, "bluestore_statfs", &bl
);
5746 TransContext::volatile_statfs vstatfs
;
5747 if (size_t(bl
.length()) >= sizeof(vstatfs
.values
)) {
5748 auto it
= bl
.begin();
5751 buf
->allocated
= vstatfs
.allocated();
5752 buf
->stored
= vstatfs
.stored();
5753 buf
->compressed
= vstatfs
.compressed();
5754 buf
->compressed_original
= vstatfs
.compressed_original();
5755 buf
->compressed_allocated
= vstatfs
.compressed_allocated();
5757 dout(10) << __func__
<< " store_statfs is corrupt, using empty" << dendl
;
5760 dout(10) << __func__
<< " store_statfs missed, using empty" << dendl
;
5764 dout(20) << __func__
<< *buf
<< dendl
;
5771 BlueStore::CollectionRef
BlueStore::_get_collection(const coll_t
& cid
)
5773 RWLock::RLocker
l(coll_lock
);
5774 ceph::unordered_map
<coll_t
,CollectionRef
>::iterator cp
= coll_map
.find(cid
);
5775 if (cp
== coll_map
.end())
5776 return CollectionRef();
5780 void BlueStore::_queue_reap_collection(CollectionRef
& c
)
5782 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
5783 std::lock_guard
<std::mutex
> l(reap_lock
);
5784 removed_collections
.push_back(c
);
5787 void BlueStore::_reap_collections()
5789 list
<CollectionRef
> removed_colls
;
5791 std::lock_guard
<std::mutex
> l(reap_lock
);
5792 removed_colls
.swap(removed_collections
);
5795 bool all_reaped
= true;
5797 for (list
<CollectionRef
>::iterator p
= removed_colls
.begin();
5798 p
!= removed_colls
.end();
5800 CollectionRef c
= *p
;
5801 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< dendl
;
5802 if (c
->onode_map
.map_any([&](OnodeRef o
) {
5804 if (o
->flushing_count
.load()) {
5805 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " " << o
->oid
5806 << " flush_txns " << o
->flushing_count
<< dendl
;
5814 c
->onode_map
.clear();
5815 dout(10) << __func__
<< " " << c
<< " " << c
->cid
<< " done" << dendl
;
5819 dout(10) << __func__
<< " all reaped" << dendl
;
5823 void BlueStore::_update_cache_logger()
5825 uint64_t num_onodes
= 0;
5826 uint64_t num_extents
= 0;
5827 uint64_t num_blobs
= 0;
5828 uint64_t num_buffers
= 0;
5829 uint64_t num_buffer_bytes
= 0;
5830 for (auto c
: cache_shards
) {
5831 c
->add_stats(&num_onodes
, &num_extents
, &num_blobs
,
5832 &num_buffers
, &num_buffer_bytes
);
5834 logger
->set(l_bluestore_onodes
, num_onodes
);
5835 logger
->set(l_bluestore_extents
, num_extents
);
5836 logger
->set(l_bluestore_blobs
, num_blobs
);
5837 logger
->set(l_bluestore_buffers
, num_buffers
);
5838 logger
->set(l_bluestore_buffer_bytes
, num_buffer_bytes
);
5844 ObjectStore::CollectionHandle
BlueStore::open_collection(const coll_t
& cid
)
5846 return _get_collection(cid
);
5849 bool BlueStore::exists(const coll_t
& cid
, const ghobject_t
& oid
)
5851 CollectionHandle c
= _get_collection(cid
);
5854 return exists(c
, oid
);
5857 bool BlueStore::exists(CollectionHandle
&c_
, const ghobject_t
& oid
)
5859 Collection
*c
= static_cast<Collection
*>(c_
.get());
5860 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
5867 RWLock::RLocker
l(c
->lock
);
5868 OnodeRef o
= c
->get_onode(oid
, false);
5869 if (!o
|| !o
->exists
)
5877 int BlueStore::stat(
5879 const ghobject_t
& oid
,
5883 CollectionHandle c
= _get_collection(cid
);
5886 return stat(c
, oid
, st
, allow_eio
);
5889 int BlueStore::stat(
5890 CollectionHandle
&c_
,
5891 const ghobject_t
& oid
,
5895 Collection
*c
= static_cast<Collection
*>(c_
.get());
5898 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
5901 RWLock::RLocker
l(c
->lock
);
5902 OnodeRef o
= c
->get_onode(oid
, false);
5903 if (!o
|| !o
->exists
)
5905 st
->st_size
= o
->onode
.size
;
5906 st
->st_blksize
= 4096;
5907 st
->st_blocks
= (st
->st_size
+ st
->st_blksize
- 1) / st
->st_blksize
;
5913 if (_debug_mdata_eio(oid
)) {
5915 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
5919 int BlueStore::set_collection_opts(
5921 const pool_opts_t
& opts
)
5923 CollectionHandle ch
= _get_collection(cid
);
5926 Collection
*c
= static_cast<Collection
*>(ch
.get());
5927 dout(15) << __func__
<< " " << cid
<< " options " << opts
<< dendl
;
5930 RWLock::WLocker
l(c
->lock
);
5931 c
->pool_opts
= opts
;
5935 int BlueStore::read(
5937 const ghobject_t
& oid
,
5944 CollectionHandle c
= _get_collection(cid
);
5947 return read(c
, oid
, offset
, length
, bl
, op_flags
, allow_eio
);
5950 int BlueStore::read(
5951 CollectionHandle
&c_
,
5952 const ghobject_t
& oid
,
5959 utime_t start
= ceph_clock_now();
5960 Collection
*c
= static_cast<Collection
*>(c_
.get());
5961 const coll_t
&cid
= c
->get_cid();
5962 dout(15) << __func__
<< " " << cid
<< " " << oid
5963 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
5971 RWLock::RLocker
l(c
->lock
);
5972 utime_t start1
= ceph_clock_now();
5973 OnodeRef o
= c
->get_onode(oid
, false);
5974 logger
->tinc(l_bluestore_read_onode_meta_lat
, ceph_clock_now() - start1
);
5975 if (!o
|| !o
->exists
) {
5980 if (offset
== length
&& offset
== 0)
5981 length
= o
->onode
.size
;
5983 r
= _do_read(c
, o
, offset
, length
, bl
, op_flags
);
5987 assert(allow_eio
|| r
!= -EIO
);
5989 if (r
== 0 && _debug_data_eio(oid
)) {
5991 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
5993 dout(10) << __func__
<< " " << cid
<< " " << oid
5994 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
5995 << " = " << r
<< dendl
;
5996 logger
->tinc(l_bluestore_read_lat
, ceph_clock_now() - start
);
6000 // --------------------------------------------------------
6001 // intermediate data structures used while reading
6003 uint64_t logical_offset
;
6004 uint64_t blob_xoffset
; //region offset within the blob
6008 // used later in read process
6012 region_t(uint64_t offset
, uint64_t b_offs
, uint64_t len
)
6013 : logical_offset(offset
),
6014 blob_xoffset(b_offs
),
6016 region_t(const region_t
& from
)
6017 : logical_offset(from
.logical_offset
),
6018 blob_xoffset(from
.blob_xoffset
),
6019 length(from
.length
){}
6021 friend ostream
& operator<<(ostream
& out
, const region_t
& r
) {
6022 return out
<< "0x" << std::hex
<< r
.logical_offset
<< ":"
6023 << r
.blob_xoffset
<< "~" << r
.length
<< std::dec
;
6027 typedef list
<region_t
> regions2read_t
;
6028 typedef map
<BlueStore::BlobRef
, regions2read_t
> blobs2read_t
;
6030 int BlueStore::_do_read(
6039 boost::intrusive::set
<Extent
>::iterator ep
, eend
;
6042 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
6043 << " size 0x" << o
->onode
.size
<< " (" << std::dec
6044 << o
->onode
.size
<< ")" << dendl
;
6047 if (offset
>= o
->onode
.size
) {
6051 // generally, don't buffer anything, unless the client explicitly requests
6053 bool buffered
= false;
6054 if (op_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
6055 dout(20) << __func__
<< " will do buffered read" << dendl
;
6057 } else if (cct
->_conf
->bluestore_default_buffered_read
&&
6058 (op_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
6059 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
6060 dout(20) << __func__
<< " defaulting to buffered read" << dendl
;
6064 if (offset
+ length
> o
->onode
.size
) {
6065 length
= o
->onode
.size
- offset
;
6068 utime_t start
= ceph_clock_now();
6069 o
->extent_map
.fault_range(db
, offset
, length
);
6070 logger
->tinc(l_bluestore_read_onode_meta_lat
, ceph_clock_now() - start
);
6073 ready_regions_t ready_regions
;
6075 // build blob-wise list to of stuff read (that isn't cached)
6076 blobs2read_t blobs2read
;
6077 unsigned left
= length
;
6078 uint64_t pos
= offset
;
6079 unsigned num_regions
= 0;
6080 auto lp
= o
->extent_map
.seek_lextent(offset
);
6081 while (left
> 0 && lp
!= o
->extent_map
.extent_map
.end()) {
6082 if (pos
< lp
->logical_offset
) {
6083 unsigned hole
= lp
->logical_offset
- pos
;
6087 dout(30) << __func__
<< " hole 0x" << std::hex
<< pos
<< "~" << hole
6088 << std::dec
<< dendl
;
6092 BlobRef bptr
= lp
->blob
;
6093 unsigned l_off
= pos
- lp
->logical_offset
;
6094 unsigned b_off
= l_off
+ lp
->blob_offset
;
6095 unsigned b_len
= std::min(left
, lp
->length
- l_off
);
6097 ready_regions_t cache_res
;
6098 interval_set
<uint32_t> cache_interval
;
6099 bptr
->shared_blob
->bc
.read(
6100 bptr
->shared_blob
->get_cache(), b_off
, b_len
, cache_res
, cache_interval
);
6101 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6102 << " need 0x" << b_off
<< "~" << b_len
6103 << " cache has 0x" << cache_interval
6104 << std::dec
<< dendl
;
6106 auto pc
= cache_res
.begin();
6109 if (pc
!= cache_res
.end() &&
6110 pc
->first
== b_off
) {
6111 l
= pc
->second
.length();
6112 ready_regions
[pos
].claim(pc
->second
);
6113 dout(30) << __func__
<< " use cache 0x" << std::hex
<< pos
<< ": 0x"
6114 << b_off
<< "~" << l
<< std::dec
<< dendl
;
6118 if (pc
!= cache_res
.end()) {
6119 assert(pc
->first
> b_off
);
6120 l
= pc
->first
- b_off
;
6122 dout(30) << __func__
<< " will read 0x" << std::hex
<< pos
<< ": 0x"
6123 << b_off
<< "~" << l
<< std::dec
<< dendl
;
6124 blobs2read
[bptr
].emplace_back(region_t(pos
, b_off
, l
));
6135 // read raw blob data. use aio if we have >1 blobs to read.
6136 start
= ceph_clock_now(); // for the sake of simplicity
6137 // measure the whole block below.
6138 // The error isn't that much...
6139 vector
<bufferlist
> compressed_blob_bls
;
6140 IOContext
ioc(cct
, NULL
);
6141 for (auto& p
: blobs2read
) {
6142 BlobRef bptr
= p
.first
;
6143 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6144 << " need " << p
.second
<< std::dec
<< dendl
;
6145 if (bptr
->get_blob().is_compressed()) {
6146 // read the whole thing
6147 if (compressed_blob_bls
.empty()) {
6148 // ensure we avoid any reallocation on subsequent blobs
6149 compressed_blob_bls
.reserve(blobs2read
.size());
6151 compressed_blob_bls
.push_back(bufferlist());
6152 bufferlist
& bl
= compressed_blob_bls
.back();
6153 r
= bptr
->get_blob().map(
6154 0, bptr
->get_blob().get_ondisk_length(),
6155 [&](uint64_t offset
, uint64_t length
) {
6157 // use aio if there are more regions to read than those in this blob
6158 if (num_regions
> p
.second
.size()) {
6159 r
= bdev
->aio_read(offset
, length
, &bl
, &ioc
);
6161 r
= bdev
->read(offset
, length
, &bl
, &ioc
, false);
6170 for (auto& reg
: p
.second
) {
6171 // determine how much of the blob to read
6172 uint64_t chunk_size
= bptr
->get_blob().get_chunk_size(block_size
);
6173 reg
.r_off
= reg
.blob_xoffset
;
6174 uint64_t r_len
= reg
.length
;
6175 reg
.front
= reg
.r_off
% chunk_size
;
6177 reg
.r_off
-= reg
.front
;
6180 unsigned tail
= r_len
% chunk_size
;
6182 r_len
+= chunk_size
- tail
;
6184 dout(20) << __func__
<< " region 0x" << std::hex
6185 << reg
.logical_offset
6186 << ": 0x" << reg
.blob_xoffset
<< "~" << reg
.length
6187 << " reading 0x" << reg
.r_off
<< "~" << r_len
<< std::dec
6191 r
= bptr
->get_blob().map(
6193 [&](uint64_t offset
, uint64_t length
) {
6195 // use aio if there is more than one region to read
6196 if (num_regions
> 1) {
6197 r
= bdev
->aio_read(offset
, length
, ®
.bl
, &ioc
);
6199 r
= bdev
->read(offset
, length
, ®
.bl
, &ioc
, false);
6206 assert(reg
.bl
.length() == r_len
);
6210 if (ioc
.has_pending_aios()) {
6211 bdev
->aio_submit(&ioc
);
6212 dout(20) << __func__
<< " waiting for aio" << dendl
;
6215 logger
->tinc(l_bluestore_read_wait_aio_lat
, ceph_clock_now() - start
);
6217 // enumerate and decompress desired blobs
6218 auto p
= compressed_blob_bls
.begin();
6219 blobs2read_t::iterator b2r_it
= blobs2read
.begin();
6220 while (b2r_it
!= blobs2read
.end()) {
6221 BlobRef bptr
= b2r_it
->first
;
6222 dout(20) << __func__
<< " blob " << *bptr
<< std::hex
6223 << " need 0x" << b2r_it
->second
<< std::dec
<< dendl
;
6224 if (bptr
->get_blob().is_compressed()) {
6225 assert(p
!= compressed_blob_bls
.end());
6226 bufferlist
& compressed_bl
= *p
++;
6227 if (_verify_csum(o
, &bptr
->get_blob(), 0, compressed_bl
,
6228 b2r_it
->second
.front().logical_offset
) < 0) {
6232 r
= _decompress(compressed_bl
, &raw_bl
);
6236 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(), 0,
6239 for (auto& i
: b2r_it
->second
) {
6240 ready_regions
[i
.logical_offset
].substr_of(
6241 raw_bl
, i
.blob_xoffset
, i
.length
);
6244 for (auto& reg
: b2r_it
->second
) {
6245 if (_verify_csum(o
, &bptr
->get_blob(), reg
.r_off
, reg
.bl
,
6246 reg
.logical_offset
) < 0) {
6250 bptr
->shared_blob
->bc
.did_read(bptr
->shared_blob
->get_cache(),
6254 // prune and keep result
6255 ready_regions
[reg
.logical_offset
].substr_of(
6256 reg
.bl
, reg
.front
, reg
.length
);
6262 // generate a resulting buffer
6263 auto pr
= ready_regions
.begin();
6264 auto pr_end
= ready_regions
.end();
6266 while (pos
< length
) {
6267 if (pr
!= pr_end
&& pr
->first
== pos
+ offset
) {
6268 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
6269 << ": data from 0x" << pr
->first
<< "~" << pr
->second
.length()
6270 << std::dec
<< dendl
;
6271 pos
+= pr
->second
.length();
6272 bl
.claim_append(pr
->second
);
6275 uint64_t l
= length
- pos
;
6277 assert(pr
->first
> pos
+ offset
);
6278 l
= pr
->first
- (pos
+ offset
);
6280 dout(30) << __func__
<< " assemble 0x" << std::hex
<< pos
6281 << ": zeros for 0x" << (pos
+ offset
) << "~" << l
6282 << std::dec
<< dendl
;
6287 assert(bl
.length() == length
);
6288 assert(pos
== length
);
6289 assert(pr
== pr_end
);
6294 int BlueStore::_verify_csum(OnodeRef
& o
,
6295 const bluestore_blob_t
* blob
, uint64_t blob_xoffset
,
6296 const bufferlist
& bl
,
6297 uint64_t logical_offset
) const
6301 utime_t start
= ceph_clock_now();
6302 int r
= blob
->verify_csum(blob_xoffset
, bl
, &bad
, &bad_csum
);
6308 blob
->get_csum_chunk_size(),
6309 [&](uint64_t offset
, uint64_t length
) {
6310 pex
.emplace_back(bluestore_pextent_t(offset
, length
));
6313 derr
<< __func__
<< " bad "
6314 << Checksummer::get_csum_type_string(blob
->csum_type
)
6315 << "/0x" << std::hex
<< blob
->get_csum_chunk_size()
6316 << " checksum at blob offset 0x" << bad
6317 << ", got 0x" << bad_csum
<< ", expected 0x"
6318 << blob
->get_csum_item(bad
/ blob
->get_csum_chunk_size()) << std::dec
6319 << ", device location " << pex
6320 << ", logical extent 0x" << std::hex
6321 << (logical_offset
+ bad
- blob_xoffset
) << "~"
6322 << blob
->get_csum_chunk_size() << std::dec
6323 << ", object " << o
->oid
6326 derr
<< __func__
<< " failed with exit code: " << cpp_strerror(r
) << dendl
;
6329 logger
->tinc(l_bluestore_csum_lat
, ceph_clock_now() - start
);
6333 int BlueStore::_decompress(bufferlist
& source
, bufferlist
* result
)
6336 utime_t start
= ceph_clock_now();
6337 bufferlist::iterator i
= source
.begin();
6338 bluestore_compression_header_t chdr
;
6340 int alg
= int(chdr
.type
);
6341 CompressorRef cp
= compressor
;
6342 if (!cp
|| (int)cp
->get_type() != alg
) {
6343 cp
= Compressor::create(cct
, alg
);
6347 // if compressor isn't available - error, because cannot return
6348 // decompressed data?
6349 derr
<< __func__
<< " can't load decompressor " << alg
<< dendl
;
6352 r
= cp
->decompress(i
, chdr
.length
, *result
);
6354 derr
<< __func__
<< " decompression failed with exit code " << r
<< dendl
;
6358 logger
->tinc(l_bluestore_decompress_lat
, ceph_clock_now() - start
);
6362 // this stores fiemap into interval_set, other variations
6363 // use it internally
6364 int BlueStore::_fiemap(
6365 CollectionHandle
&c_
,
6366 const ghobject_t
& oid
,
6369 interval_set
<uint64_t>& destset
)
6371 Collection
*c
= static_cast<Collection
*>(c_
.get());
6375 RWLock::RLocker
l(c
->lock
);
6377 OnodeRef o
= c
->get_onode(oid
, false);
6378 if (!o
|| !o
->exists
) {
6383 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
6384 << " size 0x" << o
->onode
.size
<< std::dec
<< dendl
;
6386 boost::intrusive::set
<Extent
>::iterator ep
, eend
;
6387 if (offset
>= o
->onode
.size
)
6390 if (offset
+ length
> o
->onode
.size
) {
6391 length
= o
->onode
.size
- offset
;
6394 o
->extent_map
.fault_range(db
, offset
, length
);
6395 eend
= o
->extent_map
.extent_map
.end();
6396 ep
= o
->extent_map
.seek_lextent(offset
);
6397 while (length
> 0) {
6398 dout(20) << __func__
<< " offset " << offset
<< dendl
;
6399 if (ep
!= eend
&& ep
->logical_offset
+ ep
->length
<= offset
) {
6404 uint64_t x_len
= length
;
6405 if (ep
!= eend
&& ep
->logical_offset
<= offset
) {
6406 uint64_t x_off
= offset
- ep
->logical_offset
;
6407 x_len
= MIN(x_len
, ep
->length
- x_off
);
6408 dout(30) << __func__
<< " lextent 0x" << std::hex
<< offset
<< "~"
6409 << x_len
<< std::dec
<< " blob " << ep
->blob
<< dendl
;
6410 destset
.insert(offset
, x_len
);
6413 if (x_off
+ x_len
== ep
->length
)
6418 ep
->logical_offset
> offset
&&
6419 ep
->logical_offset
- offset
< x_len
) {
6420 x_len
= ep
->logical_offset
- offset
;
6429 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
6430 << " size = 0x(" << destset
<< ")" << std::dec
<< dendl
;
6434 int BlueStore::fiemap(
6436 const ghobject_t
& oid
,
6441 CollectionHandle c
= _get_collection(cid
);
6444 return fiemap(c
, oid
, offset
, len
, bl
);
6447 int BlueStore::fiemap(
6448 CollectionHandle
&c_
,
6449 const ghobject_t
& oid
,
6454 interval_set
<uint64_t> m
;
6455 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
6462 int BlueStore::fiemap(
6464 const ghobject_t
& oid
,
6467 map
<uint64_t, uint64_t>& destmap
)
6469 CollectionHandle c
= _get_collection(cid
);
6472 return fiemap(c
, oid
, offset
, len
, destmap
);
6475 int BlueStore::fiemap(
6476 CollectionHandle
&c_
,
6477 const ghobject_t
& oid
,
6480 map
<uint64_t, uint64_t>& destmap
)
6482 interval_set
<uint64_t> m
;
6483 int r
= _fiemap(c_
, oid
, offset
, length
, m
);
6485 m
.move_into(destmap
);
6490 int BlueStore::getattr(
6492 const ghobject_t
& oid
,
6496 CollectionHandle c
= _get_collection(cid
);
6499 return getattr(c
, oid
, name
, value
);
6502 int BlueStore::getattr(
6503 CollectionHandle
&c_
,
6504 const ghobject_t
& oid
,
6508 Collection
*c
= static_cast<Collection
*>(c_
.get());
6509 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
<< dendl
;
6515 RWLock::RLocker
l(c
->lock
);
6516 mempool::bluestore_meta_other::string
k(name
);
6518 OnodeRef o
= c
->get_onode(oid
, false);
6519 if (!o
|| !o
->exists
) {
6524 if (!o
->onode
.attrs
.count(k
)) {
6528 value
= o
->onode
.attrs
[k
];
6533 if (r
== 0 && _debug_mdata_eio(oid
)) {
6535 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
6537 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
<< " " << name
6538 << " = " << r
<< dendl
;
6543 int BlueStore::getattrs(
6545 const ghobject_t
& oid
,
6546 map
<string
,bufferptr
>& aset
)
6548 CollectionHandle c
= _get_collection(cid
);
6551 return getattrs(c
, oid
, aset
);
6554 int BlueStore::getattrs(
6555 CollectionHandle
&c_
,
6556 const ghobject_t
& oid
,
6557 map
<string
,bufferptr
>& aset
)
6559 Collection
*c
= static_cast<Collection
*>(c_
.get());
6560 dout(15) << __func__
<< " " << c
->cid
<< " " << oid
<< dendl
;
6566 RWLock::RLocker
l(c
->lock
);
6568 OnodeRef o
= c
->get_onode(oid
, false);
6569 if (!o
|| !o
->exists
) {
6573 for (auto& i
: o
->onode
.attrs
) {
6574 aset
.emplace(i
.first
.c_str(), i
.second
);
6581 if (r
== 0 && _debug_mdata_eio(oid
)) {
6583 derr
<< __func__
<< " " << c
->cid
<< " " << oid
<< " INJECT EIO" << dendl
;
6585 dout(10) << __func__
<< " " << c
->cid
<< " " << oid
6586 << " = " << r
<< dendl
;
6590 int BlueStore::list_collections(vector
<coll_t
>& ls
)
6592 RWLock::RLocker
l(coll_lock
);
6593 for (ceph::unordered_map
<coll_t
, CollectionRef
>::iterator p
= coll_map
.begin();
6594 p
!= coll_map
.end();
6596 ls
.push_back(p
->first
);
6600 bool BlueStore::collection_exists(const coll_t
& c
)
6602 RWLock::RLocker
l(coll_lock
);
6603 return coll_map
.count(c
);
6606 int BlueStore::collection_empty(const coll_t
& cid
, bool *empty
)
6608 dout(15) << __func__
<< " " << cid
<< dendl
;
6609 vector
<ghobject_t
> ls
;
6611 int r
= collection_list(cid
, ghobject_t(), ghobject_t::get_max(), 1,
6614 derr
<< __func__
<< " collection_list returned: " << cpp_strerror(r
)
6618 *empty
= ls
.empty();
6619 dout(10) << __func__
<< " " << cid
<< " = " << (int)(*empty
) << dendl
;
6623 int BlueStore::collection_bits(const coll_t
& cid
)
6625 dout(15) << __func__
<< " " << cid
<< dendl
;
6626 CollectionRef c
= _get_collection(cid
);
6629 RWLock::RLocker
l(c
->lock
);
6630 dout(10) << __func__
<< " " << cid
<< " = " << c
->cnode
.bits
<< dendl
;
6631 return c
->cnode
.bits
;
6634 int BlueStore::collection_list(
6635 const coll_t
& cid
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
6636 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
6638 CollectionHandle c
= _get_collection(cid
);
6641 return collection_list(c
, start
, end
, max
, ls
, pnext
);
6644 int BlueStore::collection_list(
6645 CollectionHandle
&c_
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
6646 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
6648 Collection
*c
= static_cast<Collection
*>(c_
.get());
6649 dout(15) << __func__
<< " " << c
->cid
6650 << " start " << start
<< " end " << end
<< " max " << max
<< dendl
;
6653 RWLock::RLocker
l(c
->lock
);
6654 r
= _collection_list(c
, start
, end
, max
, ls
, pnext
);
6658 dout(10) << __func__
<< " " << c
->cid
6659 << " start " << start
<< " end " << end
<< " max " << max
6660 << " = " << r
<< ", ls.size() = " << ls
->size()
6661 << ", next = " << (pnext
? *pnext
: ghobject_t()) << dendl
;
6665 int BlueStore::_collection_list(
6666 Collection
*c
, const ghobject_t
& start
, const ghobject_t
& end
, int max
,
6667 vector
<ghobject_t
> *ls
, ghobject_t
*pnext
)
6674 ghobject_t static_next
;
6675 KeyValueDB::Iterator it
;
6676 string temp_start_key
, temp_end_key
;
6677 string start_key
, end_key
;
6678 bool set_next
= false;
6683 pnext
= &static_next
;
6685 if (start
== ghobject_t::get_max() ||
6686 start
.hobj
.is_max()) {
6689 get_coll_key_range(c
->cid
, c
->cnode
.bits
, &temp_start_key
, &temp_end_key
,
6690 &start_key
, &end_key
);
6691 dout(20) << __func__
6692 << " range " << pretty_binary_string(temp_start_key
)
6693 << " to " << pretty_binary_string(temp_end_key
)
6694 << " and " << pretty_binary_string(start_key
)
6695 << " to " << pretty_binary_string(end_key
)
6696 << " start " << start
<< dendl
;
6697 it
= db
->get_iterator(PREFIX_OBJ
);
6698 if (start
== ghobject_t() ||
6699 start
.hobj
== hobject_t() ||
6700 start
== c
->cid
.get_min_hobj()) {
6701 it
->upper_bound(temp_start_key
);
6705 get_object_key(cct
, start
, &k
);
6706 if (start
.hobj
.is_temp()) {
6708 assert(k
>= temp_start_key
&& k
< temp_end_key
);
6711 assert(k
>= start_key
&& k
< end_key
);
6713 dout(20) << " start from " << pretty_binary_string(k
)
6714 << " temp=" << (int)temp
<< dendl
;
6717 if (end
.hobj
.is_max()) {
6718 pend
= temp
? temp_end_key
: end_key
;
6720 get_object_key(cct
, end
, &end_key
);
6721 if (end
.hobj
.is_temp()) {
6727 pend
= temp
? temp_end_key
: end_key
;
6730 dout(20) << __func__
<< " pend " << pretty_binary_string(pend
) << dendl
;
6732 if (!it
->valid() || it
->key() >= pend
) {
6734 dout(20) << __func__
<< " iterator not valid (end of db?)" << dendl
;
6736 dout(20) << __func__
<< " key " << pretty_binary_string(it
->key())
6737 << " >= " << end
<< dendl
;
6739 if (end
.hobj
.is_temp()) {
6742 dout(30) << __func__
<< " switch to non-temp namespace" << dendl
;
6744 it
->upper_bound(start_key
);
6746 dout(30) << __func__
<< " pend " << pretty_binary_string(pend
) << dendl
;
6751 dout(30) << __func__
<< " key " << pretty_binary_string(it
->key()) << dendl
;
6752 if (is_extent_shard_key(it
->key())) {
6757 int r
= get_key_object(it
->key(), &oid
);
6759 dout(20) << __func__
<< " oid " << oid
<< " end " << end
<< dendl
;
6760 if (ls
->size() >= (unsigned)max
) {
6761 dout(20) << __func__
<< " reached max " << max
<< dendl
;
6771 *pnext
= ghobject_t::get_max();
6779 BlueStore::OmapIteratorImpl::OmapIteratorImpl(
6780 CollectionRef c
, OnodeRef o
, KeyValueDB::Iterator it
)
6781 : c(c
), o(o
), it(it
)
6783 RWLock::RLocker
l(c
->lock
);
6784 if (o
->onode
.has_omap()) {
6785 get_omap_key(o
->onode
.nid
, string(), &head
);
6786 get_omap_tail(o
->onode
.nid
, &tail
);
6787 it
->lower_bound(head
);
6791 int BlueStore::OmapIteratorImpl::seek_to_first()
6793 RWLock::RLocker
l(c
->lock
);
6794 if (o
->onode
.has_omap()) {
6795 it
->lower_bound(head
);
6797 it
= KeyValueDB::Iterator();
6802 int BlueStore::OmapIteratorImpl::upper_bound(const string
& after
)
6804 RWLock::RLocker
l(c
->lock
);
6805 if (o
->onode
.has_omap()) {
6807 get_omap_key(o
->onode
.nid
, after
, &key
);
6808 it
->upper_bound(key
);
6810 it
= KeyValueDB::Iterator();
6815 int BlueStore::OmapIteratorImpl::lower_bound(const string
& to
)
6817 RWLock::RLocker
l(c
->lock
);
6818 if (o
->onode
.has_omap()) {
6820 get_omap_key(o
->onode
.nid
, to
, &key
);
6821 it
->lower_bound(key
);
6823 it
= KeyValueDB::Iterator();
6828 bool BlueStore::OmapIteratorImpl::valid()
6830 RWLock::RLocker
l(c
->lock
);
6831 return o
->onode
.has_omap() && it
&& it
->valid() && it
->raw_key().second
<= tail
;
6834 int BlueStore::OmapIteratorImpl::next(bool validate
)
6836 RWLock::RLocker
l(c
->lock
);
6837 if (o
->onode
.has_omap()) {
6845 string
BlueStore::OmapIteratorImpl::key()
6847 RWLock::RLocker
l(c
->lock
);
6848 assert(it
->valid());
6849 string db_key
= it
->raw_key().second
;
6851 decode_omap_key(db_key
, &user_key
);
6855 bufferlist
BlueStore::OmapIteratorImpl::value()
6857 RWLock::RLocker
l(c
->lock
);
6858 assert(it
->valid());
6862 int BlueStore::omap_get(
6863 const coll_t
& cid
, ///< [in] Collection containing oid
6864 const ghobject_t
&oid
, ///< [in] Object containing omap
6865 bufferlist
*header
, ///< [out] omap header
6866 map
<string
, bufferlist
> *out
/// < [out] Key to value map
6869 CollectionHandle c
= _get_collection(cid
);
6872 return omap_get(c
, oid
, header
, out
);
6875 int BlueStore::omap_get(
6876 CollectionHandle
&c_
, ///< [in] Collection containing oid
6877 const ghobject_t
&oid
, ///< [in] Object containing omap
6878 bufferlist
*header
, ///< [out] omap header
6879 map
<string
, bufferlist
> *out
/// < [out] Key to value map
6882 Collection
*c
= static_cast<Collection
*>(c_
.get());
6883 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
6886 RWLock::RLocker
l(c
->lock
);
6888 OnodeRef o
= c
->get_onode(oid
, false);
6889 if (!o
|| !o
->exists
) {
6893 if (!o
->onode
.has_omap())
6897 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
6899 get_omap_header(o
->onode
.nid
, &head
);
6900 get_omap_tail(o
->onode
.nid
, &tail
);
6901 it
->lower_bound(head
);
6902 while (it
->valid()) {
6903 if (it
->key() == head
) {
6904 dout(30) << __func__
<< " got header" << dendl
;
6905 *header
= it
->value();
6906 } else if (it
->key() >= tail
) {
6907 dout(30) << __func__
<< " reached tail" << dendl
;
6911 decode_omap_key(it
->key(), &user_key
);
6912 dout(30) << __func__
<< " got " << pretty_binary_string(it
->key())
6913 << " -> " << user_key
<< dendl
;
6914 (*out
)[user_key
] = it
->value();
6920 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
6925 int BlueStore::omap_get_header(
6926 const coll_t
& cid
, ///< [in] Collection containing oid
6927 const ghobject_t
&oid
, ///< [in] Object containing omap
6928 bufferlist
*header
, ///< [out] omap header
6929 bool allow_eio
///< [in] don't assert on eio
6932 CollectionHandle c
= _get_collection(cid
);
6935 return omap_get_header(c
, oid
, header
, allow_eio
);
6938 int BlueStore::omap_get_header(
6939 CollectionHandle
&c_
, ///< [in] Collection containing oid
6940 const ghobject_t
&oid
, ///< [in] Object containing omap
6941 bufferlist
*header
, ///< [out] omap header
6942 bool allow_eio
///< [in] don't assert on eio
6945 Collection
*c
= static_cast<Collection
*>(c_
.get());
6946 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
6949 RWLock::RLocker
l(c
->lock
);
6951 OnodeRef o
= c
->get_onode(oid
, false);
6952 if (!o
|| !o
->exists
) {
6956 if (!o
->onode
.has_omap())
6961 get_omap_header(o
->onode
.nid
, &head
);
6962 if (db
->get(PREFIX_OMAP
, head
, header
) >= 0) {
6963 dout(30) << __func__
<< " got header" << dendl
;
6965 dout(30) << __func__
<< " no header" << dendl
;
6969 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
6974 int BlueStore::omap_get_keys(
6975 const coll_t
& cid
, ///< [in] Collection containing oid
6976 const ghobject_t
&oid
, ///< [in] Object containing omap
6977 set
<string
> *keys
///< [out] Keys defined on oid
6980 CollectionHandle c
= _get_collection(cid
);
6983 return omap_get_keys(c
, oid
, keys
);
6986 int BlueStore::omap_get_keys(
6987 CollectionHandle
&c_
, ///< [in] Collection containing oid
6988 const ghobject_t
&oid
, ///< [in] Object containing omap
6989 set
<string
> *keys
///< [out] Keys defined on oid
6992 Collection
*c
= static_cast<Collection
*>(c_
.get());
6993 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
6996 RWLock::RLocker
l(c
->lock
);
6998 OnodeRef o
= c
->get_onode(oid
, false);
6999 if (!o
|| !o
->exists
) {
7003 if (!o
->onode
.has_omap())
7007 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
7009 get_omap_key(o
->onode
.nid
, string(), &head
);
7010 get_omap_tail(o
->onode
.nid
, &tail
);
7011 it
->lower_bound(head
);
7012 while (it
->valid()) {
7013 if (it
->key() >= tail
) {
7014 dout(30) << __func__
<< " reached tail" << dendl
;
7018 decode_omap_key(it
->key(), &user_key
);
7019 dout(30) << __func__
<< " got " << pretty_binary_string(it
->key())
7020 << " -> " << user_key
<< dendl
;
7021 keys
->insert(user_key
);
7026 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7031 int BlueStore::omap_get_values(
7032 const coll_t
& cid
, ///< [in] Collection containing oid
7033 const ghobject_t
&oid
, ///< [in] Object containing omap
7034 const set
<string
> &keys
, ///< [in] Keys to get
7035 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
7038 CollectionHandle c
= _get_collection(cid
);
7041 return omap_get_values(c
, oid
, keys
, out
);
7044 int BlueStore::omap_get_values(
7045 CollectionHandle
&c_
, ///< [in] Collection containing oid
7046 const ghobject_t
&oid
, ///< [in] Object containing omap
7047 const set
<string
> &keys
, ///< [in] Keys to get
7048 map
<string
, bufferlist
> *out
///< [out] Returned keys and values
7051 Collection
*c
= static_cast<Collection
*>(c_
.get());
7052 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7055 RWLock::RLocker
l(c
->lock
);
7058 OnodeRef o
= c
->get_onode(oid
, false);
7059 if (!o
|| !o
->exists
) {
7063 if (!o
->onode
.has_omap())
7066 _key_encode_u64(o
->onode
.nid
, &final_key
);
7067 final_key
.push_back('.');
7068 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
7069 final_key
.resize(9); // keep prefix
7072 if (db
->get(PREFIX_OMAP
, final_key
, &val
) >= 0) {
7073 dout(30) << __func__
<< " got " << pretty_binary_string(final_key
)
7074 << " -> " << *p
<< dendl
;
7075 out
->insert(make_pair(*p
, val
));
7079 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7084 int BlueStore::omap_check_keys(
7085 const coll_t
& cid
, ///< [in] Collection containing oid
7086 const ghobject_t
&oid
, ///< [in] Object containing omap
7087 const set
<string
> &keys
, ///< [in] Keys to check
7088 set
<string
> *out
///< [out] Subset of keys defined on oid
7091 CollectionHandle c
= _get_collection(cid
);
7094 return omap_check_keys(c
, oid
, keys
, out
);
7097 int BlueStore::omap_check_keys(
7098 CollectionHandle
&c_
, ///< [in] Collection containing oid
7099 const ghobject_t
&oid
, ///< [in] Object containing omap
7100 const set
<string
> &keys
, ///< [in] Keys to check
7101 set
<string
> *out
///< [out] Subset of keys defined on oid
7104 Collection
*c
= static_cast<Collection
*>(c_
.get());
7105 dout(15) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< dendl
;
7108 RWLock::RLocker
l(c
->lock
);
7111 OnodeRef o
= c
->get_onode(oid
, false);
7112 if (!o
|| !o
->exists
) {
7116 if (!o
->onode
.has_omap())
7119 _key_encode_u64(o
->onode
.nid
, &final_key
);
7120 final_key
.push_back('.');
7121 for (set
<string
>::const_iterator p
= keys
.begin(); p
!= keys
.end(); ++p
) {
7122 final_key
.resize(9); // keep prefix
7125 if (db
->get(PREFIX_OMAP
, final_key
, &val
) >= 0) {
7126 dout(30) << __func__
<< " have " << pretty_binary_string(final_key
)
7127 << " -> " << *p
<< dendl
;
7130 dout(30) << __func__
<< " miss " << pretty_binary_string(final_key
)
7131 << " -> " << *p
<< dendl
;
7135 dout(10) << __func__
<< " " << c
->get_cid() << " oid " << oid
<< " = " << r
7140 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
7141 const coll_t
& cid
, ///< [in] collection
7142 const ghobject_t
&oid
///< [in] object
7145 CollectionHandle c
= _get_collection(cid
);
7147 dout(10) << __func__
<< " " << cid
<< "doesn't exist" <<dendl
;
7148 return ObjectMap::ObjectMapIterator();
7150 return get_omap_iterator(c
, oid
);
7153 ObjectMap::ObjectMapIterator
BlueStore::get_omap_iterator(
7154 CollectionHandle
&c_
, ///< [in] collection
7155 const ghobject_t
&oid
///< [in] object
7158 Collection
*c
= static_cast<Collection
*>(c_
.get());
7159 dout(10) << __func__
<< " " << c
->get_cid() << " " << oid
<< dendl
;
7161 return ObjectMap::ObjectMapIterator();
7163 RWLock::RLocker
l(c
->lock
);
7164 OnodeRef o
= c
->get_onode(oid
, false);
7165 if (!o
|| !o
->exists
) {
7166 dout(10) << __func__
<< " " << oid
<< "doesn't exist" <<dendl
;
7167 return ObjectMap::ObjectMapIterator();
7170 dout(10) << __func__
<< " has_omap = " << (int)o
->onode
.has_omap() <<dendl
;
7171 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
7172 return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c
, o
, it
));
7175 // -----------------
7178 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction
& t
)
7180 dout(10) << __func__
<< " ondisk_format " << ondisk_format
7181 << " min_compat_ondisk_format " << min_compat_ondisk_format
7183 assert(ondisk_format
== latest_ondisk_format
);
7186 ::encode(ondisk_format
, bl
);
7187 t
->set(PREFIX_SUPER
, "ondisk_format", bl
);
7191 ::encode(min_compat_ondisk_format
, bl
);
7192 t
->set(PREFIX_SUPER
, "min_compat_ondisk_format", bl
);
7196 int BlueStore::_open_super_meta()
7202 db
->get(PREFIX_SUPER
, "nid_max", &bl
);
7203 bufferlist::iterator p
= bl
.begin();
7208 } catch (buffer::error
& e
) {
7209 derr
<< __func__
<< " unable to read nid_max" << dendl
;
7212 dout(10) << __func__
<< " old nid_max " << nid_max
<< dendl
;
7213 nid_last
= nid_max
.load();
7220 db
->get(PREFIX_SUPER
, "blobid_max", &bl
);
7221 bufferlist::iterator p
= bl
.begin();
7226 } catch (buffer::error
& e
) {
7227 derr
<< __func__
<< " unable to read blobid_max" << dendl
;
7230 dout(10) << __func__
<< " old blobid_max " << blobid_max
<< dendl
;
7231 blobid_last
= blobid_max
.load();
7237 db
->get(PREFIX_SUPER
, "freelist_type", &bl
);
7239 freelist_type
= std::string(bl
.c_str(), bl
.length());
7240 dout(10) << __func__
<< " freelist_type " << freelist_type
<< dendl
;
7242 assert("Not Support extent freelist manager" == 0);
7247 if (cct
->_conf
->bluestore_bluefs
) {
7248 bluefs_extents
.clear();
7250 db
->get(PREFIX_SUPER
, "bluefs_extents", &bl
);
7251 bufferlist::iterator p
= bl
.begin();
7253 ::decode(bluefs_extents
, p
);
7255 catch (buffer::error
& e
) {
7256 derr
<< __func__
<< " unable to read bluefs_extents" << dendl
;
7259 dout(10) << __func__
<< " bluefs_extents 0x" << std::hex
<< bluefs_extents
7260 << std::dec
<< dendl
;
7264 int32_t compat_ondisk_format
= 0;
7267 int r
= db
->get(PREFIX_SUPER
, "ondisk_format", &bl
);
7269 // base case: kraken bluestore is v1 and readable by v1
7270 dout(20) << __func__
<< " missing ondisk_format; assuming kraken"
7273 compat_ondisk_format
= 1;
7275 auto p
= bl
.begin();
7277 ::decode(ondisk_format
, p
);
7278 } catch (buffer::error
& e
) {
7279 derr
<< __func__
<< " unable to read ondisk_format" << dendl
;
7284 r
= db
->get(PREFIX_SUPER
, "min_compat_ondisk_format", &bl
);
7286 auto p
= bl
.begin();
7288 ::decode(compat_ondisk_format
, p
);
7289 } catch (buffer::error
& e
) {
7290 derr
<< __func__
<< " unable to read compat_ondisk_format" << dendl
;
7295 dout(10) << __func__
<< " ondisk_format " << ondisk_format
7296 << " compat_ondisk_format " << compat_ondisk_format
7300 if (latest_ondisk_format
< compat_ondisk_format
) {
7301 derr
<< __func__
<< " compat_ondisk_format is "
7302 << compat_ondisk_format
<< " but we only understand version "
7303 << latest_ondisk_format
<< dendl
;
7306 if (ondisk_format
< latest_ondisk_format
) {
7307 int r
= _upgrade_super();
7315 db
->get(PREFIX_SUPER
, "min_alloc_size", &bl
);
7316 auto p
= bl
.begin();
7320 min_alloc_size
= val
;
7321 } catch (buffer::error
& e
) {
7322 derr
<< __func__
<< " unable to read min_alloc_size" << dendl
;
7325 dout(10) << __func__
<< " min_alloc_size 0x" << std::hex
<< min_alloc_size
7326 << std::dec
<< dendl
;
7329 _set_throttle_params();
7338 int BlueStore::_upgrade_super()
7340 dout(1) << __func__
<< " from " << ondisk_format
<< ", latest "
7341 << latest_ondisk_format
<< dendl
;
7342 assert(ondisk_format
> 0);
7343 assert(ondisk_format
< latest_ondisk_format
);
7345 if (ondisk_format
== 1) {
7347 // - super: added ondisk_format
7348 // - super: added min_readable_ondisk_format
7349 // - super: added min_compat_ondisk_format
7350 // - super: added min_alloc_size
7351 // - super: removed min_min_alloc_size
7352 KeyValueDB::Transaction t
= db
->get_transaction();
7355 db
->get(PREFIX_SUPER
, "min_min_alloc_size", &bl
);
7356 auto p
= bl
.begin();
7360 min_alloc_size
= val
;
7361 } catch (buffer::error
& e
) {
7362 derr
<< __func__
<< " failed to read min_min_alloc_size" << dendl
;
7365 t
->set(PREFIX_SUPER
, "min_alloc_size", bl
);
7366 t
->rmkey(PREFIX_SUPER
, "min_min_alloc_size");
7369 _prepare_ondisk_format_super(t
);
7370 int r
= db
->submit_transaction_sync(t
);
7375 dout(1) << __func__
<< " done" << dendl
;
7379 void BlueStore::_assign_nid(TransContext
*txc
, OnodeRef o
)
7383 uint64_t nid
= ++nid_last
;
7384 dout(20) << __func__
<< " " << nid
<< dendl
;
7386 txc
->last_nid
= nid
;
7389 uint64_t BlueStore::_assign_blobid(TransContext
*txc
)
7391 uint64_t bid
= ++blobid_last
;
7392 dout(20) << __func__
<< " " << bid
<< dendl
;
7393 txc
->last_blobid
= bid
;
7397 void BlueStore::get_db_statistics(Formatter
*f
)
7399 db
->get_statistics(f
);
7402 BlueStore::TransContext
*BlueStore::_txc_create(OpSequencer
*osr
)
7404 TransContext
*txc
= new TransContext(cct
, osr
);
7405 txc
->t
= db
->get_transaction();
7406 osr
->queue_new(txc
);
7407 dout(20) << __func__
<< " osr " << osr
<< " = " << txc
7408 << " seq " << txc
->seq
<< dendl
;
7412 void BlueStore::_txc_calc_cost(TransContext
*txc
)
7414 // this is about the simplest model for transaction cost you can
7415 // imagine. there is some fixed overhead cost by saying there is a
7416 // minimum of one "io". and then we have some cost per "io" that is
7417 // a configurable (with different hdd and ssd defaults), and add
7418 // that to the bytes value.
7419 int ios
= 1; // one "io" for the kv commit
7420 for (auto& p
: txc
->ioc
.pending_aios
) {
7421 ios
+= p
.iov
.size();
7423 auto cost
= throttle_cost_per_io
.load();
7424 txc
->cost
= ios
* cost
+ txc
->bytes
;
7425 dout(10) << __func__
<< " " << txc
<< " cost " << txc
->cost
<< " ("
7426 << ios
<< " ios * " << cost
<< " + " << txc
->bytes
7427 << " bytes)" << dendl
;
7430 void BlueStore::_txc_update_store_statfs(TransContext
*txc
)
7432 if (txc
->statfs_delta
.is_empty())
7435 logger
->inc(l_bluestore_allocated
, txc
->statfs_delta
.allocated());
7436 logger
->inc(l_bluestore_stored
, txc
->statfs_delta
.stored());
7437 logger
->inc(l_bluestore_compressed
, txc
->statfs_delta
.compressed());
7438 logger
->inc(l_bluestore_compressed_allocated
, txc
->statfs_delta
.compressed_allocated());
7439 logger
->inc(l_bluestore_compressed_original
, txc
->statfs_delta
.compressed_original());
7442 txc
->statfs_delta
.encode(bl
);
7444 txc
->t
->merge(PREFIX_STAT
, "bluestore_statfs", bl
);
7445 txc
->statfs_delta
.reset();
7448 void BlueStore::_txc_state_proc(TransContext
*txc
)
7451 dout(10) << __func__
<< " txc " << txc
7452 << " " << txc
->get_state_name() << dendl
;
7453 switch (txc
->state
) {
7454 case TransContext::STATE_PREPARE
:
7455 txc
->log_state_latency(logger
, l_bluestore_state_prepare_lat
);
7456 if (txc
->ioc
.has_pending_aios()) {
7457 txc
->state
= TransContext::STATE_AIO_WAIT
;
7458 txc
->had_ios
= true;
7459 _txc_aio_submit(txc
);
7464 case TransContext::STATE_AIO_WAIT
:
7465 txc
->log_state_latency(logger
, l_bluestore_state_aio_wait_lat
);
7466 _txc_finish_io(txc
); // may trigger blocked txc's too
7469 case TransContext::STATE_IO_DONE
:
7470 //assert(txc->osr->qlock.is_locked()); // see _txc_finish_io
7472 ++txc
->osr
->txc_with_unstable_io
;
7474 txc
->log_state_latency(logger
, l_bluestore_state_io_done_lat
);
7475 txc
->state
= TransContext::STATE_KV_QUEUED
;
7476 if (cct
->_conf
->bluestore_sync_submit_transaction
) {
7477 if (txc
->last_nid
>= nid_max
||
7478 txc
->last_blobid
>= blobid_max
) {
7479 dout(20) << __func__
7480 << " last_{nid,blobid} exceeds max, submit via kv thread"
7482 } else if (txc
->osr
->kv_committing_serially
) {
7483 dout(20) << __func__
<< " prior txc submitted via kv thread, us too"
7485 // note: this is starvation-prone. once we have a txc in a busy
7486 // sequencer that is committing serially it is possible to keep
7487 // submitting new transactions fast enough that we get stuck doing
7488 // so. the alternative is to block here... fixme?
7489 } else if (txc
->osr
->txc_with_unstable_io
) {
7490 dout(20) << __func__
<< " prior txc(s) with unstable ios "
7491 << txc
->osr
->txc_with_unstable_io
.load() << dendl
;
7492 } else if (cct
->_conf
->bluestore_debug_randomize_serial_transaction
&&
7493 rand() % cct
->_conf
->bluestore_debug_randomize_serial_transaction
7495 dout(20) << __func__
<< " DEBUG randomly forcing submit via kv thread"
7498 txc
->state
= TransContext::STATE_KV_SUBMITTED
;
7499 int r
= db
->submit_transaction(txc
->t
);
7501 _txc_applied_kv(txc
);
7505 std::lock_guard
<std::mutex
> l(kv_lock
);
7506 kv_queue
.push_back(txc
);
7507 kv_cond
.notify_one();
7508 if (txc
->state
!= TransContext::STATE_KV_SUBMITTED
) {
7509 kv_queue_unsubmitted
.push_back(txc
);
7510 ++txc
->osr
->kv_committing_serially
;
7514 case TransContext::STATE_KV_SUBMITTED
:
7515 txc
->log_state_latency(logger
, l_bluestore_state_kv_committing_lat
);
7516 txc
->state
= TransContext::STATE_KV_DONE
;
7517 _txc_committed_kv(txc
);
7520 case TransContext::STATE_KV_DONE
:
7521 txc
->log_state_latency(logger
, l_bluestore_state_kv_done_lat
);
7522 if (txc
->deferred_txn
) {
7523 txc
->state
= TransContext::STATE_DEFERRED_QUEUED
;
7524 _deferred_queue(txc
);
7527 txc
->state
= TransContext::STATE_FINISHING
;
7530 case TransContext::STATE_DEFERRED_CLEANUP
:
7531 txc
->log_state_latency(logger
, l_bluestore_state_deferred_cleanup_lat
);
7532 txc
->state
= TransContext::STATE_FINISHING
;
7535 case TransContext::STATE_FINISHING
:
7536 txc
->log_state_latency(logger
, l_bluestore_state_finishing_lat
);
7541 derr
<< __func__
<< " unexpected txc " << txc
7542 << " state " << txc
->get_state_name() << dendl
;
7543 assert(0 == "unexpected txc state");
7549 void BlueStore::_txc_finish_io(TransContext
*txc
)
7551 dout(20) << __func__
<< " " << txc
<< dendl
;
7554 * we need to preserve the order of kv transactions,
7555 * even though aio will complete in any order.
7558 OpSequencer
*osr
= txc
->osr
.get();
7559 std::lock_guard
<std::mutex
> l(osr
->qlock
);
7560 txc
->state
= TransContext::STATE_IO_DONE
;
7562 OpSequencer::q_list_t::iterator p
= osr
->q
.iterator_to(*txc
);
7563 while (p
!= osr
->q
.begin()) {
7565 if (p
->state
< TransContext::STATE_IO_DONE
) {
7566 dout(20) << __func__
<< " " << txc
<< " blocked by " << &*p
<< " "
7567 << p
->get_state_name() << dendl
;
7570 if (p
->state
> TransContext::STATE_IO_DONE
) {
7576 _txc_state_proc(&*p
++);
7577 } while (p
!= osr
->q
.end() &&
7578 p
->state
== TransContext::STATE_IO_DONE
);
7580 if (osr
->kv_submitted_waiters
&&
7581 osr
->_is_all_kv_submitted()) {
7582 osr
->qcond
.notify_all();
7586 void BlueStore::_txc_write_nodes(TransContext
*txc
, KeyValueDB::Transaction t
)
7588 dout(20) << __func__
<< " txc " << txc
7589 << " onodes " << txc
->onodes
7590 << " shared_blobs " << txc
->shared_blobs
7594 for (auto o
: txc
->onodes
) {
7595 // finalize extent_map shards
7596 o
->extent_map
.update(t
, false);
7597 if (o
->extent_map
.needs_reshard()) {
7598 o
->extent_map
.reshard(db
, t
);
7599 o
->extent_map
.update(t
, true);
7600 if (o
->extent_map
.needs_reshard()) {
7601 dout(20) << __func__
<< " warning: still wants reshard, check options?"
7603 o
->extent_map
.clear_needs_reshard();
7605 logger
->inc(l_bluestore_onode_reshard
);
7610 denc(o
->onode
, bound
);
7611 o
->extent_map
.bound_encode_spanning_blobs(bound
);
7612 if (o
->onode
.extent_map_shards
.empty()) {
7613 denc(o
->extent_map
.inline_bl
, bound
);
7618 unsigned onode_part
, blob_part
, extent_part
;
7620 auto p
= bl
.get_contiguous_appender(bound
, true);
7622 onode_part
= p
.get_logical_offset();
7623 o
->extent_map
.encode_spanning_blobs(p
);
7624 blob_part
= p
.get_logical_offset() - onode_part
;
7625 if (o
->onode
.extent_map_shards
.empty()) {
7626 denc(o
->extent_map
.inline_bl
, p
);
7628 extent_part
= p
.get_logical_offset() - onode_part
- blob_part
;
7631 dout(20) << " onode " << o
->oid
<< " is " << bl
.length()
7632 << " (" << onode_part
<< " bytes onode + "
7633 << blob_part
<< " bytes spanning blobs + "
7634 << extent_part
<< " bytes inline extents)"
7636 t
->set(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size(), bl
);
7637 o
->flushing_count
++;
7640 // objects we modified but didn't affect the onode
7641 auto p
= txc
->modified_objects
.begin();
7642 while (p
!= txc
->modified_objects
.end()) {
7643 if (txc
->onodes
.count(*p
) == 0) {
7644 (*p
)->flushing_count
++;
7647 // remove dups with onodes list to avoid problems in _txc_finish
7648 p
= txc
->modified_objects
.erase(p
);
7652 // finalize shared_blobs
7653 for (auto sb
: txc
->shared_blobs
) {
7655 auto sbid
= sb
->get_sbid();
7656 get_shared_blob_key(sbid
, &key
);
7657 if (sb
->persistent
->empty()) {
7658 dout(20) << " shared_blob 0x" << std::hex
<< sbid
<< std::dec
7659 << " is empty" << dendl
;
7660 t
->rmkey(PREFIX_SHARED_BLOB
, key
);
7663 ::encode(*(sb
->persistent
), bl
);
7664 dout(20) << " shared_blob 0x" << std::hex
<< sbid
<< std::dec
7665 << " is " << bl
.length() << dendl
;
7666 t
->set(PREFIX_SHARED_BLOB
, key
, bl
);
7671 void BlueStore::BSPerfTracker::update_from_perfcounters(
7672 PerfCounters
&logger
)
7674 os_commit_latency
.consume_next(
7676 l_bluestore_commit_lat
));
7677 os_apply_latency
.consume_next(
7679 l_bluestore_commit_lat
));
7682 void BlueStore::_txc_finalize_kv(TransContext
*txc
, KeyValueDB::Transaction t
)
7684 dout(20) << __func__
<< " txc " << txc
<< std::hex
7685 << " allocated 0x" << txc
->allocated
7686 << " released 0x" << txc
->released
7687 << std::dec
<< dendl
;
7689 // We have to handle the case where we allocate *and* deallocate the
7690 // same region in this transaction. The freelist doesn't like that.
7691 // (Actually, the only thing that cares is the BitmapFreelistManager
7692 // debug check. But that's important.)
7693 interval_set
<uint64_t> tmp_allocated
, tmp_released
;
7694 interval_set
<uint64_t> *pallocated
= &txc
->allocated
;
7695 interval_set
<uint64_t> *preleased
= &txc
->released
;
7696 if (!txc
->allocated
.empty() && !txc
->released
.empty()) {
7697 interval_set
<uint64_t> overlap
;
7698 overlap
.intersection_of(txc
->allocated
, txc
->released
);
7699 if (!overlap
.empty()) {
7700 tmp_allocated
= txc
->allocated
;
7701 tmp_allocated
.subtract(overlap
);
7702 tmp_released
= txc
->released
;
7703 tmp_released
.subtract(overlap
);
7704 dout(20) << __func__
<< " overlap 0x" << std::hex
<< overlap
7705 << ", new allocated 0x" << tmp_allocated
7706 << " released 0x" << tmp_released
<< std::dec
7708 pallocated
= &tmp_allocated
;
7709 preleased
= &tmp_released
;
7713 // update freelist with non-overlap sets
7714 for (interval_set
<uint64_t>::iterator p
= pallocated
->begin();
7715 p
!= pallocated
->end();
7717 fm
->allocate(p
.get_start(), p
.get_len(), t
);
7719 for (interval_set
<uint64_t>::iterator p
= preleased
->begin();
7720 p
!= preleased
->end();
7722 dout(20) << __func__
<< " release 0x" << std::hex
<< p
.get_start()
7723 << "~" << p
.get_len() << std::dec
<< dendl
;
7724 fm
->release(p
.get_start(), p
.get_len(), t
);
7727 _txc_update_store_statfs(txc
);
7730 void BlueStore::_txc_applied_kv(TransContext
*txc
)
7732 for (auto ls
: { &txc
->onodes
, &txc
->modified_objects
}) {
7733 for (auto& o
: *ls
) {
7734 dout(20) << __func__
<< " onode " << o
<< " had " << o
->flushing_count
7736 if (--o
->flushing_count
== 0) {
7737 std::lock_guard
<std::mutex
> l(o
->flush_lock
);
7738 o
->flush_cond
.notify_all();
7744 void BlueStore::_txc_committed_kv(TransContext
*txc
)
7746 dout(20) << __func__
<< " txc " << txc
<< dendl
;
7748 // warning: we're calling onreadable_sync inside the sequencer lock
7749 if (txc
->onreadable_sync
) {
7750 txc
->onreadable_sync
->complete(0);
7751 txc
->onreadable_sync
= NULL
;
7753 unsigned n
= txc
->osr
->parent
->shard_hint
.hash_to_shard(m_finisher_num
);
7754 if (txc
->oncommit
) {
7755 logger
->tinc(l_bluestore_commit_lat
, ceph_clock_now() - txc
->start
);
7756 finishers
[n
]->queue(txc
->oncommit
);
7757 txc
->oncommit
= NULL
;
7759 if (txc
->onreadable
) {
7760 finishers
[n
]->queue(txc
->onreadable
);
7761 txc
->onreadable
= NULL
;
7764 if (!txc
->oncommits
.empty()) {
7765 finishers
[n
]->queue(txc
->oncommits
);
7769 void BlueStore::_txc_finish(TransContext
*txc
)
7771 dout(20) << __func__
<< " " << txc
<< " onodes " << txc
->onodes
<< dendl
;
7772 assert(txc
->state
== TransContext::STATE_FINISHING
);
7774 for (auto& sb
: txc
->shared_blobs_written
) {
7775 sb
->bc
.finish_write(sb
->get_cache(), txc
->seq
);
7777 txc
->shared_blobs_written
.clear();
7779 while (!txc
->removed_collections
.empty()) {
7780 _queue_reap_collection(txc
->removed_collections
.front());
7781 txc
->removed_collections
.pop_front();
7784 OpSequencerRef osr
= txc
->osr
;
7787 OpSequencer::q_list_t releasing_txc
;
7789 std::lock_guard
<std::mutex
> l(osr
->qlock
);
7790 txc
->state
= TransContext::STATE_DONE
;
7791 bool notify
= false;
7792 while (!osr
->q
.empty()) {
7793 TransContext
*txc
= &osr
->q
.front();
7794 dout(20) << __func__
<< " txc " << txc
<< " " << txc
->get_state_name()
7796 if (txc
->state
!= TransContext::STATE_DONE
) {
7797 if (txc
->state
== TransContext::STATE_PREPARE
&&
7798 deferred_aggressive
) {
7799 // for _osr_drain_preceding()
7805 if (!c
&& txc
->first_collection
) {
7806 c
= txc
->first_collection
;
7809 releasing_txc
.push_back(*txc
);
7813 osr
->qcond
.notify_all();
7815 if (osr
->q
.empty()) {
7816 dout(20) << __func__
<< " osr " << osr
<< " q now empty" << dendl
;
7820 while (!releasing_txc
.empty()) {
7821 // release to allocator only after all preceding txc's have also
7822 // finished any deferred writes that potentially land in these
7824 auto txc
= &releasing_txc
.front();
7825 _txc_release_alloc(txc
);
7826 releasing_txc
.pop_front();
7827 txc
->log_state_latency(logger
, l_bluestore_state_done_lat
);
7836 if (empty
&& osr
->zombie
) {
7837 dout(10) << __func__
<< " reaping empty zombie osr " << osr
<< dendl
;
7842 void BlueStore::_txc_release_alloc(TransContext
*txc
)
7844 // update allocator with full released set
7845 if (!cct
->_conf
->bluestore_debug_no_reuse_blocks
) {
7846 dout(10) << __func__
<< " " << txc
<< " " << txc
->released
<< dendl
;
7847 for (interval_set
<uint64_t>::iterator p
= txc
->released
.begin();
7848 p
!= txc
->released
.end();
7850 alloc
->release(p
.get_start(), p
.get_len());
7854 txc
->allocated
.clear();
7855 txc
->released
.clear();
7858 void BlueStore::_osr_drain_preceding(TransContext
*txc
)
7860 OpSequencer
*osr
= txc
->osr
.get();
7861 dout(10) << __func__
<< " " << txc
<< " osr " << osr
<< dendl
;
7862 ++deferred_aggressive
; // FIXME: maybe osr-local aggressive flag?
7864 // submit anything pending
7865 std::lock_guard
<std::mutex
> l(deferred_lock
);
7866 if (osr
->deferred_pending
) {
7867 _deferred_submit(osr
);
7871 // wake up any previously finished deferred events
7872 std::lock_guard
<std::mutex
> l(kv_lock
);
7873 kv_cond
.notify_one();
7875 osr
->drain_preceding(txc
);
7876 --deferred_aggressive
;
7877 dout(10) << __func__
<< " " << osr
<< " done" << dendl
;
7880 void BlueStore::_osr_drain_all()
7882 dout(10) << __func__
<< dendl
;
7884 set
<OpSequencerRef
> s
;
7886 std::lock_guard
<std::mutex
> l(osr_lock
);
7889 dout(20) << __func__
<< " osr_set " << s
<< dendl
;
7891 ++deferred_aggressive
;
7893 // submit anything pending
7894 std::lock_guard
<std::mutex
> l(deferred_lock
);
7895 _deferred_try_submit();
7898 // wake up any previously finished deferred events
7899 std::lock_guard
<std::mutex
> l(kv_lock
);
7900 kv_cond
.notify_one();
7902 for (auto osr
: s
) {
7903 dout(20) << __func__
<< " drain " << osr
<< dendl
;
7906 --deferred_aggressive
;
7908 dout(10) << __func__
<< " done" << dendl
;
7911 void BlueStore::_osr_unregister_all()
7913 set
<OpSequencerRef
> s
;
7915 std::lock_guard
<std::mutex
> l(osr_lock
);
7918 dout(10) << __func__
<< " " << s
<< dendl
;
7919 for (auto osr
: s
) {
7923 // break link from Sequencer to us so that this OpSequencer
7924 // instance can die with this mount/umount cycle. note that
7925 // we assume umount() will not race against ~Sequencer.
7926 assert(osr
->parent
);
7927 osr
->parent
->p
.reset();
7930 // nobody should be creating sequencers during umount either.
7932 std::lock_guard
<std::mutex
> l(osr_lock
);
7933 assert(osr_set
.empty());
7937 void BlueStore::_kv_sync_thread()
7939 dout(10) << __func__
<< " start" << dendl
;
7940 std::unique_lock
<std::mutex
> l(kv_lock
);
7942 assert(kv_committing
.empty());
7943 if (kv_queue
.empty() &&
7944 ((deferred_done_queue
.empty() && deferred_stable_queue
.empty()) ||
7945 !deferred_aggressive
)) {
7948 dout(20) << __func__
<< " sleep" << dendl
;
7950 dout(20) << __func__
<< " wake" << dendl
;
7952 deque
<TransContext
*> kv_submitting
;
7953 deque
<DeferredBatch
*> deferred_done
, deferred_stable
;
7954 dout(20) << __func__
<< " committing " << kv_queue
.size()
7955 << " submitting " << kv_queue_unsubmitted
.size()
7956 << " deferred done " << deferred_done_queue
.size()
7957 << " stable " << deferred_stable_queue
.size()
7959 kv_committing
.swap(kv_queue
);
7960 kv_submitting
.swap(kv_queue_unsubmitted
);
7961 deferred_done
.swap(deferred_done_queue
);
7962 deferred_stable
.swap(deferred_stable_queue
);
7963 utime_t start
= ceph_clock_now();
7966 dout(30) << __func__
<< " committing " << kv_committing
<< dendl
;
7967 dout(30) << __func__
<< " submitting " << kv_submitting
<< dendl
;
7968 dout(30) << __func__
<< " deferred_done " << deferred_done
<< dendl
;
7969 dout(30) << __func__
<< " deferred_stable " << deferred_stable
<< dendl
;
7972 for (auto txc
: kv_committing
) {
7978 bool force_flush
= false;
7979 // if bluefs is sharing the same device as data (only), then we
7980 // can rely on the bluefs commit to flush the device and make
7981 // deferred aios stable. that means that if we do have done deferred
7982 // txcs AND we are not on a single device, we need to force a flush.
7983 if (bluefs_single_shared_device
&& bluefs
) {
7986 } else if (kv_committing
.empty() && kv_submitting
.empty() &&
7987 deferred_stable
.empty()) {
7988 force_flush
= true; // there's nothing else to commit!
7989 } else if (deferred_aggressive
) {
7996 dout(20) << __func__
<< " num_aios=" << num_aios
7997 << " force_flush=" << (int)force_flush
7998 << ", flushing, deferred done->stable" << dendl
;
7999 // flush/barrier on block device
8002 // if we flush then deferred done are now deferred stable
8003 deferred_stable
.insert(deferred_stable
.end(), deferred_done
.begin(),
8004 deferred_done
.end());
8005 deferred_done
.clear();
8007 utime_t after_flush
= ceph_clock_now();
8009 // we will use one final transaction to force a sync
8010 KeyValueDB::Transaction synct
= db
->get_transaction();
8012 // increase {nid,blobid}_max? note that this covers both the
8013 // case where we are approaching the max and the case we passed
8014 // it. in either case, we increase the max in the earlier txn
8016 uint64_t new_nid_max
= 0, new_blobid_max
= 0;
8017 if (nid_last
+ cct
->_conf
->bluestore_nid_prealloc
/2 > nid_max
) {
8018 KeyValueDB::Transaction t
=
8019 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
8020 new_nid_max
= nid_last
+ cct
->_conf
->bluestore_nid_prealloc
;
8022 ::encode(new_nid_max
, bl
);
8023 t
->set(PREFIX_SUPER
, "nid_max", bl
);
8024 dout(10) << __func__
<< " new_nid_max " << new_nid_max
<< dendl
;
8026 if (blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
/2 > blobid_max
) {
8027 KeyValueDB::Transaction t
=
8028 kv_submitting
.empty() ? synct
: kv_submitting
.front()->t
;
8029 new_blobid_max
= blobid_last
+ cct
->_conf
->bluestore_blobid_prealloc
;
8031 ::encode(new_blobid_max
, bl
);
8032 t
->set(PREFIX_SUPER
, "blobid_max", bl
);
8033 dout(10) << __func__
<< " new_blobid_max " << new_blobid_max
<< dendl
;
8035 for (auto txc
: kv_submitting
) {
8036 assert(txc
->state
== TransContext::STATE_KV_QUEUED
);
8037 txc
->log_state_latency(logger
, l_bluestore_state_kv_queued_lat
);
8038 int r
= db
->submit_transaction(txc
->t
);
8040 _txc_applied_kv(txc
);
8041 --txc
->osr
->kv_committing_serially
;
8042 txc
->state
= TransContext::STATE_KV_SUBMITTED
;
8043 if (txc
->osr
->kv_submitted_waiters
) {
8044 std::lock_guard
<std::mutex
> l(txc
->osr
->qlock
);
8045 if (txc
->osr
->_is_all_kv_submitted()) {
8046 txc
->osr
->qcond
.notify_all();
8050 for (auto txc
: kv_committing
) {
8052 --txc
->osr
->txc_with_unstable_io
;
8054 txc
->log_state_latency(logger
, l_bluestore_state_kv_queued_lat
);
8055 // release throttle *before* we commit. this allows new ops
8056 // to be prepared and enter pipeline while we are waiting on
8057 // the kv commit sync/flush. then hopefully on the next
8058 // iteration there will already be ops awake. otherwise, we
8059 // end up going to sleep, and then wake up when the very first
8060 // transaction is ready for commit.
8061 throttle_bytes
.put(txc
->cost
);
8064 PExtentVector bluefs_gift_extents
;
8066 after_flush
- bluefs_last_balance
>
8067 cct
->_conf
->bluestore_bluefs_balance_interval
) {
8068 bluefs_last_balance
= after_flush
;
8069 int r
= _balance_bluefs_freespace(&bluefs_gift_extents
);
8072 for (auto& p
: bluefs_gift_extents
) {
8073 bluefs_extents
.insert(p
.offset
, p
.length
);
8076 ::encode(bluefs_extents
, bl
);
8077 dout(10) << __func__
<< " bluefs_extents now 0x" << std::hex
8078 << bluefs_extents
<< std::dec
<< dendl
;
8079 synct
->set(PREFIX_SUPER
, "bluefs_extents", bl
);
8083 // cleanup sync deferred keys
8084 for (auto b
: deferred_stable
) {
8085 for (auto& txc
: b
->txcs
) {
8086 bluestore_deferred_transaction_t
& wt
= *txc
.deferred_txn
;
8087 if (!wt
.released
.empty()) {
8088 // kraken replay compat only
8089 txc
.released
= wt
.released
;
8090 dout(10) << __func__
<< " deferred txn has released "
8092 << " (we just upgraded from kraken) on " << &txc
<< dendl
;
8093 _txc_finalize_kv(&txc
, synct
);
8095 // cleanup the deferred
8097 get_deferred_key(wt
.seq
, &key
);
8098 synct
->rm_single_key(PREFIX_DEFERRED
, key
);
8102 // submit synct synchronously (block and wait for it to commit)
8103 int r
= db
->submit_transaction_sync(synct
);
8107 nid_max
= new_nid_max
;
8108 dout(10) << __func__
<< " nid_max now " << nid_max
<< dendl
;
8110 if (new_blobid_max
) {
8111 blobid_max
= new_blobid_max
;
8112 dout(10) << __func__
<< " blobid_max now " << blobid_max
<< dendl
;
8115 utime_t finish
= ceph_clock_now();
8116 utime_t dur_flush
= after_flush
- start
;
8117 utime_t dur_kv
= finish
- after_flush
;
8118 utime_t dur
= finish
- start
;
8119 dout(20) << __func__
<< " committed " << kv_committing
.size()
8120 << " cleaned " << deferred_stable
.size()
8122 << " (" << dur_flush
<< " flush + " << dur_kv
<< " kv commit)"
8125 logger
->tinc(l_bluestore_kv_flush_lat
, dur_flush
);
8126 logger
->tinc(l_bluestore_kv_commit_lat
, dur_kv
);
8127 logger
->tinc(l_bluestore_kv_lat
, dur
);
8129 while (!kv_committing
.empty()) {
8130 TransContext
*txc
= kv_committing
.front();
8131 assert(txc
->state
== TransContext::STATE_KV_SUBMITTED
);
8132 _txc_state_proc(txc
);
8133 kv_committing
.pop_front();
8135 for (auto b
: deferred_stable
) {
8136 auto p
= b
->txcs
.begin();
8137 while (p
!= b
->txcs
.end()) {
8138 TransContext
*txc
= &*p
;
8139 p
= b
->txcs
.erase(p
); // unlink here because
8140 _txc_state_proc(txc
); // this may destroy txc
8145 if (!deferred_aggressive
) {
8146 std::lock_guard
<std::mutex
> l(deferred_lock
);
8147 if (deferred_queue_size
>= deferred_batch_ops
||
8148 throttle_deferred_bytes
.past_midpoint()) {
8149 _deferred_try_submit();
8153 // this is as good a place as any ...
8154 _reap_collections();
8157 if (!bluefs_gift_extents
.empty()) {
8158 _commit_bluefs_freespace(bluefs_gift_extents
);
8160 for (auto p
= bluefs_extents_reclaiming
.begin();
8161 p
!= bluefs_extents_reclaiming
.end();
8163 dout(20) << __func__
<< " releasing old bluefs 0x" << std::hex
8164 << p
.get_start() << "~" << p
.get_len() << std::dec
8166 alloc
->release(p
.get_start(), p
.get_len());
8168 bluefs_extents_reclaiming
.clear();
8172 // previously deferred "done" are now "stable" by virtue of this
8174 deferred_stable_queue
.swap(deferred_done
);
8177 dout(10) << __func__
<< " finish" << dendl
;
8180 bluestore_deferred_op_t
*BlueStore::_get_deferred_op(
8181 TransContext
*txc
, OnodeRef o
)
8183 if (!txc
->deferred_txn
) {
8184 txc
->deferred_txn
= new bluestore_deferred_transaction_t
;
8186 txc
->deferred_txn
->ops
.push_back(bluestore_deferred_op_t());
8187 return &txc
->deferred_txn
->ops
.back();
8190 void BlueStore::_deferred_queue(TransContext
*txc
)
8192 dout(20) << __func__
<< " txc " << txc
<< " osr " << txc
->osr
<< dendl
;
8193 std::lock_guard
<std::mutex
> l(deferred_lock
);
8194 if (!txc
->osr
->deferred_pending
&&
8195 !txc
->osr
->deferred_running
) {
8196 deferred_queue
.push_back(*txc
->osr
);
8198 if (!txc
->osr
->deferred_pending
) {
8199 txc
->osr
->deferred_pending
= new DeferredBatch(cct
, txc
->osr
.get());
8201 ++deferred_queue_size
;
8202 txc
->osr
->deferred_pending
->txcs
.push_back(*txc
);
8203 bluestore_deferred_transaction_t
& wt
= *txc
->deferred_txn
;
8204 for (auto opi
= wt
.ops
.begin(); opi
!= wt
.ops
.end(); ++opi
) {
8205 const auto& op
= *opi
;
8206 assert(op
.op
== bluestore_deferred_op_t::OP_WRITE
);
8207 bufferlist::const_iterator p
= op
.data
.begin();
8208 for (auto e
: op
.extents
) {
8209 txc
->osr
->deferred_pending
->prepare_write(
8210 cct
, wt
.seq
, e
.offset
, e
.length
, p
);
8213 if (deferred_aggressive
&&
8214 !txc
->osr
->deferred_running
) {
8215 _deferred_submit(txc
->osr
.get());
8219 void BlueStore::_deferred_try_submit()
8221 dout(20) << __func__
<< " " << deferred_queue
.size() << " osrs, "
8222 << deferred_queue_size
<< " txcs" << dendl
;
8223 for (auto& osr
: deferred_queue
) {
8224 if (!osr
.deferred_running
) {
8225 _deferred_submit(&osr
);
8230 void BlueStore::_deferred_submit(OpSequencer
*osr
)
8232 dout(10) << __func__
<< " osr " << osr
8233 << " " << osr
->deferred_pending
->iomap
.size() << " ios pending "
8235 assert(osr
->deferred_pending
);
8236 assert(!osr
->deferred_running
);
8238 auto b
= osr
->deferred_pending
;
8239 deferred_queue_size
-= b
->seq_bytes
.size();
8240 assert(deferred_queue_size
>= 0);
8242 osr
->deferred_running
= osr
->deferred_pending
;
8243 osr
->deferred_pending
= nullptr;
8245 uint64_t start
= 0, pos
= 0;
8247 auto i
= b
->iomap
.begin();
8249 if (i
== b
->iomap
.end() || i
->first
!= pos
) {
8251 dout(20) << __func__
<< " write 0x" << std::hex
8252 << start
<< "~" << bl
.length()
8253 << " crc " << bl
.crc32c(-1) << std::dec
<< dendl
;
8254 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
8255 logger
->inc(l_bluestore_deferred_write_ops
);
8256 logger
->inc(l_bluestore_deferred_write_bytes
, bl
.length());
8257 int r
= bdev
->aio_write(start
, bl
, &b
->ioc
, false);
8261 if (i
== b
->iomap
.end()) {
8268 dout(20) << __func__
<< " seq " << i
->second
.seq
<< " 0x"
8269 << std::hex
<< pos
<< "~" << i
->second
.bl
.length() << std::dec
8274 pos
+= i
->second
.bl
.length();
8275 bl
.claim_append(i
->second
.bl
);
8278 bdev
->aio_submit(&b
->ioc
);
8281 void BlueStore::_deferred_aio_finish(OpSequencer
*osr
)
8283 dout(10) << __func__
<< " osr " << osr
<< dendl
;
8284 assert(osr
->deferred_running
);
8285 DeferredBatch
*b
= osr
->deferred_running
;
8288 std::lock_guard
<std::mutex
> l(deferred_lock
);
8289 assert(osr
->deferred_running
== b
);
8290 osr
->deferred_running
= nullptr;
8291 if (!osr
->deferred_pending
) {
8292 auto q
= deferred_queue
.iterator_to(*osr
);
8293 deferred_queue
.erase(q
);
8294 } else if (deferred_aggressive
) {
8295 _deferred_submit(osr
);
8300 std::lock_guard
<std::mutex
> l2(osr
->qlock
);
8301 for (auto& i
: b
->txcs
) {
8302 TransContext
*txc
= &i
;
8303 txc
->state
= TransContext::STATE_DEFERRED_CLEANUP
;
8304 txc
->osr
->qcond
.notify_all();
8305 throttle_deferred_bytes
.put(txc
->cost
);
8307 std::lock_guard
<std::mutex
> l(kv_lock
);
8308 deferred_done_queue
.emplace_back(b
);
8311 // in the normal case, do not bother waking up the kv thread; it will
8312 // catch us on the next commit anyway.
8313 if (deferred_aggressive
) {
8314 std::lock_guard
<std::mutex
> l(kv_lock
);
8315 kv_cond
.notify_one();
8319 int BlueStore::_deferred_replay()
8321 dout(10) << __func__
<< " start" << dendl
;
8322 OpSequencerRef osr
= new OpSequencer(cct
, this);
8325 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_DEFERRED
);
8326 for (it
->lower_bound(string()); it
->valid(); it
->next(), ++count
) {
8327 dout(20) << __func__
<< " replay " << pretty_binary_string(it
->key())
8329 bluestore_deferred_transaction_t
*deferred_txn
=
8330 new bluestore_deferred_transaction_t
;
8331 bufferlist bl
= it
->value();
8332 bufferlist::iterator p
= bl
.begin();
8334 ::decode(*deferred_txn
, p
);
8335 } catch (buffer::error
& e
) {
8336 derr
<< __func__
<< " failed to decode deferred txn "
8337 << pretty_binary_string(it
->key()) << dendl
;
8338 delete deferred_txn
;
8342 TransContext
*txc
= _txc_create(osr
.get());
8343 txc
->deferred_txn
= deferred_txn
;
8344 txc
->state
= TransContext::STATE_KV_DONE
;
8345 _txc_state_proc(txc
);
8348 dout(20) << __func__
<< " draining osr" << dendl
;
8351 dout(10) << __func__
<< " completed " << count
<< " events" << dendl
;
8355 // ---------------------------
8358 int BlueStore::queue_transactions(
8360 vector
<Transaction
>& tls
,
8362 ThreadPool::TPHandle
*handle
)
8365 Context
*onreadable
;
8367 Context
*onreadable_sync
;
8368 ObjectStore::Transaction::collect_contexts(
8369 tls
, &onreadable
, &ondisk
, &onreadable_sync
);
8371 if (cct
->_conf
->objectstore_blackhole
) {
8372 dout(0) << __func__
<< " objectstore_blackhole = TRUE, dropping transaction"
8376 delete onreadable_sync
;
8379 utime_t start
= ceph_clock_now();
8380 // set up the sequencer
8384 osr
= static_cast<OpSequencer
*>(posr
->p
.get());
8385 dout(10) << __func__
<< " existing " << osr
<< " " << *osr
<< dendl
;
8387 osr
= new OpSequencer(cct
, this);
8390 dout(10) << __func__
<< " new " << osr
<< " " << *osr
<< dendl
;
8394 TransContext
*txc
= _txc_create(osr
);
8395 txc
->onreadable
= onreadable
;
8396 txc
->onreadable_sync
= onreadable_sync
;
8397 txc
->oncommit
= ondisk
;
8399 for (vector
<Transaction
>::iterator p
= tls
.begin(); p
!= tls
.end(); ++p
) {
8401 txc
->bytes
+= (*p
).get_num_bytes();
8402 _txc_add_transaction(txc
, &(*p
));
8404 _txc_calc_cost(txc
);
8406 _txc_write_nodes(txc
, txc
->t
);
8408 // journal deferred items
8409 if (txc
->deferred_txn
) {
8410 txc
->deferred_txn
->seq
= ++deferred_seq
;
8412 ::encode(*txc
->deferred_txn
, bl
);
8414 get_deferred_key(txc
->deferred_txn
->seq
, &key
);
8415 txc
->t
->set(PREFIX_DEFERRED
, key
, bl
);
8418 _txc_finalize_kv(txc
, txc
->t
);
8420 handle
->suspend_tp_timeout();
8422 utime_t tstart
= ceph_clock_now();
8423 throttle_bytes
.get(txc
->cost
);
8424 if (txc
->deferred_txn
) {
8425 // ensure we do not block here because of deferred writes
8426 if (!throttle_deferred_bytes
.get_or_fail(txc
->cost
)) {
8427 deferred_try_submit();
8428 throttle_deferred_bytes
.get(txc
->cost
);
8431 utime_t tend
= ceph_clock_now();
8434 handle
->reset_tp_timeout();
8436 logger
->inc(l_bluestore_txc
);
8439 _txc_state_proc(txc
);
8441 logger
->tinc(l_bluestore_submit_lat
, ceph_clock_now() - start
);
8442 logger
->tinc(l_bluestore_throttle_lat
, tend
- tstart
);
8446 void BlueStore::_txc_aio_submit(TransContext
*txc
)
8448 dout(10) << __func__
<< " txc " << txc
<< dendl
;
8449 bdev
->aio_submit(&txc
->ioc
);
8452 void BlueStore::_txc_add_transaction(TransContext
*txc
, Transaction
*t
)
8454 Transaction::iterator i
= t
->begin();
8456 _dump_transaction(t
);
8458 vector
<CollectionRef
> cvec(i
.colls
.size());
8460 for (vector
<coll_t
>::iterator p
= i
.colls
.begin(); p
!= i
.colls
.end();
8462 cvec
[j
] = _get_collection(*p
);
8464 // note first collection we reference
8465 if (!txc
->first_collection
)
8466 txc
->first_collection
= cvec
[j
];
8468 vector
<OnodeRef
> ovec(i
.objects
.size());
8470 for (int pos
= 0; i
.have_op(); ++pos
) {
8471 Transaction::Op
*op
= i
.decode_op();
8475 if (op
->op
== Transaction::OP_NOP
)
8478 // collection operations
8479 CollectionRef
&c
= cvec
[op
->cid
];
8481 case Transaction::OP_RMCOLL
:
8483 const coll_t
&cid
= i
.get_cid(op
->cid
);
8484 r
= _remove_collection(txc
, cid
, &c
);
8490 case Transaction::OP_MKCOLL
:
8493 const coll_t
&cid
= i
.get_cid(op
->cid
);
8494 r
= _create_collection(txc
, cid
, op
->split_bits
, &c
);
8500 case Transaction::OP_SPLIT_COLLECTION
:
8501 assert(0 == "deprecated");
8504 case Transaction::OP_SPLIT_COLLECTION2
:
8506 uint32_t bits
= op
->split_bits
;
8507 uint32_t rem
= op
->split_rem
;
8508 r
= _split_collection(txc
, c
, cvec
[op
->dest_cid
], bits
, rem
);
8514 case Transaction::OP_COLL_HINT
:
8516 uint32_t type
= op
->hint_type
;
8519 bufferlist::iterator hiter
= hint
.begin();
8520 if (type
== Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
) {
8523 ::decode(pg_num
, hiter
);
8524 ::decode(num_objs
, hiter
);
8525 dout(10) << __func__
<< " collection hint objects is a no-op, "
8526 << " pg_num " << pg_num
<< " num_objects " << num_objs
8530 dout(10) << __func__
<< " unknown collection hint " << type
<< dendl
;
8536 case Transaction::OP_COLL_SETATTR
:
8540 case Transaction::OP_COLL_RMATTR
:
8544 case Transaction::OP_COLL_RENAME
:
8545 assert(0 == "not implemented");
8549 derr
<< __func__
<< " error " << cpp_strerror(r
)
8550 << " not handled on operation " << op
->op
8551 << " (op " << pos
<< ", counting from 0)" << dendl
;
8552 _dump_transaction(t
, 0);
8553 assert(0 == "unexpected error");
8556 // these operations implicity create the object
8557 bool create
= false;
8558 if (op
->op
== Transaction::OP_TOUCH
||
8559 op
->op
== Transaction::OP_WRITE
||
8560 op
->op
== Transaction::OP_ZERO
) {
8564 // object operations
8565 RWLock::WLocker
l(c
->lock
);
8566 OnodeRef
&o
= ovec
[op
->oid
];
8568 ghobject_t oid
= i
.get_oid(op
->oid
);
8569 o
= c
->get_onode(oid
, create
);
8571 if (!create
&& (!o
|| !o
->exists
)) {
8572 dout(10) << __func__
<< " op " << op
->op
<< " got ENOENT on "
8573 << i
.get_oid(op
->oid
) << dendl
;
8579 case Transaction::OP_TOUCH
:
8580 r
= _touch(txc
, c
, o
);
8583 case Transaction::OP_WRITE
:
8585 uint64_t off
= op
->off
;
8586 uint64_t len
= op
->len
;
8587 uint32_t fadvise_flags
= i
.get_fadvise_flags();
8590 r
= _write(txc
, c
, o
, off
, len
, bl
, fadvise_flags
);
8594 case Transaction::OP_ZERO
:
8596 uint64_t off
= op
->off
;
8597 uint64_t len
= op
->len
;
8598 r
= _zero(txc
, c
, o
, off
, len
);
8602 case Transaction::OP_TRIMCACHE
:
8604 // deprecated, no-op
8608 case Transaction::OP_TRUNCATE
:
8610 uint64_t off
= op
->off
;
8611 _truncate(txc
, c
, o
, off
);
8615 case Transaction::OP_REMOVE
:
8617 r
= _remove(txc
, c
, o
);
8621 case Transaction::OP_SETATTR
:
8623 string name
= i
.decode_string();
8626 r
= _setattr(txc
, c
, o
, name
, bp
);
8630 case Transaction::OP_SETATTRS
:
8632 map
<string
, bufferptr
> aset
;
8633 i
.decode_attrset(aset
);
8634 r
= _setattrs(txc
, c
, o
, aset
);
8638 case Transaction::OP_RMATTR
:
8640 string name
= i
.decode_string();
8641 r
= _rmattr(txc
, c
, o
, name
);
8645 case Transaction::OP_RMATTRS
:
8647 r
= _rmattrs(txc
, c
, o
);
8651 case Transaction::OP_CLONE
:
8653 OnodeRef
& no
= ovec
[op
->dest_oid
];
8655 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
8656 no
= c
->get_onode(noid
, true);
8658 r
= _clone(txc
, c
, o
, no
);
8662 case Transaction::OP_CLONERANGE
:
8663 assert(0 == "deprecated");
8666 case Transaction::OP_CLONERANGE2
:
8668 OnodeRef
& no
= ovec
[op
->dest_oid
];
8670 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
8671 no
= c
->get_onode(noid
, true);
8673 uint64_t srcoff
= op
->off
;
8674 uint64_t len
= op
->len
;
8675 uint64_t dstoff
= op
->dest_off
;
8676 r
= _clone_range(txc
, c
, o
, no
, srcoff
, len
, dstoff
);
8680 case Transaction::OP_COLL_ADD
:
8681 assert(0 == "not implemented");
8684 case Transaction::OP_COLL_REMOVE
:
8685 assert(0 == "not implemented");
8688 case Transaction::OP_COLL_MOVE
:
8689 assert(0 == "deprecated");
8692 case Transaction::OP_COLL_MOVE_RENAME
:
8693 case Transaction::OP_TRY_RENAME
:
8695 assert(op
->cid
== op
->dest_cid
);
8696 const ghobject_t
& noid
= i
.get_oid(op
->dest_oid
);
8697 OnodeRef
& no
= ovec
[op
->dest_oid
];
8699 no
= c
->get_onode(noid
, false);
8701 r
= _rename(txc
, c
, o
, no
, noid
);
8705 case Transaction::OP_OMAP_CLEAR
:
8707 r
= _omap_clear(txc
, c
, o
);
8710 case Transaction::OP_OMAP_SETKEYS
:
8713 i
.decode_attrset_bl(&aset_bl
);
8714 r
= _omap_setkeys(txc
, c
, o
, aset_bl
);
8717 case Transaction::OP_OMAP_RMKEYS
:
8720 i
.decode_keyset_bl(&keys_bl
);
8721 r
= _omap_rmkeys(txc
, c
, o
, keys_bl
);
8724 case Transaction::OP_OMAP_RMKEYRANGE
:
8727 first
= i
.decode_string();
8728 last
= i
.decode_string();
8729 r
= _omap_rmkey_range(txc
, c
, o
, first
, last
);
8732 case Transaction::OP_OMAP_SETHEADER
:
8736 r
= _omap_setheader(txc
, c
, o
, bl
);
8740 case Transaction::OP_SETALLOCHINT
:
8742 r
= _set_alloc_hint(txc
, c
, o
,
8743 op
->expected_object_size
,
8744 op
->expected_write_size
,
8745 op
->alloc_hint_flags
);
8750 derr
<< __func__
<< "bad op " << op
->op
<< dendl
;
8758 if (r
== -ENOENT
&& !(op
->op
== Transaction::OP_CLONERANGE
||
8759 op
->op
== Transaction::OP_CLONE
||
8760 op
->op
== Transaction::OP_CLONERANGE2
||
8761 op
->op
== Transaction::OP_COLL_ADD
||
8762 op
->op
== Transaction::OP_SETATTR
||
8763 op
->op
== Transaction::OP_SETATTRS
||
8764 op
->op
== Transaction::OP_RMATTR
||
8765 op
->op
== Transaction::OP_OMAP_SETKEYS
||
8766 op
->op
== Transaction::OP_OMAP_RMKEYS
||
8767 op
->op
== Transaction::OP_OMAP_RMKEYRANGE
||
8768 op
->op
== Transaction::OP_OMAP_SETHEADER
))
8769 // -ENOENT is usually okay
8775 const char *msg
= "unexpected error code";
8777 if (r
== -ENOENT
&& (op
->op
== Transaction::OP_CLONERANGE
||
8778 op
->op
== Transaction::OP_CLONE
||
8779 op
->op
== Transaction::OP_CLONERANGE2
))
8780 msg
= "ENOENT on clone suggests osd bug";
8783 // For now, if we hit _any_ ENOSPC, crash, before we do any damage
8784 // by partially applying transactions.
8785 msg
= "ENOSPC from bluestore, misconfigured cluster";
8787 if (r
== -ENOTEMPTY
) {
8788 msg
= "ENOTEMPTY suggests garbage data in osd data dir";
8791 derr
<< __func__
<< " error " << cpp_strerror(r
)
8792 << " not handled on operation " << op
->op
8793 << " (op " << pos
<< ", counting from 0)"
8795 derr
<< msg
<< dendl
;
8796 _dump_transaction(t
, 0);
8797 assert(0 == "unexpected error");
8805 // -----------------
8808 int BlueStore::_touch(TransContext
*txc
,
8812 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
8815 _assign_nid(txc
, o
);
8816 txc
->write_onode(o
);
8817 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
8821 void BlueStore::_dump_onode(OnodeRef o
, int log_level
)
8823 if (!cct
->_conf
->subsys
.should_gather(ceph_subsys_bluestore
, log_level
))
8825 dout(log_level
) << __func__
<< " " << o
<< " " << o
->oid
8826 << " nid " << o
->onode
.nid
8827 << " size 0x" << std::hex
<< o
->onode
.size
8828 << " (" << std::dec
<< o
->onode
.size
<< ")"
8829 << " expected_object_size " << o
->onode
.expected_object_size
8830 << " expected_write_size " << o
->onode
.expected_write_size
8831 << " in " << o
->onode
.extent_map_shards
.size() << " shards"
8832 << ", " << o
->extent_map
.spanning_blob_map
.size()
8833 << " spanning blobs"
8835 for (auto p
= o
->onode
.attrs
.begin();
8836 p
!= o
->onode
.attrs
.end();
8838 dout(log_level
) << __func__
<< " attr " << p
->first
8839 << " len " << p
->second
.length() << dendl
;
8841 _dump_extent_map(o
->extent_map
, log_level
);
8844 void BlueStore::_dump_extent_map(ExtentMap
&em
, int log_level
)
8847 for (auto& s
: em
.shards
) {
8848 dout(log_level
) << __func__
<< " shard " << *s
.shard_info
8849 << (s
.loaded
? " (loaded)" : "")
8850 << (s
.dirty
? " (dirty)" : "")
8853 for (auto& e
: em
.extent_map
) {
8854 dout(log_level
) << __func__
<< " " << e
<< dendl
;
8855 assert(e
.logical_offset
>= pos
);
8856 pos
= e
.logical_offset
+ e
.length
;
8857 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
8858 if (blob
.has_csum()) {
8860 unsigned n
= blob
.get_csum_count();
8861 for (unsigned i
= 0; i
< n
; ++i
)
8862 v
.push_back(blob
.get_csum_item(i
));
8863 dout(log_level
) << __func__
<< " csum: " << std::hex
<< v
<< std::dec
8866 std::lock_guard
<std::recursive_mutex
> l(e
.blob
->shared_blob
->get_cache()->lock
);
8867 for (auto& i
: e
.blob
->shared_blob
->bc
.buffer_map
) {
8868 dout(log_level
) << __func__
<< " 0x" << std::hex
<< i
.first
8869 << "~" << i
.second
->length
<< std::dec
8870 << " " << *i
.second
<< dendl
;
8875 void BlueStore::_dump_transaction(Transaction
*t
, int log_level
)
8877 dout(log_level
) << " transaction dump:\n";
8878 JSONFormatter
f(true);
8879 f
.open_object_section("transaction");
8886 void BlueStore::_pad_zeros(
8887 bufferlist
*bl
, uint64_t *offset
,
8888 uint64_t chunk_size
)
8890 auto length
= bl
->length();
8891 dout(30) << __func__
<< " 0x" << std::hex
<< *offset
<< "~" << length
8892 << " chunk_size 0x" << chunk_size
<< std::dec
<< dendl
;
8893 dout(40) << "before:\n";
8894 bl
->hexdump(*_dout
);
8897 size_t front_pad
= *offset
% chunk_size
;
8898 size_t back_pad
= 0;
8899 size_t pad_count
= 0;
8901 size_t front_copy
= MIN(chunk_size
- front_pad
, length
);
8902 bufferptr z
= buffer::create_page_aligned(chunk_size
);
8903 memset(z
.c_str(), 0, front_pad
);
8904 pad_count
+= front_pad
;
8905 memcpy(z
.c_str() + front_pad
, bl
->get_contiguous(0, front_copy
), front_copy
);
8906 if (front_copy
+ front_pad
< chunk_size
) {
8907 back_pad
= chunk_size
- (length
+ front_pad
);
8908 memset(z
.c_str() + front_pad
+ length
, 0, back_pad
);
8909 pad_count
+= back_pad
;
8913 t
.substr_of(old
, front_copy
, length
- front_copy
);
8915 bl
->claim_append(t
);
8916 *offset
-= front_pad
;
8917 length
+= front_pad
+ back_pad
;
8921 uint64_t end
= *offset
+ length
;
8922 unsigned back_copy
= end
% chunk_size
;
8924 assert(back_pad
== 0);
8925 back_pad
= chunk_size
- back_copy
;
8926 assert(back_copy
<= length
);
8927 bufferptr
tail(chunk_size
);
8928 memcpy(tail
.c_str(), bl
->get_contiguous(length
- back_copy
, back_copy
),
8930 memset(tail
.c_str() + back_copy
, 0, back_pad
);
8933 bl
->substr_of(old
, 0, length
- back_copy
);
8936 pad_count
+= back_pad
;
8938 dout(20) << __func__
<< " pad 0x" << std::hex
<< front_pad
<< " + 0x"
8939 << back_pad
<< " on front/back, now 0x" << *offset
<< "~"
8940 << length
<< std::dec
<< dendl
;
8941 dout(40) << "after:\n";
8942 bl
->hexdump(*_dout
);
8945 logger
->inc(l_bluestore_write_pad_bytes
, pad_count
);
8946 assert(bl
->length() == length
);
8949 void BlueStore::_do_write_small(
8953 uint64_t offset
, uint64_t length
,
8954 bufferlist::iterator
& blp
,
8957 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
8958 << std::dec
<< dendl
;
8959 assert(length
< min_alloc_size
);
8960 uint64_t end_offs
= offset
+ length
;
8962 logger
->inc(l_bluestore_write_small
);
8963 logger
->inc(l_bluestore_write_small_bytes
, length
);
8966 blp
.copy(length
, bl
);
8968 // Look for an existing mutable blob we can use.
8969 auto begin
= o
->extent_map
.extent_map
.begin();
8970 auto end
= o
->extent_map
.extent_map
.end();
8971 auto ep
= o
->extent_map
.seek_lextent(offset
);
8974 if (ep
->blob_end() <= offset
) {
8979 if (prev_ep
!= begin
) {
8982 prev_ep
= end
; // to avoid this extent check as it's a duplicate
8985 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
8986 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
8987 uint32_t alloc_len
= min_alloc_size
;
8988 auto offset0
= P2ALIGN(offset
, alloc_len
);
8992 // search suitable extent in both forward and reverse direction in
8993 // [offset - target_max_blob_size, offset + target_max_blob_size] range
8994 // then check if blob can be reused via try_reuse_blob func or apply
8995 // direct/deferred write (the latter for extents including or higher
8996 // than 'offset' only).
9000 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
9001 BlobRef b
= ep
->blob
;
9002 auto bstart
= ep
->blob_start();
9003 dout(20) << __func__
<< " considering " << *b
9004 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
9005 if (bstart
>= end_offs
) {
9006 dout(20) << __func__
<< " ignoring distant " << *b
<< dendl
;
9007 } else if (!b
->get_blob().is_mutable()) {
9008 dout(20) << __func__
<< " ignoring immutable " << *b
<< dendl
;
9009 } else if (ep
->logical_offset
% min_alloc_size
!=
9010 ep
->blob_offset
% min_alloc_size
) {
9011 dout(20) << __func__
<< " ignoring offset-skewed " << *b
<< dendl
;
9013 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
9014 // can we pad our head/tail out with zeros?
9015 uint64_t head_pad
, tail_pad
;
9016 head_pad
= P2PHASE(offset
, chunk_size
);
9017 tail_pad
= P2NPHASE(end_offs
, chunk_size
);
9018 if (head_pad
|| tail_pad
) {
9019 o
->extent_map
.fault_range(db
, offset
- head_pad
,
9020 end_offs
- offset
+ head_pad
+ tail_pad
);
9023 o
->extent_map
.has_any_lextents(offset
- head_pad
, chunk_size
)) {
9026 if (tail_pad
&& o
->extent_map
.has_any_lextents(end_offs
, tail_pad
)) {
9030 uint64_t b_off
= offset
- head_pad
- bstart
;
9031 uint64_t b_len
= length
+ head_pad
+ tail_pad
;
9033 // direct write into unused blocks of an existing mutable blob?
9034 if ((b_off
% chunk_size
== 0 && b_len
% chunk_size
== 0) &&
9035 b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
9036 b
->get_blob().is_unused(b_off
, b_len
) &&
9037 b
->get_blob().is_allocated(b_off
, b_len
)) {
9039 _apply_padding(head_pad
, tail_pad
, bl
, padded
);
9041 dout(20) << __func__
<< " write to unused 0x" << std::hex
9042 << b_off
<< "~" << b_len
9043 << " pad 0x" << head_pad
<< " + 0x" << tail_pad
9044 << std::dec
<< " of mutable " << *b
<< dendl
;
9045 _buffer_cache_write(txc
, b
, b_off
, padded
,
9046 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
9048 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
9049 if (b_len
<= prefer_deferred_size
) {
9050 dout(20) << __func__
<< " deferring small 0x" << std::hex
9051 << b_len
<< std::dec
<< " unused write via deferred" << dendl
;
9052 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
9053 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
9056 [&](uint64_t offset
, uint64_t length
) {
9057 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
9062 b
->get_blob().map_bl(
9064 [&](uint64_t offset
, bufferlist
& t
) {
9065 bdev
->aio_write(offset
, t
,
9066 &txc
->ioc
, wctx
->buffered
);
9070 b
->dirty_blob().calc_csum(b_off
, padded
);
9071 dout(20) << __func__
<< " lex old " << *ep
<< dendl
;
9072 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, b_off
+ head_pad
, length
,
9074 &wctx
->old_extents
);
9075 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
9076 txc
->statfs_delta
.stored() += le
->length
;
9077 dout(20) << __func__
<< " lex " << *le
<< dendl
;
9078 logger
->inc(l_bluestore_write_small_unused
);
9081 // read some data to fill out the chunk?
9082 uint64_t head_read
= P2PHASE(b_off
, chunk_size
);
9083 uint64_t tail_read
= P2NPHASE(b_off
+ b_len
, chunk_size
);
9084 if ((head_read
|| tail_read
) &&
9085 (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
+ tail_read
) &&
9086 head_read
+ tail_read
< min_alloc_size
) {
9088 b_len
+= head_read
+ tail_read
;
9091 head_read
= tail_read
= 0;
9094 // chunk-aligned deferred overwrite?
9095 if (b
->get_blob().get_ondisk_length() >= b_off
+ b_len
&&
9096 b_off
% chunk_size
== 0 &&
9097 b_len
% chunk_size
== 0 &&
9098 b
->get_blob().is_allocated(b_off
, b_len
)) {
9101 _apply_padding(head_pad
, tail_pad
, bl
, padded
);
9103 dout(20) << __func__
<< " reading head 0x" << std::hex
<< head_read
9104 << " and tail 0x" << tail_read
<< std::dec
<< dendl
;
9107 int r
= _do_read(c
.get(), o
, offset
- head_pad
- head_read
, head_read
,
9109 assert(r
>= 0 && r
<= (int)head_read
);
9110 size_t zlen
= head_read
- r
;
9112 head_bl
.append_zero(zlen
);
9113 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
9115 head_bl
.claim_append(padded
);
9116 padded
.swap(head_bl
);
9117 logger
->inc(l_bluestore_write_penalty_read_ops
);
9121 int r
= _do_read(c
.get(), o
, offset
+ length
+ tail_pad
, tail_read
,
9123 assert(r
>= 0 && r
<= (int)tail_read
);
9124 size_t zlen
= tail_read
- r
;
9126 tail_bl
.append_zero(zlen
);
9127 logger
->inc(l_bluestore_write_pad_bytes
, zlen
);
9129 padded
.claim_append(tail_bl
);
9130 logger
->inc(l_bluestore_write_penalty_read_ops
);
9132 logger
->inc(l_bluestore_write_small_pre_read
);
9134 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
9135 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
9136 _buffer_cache_write(txc
, b
, b_off
, padded
,
9137 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
9139 int r
= b
->get_blob().map(
9141 [&](uint64_t offset
, uint64_t length
) {
9142 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
9146 if (b
->get_blob().csum_type
) {
9147 b
->dirty_blob().calc_csum(b_off
, padded
);
9149 op
->data
.claim(padded
);
9150 dout(20) << __func__
<< " deferred write 0x" << std::hex
<< b_off
<< "~"
9151 << b_len
<< std::dec
<< " of mutable " << *b
9152 << " at " << op
->extents
<< dendl
;
9153 Extent
*le
= o
->extent_map
.set_lextent(c
, offset
, offset
- bstart
, length
,
9154 b
, &wctx
->old_extents
);
9155 b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
9156 txc
->statfs_delta
.stored() += le
->length
;
9157 dout(20) << __func__
<< " lex " << *le
<< dendl
;
9158 logger
->inc(l_bluestore_write_small_deferred
);
9162 if (b
->try_reuse_blob(min_alloc_size
,
9166 assert(alloc_len
== min_alloc_size
); // expecting data always
9167 // fit into reused blob
9168 // Need to check for pending writes desiring to
9169 // reuse the same pextent. The rationale is that during GC two chunks
9170 // from garbage blobs(compressed?) can share logical space within the same
9171 // AU. That's in turn might be caused by unaligned len in clone_range2.
9172 // Hence the second write will fail in an attempt to reuse blob at
9173 // do_alloc_write().
9174 if (!wctx
->has_conflict(b
,
9176 offset0
+ alloc_len
,
9179 // we can't reuse pad_head/pad_tail since they might be truncated
9180 // due to existent extents
9181 uint64_t b_off
= offset
- bstart
;
9182 uint64_t b_off0
= b_off
;
9183 _pad_zeros(&bl
, &b_off0
, chunk_size
);
9185 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9186 << " (" << b_off0
<< "~" << bl
.length() << ")"
9187 << " (" << b_off
<< "~" << length
<< ")"
9188 << std::dec
<< dendl
;
9190 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9191 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
9193 logger
->inc(l_bluestore_write_small_unused
);
9200 } // if (ep != end && ep->logical_offset < offset + max_bsize)
9202 // check extent for reuse in reverse order
9203 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
9204 BlobRef b
= prev_ep
->blob
;
9205 auto bstart
= prev_ep
->blob_start();
9206 dout(20) << __func__
<< " considering " << *b
9207 << " bstart 0x" << std::hex
<< bstart
<< std::dec
<< dendl
;
9208 if (b
->try_reuse_blob(min_alloc_size
,
9212 assert(alloc_len
== min_alloc_size
); // expecting data always
9213 // fit into reused blob
9214 // Need to check for pending writes desiring to
9215 // reuse the same pextent. The rationale is that during GC two chunks
9216 // from garbage blobs(compressed?) can share logical space within the same
9217 // AU. That's in turn might be caused by unaligned len in clone_range2.
9218 // Hence the second write will fail in an attempt to reuse blob at
9219 // do_alloc_write().
9220 if (!wctx
->has_conflict(b
,
9222 offset0
+ alloc_len
,
9225 uint64_t chunk_size
= b
->get_blob().get_chunk_size(block_size
);
9226 uint64_t b_off
= offset
- bstart
;
9227 uint64_t b_off0
= b_off
;
9228 _pad_zeros(&bl
, &b_off0
, chunk_size
);
9230 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9231 << " (" << b_off0
<< "~" << bl
.length() << ")"
9232 << " (" << b_off
<< "~" << length
<< ")"
9233 << std::dec
<< dendl
;
9235 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9236 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
,
9238 logger
->inc(l_bluestore_write_small_unused
);
9242 if (prev_ep
!= begin
) {
9246 prev_ep
= end
; // to avoid useless first extent re-check
9248 } // if (prev_ep != end && prev_ep->logical_offset >= min_off)
9249 } while (any_change
);
9253 BlobRef b
= c
->new_blob();
9254 uint64_t b_off
= P2PHASE(offset
, alloc_len
);
9255 uint64_t b_off0
= b_off
;
9256 _pad_zeros(&bl
, &b_off0
, block_size
);
9257 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9258 wctx
->write(offset
, b
, alloc_len
, b_off0
, bl
, b_off
, length
, true, true);
9259 logger
->inc(l_bluestore_write_small_new
);
9264 void BlueStore::_do_write_big(
9268 uint64_t offset
, uint64_t length
,
9269 bufferlist::iterator
& blp
,
9272 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
9273 << " target_blob_size 0x" << wctx
->target_blob_size
<< std::dec
9274 << " compress " << (int)wctx
->compress
9276 logger
->inc(l_bluestore_write_big
);
9277 logger
->inc(l_bluestore_write_big_bytes
, length
);
9278 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
->old_extents
);
9279 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
9280 while (length
> 0) {
9281 bool new_blob
= false;
9282 uint32_t l
= MIN(max_bsize
, length
);
9286 //attempting to reuse existing blob
9287 if (!wctx
->compress
) {
9288 // look for an existing mutable blob we can reuse
9289 auto begin
= o
->extent_map
.extent_map
.begin();
9290 auto end
= o
->extent_map
.extent_map
.end();
9291 auto ep
= o
->extent_map
.seek_lextent(offset
);
9293 if (prev_ep
!= begin
) {
9296 prev_ep
= end
; // to avoid this extent check as it's a duplicate
9298 auto min_off
= offset
>= max_bsize
? offset
- max_bsize
: 0;
9299 // search suitable extent in both forward and reverse direction in
9300 // [offset - target_max_blob_size, offset + target_max_blob_size] range
9301 // then check if blob can be reused via try_reuse_blob func.
9305 if (ep
!= end
&& ep
->logical_offset
< offset
+ max_bsize
) {
9306 if (offset
>= ep
->blob_start() &&
9307 ep
->blob
->try_reuse_blob(min_alloc_size
, max_bsize
,
9308 offset
- ep
->blob_start(),
9311 b_off
= offset
- ep
->blob_start();
9312 prev_ep
= end
; // to avoid check below
9313 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9314 << " (" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
9321 if (prev_ep
!= end
&& prev_ep
->logical_offset
>= min_off
) {
9322 if (prev_ep
->blob
->try_reuse_blob(min_alloc_size
, max_bsize
,
9323 offset
- prev_ep
->blob_start(),
9326 b_off
= offset
- prev_ep
->blob_start();
9327 dout(20) << __func__
<< " reuse blob " << *b
<< std::hex
9328 << " (" << b_off
<< "~" << l
<< ")" << std::dec
<< dendl
;
9329 } else if (prev_ep
!= begin
) {
9333 prev_ep
= end
; // to avoid useless first extent re-check
9336 } while (b
== nullptr && any_change
);
9346 wctx
->write(offset
, b
, l
, b_off
, t
, b_off
, l
, false, new_blob
);
9349 logger
->inc(l_bluestore_write_big_blobs
);
9353 int BlueStore::_do_alloc_write(
9359 dout(20) << __func__
<< " txc " << txc
9360 << " " << wctx
->writes
.size() << " blobs"
9364 auto max_bsize
= MAX(wctx
->target_blob_size
, min_alloc_size
);
9365 for (auto &wi
: wctx
->writes
) {
9366 need
+= wi
.blob_length
;
9368 int r
= alloc
->reserve(need
);
9370 derr
<< __func__
<< " failed to reserve 0x" << std::hex
<< need
<< std::dec
9378 if (wctx
->compress
) {
9380 "compression_algorithm",
9384 if (coll
->pool_opts
.get(pool_opts_t::COMPRESSION_ALGORITHM
, &val
)) {
9385 CompressorRef cp
= compressor
;
9386 if (!cp
|| cp
->get_type_name() != val
) {
9387 cp
= Compressor::create(cct
, val
);
9389 return boost::optional
<CompressorRef
>(cp
);
9391 return boost::optional
<CompressorRef
>();
9395 crr
= select_option(
9396 "compression_required_ratio",
9397 cct
->_conf
->bluestore_compression_required_ratio
,
9400 if(coll
->pool_opts
.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO
, &val
)) {
9401 return boost::optional
<double>(val
);
9403 return boost::optional
<double>();
9409 int csum
= csum_type
.load();
9410 csum
= select_option(
9415 if(coll
->pool_opts
.get(pool_opts_t::CSUM_TYPE
, &val
)) {
9416 return boost::optional
<int>(val
);
9418 return boost::optional
<int>();
9422 for (auto& wi
: wctx
->writes
) {
9424 bluestore_blob_t
& dblob
= b
->dirty_blob();
9425 uint64_t b_off
= wi
.b_off
;
9426 bufferlist
*l
= &wi
.bl
;
9427 uint64_t final_length
= wi
.blob_length
;
9428 uint64_t csum_length
= wi
.blob_length
;
9429 unsigned csum_order
= block_size_order
;
9430 bufferlist compressed_bl
;
9431 bool compressed
= false;
9432 if(c
&& wi
.blob_length
> min_alloc_size
) {
9434 utime_t start
= ceph_clock_now();
9438 assert(wi
.blob_length
== l
->length());
9439 bluestore_compression_header_t chdr
;
9440 chdr
.type
= c
->get_type();
9441 // FIXME: memory alignment here is bad
9444 r
= c
->compress(*l
, t
);
9447 chdr
.length
= t
.length();
9448 ::encode(chdr
, compressed_bl
);
9449 compressed_bl
.claim_append(t
);
9450 uint64_t rawlen
= compressed_bl
.length();
9451 uint64_t newlen
= P2ROUNDUP(rawlen
, min_alloc_size
);
9452 uint64_t want_len_raw
= final_length
* crr
;
9453 uint64_t want_len
= P2ROUNDUP(want_len_raw
, min_alloc_size
);
9454 if (newlen
<= want_len
&& newlen
< final_length
) {
9455 // Cool. We compressed at least as much as we were hoping to.
9456 // pad out to min_alloc_size
9457 compressed_bl
.append_zero(newlen
- rawlen
);
9458 logger
->inc(l_bluestore_write_pad_bytes
, newlen
- rawlen
);
9459 dout(20) << __func__
<< std::hex
<< " compressed 0x" << wi
.blob_length
9460 << " -> 0x" << rawlen
<< " => 0x" << newlen
9461 << " with " << c
->get_type()
9462 << std::dec
<< dendl
;
9463 txc
->statfs_delta
.compressed() += rawlen
;
9464 txc
->statfs_delta
.compressed_original() += l
->length();
9465 txc
->statfs_delta
.compressed_allocated() += newlen
;
9467 final_length
= newlen
;
9468 csum_length
= newlen
;
9469 csum_order
= ctz(newlen
);
9470 dblob
.set_compressed(wi
.blob_length
, rawlen
);
9472 logger
->inc(l_bluestore_compress_success_count
);
9474 dout(20) << __func__
<< std::hex
<< " 0x" << l
->length()
9475 << " compressed to 0x" << rawlen
<< " -> 0x" << newlen
9476 << " with " << c
->get_type()
9477 << ", which is more than required 0x" << want_len_raw
9478 << " -> 0x" << want_len
9479 << ", leaving uncompressed"
9480 << std::dec
<< dendl
;
9481 logger
->inc(l_bluestore_compress_rejected_count
);
9483 logger
->tinc(l_bluestore_compress_lat
,
9484 ceph_clock_now() - start
);
9486 if (!compressed
&& wi
.new_blob
) {
9487 // initialize newly created blob only
9488 assert(!dblob
.has_flag(bluestore_blob_t::FLAG_MUTABLE
));
9489 dblob
.set_flag(bluestore_blob_t::FLAG_MUTABLE
);
9491 if (l
->length() != wi
.blob_length
) {
9492 // hrm, maybe we could do better here, but let's not bother.
9493 dout(20) << __func__
<< " forcing csum_order to block_size_order "
9494 << block_size_order
<< dendl
;
9495 csum_order
= block_size_order
;
9497 csum_order
= std::min(wctx
->csum_order
, ctz(l
->length()));
9499 // try to align blob with max_blob_size to improve
9500 // its reuse ratio, e.g. in case of reverse write
9501 uint32_t suggested_boff
=
9502 (wi
.logical_offset
- (wi
.b_off0
- wi
.b_off
)) % max_bsize
;
9503 if ((suggested_boff
% (1 << csum_order
)) == 0 &&
9504 suggested_boff
+ final_length
<= max_bsize
&&
9505 suggested_boff
> b_off
) {
9506 dout(20) << __func__
<< " forcing blob_offset to "
9507 << std::hex
<< suggested_boff
<< std::dec
<< dendl
;
9508 assert(suggested_boff
>= b_off
);
9509 csum_length
+= suggested_boff
- b_off
;
9510 b_off
= suggested_boff
;
9514 AllocExtentVector extents
;
9515 extents
.reserve(4); // 4 should be (more than) enough for most allocations
9516 int64_t got
= alloc
->allocate(final_length
, min_alloc_size
,
9517 max_alloc_size
.load(),
9519 assert(got
== (int64_t)final_length
);
9521 txc
->statfs_delta
.allocated() += got
;
9522 for (auto& p
: extents
) {
9523 bluestore_pextent_t e
= bluestore_pextent_t(p
);
9524 txc
->allocated
.insert(e
.offset
, e
.length
);
9527 dblob
.allocated(P2ALIGN(b_off
, min_alloc_size
), final_length
, extents
);
9529 dout(20) << __func__
<< " blob " << *b
9530 << " csum_type " << Checksummer::get_csum_type_string(csum
)
9531 << " csum_order " << csum_order
9532 << " csum_length 0x" << std::hex
<< csum_length
<< std::dec
9535 if (csum
!= Checksummer::CSUM_NONE
) {
9536 if (!dblob
.has_csum()) {
9537 dblob
.init_csum(csum
, csum_order
, csum_length
);
9539 dblob
.calc_csum(b_off
, *l
);
9541 if (wi
.mark_unused
) {
9542 auto b_end
= b_off
+ wi
.bl
.length();
9544 dblob
.add_unused(0, b_off
);
9546 if (b_end
< wi
.blob_length
) {
9547 dblob
.add_unused(b_end
, wi
.blob_length
- b_end
);
9551 Extent
*le
= o
->extent_map
.set_lextent(coll
, wi
.logical_offset
,
9552 b_off
+ (wi
.b_off0
- wi
.b_off
),
9556 wi
.b
->dirty_blob().mark_used(le
->blob_offset
, le
->length
);
9557 txc
->statfs_delta
.stored() += le
->length
;
9558 dout(20) << __func__
<< " lex " << *le
<< dendl
;
9559 _buffer_cache_write(txc
, wi
.b
, b_off
, wi
.bl
,
9560 wctx
->buffered
? 0 : Buffer::FLAG_NOCACHE
);
9563 if (!g_conf
->bluestore_debug_omit_block_device_write
) {
9564 if (l
->length() <= prefer_deferred_size
.load()) {
9565 dout(20) << __func__
<< " deferring small 0x" << std::hex
9566 << l
->length() << std::dec
<< " write via deferred" << dendl
;
9567 bluestore_deferred_op_t
*op
= _get_deferred_op(txc
, o
);
9568 op
->op
= bluestore_deferred_op_t::OP_WRITE
;
9569 int r
= b
->get_blob().map(
9571 [&](uint64_t offset
, uint64_t length
) {
9572 op
->extents
.emplace_back(bluestore_pextent_t(offset
, length
));
9578 b
->get_blob().map_bl(
9580 [&](uint64_t offset
, bufferlist
& t
) {
9581 bdev
->aio_write(offset
, t
, &txc
->ioc
, false);
9587 alloc
->unreserve(need
);
9592 void BlueStore::_wctx_finish(
9598 auto oep
= wctx
->old_extents
.begin();
9599 while (oep
!= wctx
->old_extents
.end()) {
9601 oep
= wctx
->old_extents
.erase(oep
);
9602 dout(20) << __func__
<< " lex_old " << lo
.e
<< dendl
;
9603 BlobRef b
= lo
.e
.blob
;
9604 const bluestore_blob_t
& blob
= b
->get_blob();
9605 if (blob
.is_compressed()) {
9606 if (lo
.blob_empty
) {
9607 txc
->statfs_delta
.compressed() -= blob
.get_compressed_payload_length();
9609 txc
->statfs_delta
.compressed_original() -= lo
.e
.length
;
9612 txc
->statfs_delta
.stored() -= lo
.e
.length
;
9614 dout(20) << __func__
<< " blob release " << r
<< dendl
;
9615 if (blob
.is_shared()) {
9616 PExtentVector final
;
9617 c
->load_shared_blob(b
->shared_blob
);
9619 b
->shared_blob
->put_ref(e
.offset
, e
.length
, &final
);
9621 dout(20) << __func__
<< " shared_blob release " << final
9622 << " from " << *b
->shared_blob
<< dendl
;
9623 txc
->write_shared_blob(b
->shared_blob
);
9628 // we can't invalidate our logical extents as we drop them because
9629 // other lextents (either in our onode or others) may still
9630 // reference them. but we can throw out anything that is no
9631 // longer allocated. Note that this will leave behind edge bits
9632 // that are no longer referenced but not deallocated (until they
9633 // age out of the cache naturally).
9634 b
->discard_unallocated(c
.get());
9636 dout(20) << __func__
<< " release " << e
<< dendl
;
9637 txc
->released
.insert(e
.offset
, e
.length
);
9638 txc
->statfs_delta
.allocated() -= e
.length
;
9639 if (blob
.is_compressed()) {
9640 txc
->statfs_delta
.compressed_allocated() -= e
.length
;
9644 if (b
->is_spanning() && !b
->is_referenced()) {
9645 dout(20) << __func__
<< " spanning_blob_map removing empty " << *b
9647 o
->extent_map
.spanning_blob_map
.erase(b
->id
);
9652 void BlueStore::_do_write_data(
9661 uint64_t end
= offset
+ length
;
9662 bufferlist::iterator p
= bl
.begin();
9664 if (offset
/ min_alloc_size
== (end
- 1) / min_alloc_size
&&
9665 (length
!= min_alloc_size
)) {
9666 // we fall within the same block
9667 _do_write_small(txc
, c
, o
, offset
, length
, p
, wctx
);
9669 uint64_t head_offset
, head_length
;
9670 uint64_t middle_offset
, middle_length
;
9671 uint64_t tail_offset
, tail_length
;
9673 head_offset
= offset
;
9674 head_length
= P2NPHASE(offset
, min_alloc_size
);
9676 tail_offset
= P2ALIGN(end
, min_alloc_size
);
9677 tail_length
= P2PHASE(end
, min_alloc_size
);
9679 middle_offset
= head_offset
+ head_length
;
9680 middle_length
= length
- head_length
- tail_length
;
9683 _do_write_small(txc
, c
, o
, head_offset
, head_length
, p
, wctx
);
9686 if (middle_length
) {
9687 _do_write_big(txc
, c
, o
, middle_offset
, middle_length
, p
, wctx
);
9691 _do_write_small(txc
, c
, o
, tail_offset
, tail_length
, p
, wctx
);
9696 int BlueStore::_do_write(
9703 uint32_t fadvise_flags
)
9707 dout(20) << __func__
9709 << " 0x" << std::hex
<< offset
<< "~" << length
9710 << " - have 0x" << o
->onode
.size
9711 << " (" << std::dec
<< o
->onode
.size
<< ")"
9713 << " fadvise_flags 0x" << std::hex
<< fadvise_flags
<< std::dec
9721 uint64_t end
= offset
+ length
;
9722 bool was_gc
= false;
9723 GarbageCollector
gc(c
->store
->cct
);
9725 auto dirty_start
= offset
;
9726 auto dirty_end
= offset
+ length
;
9728 WriteContext wctx
, wctx_gc
;
9729 if (fadvise_flags
& CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) {
9730 dout(20) << __func__
<< " will do buffered write" << dendl
;
9731 wctx
.buffered
= true;
9732 } else if (cct
->_conf
->bluestore_default_buffered_write
&&
9733 (fadvise_flags
& (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
|
9734 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
)) == 0) {
9735 dout(20) << __func__
<< " defaulting to buffered write" << dendl
;
9736 wctx
.buffered
= true;
9739 // FIXME: Using the MAX of the block_size_order and preferred_csum_order
9740 // results in poor small random read performance when data was initially
9741 // written out in large chunks. Reverting to previous behavior for now.
9742 wctx
.csum_order
= block_size_order
;
9744 // compression parameters
9745 unsigned alloc_hints
= o
->onode
.alloc_hint_flags
;
9746 auto cm
= select_option(
9751 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MODE
, &val
)) {
9752 return boost::optional
<Compressor::CompressionMode
>(Compressor::get_comp_mode_type(val
));
9754 return boost::optional
<Compressor::CompressionMode
>();
9757 wctx
.compress
= (cm
!= Compressor::COMP_NONE
) &&
9758 ((cm
== Compressor::COMP_FORCE
) ||
9759 (cm
== Compressor::COMP_AGGRESSIVE
&&
9760 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE
) == 0) ||
9761 (cm
== Compressor::COMP_PASSIVE
&&
9762 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE
)));
9764 if ((alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ
) &&
9765 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ
) == 0 &&
9766 (alloc_hints
& (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE
|
9767 CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY
)) &&
9768 (alloc_hints
& CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE
) == 0) {
9769 dout(20) << __func__
<< " will prefer large blob and csum sizes" << dendl
;
9770 auto order
= min_alloc_size_order
.load();
9771 if (o
->onode
.expected_write_size
) {
9772 wctx
.csum_order
= std::max(order
,
9773 (uint8_t)ctz(o
->onode
.expected_write_size
));
9775 wctx
.csum_order
= order
;
9778 if (wctx
.compress
) {
9779 wctx
.target_blob_size
= select_option(
9780 "compression_max_blob_size",
9781 comp_max_blob_size
.load(),
9784 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE
, &val
)) {
9785 return boost::optional
<uint64_t>((uint64_t)val
);
9787 return boost::optional
<uint64_t>();
9792 if (wctx
.compress
) {
9793 wctx
.target_blob_size
= select_option(
9794 "compression_min_blob_size",
9795 comp_min_blob_size
.load(),
9798 if(c
->pool_opts
.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE
, &val
)) {
9799 return boost::optional
<uint64_t>((uint64_t)val
);
9801 return boost::optional
<uint64_t>();
9806 uint64_t max_bsize
= max_blob_size
.load();
9807 if (wctx
.target_blob_size
== 0 || wctx
.target_blob_size
> max_bsize
) {
9808 wctx
.target_blob_size
= max_bsize
;
9810 // set the min blob size floor at 2x the min_alloc_size, or else we
9811 // won't be able to allocate a smaller extent for the compressed
9813 if (wctx
.compress
&&
9814 wctx
.target_blob_size
< min_alloc_size
* 2) {
9815 wctx
.target_blob_size
= min_alloc_size
* 2;
9817 wctx_gc
.fork(wctx
); // make a clone for garbage collection
9818 dout(20) << __func__
<< " prefer csum_order " << wctx
.csum_order
9819 << " target_blob_size 0x" << std::hex
<< wctx
.target_blob_size
9820 << std::dec
<< dendl
;
9822 o
->extent_map
.fault_range(db
, offset
, length
);
9823 _do_write_data(txc
, c
, o
, offset
, length
, bl
, &wctx
);
9825 r
= _do_alloc_write(txc
, c
, o
, &wctx
);
9827 derr
<< __func__
<< " _do_alloc_write failed with " << cpp_strerror(r
)
9832 benefit
= gc
.estimate(offset
,
9838 _wctx_finish(txc
, c
, o
, &wctx
);
9839 if (end
> o
->onode
.size
) {
9840 dout(20) << __func__
<< " extending size to 0x" << std::hex
<< end
9841 << std::dec
<< dendl
;
9842 o
->onode
.size
= end
;
9845 if (benefit
>= g_conf
->bluestore_gc_enable_total_threshold
) {
9846 dout(20) << __func__
<< " perform garbage collection, expected benefit = "
9847 << benefit
<< " AUs" << dendl
;
9848 auto& extents_to_collect
= gc
.get_extents_to_collect();
9849 for (auto it
= extents_to_collect
.begin();
9850 it
!= extents_to_collect
.end();
9853 int r
= _do_read(c
.get(), o
, it
->offset
, it
->length
, bl
, 0);
9854 assert(r
== (int)it
->length
);
9855 o
->extent_map
.fault_range(db
, it
->offset
, it
->length
);
9856 _do_write_data(txc
, c
, o
, it
->offset
, it
->length
, bl
, &wctx_gc
);
9857 logger
->inc(l_bluestore_gc_merged
, it
->length
);
9859 if (dirty_start
> it
->offset
) {
9860 dirty_start
= it
->offset
;
9862 if (dirty_end
< it
->offset
+ it
->length
) {
9863 dirty_end
= it
->offset
+ it
->length
;
9868 dout(30) << __func__
<< " alloc write for GC" << dendl
;
9869 r
= _do_alloc_write(txc
, c
, o
, &wctx_gc
);
9871 derr
<< __func__
<< " _do_alloc_write(gc) failed with " << cpp_strerror(r
)
9875 _wctx_finish(txc
, c
, o
, &wctx_gc
);
9878 o
->extent_map
.compress_extent_map(dirty_start
, dirty_end
- dirty_start
);
9879 o
->extent_map
.dirty_range(txc
->t
, dirty_start
, dirty_end
- dirty_start
);
9886 int BlueStore::_write(TransContext
*txc
,
9889 uint64_t offset
, size_t length
,
9891 uint32_t fadvise_flags
)
9893 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
9894 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9897 _assign_nid(txc
, o
);
9898 int r
= _do_write(txc
, c
, o
, offset
, length
, bl
, fadvise_flags
);
9899 txc
->write_onode(o
);
9901 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
9902 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9903 << " = " << r
<< dendl
;
9907 int BlueStore::_zero(TransContext
*txc
,
9910 uint64_t offset
, size_t length
)
9912 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
9913 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9916 _assign_nid(txc
, o
);
9917 int r
= _do_zero(txc
, c
, o
, offset
, length
);
9918 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
9919 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9920 << " = " << r
<< dendl
;
9924 int BlueStore::_do_zero(TransContext
*txc
,
9927 uint64_t offset
, size_t length
)
9929 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
9930 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9937 o
->extent_map
.fault_range(db
, offset
, length
);
9938 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
9939 o
->extent_map
.dirty_range(txc
->t
, offset
, length
);
9940 _wctx_finish(txc
, c
, o
, &wctx
);
9942 if (offset
+ length
> o
->onode
.size
) {
9943 o
->onode
.size
= offset
+ length
;
9944 dout(20) << __func__
<< " extending size to " << offset
+ length
9947 txc
->write_onode(o
);
9949 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
9950 << " 0x" << std::hex
<< offset
<< "~" << length
<< std::dec
9951 << " = " << r
<< dendl
;
9955 void BlueStore::_do_truncate(
9956 TransContext
*txc
, CollectionRef
& c
, OnodeRef o
, uint64_t offset
)
9958 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
9959 << " 0x" << std::hex
<< offset
<< std::dec
<< dendl
;
9963 if (offset
== o
->onode
.size
)
9966 if (offset
< o
->onode
.size
) {
9968 uint64_t length
= o
->onode
.size
- offset
;
9969 o
->extent_map
.fault_range(db
, offset
, length
);
9970 o
->extent_map
.punch_hole(c
, offset
, length
, &wctx
.old_extents
);
9971 o
->extent_map
.dirty_range(txc
->t
, offset
, length
);
9972 _wctx_finish(txc
, c
, o
, &wctx
);
9974 // if we have shards past EOF, ask for a reshard
9975 if (!o
->onode
.extent_map_shards
.empty() &&
9976 o
->onode
.extent_map_shards
.back().offset
>= offset
) {
9977 dout(10) << __func__
<< " request reshard past EOF" << dendl
;
9979 o
->extent_map
.request_reshard(offset
- 1, offset
+ length
);
9981 o
->extent_map
.request_reshard(0, length
);
9986 o
->onode
.size
= offset
;
9988 txc
->write_onode(o
);
9991 void BlueStore::_truncate(TransContext
*txc
,
9996 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
9997 << " 0x" << std::hex
<< offset
<< std::dec
9999 _do_truncate(txc
, c
, o
, offset
);
10002 int BlueStore::_do_remove(
10007 _do_truncate(txc
, c
, o
, 0);
10008 if (o
->onode
.has_omap()) {
10010 _do_omap_clear(txc
, o
->onode
.nid
);
10014 for (auto &s
: o
->extent_map
.shards
) {
10015 dout(20) << __func__
<< " removing shard 0x" << std::hex
10016 << s
.shard_info
->offset
<< std::dec
<< dendl
;
10017 generate_extent_shard_key_and_apply(o
->key
, s
.shard_info
->offset
, &key
,
10018 [&](const string
& final_key
) {
10019 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
10023 txc
->t
->rmkey(PREFIX_OBJ
, o
->key
.c_str(), o
->key
.size());
10025 o
->extent_map
.clear();
10026 o
->onode
= bluestore_onode_t();
10027 _debug_obj_on_delete(o
->oid
);
10031 int BlueStore::_remove(TransContext
*txc
,
10035 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10036 int r
= _do_remove(txc
, c
, o
);
10037 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10041 int BlueStore::_setattr(TransContext
*txc
,
10044 const string
& name
,
10047 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10048 << " " << name
<< " (" << val
.length() << " bytes)"
10051 if (val
.is_partial())
10052 o
->onode
.attrs
[name
.c_str()] = bufferptr(val
.c_str(), val
.length());
10054 o
->onode
.attrs
[name
.c_str()] = val
;
10055 txc
->write_onode(o
);
10056 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10057 << " " << name
<< " (" << val
.length() << " bytes)"
10058 << " = " << r
<< dendl
;
10062 int BlueStore::_setattrs(TransContext
*txc
,
10065 const map
<string
,bufferptr
>& aset
)
10067 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10068 << " " << aset
.size() << " keys"
10071 for (map
<string
,bufferptr
>::const_iterator p
= aset
.begin();
10072 p
!= aset
.end(); ++p
) {
10073 if (p
->second
.is_partial())
10074 o
->onode
.attrs
[p
->first
.c_str()] =
10075 bufferptr(p
->second
.c_str(), p
->second
.length());
10077 o
->onode
.attrs
[p
->first
.c_str()] = p
->second
;
10079 txc
->write_onode(o
);
10080 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10081 << " " << aset
.size() << " keys"
10082 << " = " << r
<< dendl
;
10087 int BlueStore::_rmattr(TransContext
*txc
,
10090 const string
& name
)
10092 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10093 << " " << name
<< dendl
;
10095 auto it
= o
->onode
.attrs
.find(name
.c_str());
10096 if (it
== o
->onode
.attrs
.end())
10099 o
->onode
.attrs
.erase(it
);
10100 txc
->write_onode(o
);
10103 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10104 << " " << name
<< " = " << r
<< dendl
;
10108 int BlueStore::_rmattrs(TransContext
*txc
,
10112 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10115 if (o
->onode
.attrs
.empty())
10118 o
->onode
.attrs
.clear();
10119 txc
->write_onode(o
);
10122 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10126 void BlueStore::_do_omap_clear(TransContext
*txc
, uint64_t id
)
10128 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
10129 string prefix
, tail
;
10130 get_omap_header(id
, &prefix
);
10131 get_omap_tail(id
, &tail
);
10132 it
->lower_bound(prefix
);
10133 while (it
->valid()) {
10134 if (it
->key() >= tail
) {
10135 dout(30) << __func__
<< " stop at " << pretty_binary_string(tail
)
10139 txc
->t
->rmkey(PREFIX_OMAP
, it
->key());
10140 dout(30) << __func__
<< " rm " << pretty_binary_string(it
->key()) << dendl
;
10145 int BlueStore::_omap_clear(TransContext
*txc
,
10149 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10151 if (o
->onode
.has_omap()) {
10153 _do_omap_clear(txc
, o
->onode
.nid
);
10154 o
->onode
.clear_omap_flag();
10155 txc
->write_onode(o
);
10157 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10161 int BlueStore::_omap_setkeys(TransContext
*txc
,
10166 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10168 bufferlist::iterator p
= bl
.begin();
10170 if (!o
->onode
.has_omap()) {
10171 o
->onode
.set_omap_flag();
10172 txc
->write_onode(o
);
10174 txc
->note_modified_object(o
);
10177 _key_encode_u64(o
->onode
.nid
, &final_key
);
10178 final_key
.push_back('.');
10184 ::decode(value
, p
);
10185 final_key
.resize(9); // keep prefix
10187 dout(30) << __func__
<< " " << pretty_binary_string(final_key
)
10188 << " <- " << key
<< dendl
;
10189 txc
->t
->set(PREFIX_OMAP
, final_key
, value
);
10192 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10196 int BlueStore::_omap_setheader(TransContext
*txc
,
10201 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10204 if (!o
->onode
.has_omap()) {
10205 o
->onode
.set_omap_flag();
10206 txc
->write_onode(o
);
10208 txc
->note_modified_object(o
);
10210 get_omap_header(o
->onode
.nid
, &key
);
10211 txc
->t
->set(PREFIX_OMAP
, key
, bl
);
10213 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10217 int BlueStore::_omap_rmkeys(TransContext
*txc
,
10222 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10224 bufferlist::iterator p
= bl
.begin();
10228 if (!o
->onode
.has_omap()) {
10231 _key_encode_u64(o
->onode
.nid
, &final_key
);
10232 final_key
.push_back('.');
10237 final_key
.resize(9); // keep prefix
10239 dout(30) << __func__
<< " rm " << pretty_binary_string(final_key
)
10240 << " <- " << key
<< dendl
;
10241 txc
->t
->rmkey(PREFIX_OMAP
, final_key
);
10243 txc
->note_modified_object(o
);
10246 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10250 int BlueStore::_omap_rmkey_range(TransContext
*txc
,
10253 const string
& first
, const string
& last
)
10255 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< dendl
;
10256 KeyValueDB::Iterator it
;
10257 string key_first
, key_last
;
10259 if (!o
->onode
.has_omap()) {
10263 it
= db
->get_iterator(PREFIX_OMAP
);
10264 get_omap_key(o
->onode
.nid
, first
, &key_first
);
10265 get_omap_key(o
->onode
.nid
, last
, &key_last
);
10266 it
->lower_bound(key_first
);
10267 while (it
->valid()) {
10268 if (it
->key() >= key_last
) {
10269 dout(30) << __func__
<< " stop at " << pretty_binary_string(key_last
)
10273 txc
->t
->rmkey(PREFIX_OMAP
, it
->key());
10274 dout(30) << __func__
<< " rm " << pretty_binary_string(it
->key()) << dendl
;
10277 txc
->note_modified_object(o
);
10280 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
<< " = " << r
<< dendl
;
10284 int BlueStore::_set_alloc_hint(
10288 uint64_t expected_object_size
,
10289 uint64_t expected_write_size
,
10292 dout(15) << __func__
<< " " << c
->cid
<< " " << o
->oid
10293 << " object_size " << expected_object_size
10294 << " write_size " << expected_write_size
10295 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
10298 o
->onode
.expected_object_size
= expected_object_size
;
10299 o
->onode
.expected_write_size
= expected_write_size
;
10300 o
->onode
.alloc_hint_flags
= flags
;
10301 txc
->write_onode(o
);
10302 dout(10) << __func__
<< " " << c
->cid
<< " " << o
->oid
10303 << " object_size " << expected_object_size
10304 << " write_size " << expected_write_size
10305 << " flags " << ceph_osd_alloc_hint_flag_string(flags
)
10306 << " = " << r
<< dendl
;
10310 int BlueStore::_clone(TransContext
*txc
,
10315 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10316 << newo
->oid
<< dendl
;
10318 if (oldo
->oid
.hobj
.get_hash() != newo
->oid
.hobj
.get_hash()) {
10319 derr
<< __func__
<< " mismatched hash on " << oldo
->oid
10320 << " and " << newo
->oid
<< dendl
;
10324 newo
->exists
= true;
10325 _assign_nid(txc
, newo
);
10329 _do_truncate(txc
, c
, newo
, 0);
10330 if (cct
->_conf
->bluestore_clone_cow
) {
10331 _do_clone_range(txc
, c
, oldo
, newo
, 0, oldo
->onode
.size
, 0);
10334 r
= _do_read(c
.get(), oldo
, 0, oldo
->onode
.size
, bl
, 0);
10337 r
= _do_write(txc
, c
, newo
, 0, oldo
->onode
.size
, bl
, 0);
10343 newo
->onode
.attrs
= oldo
->onode
.attrs
;
10346 if (newo
->onode
.has_omap()) {
10347 dout(20) << __func__
<< " clearing old omap data" << dendl
;
10349 _do_omap_clear(txc
, newo
->onode
.nid
);
10351 if (oldo
->onode
.has_omap()) {
10352 dout(20) << __func__
<< " copying omap data" << dendl
;
10353 if (!newo
->onode
.has_omap()) {
10354 newo
->onode
.set_omap_flag();
10356 KeyValueDB::Iterator it
= db
->get_iterator(PREFIX_OMAP
);
10358 get_omap_header(oldo
->onode
.nid
, &head
);
10359 get_omap_tail(oldo
->onode
.nid
, &tail
);
10360 it
->lower_bound(head
);
10361 while (it
->valid()) {
10362 if (it
->key() >= tail
) {
10363 dout(30) << __func__
<< " reached tail" << dendl
;
10366 dout(30) << __func__
<< " got header/data "
10367 << pretty_binary_string(it
->key()) << dendl
;
10369 rewrite_omap_key(newo
->onode
.nid
, it
->key(), &key
);
10370 txc
->t
->set(PREFIX_OMAP
, key
, it
->value());
10375 newo
->onode
.clear_omap_flag();
10378 txc
->write_onode(newo
);
10382 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10383 << newo
->oid
<< " = " << r
<< dendl
;
10387 int BlueStore::_do_clone_range(
10392 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
)
10394 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10396 << " 0x" << std::hex
<< srcoff
<< "~" << length
<< " -> "
10397 << " 0x" << dstoff
<< "~" << length
<< std::dec
<< dendl
;
10398 oldo
->extent_map
.fault_range(db
, srcoff
, length
);
10399 newo
->extent_map
.fault_range(db
, dstoff
, length
);
10403 // hmm, this could go into an ExtentMap::dup() method.
10404 vector
<BlobRef
> id_to_blob(oldo
->extent_map
.extent_map
.size());
10405 for (auto &e
: oldo
->extent_map
.extent_map
) {
10406 e
.blob
->last_encoded_id
= -1;
10409 bool dirtied_oldo
= false;
10410 uint64_t end
= srcoff
+ length
;
10411 for (auto ep
= oldo
->extent_map
.seek_lextent(srcoff
);
10412 ep
!= oldo
->extent_map
.extent_map
.end();
10415 if (e
.logical_offset
>= end
) {
10418 dout(20) << __func__
<< " src " << e
<< dendl
;
10420 bool blob_duped
= true;
10421 if (e
.blob
->last_encoded_id
>= 0) {
10422 // blob is already duped
10423 cb
= id_to_blob
[e
.blob
->last_encoded_id
];
10424 blob_duped
= false;
10427 const bluestore_blob_t
& blob
= e
.blob
->get_blob();
10428 // make sure it is shared
10429 if (!blob
.is_shared()) {
10430 c
->make_blob_shared(_assign_blobid(txc
), e
.blob
);
10431 dirtied_oldo
= true; // fixme: overkill
10433 c
->load_shared_blob(e
.blob
->shared_blob
);
10436 e
.blob
->last_encoded_id
= n
;
10437 id_to_blob
[n
] = cb
;
10439 // bump the extent refs on the copied blob's extents
10440 for (auto p
: blob
.get_extents()) {
10441 if (p
.is_valid()) {
10442 e
.blob
->shared_blob
->get_ref(p
.offset
, p
.length
);
10445 txc
->write_shared_blob(e
.blob
->shared_blob
);
10446 dout(20) << __func__
<< " new " << *cb
<< dendl
;
10449 int skip_front
, skip_back
;
10450 if (e
.logical_offset
< srcoff
) {
10451 skip_front
= srcoff
- e
.logical_offset
;
10455 if (e
.logical_end() > end
) {
10456 skip_back
= e
.logical_end() - end
;
10460 Extent
*ne
= new Extent(e
.logical_offset
+ skip_front
+ dstoff
- srcoff
,
10461 e
.blob_offset
+ skip_front
,
10462 e
.length
- skip_front
- skip_back
, cb
);
10463 newo
->extent_map
.extent_map
.insert(*ne
);
10464 ne
->blob
->get_ref(c
.get(), ne
->blob_offset
, ne
->length
);
10465 // fixme: we may leave parts of new blob unreferenced that could
10466 // be freed (relative to the shared_blob).
10467 txc
->statfs_delta
.stored() += ne
->length
;
10468 if (e
.blob
->get_blob().is_compressed()) {
10469 txc
->statfs_delta
.compressed_original() += ne
->length
;
10471 txc
->statfs_delta
.compressed() +=
10472 cb
->get_blob().get_compressed_payload_length();
10475 dout(20) << __func__
<< " dst " << *ne
<< dendl
;
10478 if (dirtied_oldo
) {
10479 oldo
->extent_map
.dirty_range(txc
->t
, srcoff
, length
); // overkill
10480 txc
->write_onode(oldo
);
10482 txc
->write_onode(newo
);
10484 if (dstoff
+ length
> newo
->onode
.size
) {
10485 newo
->onode
.size
= dstoff
+ length
;
10487 newo
->extent_map
.dirty_range(txc
->t
, dstoff
, length
);
10493 int BlueStore::_clone_range(TransContext
*txc
,
10497 uint64_t srcoff
, uint64_t length
, uint64_t dstoff
)
10499 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10500 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
10501 << " to offset 0x" << dstoff
<< std::dec
<< dendl
;
10504 if (srcoff
+ length
> oldo
->onode
.size
) {
10509 newo
->exists
= true;
10510 _assign_nid(txc
, newo
);
10513 if (cct
->_conf
->bluestore_clone_cow
) {
10514 _do_zero(txc
, c
, newo
, dstoff
, length
);
10515 _do_clone_range(txc
, c
, oldo
, newo
, srcoff
, length
, dstoff
);
10518 r
= _do_read(c
.get(), oldo
, srcoff
, length
, bl
, 0);
10521 r
= _do_write(txc
, c
, newo
, dstoff
, bl
.length(), bl
, 0);
10527 txc
->write_onode(newo
);
10531 dout(10) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10532 << newo
->oid
<< " from 0x" << std::hex
<< srcoff
<< "~" << length
10533 << " to offset 0x" << dstoff
<< std::dec
10534 << " = " << r
<< dendl
;
10538 int BlueStore::_rename(TransContext
*txc
,
10542 const ghobject_t
& new_oid
)
10544 dout(15) << __func__
<< " " << c
->cid
<< " " << oldo
->oid
<< " -> "
10545 << new_oid
<< dendl
;
10547 ghobject_t old_oid
= oldo
->oid
;
10548 mempool::bluestore_meta_other::string new_okey
;
10551 if (newo
->exists
) {
10555 assert(txc
->onodes
.count(newo
) == 0);
10558 txc
->t
->rmkey(PREFIX_OBJ
, oldo
->key
.c_str(), oldo
->key
.size());
10562 oldo
->extent_map
.fault_range(db
, 0, oldo
->onode
.size
);
10563 get_object_key(cct
, new_oid
, &new_okey
);
10565 for (auto &s
: oldo
->extent_map
.shards
) {
10566 generate_extent_shard_key_and_apply(oldo
->key
, s
.shard_info
->offset
, &key
,
10567 [&](const string
& final_key
) {
10568 txc
->t
->rmkey(PREFIX_OBJ
, final_key
);
10576 txc
->write_onode(newo
);
10578 // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
10579 // Onode in the old slot
10580 c
->onode_map
.rename(oldo
, old_oid
, new_oid
, new_okey
);
10584 dout(10) << __func__
<< " " << c
->cid
<< " " << old_oid
<< " -> "
10585 << new_oid
<< " = " << r
<< dendl
;
10591 int BlueStore::_create_collection(
10597 dout(15) << __func__
<< " " << cid
<< " bits " << bits
<< dendl
;
10602 RWLock::WLocker
l(coll_lock
);
10610 cache_shards
[cid
.hash_to_shard(cache_shards
.size())],
10612 (*c
)->cnode
.bits
= bits
;
10613 coll_map
[cid
] = *c
;
10615 ::encode((*c
)->cnode
, bl
);
10616 txc
->t
->set(PREFIX_COLL
, stringify(cid
), bl
);
10620 dout(10) << __func__
<< " " << cid
<< " bits " << bits
<< " = " << r
<< dendl
;
10624 int BlueStore::_remove_collection(TransContext
*txc
, const coll_t
&cid
,
10627 dout(15) << __func__
<< " " << cid
<< dendl
;
10631 RWLock::WLocker
l(coll_lock
);
10636 size_t nonexistent_count
= 0;
10637 assert((*c
)->exists
);
10638 if ((*c
)->onode_map
.map_any([&](OnodeRef o
) {
10640 dout(10) << __func__
<< " " << o
->oid
<< " " << o
10641 << " exists in onode_map" << dendl
;
10644 ++nonexistent_count
;
10651 vector
<ghobject_t
> ls
;
10653 // Enumerate onodes in db, up to nonexistent_count + 1
10654 // then check if all of them are marked as non-existent.
10655 // Bypass the check if returned number is greater than nonexistent_count
10656 r
= _collection_list(c
->get(), ghobject_t(), ghobject_t::get_max(),
10657 nonexistent_count
+ 1, &ls
, &next
);
10659 bool exists
= false; //ls.size() > nonexistent_count;
10660 for (auto it
= ls
.begin(); !exists
&& it
< ls
.end(); ++it
) {
10661 dout(10) << __func__
<< " oid " << *it
<< dendl
;
10662 auto onode
= (*c
)->onode_map
.lookup(*it
);
10663 exists
= !onode
|| onode
->exists
;
10665 dout(10) << __func__
<< " " << *it
10666 << " exists in db" << dendl
;
10670 coll_map
.erase(cid
);
10671 txc
->removed_collections
.push_back(*c
);
10672 (*c
)->exists
= false;
10674 txc
->t
->rmkey(PREFIX_COLL
, stringify(cid
));
10677 dout(10) << __func__
<< " " << cid
10678 << " is non-empty" << dendl
;
10685 dout(10) << __func__
<< " " << cid
<< " = " << r
<< dendl
;
10689 int BlueStore::_split_collection(TransContext
*txc
,
10692 unsigned bits
, int rem
)
10694 dout(15) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
10695 << " bits " << bits
<< dendl
;
10696 RWLock::WLocker
l(c
->lock
);
10697 RWLock::WLocker
l2(d
->lock
);
10700 // flush all previous deferred writes on this sequencer. this is a bit
10701 // heavyweight, but we need to make sure all deferred writes complete
10702 // before we split as the new collection's sequencer may need to order
10703 // this after those writes, and we don't bother with the complexity of
10704 // moving those TransContexts over to the new osr.
10705 _osr_drain_preceding(txc
);
10707 // move any cached items (onodes and referenced shared blobs) that will
10708 // belong to the child collection post-split. leave everything else behind.
10709 // this may include things that don't strictly belong to the now-smaller
10710 // parent split, but the OSD will always send us a split for every new
10713 spg_t pgid
, dest_pgid
;
10714 bool is_pg
= c
->cid
.is_pg(&pgid
);
10716 is_pg
= d
->cid
.is_pg(&dest_pgid
);
10719 // the destination should initially be empty.
10720 assert(d
->onode_map
.empty());
10721 assert(d
->shared_blob_set
.empty());
10722 assert(d
->cnode
.bits
== bits
);
10724 c
->split_cache(d
.get());
10726 // adjust bits. note that this will be redundant for all but the first
10727 // split call for this parent (first child).
10728 c
->cnode
.bits
= bits
;
10729 assert(d
->cnode
.bits
== bits
);
10733 ::encode(c
->cnode
, bl
);
10734 txc
->t
->set(PREFIX_COLL
, stringify(c
->cid
), bl
);
10736 dout(10) << __func__
<< " " << c
->cid
<< " to " << d
->cid
<< " "
10737 << " bits " << bits
<< " = " << r
<< dendl
;
10741 // DB key value Histogram
10742 #define KEY_SLAB 32
10743 #define VALUE_SLAB 64
10745 const string prefix_onode
= "o";
10746 const string prefix_onode_shard
= "x";
10747 const string prefix_other
= "Z";
10749 int BlueStore::DBHistogram::get_key_slab(size_t sz
)
10751 return (sz
/KEY_SLAB
);
10754 string
BlueStore::DBHistogram::get_key_slab_to_range(int slab
)
10756 int lower_bound
= slab
* KEY_SLAB
;
10757 int upper_bound
= (slab
+ 1) * KEY_SLAB
;
10758 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
10762 int BlueStore::DBHistogram::get_value_slab(size_t sz
)
10764 return (sz
/VALUE_SLAB
);
10767 string
BlueStore::DBHistogram::get_value_slab_to_range(int slab
)
10769 int lower_bound
= slab
* VALUE_SLAB
;
10770 int upper_bound
= (slab
+ 1) * VALUE_SLAB
;
10771 string ret
= "[" + stringify(lower_bound
) + "," + stringify(upper_bound
) + ")";
10775 void BlueStore::DBHistogram::update_hist_entry(map
<string
, map
<int, struct key_dist
> > &key_hist
,
10776 const string
&prefix
, size_t key_size
, size_t value_size
)
10778 uint32_t key_slab
= get_key_slab(key_size
);
10779 uint32_t value_slab
= get_value_slab(value_size
);
10780 key_hist
[prefix
][key_slab
].count
++;
10781 key_hist
[prefix
][key_slab
].max_len
= MAX(key_size
, key_hist
[prefix
][key_slab
].max_len
);
10782 key_hist
[prefix
][key_slab
].val_map
[value_slab
].count
++;
10783 key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
=
10784 MAX(value_size
, key_hist
[prefix
][key_slab
].val_map
[value_slab
].max_len
);
10787 void BlueStore::DBHistogram::dump(Formatter
*f
)
10789 f
->open_object_section("rocksdb_value_distribution");
10790 for (auto i
: value_hist
) {
10791 f
->dump_unsigned(get_value_slab_to_range(i
.first
).data(), i
.second
);
10793 f
->close_section();
10795 f
->open_object_section("rocksdb_key_value_histogram");
10796 for (auto i
: key_hist
) {
10797 f
->dump_string("prefix", i
.first
);
10798 f
->open_object_section("key_hist");
10799 for ( auto k
: i
.second
) {
10800 f
->dump_unsigned(get_key_slab_to_range(k
.first
).data(), k
.second
.count
);
10801 f
->dump_unsigned("max_len", k
.second
.max_len
);
10802 f
->open_object_section("value_hist");
10803 for ( auto j
: k
.second
.val_map
) {
10804 f
->dump_unsigned(get_value_slab_to_range(j
.first
).data(), j
.second
.count
);
10805 f
->dump_unsigned("max_len", j
.second
.max_len
);
10807 f
->close_section();
10809 f
->close_section();
10811 f
->close_section();
10814 //Itrerates through the db and collects the stats
10815 void BlueStore::generate_db_histogram(Formatter
*f
)
10818 uint64_t num_onodes
= 0;
10819 uint64_t num_shards
= 0;
10820 uint64_t num_super
= 0;
10821 uint64_t num_coll
= 0;
10822 uint64_t num_omap
= 0;
10823 uint64_t num_deferred
= 0;
10824 uint64_t num_alloc
= 0;
10825 uint64_t num_stat
= 0;
10826 uint64_t num_others
= 0;
10827 uint64_t num_shared_shards
= 0;
10828 size_t max_key_size
=0, max_value_size
= 0;
10829 uint64_t total_key_size
= 0, total_value_size
= 0;
10830 size_t key_size
= 0, value_size
= 0;
10833 utime_t start
= ceph_clock_now();
10835 KeyValueDB::WholeSpaceIterator iter
= db
->get_iterator();
10836 iter
->seek_to_first();
10837 while (iter
->valid()) {
10838 dout(30) << __func__
<< " Key: " << iter
->key() << dendl
;
10839 key_size
= iter
->key_size();
10840 value_size
= iter
->value_size();
10841 hist
.value_hist
[hist
.get_value_slab(value_size
)]++;
10842 max_key_size
= MAX(max_key_size
, key_size
);
10843 max_value_size
= MAX(max_value_size
, value_size
);
10844 total_key_size
+= key_size
;
10845 total_value_size
+= value_size
;
10847 pair
<string
,string
> key(iter
->raw_key());
10849 if (key
.first
== PREFIX_SUPER
) {
10850 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SUPER
, key_size
, value_size
);
10852 } else if (key
.first
== PREFIX_STAT
) {
10853 hist
.update_hist_entry(hist
.key_hist
, PREFIX_STAT
, key_size
, value_size
);
10855 } else if (key
.first
== PREFIX_COLL
) {
10856 hist
.update_hist_entry(hist
.key_hist
, PREFIX_COLL
, key_size
, value_size
);
10858 } else if (key
.first
== PREFIX_OBJ
) {
10859 if (key
.second
.back() == ONODE_KEY_SUFFIX
) {
10860 hist
.update_hist_entry(hist
.key_hist
, prefix_onode
, key_size
, value_size
);
10863 hist
.update_hist_entry(hist
.key_hist
, prefix_onode_shard
, key_size
, value_size
);
10866 } else if (key
.first
== PREFIX_OMAP
) {
10867 hist
.update_hist_entry(hist
.key_hist
, PREFIX_OMAP
, key_size
, value_size
);
10869 } else if (key
.first
== PREFIX_DEFERRED
) {
10870 hist
.update_hist_entry(hist
.key_hist
, PREFIX_DEFERRED
, key_size
, value_size
);
10872 } else if (key
.first
== PREFIX_ALLOC
|| key
.first
== "b" ) {
10873 hist
.update_hist_entry(hist
.key_hist
, PREFIX_ALLOC
, key_size
, value_size
);
10875 } else if (key
.first
== PREFIX_SHARED_BLOB
) {
10876 hist
.update_hist_entry(hist
.key_hist
, PREFIX_SHARED_BLOB
, key_size
, value_size
);
10877 num_shared_shards
++;
10879 hist
.update_hist_entry(hist
.key_hist
, prefix_other
, key_size
, value_size
);
10885 utime_t duration
= ceph_clock_now() - start
;
10886 f
->open_object_section("rocksdb_key_value_stats");
10887 f
->dump_unsigned("num_onodes", num_onodes
);
10888 f
->dump_unsigned("num_shards", num_shards
);
10889 f
->dump_unsigned("num_super", num_super
);
10890 f
->dump_unsigned("num_coll", num_coll
);
10891 f
->dump_unsigned("num_omap", num_omap
);
10892 f
->dump_unsigned("num_deferred", num_deferred
);
10893 f
->dump_unsigned("num_alloc", num_alloc
);
10894 f
->dump_unsigned("num_stat", num_stat
);
10895 f
->dump_unsigned("num_shared_shards", num_shared_shards
);
10896 f
->dump_unsigned("num_others", num_others
);
10897 f
->dump_unsigned("max_key_size", max_key_size
);
10898 f
->dump_unsigned("max_value_size", max_value_size
);
10899 f
->dump_unsigned("total_key_size", total_key_size
);
10900 f
->dump_unsigned("total_value_size", total_value_size
);
10901 f
->close_section();
10905 dout(20) << __func__
<< " finished in " << duration
<< " seconds" << dendl
;
10909 void BlueStore::flush_cache()
10911 dout(10) << __func__
<< dendl
;
10912 for (auto i
: cache_shards
) {
10915 for (auto& p
: coll_map
) {
10916 assert(p
.second
->onode_map
.empty());
10917 assert(p
.second
->shared_blob_set
.empty());
10922 void BlueStore::_apply_padding(uint64_t head_pad
,
10925 bufferlist
& padded
)
10930 z
.append_zero(head_pad
);
10931 z
.claim_append(padded
);
10935 padded
.append_zero(tail_pad
);
10937 if (head_pad
|| tail_pad
) {
10938 dout(20) << __func__
<< " can pad head 0x" << std::hex
<< head_pad
10939 << " tail 0x" << tail_pad
<< std::dec
<< dendl
;
10940 logger
->inc(l_bluestore_write_pad_bytes
, head_pad
+ tail_pad
);
10944 // ===========================================