1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "BitmapFreelistManager.h"
7 #include "kv/KeyValueDB.h"
9 #include "include/stringify.h"
11 #include "common/debug.h"
13 #define dout_context cct
14 #define dout_subsys ceph_subsys_bluestore
16 #define dout_prefix *_dout << "freelist "
20 using ceph::bufferlist
;
21 using ceph::bufferptr
;
25 void make_offset_key(uint64_t offset
, std::string
*key
)
28 _key_encode_u64(offset
, key
);
31 struct XorMergeOperator
: public KeyValueDB::MergeOperator
{
32 void merge_nonexistent(
33 const char *rdata
, size_t rlen
, std::string
*new_value
) override
{
34 *new_value
= std::string(rdata
, rlen
);
37 const char *ldata
, size_t llen
,
38 const char *rdata
, size_t rlen
,
39 std::string
*new_value
) override
{
40 ceph_assert(llen
== rlen
);
41 *new_value
= std::string(ldata
, llen
);
42 for (size_t i
= 0; i
< rlen
; ++i
) {
43 (*new_value
)[i
] ^= rdata
[i
];
46 // We use each operator name and each prefix to construct the
47 // overall RocksDB operator name for consistency check at open time.
48 const char *name() const override
{
53 void BitmapFreelistManager::setup_merge_operator(KeyValueDB
*db
, string prefix
)
55 std::shared_ptr
<XorMergeOperator
> merge_op(new XorMergeOperator
);
56 db
->set_merge_operator(prefix
, merge_op
);
59 BitmapFreelistManager::BitmapFreelistManager(CephContext
* cct
,
62 : FreelistManager(cct
),
63 meta_prefix(meta_prefix
),
64 bitmap_prefix(bitmap_prefix
),
69 int BitmapFreelistManager::create(uint64_t new_size
, uint64_t granularity
,
70 uint64_t zone_size
, uint64_t first_sequential_zone
,
71 KeyValueDB::Transaction txn
)
73 bytes_per_block
= granularity
;
74 ceph_assert(std::has_single_bit(bytes_per_block
));
75 size
= p2align(new_size
, bytes_per_block
);
76 blocks_per_key
= cct
->_conf
->bluestore_freelist_blocks_per_key
;
80 blocks
= size_2_block_count(size
);
81 if (blocks
* bytes_per_block
> size
) {
82 dout(10) << __func__
<< " rounding blocks up from 0x" << std::hex
<< size
83 << " to 0x" << (blocks
* bytes_per_block
)
84 << " (0x" << blocks
<< " blocks)" << std::dec
<< dendl
;
85 // set past-eof blocks as allocated
86 _xor(size
, blocks
* bytes_per_block
- size
, txn
);
89 << " size 0x" << std::hex
<< size
90 << " bytes_per_block 0x" << bytes_per_block
91 << " blocks 0x" << blocks
92 << " blocks_per_key 0x" << blocks_per_key
96 encode(bytes_per_block
, bl
);
97 txn
->set(meta_prefix
, "bytes_per_block", bl
);
101 encode(blocks_per_key
, bl
);
102 txn
->set(meta_prefix
, "blocks_per_key", bl
);
107 txn
->set(meta_prefix
, "blocks", bl
);
112 txn
->set(meta_prefix
, "size", bl
);
117 int BitmapFreelistManager::_expand(uint64_t old_size
, KeyValueDB
* db
)
119 assert(old_size
< size
);
120 ceph_assert(std::has_single_bit(bytes_per_block
));
122 KeyValueDB::Transaction txn
;
123 txn
= db
->get_transaction();
125 auto blocks0
= size_2_block_count(old_size
);
126 if (blocks0
* bytes_per_block
> old_size
) {
127 dout(10) << __func__
<< " rounding1 blocks up from 0x" << std::hex
128 << old_size
<< " to 0x" << (blocks0
* bytes_per_block
)
129 << " (0x" << blocks0
<< " blocks)" << std::dec
<< dendl
;
130 // reset past-eof blocks to unallocated
131 _xor(old_size
, blocks0
* bytes_per_block
- old_size
, txn
);
134 size
= p2align(size
, bytes_per_block
);
135 blocks
= size_2_block_count(size
);
137 if (blocks
* bytes_per_block
> size
) {
138 dout(10) << __func__
<< " rounding2 blocks up from 0x" << std::hex
139 << size
<< " to 0x" << (blocks
* bytes_per_block
)
140 << " (0x" << blocks
<< " blocks)" << std::dec
<< dendl
;
141 // set past-eof blocks as allocated
142 _xor(size
, blocks
* bytes_per_block
- size
, txn
);
146 << " size 0x" << std::hex
<< size
147 << " bytes_per_block 0x" << bytes_per_block
148 << " blocks 0x" << blocks
149 << " blocks_per_key 0x" << blocks_per_key
150 << std::dec
<< dendl
;
154 txn
->set(meta_prefix
, "blocks", bl
);
159 txn
->set(meta_prefix
, "size", bl
);
161 db
->submit_transaction_sync(txn
);
166 int BitmapFreelistManager::read_size_meta_from_db(KeyValueDB
* kvdb
,
170 int r
= kvdb
->get(meta_prefix
, "size", &v
);
172 derr
<< __func__
<< " missing size meta in DB" << dendl
;
182 void BitmapFreelistManager::_load_from_db(KeyValueDB
* kvdb
)
184 KeyValueDB::Iterator it
= kvdb
->get_iterator(meta_prefix
);
185 it
->lower_bound(string());
188 while (it
->valid()) {
189 string k
= it
->key();
190 if (k
== "bytes_per_block") {
191 bufferlist bl
= it
->value();
192 auto p
= bl
.cbegin();
193 decode(bytes_per_block
, p
);
194 dout(10) << __func__
<< " bytes_per_block 0x" << std::hex
195 << bytes_per_block
<< std::dec
<< dendl
;
196 } else if (k
== "blocks") {
197 bufferlist bl
= it
->value();
198 auto p
= bl
.cbegin();
200 dout(10) << __func__
<< " blocks 0x" << std::hex
<< blocks
<< std::dec
202 } else if (k
== "size") {
203 bufferlist bl
= it
->value();
204 auto p
= bl
.cbegin();
206 dout(10) << __func__
<< " size 0x" << std::hex
<< size
<< std::dec
208 } else if (k
== "blocks_per_key") {
209 bufferlist bl
= it
->value();
210 auto p
= bl
.cbegin();
211 decode(blocks_per_key
, p
);
212 dout(10) << __func__
<< " blocks_per_key 0x" << std::hex
<< blocks_per_key
213 << std::dec
<< dendl
;
215 derr
<< __func__
<< " unrecognized meta " << k
<< dendl
;
222 int BitmapFreelistManager::init(KeyValueDB
*kvdb
, bool db_in_read_only
,
223 std::function
<int(const std::string
&, std::string
*)> cfg_reader
)
225 dout(1) << __func__
<< dendl
;
226 int r
= _read_cfg(cfg_reader
);
228 dout(1) << __func__
<< " fall back to legacy meta repo" << dendl
;
231 _sync(kvdb
, db_in_read_only
);
233 dout(10) << __func__
<< std::hex
234 << " size 0x" << size
235 << " bytes_per_block 0x" << bytes_per_block
236 << " blocks 0x" << blocks
237 << " blocks_per_key 0x" << blocks_per_key
238 << std::dec
<< dendl
;
243 int BitmapFreelistManager::_read_cfg(
244 std::function
<int(const std::string
&, std::string
*)> cfg_reader
)
246 dout(1) << __func__
<< dendl
;
250 const size_t key_count
= 4;
251 string keys
[key_count
] = {
254 "bfm_bytes_per_block",
255 "bfm_blocks_per_key"};
256 uint64_t* vals
[key_count
] = {
262 for (size_t i
= 0; i
< key_count
; i
++) {
264 int r
= cfg_reader(keys
[i
], &val
);
266 *(vals
[i
]) = strict_iecstrtoll(val
, &err
);
268 derr
<< __func__
<< " Failed to parse - "
269 << keys
[i
] << ":" << val
270 << ", error: " << err
<< dendl
;
274 // this is expected for legacy deployed OSDs
275 dout(0) << __func__
<< " " << keys
[i
] << " not found in bdev meta" << dendl
;
283 void BitmapFreelistManager::_init_misc()
285 bufferptr
z(blocks_per_key
>> 3);
286 memset(z
.c_str(), 0xff, z
.length());
288 all_set_bl
.append(z
);
290 block_mask
= ~(bytes_per_block
- 1);
292 bytes_per_key
= bytes_per_block
* blocks_per_key
;
293 key_mask
= ~(bytes_per_key
- 1);
294 dout(10) << __func__
<< std::hex
<< " bytes_per_key 0x" << bytes_per_key
295 << ", key_mask 0x" << key_mask
<< std::dec
299 void BitmapFreelistManager::sync(KeyValueDB
* kvdb
)
304 void BitmapFreelistManager::_sync(KeyValueDB
* kvdb
, bool read_only
)
306 dout(10) << __func__
<< " checks if size sync is needed" << dendl
;
307 uint64_t size_db
= 0;
308 int r
= read_size_meta_from_db(kvdb
, &size_db
);
310 if (!read_only
&& size_db
< size
) {
311 dout(1) << __func__
<< " committing new size 0x" << std::hex
<< size
312 << std::dec
<< dendl
;
313 r
= _expand(size_db
, kvdb
);
315 } else if (size_db
> size
) {
316 // this might hapen when OSD passed the following sequence:
317 // upgrade -> downgrade -> expand -> upgrade
318 // One needs to run expand once again to syncup
319 dout(1) << __func__
<< " fall back to legacy meta repo" << dendl
;
324 void BitmapFreelistManager::shutdown()
326 dout(1) << __func__
<< dendl
;
329 void BitmapFreelistManager::enumerate_reset()
331 std::lock_guard
l(lock
);
332 enumerate_offset
= 0;
333 enumerate_bl_pos
= 0;
334 enumerate_bl
.clear();
338 int get_next_clear_bit(bufferlist
& bl
, int start
)
340 const char *p
= bl
.c_str();
341 int bits
= bl
.length() << 3;
342 while (start
< bits
) {
343 // byte = start / 8 (or start >> 3)
344 // bit = start % 8 (or start & 7)
345 unsigned char byte_mask
= 1 << (start
& 7);
346 if ((p
[start
>> 3] & byte_mask
) == 0) {
351 return -1; // not found
354 int get_next_set_bit(bufferlist
& bl
, int start
)
356 const char *p
= bl
.c_str();
357 int bits
= bl
.length() << 3;
358 while (start
< bits
) {
359 int which_byte
= start
/ 8;
360 int which_bit
= start
% 8;
361 unsigned char byte_mask
= 1 << which_bit
;
362 if (p
[which_byte
] & byte_mask
) {
367 return -1; // not found
370 bool BitmapFreelistManager::enumerate_next(KeyValueDB
*kvdb
, uint64_t *offset
, uint64_t *length
)
372 std::lock_guard
l(lock
);
374 // initial base case is a bit awkward
375 if (enumerate_offset
== 0 && enumerate_bl_pos
== 0) {
376 dout(10) << __func__
<< " start" << dendl
;
377 enumerate_p
= kvdb
->get_iterator(bitmap_prefix
);
378 enumerate_p
->lower_bound(string());
379 // we assert that the first block is always allocated; it's true,
380 // and it simplifies our lives a bit.
381 ceph_assert(enumerate_p
->valid());
382 string k
= enumerate_p
->key();
383 const char *p
= k
.c_str();
384 _key_decode_u64(p
, &enumerate_offset
);
385 enumerate_bl
= enumerate_p
->value();
386 ceph_assert(enumerate_offset
== 0);
387 ceph_assert(get_next_set_bit(enumerate_bl
, 0) == 0);
390 if (enumerate_offset
>= size
) {
391 dout(10) << __func__
<< " end" << dendl
;
395 // skip set bits to find offset
397 enumerate_bl_pos
= get_next_clear_bit(enumerate_bl
, enumerate_bl_pos
);
398 if (enumerate_bl_pos
>= 0) {
399 *offset
= _get_offset(enumerate_offset
, enumerate_bl_pos
);
400 dout(30) << __func__
<< " found clear bit, key 0x" << std::hex
401 << enumerate_offset
<< " bit 0x" << enumerate_bl_pos
402 << " offset 0x" << *offset
403 << std::dec
<< dendl
;
406 dout(30) << " no more clear bits in 0x" << std::hex
<< enumerate_offset
407 << std::dec
<< dendl
;
409 enumerate_bl
.clear();
410 if (!enumerate_p
->valid()) {
411 enumerate_offset
+= bytes_per_key
;
412 enumerate_bl_pos
= 0;
413 *offset
= _get_offset(enumerate_offset
, enumerate_bl_pos
);
416 string k
= enumerate_p
->key();
417 const char *p
= k
.c_str();
418 uint64_t next
= enumerate_offset
+ bytes_per_key
;
419 _key_decode_u64(p
, &enumerate_offset
);
420 enumerate_bl
= enumerate_p
->value();
421 enumerate_bl_pos
= 0;
422 if (enumerate_offset
> next
) {
423 dout(30) << " no key at 0x" << std::hex
<< next
<< ", got 0x"
424 << enumerate_offset
<< std::dec
<< dendl
;
430 // skip clear bits to find the end
432 if (enumerate_p
->valid()) {
434 enumerate_bl_pos
= get_next_set_bit(enumerate_bl
, enumerate_bl_pos
);
435 if (enumerate_bl_pos
>= 0) {
436 end
= _get_offset(enumerate_offset
, enumerate_bl_pos
);
437 dout(30) << __func__
<< " found set bit, key 0x" << std::hex
438 << enumerate_offset
<< " bit 0x" << enumerate_bl_pos
439 << " offset 0x" << end
<< std::dec
441 end
= std::min(get_alloc_units() * bytes_per_block
, end
);
442 *length
= end
- *offset
;
443 dout(10) << __func__
<< std::hex
<< " 0x" << *offset
<< "~" << *length
444 << std::dec
<< dendl
;
447 dout(30) << " no more set bits in 0x" << std::hex
<< enumerate_offset
448 << std::dec
<< dendl
;
450 enumerate_bl
.clear();
451 enumerate_bl_pos
= 0;
452 if (!enumerate_p
->valid()) {
455 string k
= enumerate_p
->key();
456 const char *p
= k
.c_str();
457 _key_decode_u64(p
, &enumerate_offset
);
458 enumerate_bl
= enumerate_p
->value();
462 if (enumerate_offset
< size
) {
463 end
= get_alloc_units() * bytes_per_block
;
464 *length
= end
- *offset
;
465 dout(10) << __func__
<< std::hex
<< " 0x" << *offset
<< "~" << *length
466 << std::dec
<< dendl
;
467 enumerate_offset
= size
;
468 enumerate_bl_pos
= blocks_per_key
;
472 dout(10) << __func__
<< " end" << dendl
;
476 void BitmapFreelistManager::dump(KeyValueDB
*kvdb
)
479 uint64_t offset
, length
;
480 while (enumerate_next(kvdb
, &offset
, &length
)) {
481 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
482 << std::dec
<< dendl
;
486 void BitmapFreelistManager::allocate(
487 uint64_t offset
, uint64_t length
,
488 KeyValueDB::Transaction txn
)
490 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
491 << std::dec
<< dendl
;
492 if (!is_null_manager()) {
493 _xor(offset
, length
, txn
);
497 void BitmapFreelistManager::release(
498 uint64_t offset
, uint64_t length
,
499 KeyValueDB::Transaction txn
)
501 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
502 << std::dec
<< dendl
;
503 if (!is_null_manager()) {
504 _xor(offset
, length
, txn
);
508 void BitmapFreelistManager::_xor(
509 uint64_t offset
, uint64_t length
,
510 KeyValueDB::Transaction txn
)
512 // must be block aligned
513 ceph_assert((offset
& block_mask
) == offset
);
514 ceph_assert((length
& block_mask
) == length
);
516 uint64_t first_key
= offset
& key_mask
;
517 uint64_t last_key
= (offset
+ length
- 1) & key_mask
;
518 dout(20) << __func__
<< " first_key 0x" << std::hex
<< first_key
519 << " last_key 0x" << last_key
<< std::dec
<< dendl
;
521 if (first_key
== last_key
) {
522 bufferptr
p(blocks_per_key
>> 3);
524 unsigned s
= (offset
& ~key_mask
) / bytes_per_block
;
525 unsigned e
= ((offset
+ length
- 1) & ~key_mask
) / bytes_per_block
;
526 for (unsigned i
= s
; i
<= e
; ++i
) {
527 p
[i
>> 3] ^= 1ull << (i
& 7);
530 make_offset_key(first_key
, &k
);
533 dout(30) << __func__
<< " 0x" << std::hex
<< first_key
<< std::dec
<< ": ";
534 bl
.hexdump(*_dout
, false);
536 txn
->merge(bitmap_prefix
, k
, bl
);
540 bufferptr
p(blocks_per_key
>> 3);
542 unsigned s
= (offset
& ~key_mask
) / bytes_per_block
;
543 unsigned e
= blocks_per_key
;
544 for (unsigned i
= s
; i
< e
; ++i
) {
545 p
[i
>> 3] ^= 1ull << (i
& 7);
548 make_offset_key(first_key
, &k
);
551 dout(30) << __func__
<< " 0x" << std::hex
<< first_key
<< std::dec
<< ": ";
552 bl
.hexdump(*_dout
, false);
554 txn
->merge(bitmap_prefix
, k
, bl
);
555 first_key
+= bytes_per_key
;
558 while (first_key
< last_key
) {
560 make_offset_key(first_key
, &k
);
561 dout(30) << __func__
<< " 0x" << std::hex
<< first_key
<< std::dec
563 all_set_bl
.hexdump(*_dout
, false);
565 txn
->merge(bitmap_prefix
, k
, all_set_bl
);
566 first_key
+= bytes_per_key
;
568 ceph_assert(first_key
== last_key
);
570 bufferptr
p(blocks_per_key
>> 3);
572 unsigned e
= ((offset
+ length
- 1) & ~key_mask
) / bytes_per_block
;
573 for (unsigned i
= 0; i
<= e
; ++i
) {
574 p
[i
>> 3] ^= 1ull << (i
& 7);
577 make_offset_key(first_key
, &k
);
580 dout(30) << __func__
<< " 0x" << std::hex
<< first_key
<< std::dec
<< ": ";
581 bl
.hexdump(*_dout
, false);
583 txn
->merge(bitmap_prefix
, k
, bl
);
588 uint64_t BitmapFreelistManager::size_2_block_count(uint64_t target_size
) const
590 auto target_blocks
= target_size
/ bytes_per_block
;
591 if (target_blocks
/ blocks_per_key
* blocks_per_key
!= target_blocks
) {
592 target_blocks
= (target_blocks
/ blocks_per_key
+ 1) * blocks_per_key
;
594 return target_blocks
;
597 void BitmapFreelistManager::get_meta(
598 uint64_t target_size
,
599 std::vector
<std::pair
<string
, string
>>* res
) const
601 if (target_size
== 0) {
602 res
->emplace_back("bfm_blocks", stringify(blocks
));
603 res
->emplace_back("bfm_size", stringify(size
));
605 target_size
= p2align(target_size
, bytes_per_block
);
606 auto target_blocks
= size_2_block_count(target_size
);
608 res
->emplace_back("bfm_blocks", stringify(target_blocks
));
609 res
->emplace_back("bfm_size", stringify(target_size
));
611 res
->emplace_back("bfm_bytes_per_block", stringify(bytes_per_block
));
612 res
->emplace_back("bfm_blocks_per_key", stringify(blocks_per_key
));