1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "BitmapFreelistManager.h"
5 #include "kv/KeyValueDB.h"
8 #include "common/debug.h"
10 #define dout_context cct
11 #define dout_subsys ceph_subsys_bluestore
13 #define dout_prefix *_dout << "freelist "
15 void make_offset_key(uint64_t offset
, std::string
*key
)
18 _key_encode_u64(offset
, key
);
21 struct XorMergeOperator
: public KeyValueDB::MergeOperator
{
22 void merge_nonexistent(
23 const char *rdata
, size_t rlen
, std::string
*new_value
) override
{
24 *new_value
= std::string(rdata
, rlen
);
27 const char *ldata
, size_t llen
,
28 const char *rdata
, size_t rlen
,
29 std::string
*new_value
) override
{
31 *new_value
= std::string(ldata
, llen
);
32 for (size_t i
= 0; i
< rlen
; ++i
) {
33 (*new_value
)[i
] ^= rdata
[i
];
36 // We use each operator name and each prefix to construct the
37 // overall RocksDB operator name for consistency check at open time.
38 string
name() const override
{
43 void BitmapFreelistManager::setup_merge_operator(KeyValueDB
*db
, string prefix
)
45 ceph::shared_ptr
<XorMergeOperator
> merge_op(new XorMergeOperator
);
46 db
->set_merge_operator(prefix
, merge_op
);
49 BitmapFreelistManager::BitmapFreelistManager(CephContext
* cct
,
53 : FreelistManager(cct
),
54 meta_prefix(meta_prefix
),
55 bitmap_prefix(bitmap_prefix
),
61 int BitmapFreelistManager::create(uint64_t new_size
, uint64_t granularity
,
62 KeyValueDB::Transaction txn
)
64 bytes_per_block
= granularity
;
65 assert(ISP2(bytes_per_block
));
66 size
= P2ALIGN(new_size
, bytes_per_block
);
67 blocks_per_key
= cct
->_conf
->bluestore_freelist_blocks_per_key
;
71 blocks
= size
/ bytes_per_block
;
72 if (blocks
/ blocks_per_key
* blocks_per_key
!= blocks
) {
73 blocks
= (blocks
/ blocks_per_key
+ 1) * blocks_per_key
;
74 dout(10) << __func__
<< " rounding blocks up from 0x" << std::hex
<< size
75 << " to 0x" << (blocks
* bytes_per_block
)
76 << " (0x" << blocks
<< " blocks)" << std::dec
<< dendl
;
77 // set past-eof blocks as allocated
78 _xor(size
, blocks
* bytes_per_block
- size
, txn
);
81 << " size 0x" << std::hex
<< size
82 << " bytes_per_block 0x" << bytes_per_block
83 << " blocks 0x" << blocks
84 << " blocks_per_key 0x" << blocks_per_key
88 ::encode(bytes_per_block
, bl
);
89 txn
->set(meta_prefix
, "bytes_per_block", bl
);
93 ::encode(blocks_per_key
, bl
);
94 txn
->set(meta_prefix
, "blocks_per_key", bl
);
99 txn
->set(meta_prefix
, "blocks", bl
);
104 txn
->set(meta_prefix
, "size", bl
);
109 int BitmapFreelistManager::init(uint64_t dev_size
)
111 dout(1) << __func__
<< dendl
;
113 KeyValueDB::Iterator it
= kvdb
->get_iterator(meta_prefix
);
114 it
->lower_bound(string());
117 while (it
->valid()) {
118 string k
= it
->key();
119 if (k
== "bytes_per_block") {
120 bufferlist bl
= it
->value();
121 bufferlist::iterator p
= bl
.begin();
122 ::decode(bytes_per_block
, p
);
123 dout(10) << __func__
<< " bytes_per_block 0x" << std::hex
124 << bytes_per_block
<< std::dec
<< dendl
;
125 } else if (k
== "blocks") {
126 bufferlist bl
= it
->value();
127 bufferlist::iterator p
= bl
.begin();
129 dout(10) << __func__
<< " blocks 0x" << std::hex
<< blocks
<< std::dec
131 } else if (k
== "size") {
132 bufferlist bl
= it
->value();
133 bufferlist::iterator p
= bl
.begin();
135 dout(10) << __func__
<< " size 0x" << std::hex
<< size
<< std::dec
137 } else if (k
== "blocks_per_key") {
138 bufferlist bl
= it
->value();
139 bufferlist::iterator p
= bl
.begin();
140 ::decode(blocks_per_key
, p
);
141 dout(10) << __func__
<< " blocks_per_key 0x" << std::hex
<< blocks_per_key
142 << std::dec
<< dendl
;
144 derr
<< __func__
<< " unrecognized meta " << k
<< dendl
;
150 dout(10) << __func__
<< std::hex
151 << " size 0x" << size
152 << " bytes_per_block 0x" << bytes_per_block
153 << " blocks 0x" << blocks
154 << " blocks_per_key 0x" << blocks_per_key
155 << std::dec
<< dendl
;
158 // check for http://tracker.ceph.com/issues/21089 inconsistency
160 uint64_t new_size
= P2ALIGN(dev_size
, bytes_per_block
);
161 if (new_size
!= size
) {
162 uint64_t bad_size
= new_size
& ~bytes_per_block
;
163 if (size
== bad_size
) {
164 derr
<< __func__
<< " size is 0x" << std::hex
<< size
<< " should be 0x"
165 << new_size
<< " and appears to be due to #21089" << std::dec
168 uint64_t new_blocks
= new_size
/ bytes_per_block
;
169 if (new_blocks
/ blocks_per_key
* blocks_per_key
!= new_blocks
) {
170 new_blocks
= (new_blocks
/ blocks_per_key
+ 1) *
174 KeyValueDB::Transaction t
= kvdb
->get_transaction();
177 ::encode(new_size
, sizebl
);
178 t
->set(meta_prefix
, "size", sizebl
);
180 if (new_blocks
!= blocks
) {
181 derr
<< "blocks is 0x" << std::hex
<< blocks
<< " should be 0x"
182 << new_blocks
<< std::dec
<< dendl
;
184 ::encode(new_blocks
, bl
);
185 t
->set(meta_prefix
, "blocks", bl
);
186 _xor(new_size
, new_blocks
* bytes_per_block
- new_size
, t
);
188 derr
<< "blocks are ok" << dendl
;
189 _xor(bad_size
, bytes_per_block
, t
);
191 int r
= kvdb
->submit_transaction_sync(t
);
195 derr
<< __func__
<< " fixed inconsistency, size now 0x" << std::hex
196 << size
<< " blocks 0x" << blocks
<< std::dec
<< dendl
;
203 void BitmapFreelistManager::_init_misc()
205 bufferptr
z(blocks_per_key
>> 3);
206 memset(z
.c_str(), 0xff, z
.length());
208 all_set_bl
.append(z
);
210 block_mask
= ~(bytes_per_block
- 1);
212 bytes_per_key
= bytes_per_block
* blocks_per_key
;
213 key_mask
= ~(bytes_per_key
- 1);
214 dout(10) << __func__
<< std::hex
<< " bytes_per_key 0x" << bytes_per_key
215 << ", key_mask 0x" << key_mask
<< std::dec
219 void BitmapFreelistManager::shutdown()
221 dout(1) << __func__
<< dendl
;
224 void BitmapFreelistManager::enumerate_reset()
226 std::lock_guard
<std::mutex
> l(lock
);
227 enumerate_offset
= 0;
228 enumerate_bl_pos
= 0;
229 enumerate_bl
.clear();
233 int get_next_clear_bit(bufferlist
& bl
, int start
)
235 const char *p
= bl
.c_str();
236 int bits
= bl
.length() << 3;
237 while (start
< bits
) {
238 // byte = start / 8 (or start >> 3)
239 // bit = start % 8 (or start & 7)
240 unsigned char byte_mask
= 1 << (start
& 7);
241 if ((p
[start
>> 3] & byte_mask
) == 0) {
246 return -1; // not found
249 int get_next_set_bit(bufferlist
& bl
, int start
)
251 const char *p
= bl
.c_str();
252 int bits
= bl
.length() << 3;
253 while (start
< bits
) {
254 int which_byte
= start
/ 8;
255 int which_bit
= start
% 8;
256 unsigned char byte_mask
= 1 << which_bit
;
257 if (p
[which_byte
] & byte_mask
) {
262 return -1; // not found
265 bool BitmapFreelistManager::enumerate_next(uint64_t *offset
, uint64_t *length
)
267 std::lock_guard
<std::mutex
> l(lock
);
269 // initial base case is a bit awkward
270 if (enumerate_offset
== 0 && enumerate_bl_pos
== 0) {
271 dout(10) << __func__
<< " start" << dendl
;
272 enumerate_p
= kvdb
->get_iterator(bitmap_prefix
);
273 enumerate_p
->lower_bound(string());
274 // we assert that the first block is always allocated; it's true,
275 // and it simplifies our lives a bit.
276 assert(enumerate_p
->valid());
277 string k
= enumerate_p
->key();
278 const char *p
= k
.c_str();
279 _key_decode_u64(p
, &enumerate_offset
);
280 enumerate_bl
= enumerate_p
->value();
281 assert(enumerate_offset
== 0);
282 assert(get_next_set_bit(enumerate_bl
, 0) == 0);
285 if (enumerate_offset
>= size
) {
286 dout(10) << __func__
<< " end" << dendl
;
290 // skip set bits to find offset
292 enumerate_bl_pos
= get_next_clear_bit(enumerate_bl
, enumerate_bl_pos
);
293 if (enumerate_bl_pos
>= 0) {
294 *offset
= _get_offset(enumerate_offset
, enumerate_bl_pos
);
295 dout(30) << __func__
<< " found clear bit, key 0x" << std::hex
296 << enumerate_offset
<< " bit 0x" << enumerate_bl_pos
297 << " offset 0x" << *offset
298 << std::dec
<< dendl
;
301 dout(30) << " no more clear bits in 0x" << std::hex
<< enumerate_offset
302 << std::dec
<< dendl
;
304 enumerate_bl
.clear();
305 if (!enumerate_p
->valid()) {
306 enumerate_offset
+= bytes_per_key
;
307 enumerate_bl_pos
= 0;
308 *offset
= _get_offset(enumerate_offset
, enumerate_bl_pos
);
311 string k
= enumerate_p
->key();
312 const char *p
= k
.c_str();
313 uint64_t next
= enumerate_offset
+ bytes_per_key
;
314 _key_decode_u64(p
, &enumerate_offset
);
315 enumerate_bl
= enumerate_p
->value();
316 enumerate_bl_pos
= 0;
317 if (enumerate_offset
> next
) {
318 dout(30) << " no key at 0x" << std::hex
<< next
<< ", got 0x"
319 << enumerate_offset
<< std::dec
<< dendl
;
325 // skip clear bits to find the end
327 if (enumerate_p
->valid()) {
329 enumerate_bl_pos
= get_next_set_bit(enumerate_bl
, enumerate_bl_pos
);
330 if (enumerate_bl_pos
>= 0) {
331 end
= _get_offset(enumerate_offset
, enumerate_bl_pos
);
332 dout(30) << __func__
<< " found set bit, key 0x" << std::hex
333 << enumerate_offset
<< " bit 0x" << enumerate_bl_pos
334 << " offset 0x" << end
<< std::dec
336 end
= std::min(get_alloc_units() * bytes_per_block
, end
);
337 *length
= end
- *offset
;
338 dout(10) << __func__
<< std::hex
<< " 0x" << *offset
<< "~" << *length
339 << std::dec
<< dendl
;
342 dout(30) << " no more set bits in 0x" << std::hex
<< enumerate_offset
343 << std::dec
<< dendl
;
345 enumerate_bl
.clear();
346 enumerate_bl_pos
= 0;
347 if (!enumerate_p
->valid()) {
350 string k
= enumerate_p
->key();
351 const char *p
= k
.c_str();
352 _key_decode_u64(p
, &enumerate_offset
);
353 enumerate_bl
= enumerate_p
->value();
357 if (enumerate_offset
< size
) {
358 end
= get_alloc_units() * bytes_per_block
;
359 *length
= end
- *offset
;
360 dout(10) << __func__
<< std::hex
<< " 0x" << *offset
<< "~" << *length
361 << std::dec
<< dendl
;
362 enumerate_offset
= size
;
363 enumerate_bl_pos
= blocks_per_key
;
367 dout(10) << __func__
<< " end" << dendl
;
371 void BitmapFreelistManager::dump()
374 uint64_t offset
, length
;
375 while (enumerate_next(&offset
, &length
)) {
376 dout(20) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
377 << std::dec
<< dendl
;
381 void BitmapFreelistManager::_verify_range(uint64_t offset
, uint64_t length
,
385 uint64_t first_key
= offset
& key_mask
;
386 uint64_t last_key
= (offset
+ length
- 1) & key_mask
;
387 if (first_key
== last_key
) {
389 make_offset_key(first_key
, &k
);
391 kvdb
->get(bitmap_prefix
, k
, &bl
);
392 if (bl
.length() > 0) {
393 const char *p
= bl
.c_str();
394 unsigned s
= (offset
& ~key_mask
) / bytes_per_block
;
395 unsigned e
= ((offset
+ length
- 1) & ~key_mask
) / bytes_per_block
;
396 for (unsigned i
= s
; i
<= e
; ++i
) {
397 int has
= !!(p
[i
>> 3] & (1ull << (i
& 7)));
399 derr
<< __func__
<< " key 0x" << std::hex
<< first_key
<< " bit 0x"
400 << i
<< " has 0x" << has
<< " expected 0x" << val
401 << std::dec
<< dendl
;
407 derr
<< __func__
<< " key 0x" << std::hex
<< first_key
408 << " not present, expected 0x" << val
<< std::dec
<< dendl
;
416 make_offset_key(first_key
, &k
);
418 kvdb
->get(bitmap_prefix
, k
, &bl
);
420 const char *p
= bl
.c_str();
421 unsigned s
= (offset
& ~key_mask
) / bytes_per_block
;
422 unsigned e
= blocks_per_key
;
423 for (unsigned i
= s
; i
< e
; ++i
) {
424 int has
= !!(p
[i
>> 3] & (1ull << (i
& 7)));
426 derr
<< __func__
<< " key 0x" << std::hex
<< first_key
<< " bit 0x"
427 << i
<< " has 0x" << has
<< " expected 0x" << val
<< std::dec
434 derr
<< __func__
<< " key 0x" << std::hex
<< first_key
435 << " not present, expected 0x" << val
<< std::dec
<< dendl
;
439 first_key
+= bytes_per_key
;
442 if (first_key
< last_key
) {
443 while (first_key
< last_key
) {
445 make_offset_key(first_key
, &k
);
447 kvdb
->get(bitmap_prefix
, k
, &bl
);
448 if (bl
.length() > 0) {
449 const char *p
= bl
.c_str();
450 for (unsigned i
= 0; i
< blocks_per_key
; ++i
) {
451 int has
= !!(p
[i
>> 3] & (1ull << (i
& 7)));
453 derr
<< __func__
<< " key 0x" << std::hex
<< first_key
<< " bit 0x"
454 << i
<< " has 0x" << has
<< " expected 0x" << val
455 << std::dec
<< dendl
;
461 derr
<< __func__
<< " key 0x" << std::hex
<< first_key
462 << " not present, expected 0x" << val
<< std::dec
<< dendl
;
466 first_key
+= bytes_per_key
;
469 assert(first_key
== last_key
);
472 make_offset_key(first_key
, &k
);
474 kvdb
->get(bitmap_prefix
, k
, &bl
);
475 if (bl
.length() > 0) {
476 const char *p
= bl
.c_str();
477 unsigned e
= ((offset
+ length
- 1) & ~key_mask
) / bytes_per_block
;
478 for (unsigned i
= 0; i
< e
; ++i
) {
479 int has
= !!(p
[i
>> 3] & (1ull << (i
& 7)));
481 derr
<< __func__
<< " key 0x" << std::hex
<< first_key
<< " bit 0x"
482 << i
<< " has 0x" << has
<< " expected 0x" << val
<< std::dec
489 derr
<< __func__
<< " key 0x" << std::hex
<< first_key
490 << " not present, expected 0x" << val
<< std::dec
<< dendl
;
497 derr
<< __func__
<< " saw " << errors
<< " errors" << dendl
;
498 assert(0 == "bitmap freelist errors");
502 void BitmapFreelistManager::allocate(
503 uint64_t offset
, uint64_t length
,
504 KeyValueDB::Transaction txn
)
506 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
507 << std::dec
<< dendl
;
508 if (cct
->_conf
->bluestore_debug_freelist
)
509 _verify_range(offset
, length
, 0);
510 _xor(offset
, length
, txn
);
513 void BitmapFreelistManager::release(
514 uint64_t offset
, uint64_t length
,
515 KeyValueDB::Transaction txn
)
517 dout(10) << __func__
<< " 0x" << std::hex
<< offset
<< "~" << length
518 << std::dec
<< dendl
;
519 if (cct
->_conf
->bluestore_debug_freelist
)
520 _verify_range(offset
, length
, 1);
521 _xor(offset
, length
, txn
);
524 void BitmapFreelistManager::_xor(
525 uint64_t offset
, uint64_t length
,
526 KeyValueDB::Transaction txn
)
528 // must be block aligned
529 assert((offset
& block_mask
) == offset
);
530 assert((length
& block_mask
) == length
);
532 uint64_t first_key
= offset
& key_mask
;
533 uint64_t last_key
= (offset
+ length
- 1) & key_mask
;
534 dout(20) << __func__
<< " first_key 0x" << std::hex
<< first_key
535 << " last_key 0x" << last_key
<< std::dec
<< dendl
;
537 if (first_key
== last_key
) {
538 bufferptr
p(blocks_per_key
>> 3);
540 unsigned s
= (offset
& ~key_mask
) / bytes_per_block
;
541 unsigned e
= ((offset
+ length
- 1) & ~key_mask
) / bytes_per_block
;
542 for (unsigned i
= s
; i
<= e
; ++i
) {
543 p
[i
>> 3] ^= 1ull << (i
& 7);
546 make_offset_key(first_key
, &k
);
549 dout(30) << __func__
<< " 0x" << std::hex
<< first_key
<< std::dec
<< ": ";
550 bl
.hexdump(*_dout
, false);
552 txn
->merge(bitmap_prefix
, k
, bl
);
556 bufferptr
p(blocks_per_key
>> 3);
558 unsigned s
= (offset
& ~key_mask
) / bytes_per_block
;
559 unsigned e
= blocks_per_key
;
560 for (unsigned i
= s
; i
< e
; ++i
) {
561 p
[i
>> 3] ^= 1ull << (i
& 7);
564 make_offset_key(first_key
, &k
);
567 dout(30) << __func__
<< " 0x" << std::hex
<< first_key
<< std::dec
<< ": ";
568 bl
.hexdump(*_dout
, false);
570 txn
->merge(bitmap_prefix
, k
, bl
);
571 first_key
+= bytes_per_key
;
574 while (first_key
< last_key
) {
576 make_offset_key(first_key
, &k
);
577 dout(30) << __func__
<< " 0x" << std::hex
<< first_key
<< std::dec
579 all_set_bl
.hexdump(*_dout
, false);
581 txn
->merge(bitmap_prefix
, k
, all_set_bl
);
582 first_key
+= bytes_per_key
;
584 assert(first_key
== last_key
);
586 bufferptr
p(blocks_per_key
>> 3);
588 unsigned e
= ((offset
+ length
- 1) & ~key_mask
) / bytes_per_block
;
589 for (unsigned i
= 0; i
<= e
; ++i
) {
590 p
[i
>> 3] ^= 1ull << (i
& 7);
593 make_offset_key(first_key
, &k
);
596 dout(30) << __func__
<< " 0x" << std::hex
<< first_key
<< std::dec
<< ": ";
597 bl
.hexdump(*_dout
, false);
599 txn
->merge(bitmap_prefix
, k
, bl
);