]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BitmapFreelistManager.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / os / bluestore / BitmapFreelistManager.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "BitmapFreelistManager.h"
5#include "kv/KeyValueDB.h"
6#include "os/kv.h"
1911f103 7#include "include/stringify.h"
7c673cae
FG
8
9#include "common/debug.h"
10
11#define dout_context cct
12#define dout_subsys ceph_subsys_bluestore
13#undef dout_prefix
14#define dout_prefix *_dout << "freelist "
15
f67539c2
TL
16using std::string;
17
18using ceph::bufferlist;
19using ceph::bufferptr;
20using ceph::decode;
21using ceph::encode;
22
7c673cae
FG
23void make_offset_key(uint64_t offset, std::string *key)
24{
25 key->reserve(10);
26 _key_encode_u64(offset, key);
27}
28
29struct XorMergeOperator : public KeyValueDB::MergeOperator {
30 void merge_nonexistent(
31 const char *rdata, size_t rlen, std::string *new_value) override {
32 *new_value = std::string(rdata, rlen);
33 }
34 void merge(
35 const char *ldata, size_t llen,
36 const char *rdata, size_t rlen,
37 std::string *new_value) override {
11fdf7f2 38 ceph_assert(llen == rlen);
7c673cae
FG
39 *new_value = std::string(ldata, llen);
40 for (size_t i = 0; i < rlen; ++i) {
41 (*new_value)[i] ^= rdata[i];
42 }
43 }
44 // We use each operator name and each prefix to construct the
45 // overall RocksDB operator name for consistency check at open time.
91327a77 46 const char *name() const override {
7c673cae
FG
47 return "bitwise_xor";
48 }
49};
50
51void BitmapFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix)
52{
11fdf7f2 53 std::shared_ptr<XorMergeOperator> merge_op(new XorMergeOperator);
7c673cae
FG
54 db->set_merge_operator(prefix, merge_op);
55}
56
57BitmapFreelistManager::BitmapFreelistManager(CephContext* cct,
7c673cae
FG
58 string meta_prefix,
59 string bitmap_prefix)
60 : FreelistManager(cct),
61 meta_prefix(meta_prefix),
62 bitmap_prefix(bitmap_prefix),
7c673cae
FG
63 enumerate_bl_pos(0)
64{
65}
66
b32b8144 67int BitmapFreelistManager::create(uint64_t new_size, uint64_t granularity,
20effc67 68 uint64_t zone_size, uint64_t first_sequential_zone,
3efd9988 69 KeyValueDB::Transaction txn)
7c673cae 70{
b32b8144 71 bytes_per_block = granularity;
11fdf7f2
TL
72 ceph_assert(isp2(bytes_per_block));
73 size = p2align(new_size, bytes_per_block);
7c673cae
FG
74 blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key;
75
76 _init_misc();
77
1911f103
TL
78 blocks = size_2_block_count(size);
79 if (blocks * bytes_per_block > size) {
7c673cae
FG
80 dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
81 << " to 0x" << (blocks * bytes_per_block)
82 << " (0x" << blocks << " blocks)" << std::dec << dendl;
83 // set past-eof blocks as allocated
84 _xor(size, blocks * bytes_per_block - size, txn);
85 }
f67539c2 86 dout(1) << __func__
7c673cae
FG
87 << " size 0x" << std::hex << size
88 << " bytes_per_block 0x" << bytes_per_block
89 << " blocks 0x" << blocks
90 << " blocks_per_key 0x" << blocks_per_key
91 << std::dec << dendl;
92 {
93 bufferlist bl;
11fdf7f2 94 encode(bytes_per_block, bl);
7c673cae
FG
95 txn->set(meta_prefix, "bytes_per_block", bl);
96 }
97 {
98 bufferlist bl;
11fdf7f2 99 encode(blocks_per_key, bl);
7c673cae
FG
100 txn->set(meta_prefix, "blocks_per_key", bl);
101 }
102 {
103 bufferlist bl;
11fdf7f2 104 encode(blocks, bl);
7c673cae
FG
105 txn->set(meta_prefix, "blocks", bl);
106 }
107 {
108 bufferlist bl;
11fdf7f2 109 encode(size, bl);
7c673cae
FG
110 txn->set(meta_prefix, "size", bl);
111 }
112 return 0;
113}
114
1911f103 115int BitmapFreelistManager::_expand(uint64_t old_size, KeyValueDB* db)
11fdf7f2 116{
1911f103 117 assert(old_size < size);
11fdf7f2
TL
118 ceph_assert(isp2(bytes_per_block));
119
1911f103
TL
120 KeyValueDB::Transaction txn;
121 txn = db->get_transaction();
122
123 auto blocks0 = size_2_block_count(old_size);
124 if (blocks0 * bytes_per_block > old_size) {
125 dout(10) << __func__ << " rounding1 blocks up from 0x" << std::hex
126 << old_size << " to 0x" << (blocks0 * bytes_per_block)
11fdf7f2
TL
127 << " (0x" << blocks0 << " blocks)" << std::dec << dendl;
128 // reset past-eof blocks to unallocated
1911f103 129 _xor(old_size, blocks0 * bytes_per_block - old_size, txn);
11fdf7f2
TL
130 }
131
1911f103
TL
132 size = p2align(size, bytes_per_block);
133 blocks = size_2_block_count(size);
11fdf7f2 134
1911f103
TL
135 if (blocks * bytes_per_block > size) {
136 dout(10) << __func__ << " rounding2 blocks up from 0x" << std::hex
137 << size << " to 0x" << (blocks * bytes_per_block)
11fdf7f2
TL
138 << " (0x" << blocks << " blocks)" << std::dec << dendl;
139 // set past-eof blocks as allocated
140 _xor(size, blocks * bytes_per_block - size, txn);
141 }
142
143 dout(10) << __func__
144 << " size 0x" << std::hex << size
145 << " bytes_per_block 0x" << bytes_per_block
146 << " blocks 0x" << blocks
147 << " blocks_per_key 0x" << blocks_per_key
148 << std::dec << dendl;
149 {
150 bufferlist bl;
151 encode(blocks, bl);
152 txn->set(meta_prefix, "blocks", bl);
153 }
154 {
155 bufferlist bl;
156 encode(size, bl);
157 txn->set(meta_prefix, "size", bl);
158 }
1911f103
TL
159 db->submit_transaction_sync(txn);
160
11fdf7f2
TL
161 return 0;
162}
163
1911f103
TL
164int BitmapFreelistManager::read_size_meta_from_db(KeyValueDB* kvdb,
165 uint64_t* res)
7c673cae 166{
1911f103
TL
167 bufferlist v;
168 int r = kvdb->get(meta_prefix, "size", &v);
169 if (r < 0) {
170 derr << __func__ << " missing size meta in DB" << dendl;
f67539c2 171 return -ENOENT;
1911f103
TL
172 } else {
173 auto p = v.cbegin();
174 decode(*res, p);
175 r = 0;
176 }
177 return r;
178}
7c673cae 179
1911f103
TL
180void BitmapFreelistManager::_load_from_db(KeyValueDB* kvdb)
181{
7c673cae
FG
182 KeyValueDB::Iterator it = kvdb->get_iterator(meta_prefix);
183 it->lower_bound(string());
184
185 // load meta
186 while (it->valid()) {
187 string k = it->key();
188 if (k == "bytes_per_block") {
189 bufferlist bl = it->value();
11fdf7f2
TL
190 auto p = bl.cbegin();
191 decode(bytes_per_block, p);
7c673cae 192 dout(10) << __func__ << " bytes_per_block 0x" << std::hex
1911f103 193 << bytes_per_block << std::dec << dendl;
7c673cae
FG
194 } else if (k == "blocks") {
195 bufferlist bl = it->value();
11fdf7f2
TL
196 auto p = bl.cbegin();
197 decode(blocks, p);
7c673cae 198 dout(10) << __func__ << " blocks 0x" << std::hex << blocks << std::dec
1911f103 199 << dendl;
7c673cae
FG
200 } else if (k == "size") {
201 bufferlist bl = it->value();
11fdf7f2
TL
202 auto p = bl.cbegin();
203 decode(size, p);
7c673cae 204 dout(10) << __func__ << " size 0x" << std::hex << size << std::dec
1911f103 205 << dendl;
7c673cae
FG
206 } else if (k == "blocks_per_key") {
207 bufferlist bl = it->value();
11fdf7f2
TL
208 auto p = bl.cbegin();
209 decode(blocks_per_key, p);
7c673cae 210 dout(10) << __func__ << " blocks_per_key 0x" << std::hex << blocks_per_key
1911f103 211 << std::dec << dendl;
7c673cae
FG
212 } else {
213 derr << __func__ << " unrecognized meta " << k << dendl;
7c673cae
FG
214 }
215 it->next();
216 }
1911f103
TL
217}
218
219
f67539c2
TL
220int BitmapFreelistManager::init(KeyValueDB *kvdb, bool db_in_read_only,
221 std::function<int(const std::string&, std::string*)> cfg_reader)
1911f103
TL
222{
223 dout(1) << __func__ << dendl;
f67539c2 224 int r = _read_cfg(cfg_reader);
1911f103
TL
225 if (r != 0) {
226 dout(1) << __func__ << " fall back to legacy meta repo" << dendl;
227 _load_from_db(kvdb);
228 }
229 _sync(kvdb, db_in_read_only);
7c673cae
FG
230
231 dout(10) << __func__ << std::hex
232 << " size 0x" << size
233 << " bytes_per_block 0x" << bytes_per_block
234 << " blocks 0x" << blocks
235 << " blocks_per_key 0x" << blocks_per_key
236 << std::dec << dendl;
237 _init_misc();
238 return 0;
239}
240
f67539c2
TL
241int BitmapFreelistManager::_read_cfg(
242 std::function<int(const std::string&, std::string*)> cfg_reader)
1911f103
TL
243{
244 dout(1) << __func__ << dendl;
245
1911f103
TL
246 string err;
247
f67539c2
TL
248 const size_t key_count = 4;
249 string keys[key_count] = {
250 "bfm_size",
251 "bfm_blocks",
252 "bfm_bytes_per_block",
253 "bfm_blocks_per_key"};
254 uint64_t* vals[key_count] = {
255 &size,
256 &blocks,
257 &bytes_per_block,
258 &blocks_per_key};
259
260 for (size_t i = 0; i < key_count; i++) {
261 string val;
262 int r = cfg_reader(keys[i], &val);
263 if (r == 0) {
20effc67 264 *(vals[i]) = strict_iecstrtoll(val, &err);
f67539c2
TL
265 if (!err.empty()) {
266 derr << __func__ << " Failed to parse - "
267 << keys[i] << ":" << val
268 << ", error: " << err << dendl;
269 return -EINVAL;
270 }
271 } else {
272 // this is expected for legacy deployed OSDs
273 dout(0) << __func__ << " " << keys[i] << " not found in bdev meta" << dendl;
1911f103
TL
274 return r;
275 }
1911f103
TL
276 }
277
1911f103
TL
278 return 0;
279}
280
7c673cae
FG
281void BitmapFreelistManager::_init_misc()
282{
283 bufferptr z(blocks_per_key >> 3);
284 memset(z.c_str(), 0xff, z.length());
285 all_set_bl.clear();
286 all_set_bl.append(z);
287
288 block_mask = ~(bytes_per_block - 1);
289
290 bytes_per_key = bytes_per_block * blocks_per_key;
291 key_mask = ~(bytes_per_key - 1);
292 dout(10) << __func__ << std::hex << " bytes_per_key 0x" << bytes_per_key
293 << ", key_mask 0x" << key_mask << std::dec
294 << dendl;
295}
296
1911f103
TL
297void BitmapFreelistManager::sync(KeyValueDB* kvdb)
298{
299 _sync(kvdb, true);
300}
301
302void BitmapFreelistManager::_sync(KeyValueDB* kvdb, bool read_only)
303{
304 dout(10) << __func__ << " checks if size sync is needed" << dendl;
305 uint64_t size_db = 0;
306 int r = read_size_meta_from_db(kvdb, &size_db);
307 ceph_assert(r >= 0);
308 if (!read_only && size_db < size) {
309 dout(1) << __func__ << " committing new size 0x" << std::hex << size
310 << std::dec << dendl;
311 r = _expand(size_db, kvdb);
312 ceph_assert(r == 0);
313 } else if (size_db > size) {
314 // this might hapen when OSD passed the following sequence:
315 // upgrade -> downgrade -> expand -> upgrade
316 // One needs to run expand once again to syncup
317 dout(1) << __func__ << " fall back to legacy meta repo" << dendl;
318 _load_from_db(kvdb);
319 }
320}
321
7c673cae
FG
322void BitmapFreelistManager::shutdown()
323{
324 dout(1) << __func__ << dendl;
325}
326
327void BitmapFreelistManager::enumerate_reset()
328{
11fdf7f2 329 std::lock_guard l(lock);
7c673cae
FG
330 enumerate_offset = 0;
331 enumerate_bl_pos = 0;
332 enumerate_bl.clear();
224ce89b 333 enumerate_p.reset();
7c673cae
FG
334}
335
336int get_next_clear_bit(bufferlist& bl, int start)
337{
338 const char *p = bl.c_str();
339 int bits = bl.length() << 3;
340 while (start < bits) {
d2e6a577
FG
341 // byte = start / 8 (or start >> 3)
342 // bit = start % 8 (or start & 7)
343 unsigned char byte_mask = 1 << (start & 7);
344 if ((p[start >> 3] & byte_mask) == 0) {
7c673cae
FG
345 return start;
346 }
347 ++start;
348 }
349 return -1; // not found
350}
351
352int get_next_set_bit(bufferlist& bl, int start)
353{
354 const char *p = bl.c_str();
355 int bits = bl.length() << 3;
356 while (start < bits) {
357 int which_byte = start / 8;
358 int which_bit = start % 8;
359 unsigned char byte_mask = 1 << which_bit;
360 if (p[which_byte] & byte_mask) {
361 return start;
362 }
363 ++start;
364 }
365 return -1; // not found
366}
367
11fdf7f2 368bool BitmapFreelistManager::enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length)
7c673cae 369{
11fdf7f2 370 std::lock_guard l(lock);
7c673cae
FG
371
372 // initial base case is a bit awkward
373 if (enumerate_offset == 0 && enumerate_bl_pos == 0) {
374 dout(10) << __func__ << " start" << dendl;
375 enumerate_p = kvdb->get_iterator(bitmap_prefix);
376 enumerate_p->lower_bound(string());
377 // we assert that the first block is always allocated; it's true,
378 // and it simplifies our lives a bit.
11fdf7f2 379 ceph_assert(enumerate_p->valid());
7c673cae
FG
380 string k = enumerate_p->key();
381 const char *p = k.c_str();
382 _key_decode_u64(p, &enumerate_offset);
383 enumerate_bl = enumerate_p->value();
11fdf7f2
TL
384 ceph_assert(enumerate_offset == 0);
385 ceph_assert(get_next_set_bit(enumerate_bl, 0) == 0);
7c673cae
FG
386 }
387
388 if (enumerate_offset >= size) {
389 dout(10) << __func__ << " end" << dendl;
390 return false;
391 }
392
393 // skip set bits to find offset
394 while (true) {
395 enumerate_bl_pos = get_next_clear_bit(enumerate_bl, enumerate_bl_pos);
396 if (enumerate_bl_pos >= 0) {
397 *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
398 dout(30) << __func__ << " found clear bit, key 0x" << std::hex
399 << enumerate_offset << " bit 0x" << enumerate_bl_pos
400 << " offset 0x" << *offset
401 << std::dec << dendl;
402 break;
403 }
404 dout(30) << " no more clear bits in 0x" << std::hex << enumerate_offset
405 << std::dec << dendl;
406 enumerate_p->next();
407 enumerate_bl.clear();
408 if (!enumerate_p->valid()) {
409 enumerate_offset += bytes_per_key;
410 enumerate_bl_pos = 0;
411 *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
412 break;
413 }
414 string k = enumerate_p->key();
415 const char *p = k.c_str();
416 uint64_t next = enumerate_offset + bytes_per_key;
417 _key_decode_u64(p, &enumerate_offset);
418 enumerate_bl = enumerate_p->value();
419 enumerate_bl_pos = 0;
420 if (enumerate_offset > next) {
421 dout(30) << " no key at 0x" << std::hex << next << ", got 0x"
422 << enumerate_offset << std::dec << dendl;
423 *offset = next;
424 break;
425 }
426 }
427
428 // skip clear bits to find the end
429 uint64_t end = 0;
430 if (enumerate_p->valid()) {
431 while (true) {
432 enumerate_bl_pos = get_next_set_bit(enumerate_bl, enumerate_bl_pos);
433 if (enumerate_bl_pos >= 0) {
434 end = _get_offset(enumerate_offset, enumerate_bl_pos);
435 dout(30) << __func__ << " found set bit, key 0x" << std::hex
436 << enumerate_offset << " bit 0x" << enumerate_bl_pos
437 << " offset 0x" << end << std::dec
438 << dendl;
b32b8144 439 end = std::min(get_alloc_units() * bytes_per_block, end);
7c673cae 440 *length = end - *offset;
7c673cae
FG
441 dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
442 << std::dec << dendl;
443 return true;
444 }
445 dout(30) << " no more set bits in 0x" << std::hex << enumerate_offset
446 << std::dec << dendl;
447 enumerate_p->next();
448 enumerate_bl.clear();
449 enumerate_bl_pos = 0;
450 if (!enumerate_p->valid()) {
451 break;
452 }
453 string k = enumerate_p->key();
454 const char *p = k.c_str();
455 _key_decode_u64(p, &enumerate_offset);
456 enumerate_bl = enumerate_p->value();
457 }
458 }
459
b32b8144
FG
460 if (enumerate_offset < size) {
461 end = get_alloc_units() * bytes_per_block;
7c673cae
FG
462 *length = end - *offset;
463 dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
464 << std::dec << dendl;
b32b8144 465 enumerate_offset = size;
7c673cae 466 enumerate_bl_pos = blocks_per_key;
7c673cae
FG
467 return true;
468 }
469
470 dout(10) << __func__ << " end" << dendl;
471 return false;
472}
473
11fdf7f2 474void BitmapFreelistManager::dump(KeyValueDB *kvdb)
7c673cae
FG
475{
476 enumerate_reset();
477 uint64_t offset, length;
11fdf7f2 478 while (enumerate_next(kvdb, &offset, &length)) {
7c673cae
FG
479 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
480 << std::dec << dendl;
481 }
482}
483
7c673cae
FG
484void BitmapFreelistManager::allocate(
485 uint64_t offset, uint64_t length,
486 KeyValueDB::Transaction txn)
487{
488 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
489 << std::dec << dendl;
20effc67
TL
490 if (!is_null_manager()) {
491 _xor(offset, length, txn);
492 }
7c673cae
FG
493}
494
495void BitmapFreelistManager::release(
496 uint64_t offset, uint64_t length,
497 KeyValueDB::Transaction txn)
498{
499 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
500 << std::dec << dendl;
20effc67
TL
501 if (!is_null_manager()) {
502 _xor(offset, length, txn);
503 }
7c673cae
FG
504}
505
506void BitmapFreelistManager::_xor(
507 uint64_t offset, uint64_t length,
508 KeyValueDB::Transaction txn)
509{
510 // must be block aligned
11fdf7f2
TL
511 ceph_assert((offset & block_mask) == offset);
512 ceph_assert((length & block_mask) == length);
7c673cae
FG
513
514 uint64_t first_key = offset & key_mask;
515 uint64_t last_key = (offset + length - 1) & key_mask;
516 dout(20) << __func__ << " first_key 0x" << std::hex << first_key
517 << " last_key 0x" << last_key << std::dec << dendl;
518
519 if (first_key == last_key) {
520 bufferptr p(blocks_per_key >> 3);
521 p.zero();
522 unsigned s = (offset & ~key_mask) / bytes_per_block;
523 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
524 for (unsigned i = s; i <= e; ++i) {
525 p[i >> 3] ^= 1ull << (i & 7);
526 }
527 string k;
528 make_offset_key(first_key, &k);
529 bufferlist bl;
530 bl.append(p);
531 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
532 bl.hexdump(*_dout, false);
533 *_dout << dendl;
534 txn->merge(bitmap_prefix, k, bl);
535 } else {
536 // first key
537 {
538 bufferptr p(blocks_per_key >> 3);
539 p.zero();
540 unsigned s = (offset & ~key_mask) / bytes_per_block;
541 unsigned e = blocks_per_key;
542 for (unsigned i = s; i < e; ++i) {
543 p[i >> 3] ^= 1ull << (i & 7);
544 }
545 string k;
546 make_offset_key(first_key, &k);
547 bufferlist bl;
548 bl.append(p);
549 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
550 bl.hexdump(*_dout, false);
551 *_dout << dendl;
552 txn->merge(bitmap_prefix, k, bl);
553 first_key += bytes_per_key;
554 }
555 // middle keys
556 while (first_key < last_key) {
557 string k;
558 make_offset_key(first_key, &k);
559 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec
560 << ": ";
561 all_set_bl.hexdump(*_dout, false);
562 *_dout << dendl;
563 txn->merge(bitmap_prefix, k, all_set_bl);
564 first_key += bytes_per_key;
565 }
11fdf7f2 566 ceph_assert(first_key == last_key);
7c673cae
FG
567 {
568 bufferptr p(blocks_per_key >> 3);
569 p.zero();
570 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
571 for (unsigned i = 0; i <= e; ++i) {
572 p[i >> 3] ^= 1ull << (i & 7);
573 }
574 string k;
575 make_offset_key(first_key, &k);
576 bufferlist bl;
577 bl.append(p);
578 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
579 bl.hexdump(*_dout, false);
580 *_dout << dendl;
581 txn->merge(bitmap_prefix, k, bl);
582 }
583 }
584}
1911f103
TL
585
586uint64_t BitmapFreelistManager::size_2_block_count(uint64_t target_size) const
587{
588 auto target_blocks = target_size / bytes_per_block;
589 if (target_blocks / blocks_per_key * blocks_per_key != target_blocks) {
590 target_blocks = (target_blocks / blocks_per_key + 1) * blocks_per_key;
591 }
592 return target_blocks;
593}
594
595void BitmapFreelistManager::get_meta(
596 uint64_t target_size,
597 std::vector<std::pair<string, string>>* res) const
598{
599 if (target_size == 0) {
600 res->emplace_back("bfm_blocks", stringify(blocks));
601 res->emplace_back("bfm_size", stringify(size));
602 } else {
603 target_size = p2align(target_size, bytes_per_block);
604 auto target_blocks = size_2_block_count(target_size);
605
606 res->emplace_back("bfm_blocks", stringify(target_blocks));
607 res->emplace_back("bfm_size", stringify(target_size));
608 }
609 res->emplace_back("bfm_bytes_per_block", stringify(bytes_per_block));
610 res->emplace_back("bfm_blocks_per_key", stringify(blocks_per_key));
611}