]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BitmapFreelistManager.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / os / bluestore / BitmapFreelistManager.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "BitmapFreelistManager.h"
5
6 #include <bit>
7 #include "kv/KeyValueDB.h"
8 #include "os/kv.h"
9 #include "include/stringify.h"
10
11 #include "common/debug.h"
12
13 #define dout_context cct
14 #define dout_subsys ceph_subsys_bluestore
15 #undef dout_prefix
16 #define dout_prefix *_dout << "freelist "
17
18 using std::string;
19
20 using ceph::bufferlist;
21 using ceph::bufferptr;
22 using ceph::decode;
23 using ceph::encode;
24
25 void make_offset_key(uint64_t offset, std::string *key)
26 {
27 key->reserve(10);
28 _key_encode_u64(offset, key);
29 }
30
31 struct XorMergeOperator : public KeyValueDB::MergeOperator {
32 void merge_nonexistent(
33 const char *rdata, size_t rlen, std::string *new_value) override {
34 *new_value = std::string(rdata, rlen);
35 }
36 void merge(
37 const char *ldata, size_t llen,
38 const char *rdata, size_t rlen,
39 std::string *new_value) override {
40 ceph_assert(llen == rlen);
41 *new_value = std::string(ldata, llen);
42 for (size_t i = 0; i < rlen; ++i) {
43 (*new_value)[i] ^= rdata[i];
44 }
45 }
46 // We use each operator name and each prefix to construct the
47 // overall RocksDB operator name for consistency check at open time.
48 const char *name() const override {
49 return "bitwise_xor";
50 }
51 };
52
53 void BitmapFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix)
54 {
55 std::shared_ptr<XorMergeOperator> merge_op(new XorMergeOperator);
56 db->set_merge_operator(prefix, merge_op);
57 }
58
59 BitmapFreelistManager::BitmapFreelistManager(CephContext* cct,
60 string meta_prefix,
61 string bitmap_prefix)
62 : FreelistManager(cct),
63 meta_prefix(meta_prefix),
64 bitmap_prefix(bitmap_prefix),
65 enumerate_bl_pos(0)
66 {
67 }
68
69 int BitmapFreelistManager::create(uint64_t new_size, uint64_t granularity,
70 uint64_t zone_size, uint64_t first_sequential_zone,
71 KeyValueDB::Transaction txn)
72 {
73 bytes_per_block = granularity;
74 ceph_assert(std::has_single_bit(bytes_per_block));
75 size = p2align(new_size, bytes_per_block);
76 blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key;
77
78 _init_misc();
79
80 blocks = size_2_block_count(size);
81 if (blocks * bytes_per_block > size) {
82 dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
83 << " to 0x" << (blocks * bytes_per_block)
84 << " (0x" << blocks << " blocks)" << std::dec << dendl;
85 // set past-eof blocks as allocated
86 _xor(size, blocks * bytes_per_block - size, txn);
87 }
88 dout(1) << __func__
89 << " size 0x" << std::hex << size
90 << " bytes_per_block 0x" << bytes_per_block
91 << " blocks 0x" << blocks
92 << " blocks_per_key 0x" << blocks_per_key
93 << std::dec << dendl;
94 {
95 bufferlist bl;
96 encode(bytes_per_block, bl);
97 txn->set(meta_prefix, "bytes_per_block", bl);
98 }
99 {
100 bufferlist bl;
101 encode(blocks_per_key, bl);
102 txn->set(meta_prefix, "blocks_per_key", bl);
103 }
104 {
105 bufferlist bl;
106 encode(blocks, bl);
107 txn->set(meta_prefix, "blocks", bl);
108 }
109 {
110 bufferlist bl;
111 encode(size, bl);
112 txn->set(meta_prefix, "size", bl);
113 }
114 return 0;
115 }
116
117 int BitmapFreelistManager::_expand(uint64_t old_size, KeyValueDB* db)
118 {
119 assert(old_size < size);
120 ceph_assert(std::has_single_bit(bytes_per_block));
121
122 KeyValueDB::Transaction txn;
123 txn = db->get_transaction();
124
125 auto blocks0 = size_2_block_count(old_size);
126 if (blocks0 * bytes_per_block > old_size) {
127 dout(10) << __func__ << " rounding1 blocks up from 0x" << std::hex
128 << old_size << " to 0x" << (blocks0 * bytes_per_block)
129 << " (0x" << blocks0 << " blocks)" << std::dec << dendl;
130 // reset past-eof blocks to unallocated
131 _xor(old_size, blocks0 * bytes_per_block - old_size, txn);
132 }
133
134 size = p2align(size, bytes_per_block);
135 blocks = size_2_block_count(size);
136
137 if (blocks * bytes_per_block > size) {
138 dout(10) << __func__ << " rounding2 blocks up from 0x" << std::hex
139 << size << " to 0x" << (blocks * bytes_per_block)
140 << " (0x" << blocks << " blocks)" << std::dec << dendl;
141 // set past-eof blocks as allocated
142 _xor(size, blocks * bytes_per_block - size, txn);
143 }
144
145 dout(10) << __func__
146 << " size 0x" << std::hex << size
147 << " bytes_per_block 0x" << bytes_per_block
148 << " blocks 0x" << blocks
149 << " blocks_per_key 0x" << blocks_per_key
150 << std::dec << dendl;
151 {
152 bufferlist bl;
153 encode(blocks, bl);
154 txn->set(meta_prefix, "blocks", bl);
155 }
156 {
157 bufferlist bl;
158 encode(size, bl);
159 txn->set(meta_prefix, "size", bl);
160 }
161 db->submit_transaction_sync(txn);
162
163 return 0;
164 }
165
166 int BitmapFreelistManager::read_size_meta_from_db(KeyValueDB* kvdb,
167 uint64_t* res)
168 {
169 bufferlist v;
170 int r = kvdb->get(meta_prefix, "size", &v);
171 if (r < 0) {
172 derr << __func__ << " missing size meta in DB" << dendl;
173 return -ENOENT;
174 } else {
175 auto p = v.cbegin();
176 decode(*res, p);
177 r = 0;
178 }
179 return r;
180 }
181
182 void BitmapFreelistManager::_load_from_db(KeyValueDB* kvdb)
183 {
184 KeyValueDB::Iterator it = kvdb->get_iterator(meta_prefix);
185 it->lower_bound(string());
186
187 // load meta
188 while (it->valid()) {
189 string k = it->key();
190 if (k == "bytes_per_block") {
191 bufferlist bl = it->value();
192 auto p = bl.cbegin();
193 decode(bytes_per_block, p);
194 dout(10) << __func__ << " bytes_per_block 0x" << std::hex
195 << bytes_per_block << std::dec << dendl;
196 } else if (k == "blocks") {
197 bufferlist bl = it->value();
198 auto p = bl.cbegin();
199 decode(blocks, p);
200 dout(10) << __func__ << " blocks 0x" << std::hex << blocks << std::dec
201 << dendl;
202 } else if (k == "size") {
203 bufferlist bl = it->value();
204 auto p = bl.cbegin();
205 decode(size, p);
206 dout(10) << __func__ << " size 0x" << std::hex << size << std::dec
207 << dendl;
208 } else if (k == "blocks_per_key") {
209 bufferlist bl = it->value();
210 auto p = bl.cbegin();
211 decode(blocks_per_key, p);
212 dout(10) << __func__ << " blocks_per_key 0x" << std::hex << blocks_per_key
213 << std::dec << dendl;
214 } else {
215 derr << __func__ << " unrecognized meta " << k << dendl;
216 }
217 it->next();
218 }
219 }
220
221
222 int BitmapFreelistManager::init(KeyValueDB *kvdb, bool db_in_read_only,
223 std::function<int(const std::string&, std::string*)> cfg_reader)
224 {
225 dout(1) << __func__ << dendl;
226 int r = _read_cfg(cfg_reader);
227 if (r != 0) {
228 dout(1) << __func__ << " fall back to legacy meta repo" << dendl;
229 _load_from_db(kvdb);
230 }
231 _sync(kvdb, db_in_read_only);
232
233 dout(10) << __func__ << std::hex
234 << " size 0x" << size
235 << " bytes_per_block 0x" << bytes_per_block
236 << " blocks 0x" << blocks
237 << " blocks_per_key 0x" << blocks_per_key
238 << std::dec << dendl;
239 _init_misc();
240 return 0;
241 }
242
243 int BitmapFreelistManager::_read_cfg(
244 std::function<int(const std::string&, std::string*)> cfg_reader)
245 {
246 dout(1) << __func__ << dendl;
247
248 string err;
249
250 const size_t key_count = 4;
251 string keys[key_count] = {
252 "bfm_size",
253 "bfm_blocks",
254 "bfm_bytes_per_block",
255 "bfm_blocks_per_key"};
256 uint64_t* vals[key_count] = {
257 &size,
258 &blocks,
259 &bytes_per_block,
260 &blocks_per_key};
261
262 for (size_t i = 0; i < key_count; i++) {
263 string val;
264 int r = cfg_reader(keys[i], &val);
265 if (r == 0) {
266 *(vals[i]) = strict_iecstrtoll(val, &err);
267 if (!err.empty()) {
268 derr << __func__ << " Failed to parse - "
269 << keys[i] << ":" << val
270 << ", error: " << err << dendl;
271 return -EINVAL;
272 }
273 } else {
274 // this is expected for legacy deployed OSDs
275 dout(0) << __func__ << " " << keys[i] << " not found in bdev meta" << dendl;
276 return r;
277 }
278 }
279
280 return 0;
281 }
282
283 void BitmapFreelistManager::_init_misc()
284 {
285 bufferptr z(blocks_per_key >> 3);
286 memset(z.c_str(), 0xff, z.length());
287 all_set_bl.clear();
288 all_set_bl.append(z);
289
290 block_mask = ~(bytes_per_block - 1);
291
292 bytes_per_key = bytes_per_block * blocks_per_key;
293 key_mask = ~(bytes_per_key - 1);
294 dout(10) << __func__ << std::hex << " bytes_per_key 0x" << bytes_per_key
295 << ", key_mask 0x" << key_mask << std::dec
296 << dendl;
297 }
298
299 void BitmapFreelistManager::sync(KeyValueDB* kvdb)
300 {
301 _sync(kvdb, true);
302 }
303
304 void BitmapFreelistManager::_sync(KeyValueDB* kvdb, bool read_only)
305 {
306 dout(10) << __func__ << " checks if size sync is needed" << dendl;
307 uint64_t size_db = 0;
308 int r = read_size_meta_from_db(kvdb, &size_db);
309 ceph_assert(r >= 0);
310 if (!read_only && size_db < size) {
311 dout(1) << __func__ << " committing new size 0x" << std::hex << size
312 << std::dec << dendl;
313 r = _expand(size_db, kvdb);
314 ceph_assert(r == 0);
315 } else if (size_db > size) {
316 // this might hapen when OSD passed the following sequence:
317 // upgrade -> downgrade -> expand -> upgrade
318 // One needs to run expand once again to syncup
319 dout(1) << __func__ << " fall back to legacy meta repo" << dendl;
320 _load_from_db(kvdb);
321 }
322 }
323
324 void BitmapFreelistManager::shutdown()
325 {
326 dout(1) << __func__ << dendl;
327 }
328
329 void BitmapFreelistManager::enumerate_reset()
330 {
331 std::lock_guard l(lock);
332 enumerate_offset = 0;
333 enumerate_bl_pos = 0;
334 enumerate_bl.clear();
335 enumerate_p.reset();
336 }
337
338 int get_next_clear_bit(bufferlist& bl, int start)
339 {
340 const char *p = bl.c_str();
341 int bits = bl.length() << 3;
342 while (start < bits) {
343 // byte = start / 8 (or start >> 3)
344 // bit = start % 8 (or start & 7)
345 unsigned char byte_mask = 1 << (start & 7);
346 if ((p[start >> 3] & byte_mask) == 0) {
347 return start;
348 }
349 ++start;
350 }
351 return -1; // not found
352 }
353
354 int get_next_set_bit(bufferlist& bl, int start)
355 {
356 const char *p = bl.c_str();
357 int bits = bl.length() << 3;
358 while (start < bits) {
359 int which_byte = start / 8;
360 int which_bit = start % 8;
361 unsigned char byte_mask = 1 << which_bit;
362 if (p[which_byte] & byte_mask) {
363 return start;
364 }
365 ++start;
366 }
367 return -1; // not found
368 }
369
370 bool BitmapFreelistManager::enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length)
371 {
372 std::lock_guard l(lock);
373
374 // initial base case is a bit awkward
375 if (enumerate_offset == 0 && enumerate_bl_pos == 0) {
376 dout(10) << __func__ << " start" << dendl;
377 enumerate_p = kvdb->get_iterator(bitmap_prefix);
378 enumerate_p->lower_bound(string());
379 // we assert that the first block is always allocated; it's true,
380 // and it simplifies our lives a bit.
381 ceph_assert(enumerate_p->valid());
382 string k = enumerate_p->key();
383 const char *p = k.c_str();
384 _key_decode_u64(p, &enumerate_offset);
385 enumerate_bl = enumerate_p->value();
386 ceph_assert(enumerate_offset == 0);
387 ceph_assert(get_next_set_bit(enumerate_bl, 0) == 0);
388 }
389
390 if (enumerate_offset >= size) {
391 dout(10) << __func__ << " end" << dendl;
392 return false;
393 }
394
395 // skip set bits to find offset
396 while (true) {
397 enumerate_bl_pos = get_next_clear_bit(enumerate_bl, enumerate_bl_pos);
398 if (enumerate_bl_pos >= 0) {
399 *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
400 dout(30) << __func__ << " found clear bit, key 0x" << std::hex
401 << enumerate_offset << " bit 0x" << enumerate_bl_pos
402 << " offset 0x" << *offset
403 << std::dec << dendl;
404 break;
405 }
406 dout(30) << " no more clear bits in 0x" << std::hex << enumerate_offset
407 << std::dec << dendl;
408 enumerate_p->next();
409 enumerate_bl.clear();
410 if (!enumerate_p->valid()) {
411 enumerate_offset += bytes_per_key;
412 enumerate_bl_pos = 0;
413 *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
414 break;
415 }
416 string k = enumerate_p->key();
417 const char *p = k.c_str();
418 uint64_t next = enumerate_offset + bytes_per_key;
419 _key_decode_u64(p, &enumerate_offset);
420 enumerate_bl = enumerate_p->value();
421 enumerate_bl_pos = 0;
422 if (enumerate_offset > next) {
423 dout(30) << " no key at 0x" << std::hex << next << ", got 0x"
424 << enumerate_offset << std::dec << dendl;
425 *offset = next;
426 break;
427 }
428 }
429
430 // skip clear bits to find the end
431 uint64_t end = 0;
432 if (enumerate_p->valid()) {
433 while (true) {
434 enumerate_bl_pos = get_next_set_bit(enumerate_bl, enumerate_bl_pos);
435 if (enumerate_bl_pos >= 0) {
436 end = _get_offset(enumerate_offset, enumerate_bl_pos);
437 dout(30) << __func__ << " found set bit, key 0x" << std::hex
438 << enumerate_offset << " bit 0x" << enumerate_bl_pos
439 << " offset 0x" << end << std::dec
440 << dendl;
441 end = std::min(get_alloc_units() * bytes_per_block, end);
442 *length = end - *offset;
443 dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
444 << std::dec << dendl;
445 return true;
446 }
447 dout(30) << " no more set bits in 0x" << std::hex << enumerate_offset
448 << std::dec << dendl;
449 enumerate_p->next();
450 enumerate_bl.clear();
451 enumerate_bl_pos = 0;
452 if (!enumerate_p->valid()) {
453 break;
454 }
455 string k = enumerate_p->key();
456 const char *p = k.c_str();
457 _key_decode_u64(p, &enumerate_offset);
458 enumerate_bl = enumerate_p->value();
459 }
460 }
461
462 if (enumerate_offset < size) {
463 end = get_alloc_units() * bytes_per_block;
464 *length = end - *offset;
465 dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
466 << std::dec << dendl;
467 enumerate_offset = size;
468 enumerate_bl_pos = blocks_per_key;
469 return true;
470 }
471
472 dout(10) << __func__ << " end" << dendl;
473 return false;
474 }
475
476 void BitmapFreelistManager::dump(KeyValueDB *kvdb)
477 {
478 enumerate_reset();
479 uint64_t offset, length;
480 while (enumerate_next(kvdb, &offset, &length)) {
481 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
482 << std::dec << dendl;
483 }
484 }
485
486 void BitmapFreelistManager::allocate(
487 uint64_t offset, uint64_t length,
488 KeyValueDB::Transaction txn)
489 {
490 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
491 << std::dec << dendl;
492 if (!is_null_manager()) {
493 _xor(offset, length, txn);
494 }
495 }
496
497 void BitmapFreelistManager::release(
498 uint64_t offset, uint64_t length,
499 KeyValueDB::Transaction txn)
500 {
501 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
502 << std::dec << dendl;
503 if (!is_null_manager()) {
504 _xor(offset, length, txn);
505 }
506 }
507
508 void BitmapFreelistManager::_xor(
509 uint64_t offset, uint64_t length,
510 KeyValueDB::Transaction txn)
511 {
512 // must be block aligned
513 ceph_assert((offset & block_mask) == offset);
514 ceph_assert((length & block_mask) == length);
515
516 uint64_t first_key = offset & key_mask;
517 uint64_t last_key = (offset + length - 1) & key_mask;
518 dout(20) << __func__ << " first_key 0x" << std::hex << first_key
519 << " last_key 0x" << last_key << std::dec << dendl;
520
521 if (first_key == last_key) {
522 bufferptr p(blocks_per_key >> 3);
523 p.zero();
524 unsigned s = (offset & ~key_mask) / bytes_per_block;
525 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
526 for (unsigned i = s; i <= e; ++i) {
527 p[i >> 3] ^= 1ull << (i & 7);
528 }
529 string k;
530 make_offset_key(first_key, &k);
531 bufferlist bl;
532 bl.append(p);
533 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
534 bl.hexdump(*_dout, false);
535 *_dout << dendl;
536 txn->merge(bitmap_prefix, k, bl);
537 } else {
538 // first key
539 {
540 bufferptr p(blocks_per_key >> 3);
541 p.zero();
542 unsigned s = (offset & ~key_mask) / bytes_per_block;
543 unsigned e = blocks_per_key;
544 for (unsigned i = s; i < e; ++i) {
545 p[i >> 3] ^= 1ull << (i & 7);
546 }
547 string k;
548 make_offset_key(first_key, &k);
549 bufferlist bl;
550 bl.append(p);
551 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
552 bl.hexdump(*_dout, false);
553 *_dout << dendl;
554 txn->merge(bitmap_prefix, k, bl);
555 first_key += bytes_per_key;
556 }
557 // middle keys
558 while (first_key < last_key) {
559 string k;
560 make_offset_key(first_key, &k);
561 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec
562 << ": ";
563 all_set_bl.hexdump(*_dout, false);
564 *_dout << dendl;
565 txn->merge(bitmap_prefix, k, all_set_bl);
566 first_key += bytes_per_key;
567 }
568 ceph_assert(first_key == last_key);
569 {
570 bufferptr p(blocks_per_key >> 3);
571 p.zero();
572 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
573 for (unsigned i = 0; i <= e; ++i) {
574 p[i >> 3] ^= 1ull << (i & 7);
575 }
576 string k;
577 make_offset_key(first_key, &k);
578 bufferlist bl;
579 bl.append(p);
580 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
581 bl.hexdump(*_dout, false);
582 *_dout << dendl;
583 txn->merge(bitmap_prefix, k, bl);
584 }
585 }
586 }
587
588 uint64_t BitmapFreelistManager::size_2_block_count(uint64_t target_size) const
589 {
590 auto target_blocks = target_size / bytes_per_block;
591 if (target_blocks / blocks_per_key * blocks_per_key != target_blocks) {
592 target_blocks = (target_blocks / blocks_per_key + 1) * blocks_per_key;
593 }
594 return target_blocks;
595 }
596
597 void BitmapFreelistManager::get_meta(
598 uint64_t target_size,
599 std::vector<std::pair<string, string>>* res) const
600 {
601 if (target_size == 0) {
602 res->emplace_back("bfm_blocks", stringify(blocks));
603 res->emplace_back("bfm_size", stringify(size));
604 } else {
605 target_size = p2align(target_size, bytes_per_block);
606 auto target_blocks = size_2_block_count(target_size);
607
608 res->emplace_back("bfm_blocks", stringify(target_blocks));
609 res->emplace_back("bfm_size", stringify(target_size));
610 }
611 res->emplace_back("bfm_bytes_per_block", stringify(bytes_per_block));
612 res->emplace_back("bfm_blocks_per_key", stringify(blocks_per_key));
613 }