]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BitmapFreelistManager.cc
update sources to 12.2.10
[ceph.git] / ceph / src / os / bluestore / BitmapFreelistManager.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "BitmapFreelistManager.h"
5 #include "kv/KeyValueDB.h"
6 #include "os/kv.h"
7
8 #include "common/debug.h"
9
10 #define dout_context cct
11 #define dout_subsys ceph_subsys_bluestore
12 #undef dout_prefix
13 #define dout_prefix *_dout << "freelist "
14
15 void make_offset_key(uint64_t offset, std::string *key)
16 {
17 key->reserve(10);
18 _key_encode_u64(offset, key);
19 }
20
21 struct XorMergeOperator : public KeyValueDB::MergeOperator {
22 void merge_nonexistent(
23 const char *rdata, size_t rlen, std::string *new_value) override {
24 *new_value = std::string(rdata, rlen);
25 }
26 void merge(
27 const char *ldata, size_t llen,
28 const char *rdata, size_t rlen,
29 std::string *new_value) override {
30 assert(llen == rlen);
31 *new_value = std::string(ldata, llen);
32 for (size_t i = 0; i < rlen; ++i) {
33 (*new_value)[i] ^= rdata[i];
34 }
35 }
36 // We use each operator name and each prefix to construct the
37 // overall RocksDB operator name for consistency check at open time.
38 const char *name() const override {
39 return "bitwise_xor";
40 }
41 };
42
43 void BitmapFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix)
44 {
45 ceph::shared_ptr<XorMergeOperator> merge_op(new XorMergeOperator);
46 db->set_merge_operator(prefix, merge_op);
47 }
48
49 BitmapFreelistManager::BitmapFreelistManager(CephContext* cct,
50 KeyValueDB *db,
51 string meta_prefix,
52 string bitmap_prefix)
53 : FreelistManager(cct),
54 meta_prefix(meta_prefix),
55 bitmap_prefix(bitmap_prefix),
56 kvdb(db),
57 enumerate_bl_pos(0)
58 {
59 }
60
61 int BitmapFreelistManager::create(uint64_t new_size, uint64_t granularity,
62 KeyValueDB::Transaction txn)
63 {
64 bytes_per_block = granularity;
65 assert(ISP2(bytes_per_block));
66 size = P2ALIGN(new_size, bytes_per_block);
67 blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key;
68
69 _init_misc();
70
71 blocks = size / bytes_per_block;
72 if (blocks / blocks_per_key * blocks_per_key != blocks) {
73 blocks = (blocks / blocks_per_key + 1) * blocks_per_key;
74 dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
75 << " to 0x" << (blocks * bytes_per_block)
76 << " (0x" << blocks << " blocks)" << std::dec << dendl;
77 // set past-eof blocks as allocated
78 _xor(size, blocks * bytes_per_block - size, txn);
79 }
80 dout(10) << __func__
81 << " size 0x" << std::hex << size
82 << " bytes_per_block 0x" << bytes_per_block
83 << " blocks 0x" << blocks
84 << " blocks_per_key 0x" << blocks_per_key
85 << std::dec << dendl;
86 {
87 bufferlist bl;
88 ::encode(bytes_per_block, bl);
89 txn->set(meta_prefix, "bytes_per_block", bl);
90 }
91 {
92 bufferlist bl;
93 ::encode(blocks_per_key, bl);
94 txn->set(meta_prefix, "blocks_per_key", bl);
95 }
96 {
97 bufferlist bl;
98 ::encode(blocks, bl);
99 txn->set(meta_prefix, "blocks", bl);
100 }
101 {
102 bufferlist bl;
103 ::encode(size, bl);
104 txn->set(meta_prefix, "size", bl);
105 }
106 return 0;
107 }
108
109 int BitmapFreelistManager::init(uint64_t dev_size)
110 {
111 dout(1) << __func__ << dendl;
112
113 KeyValueDB::Iterator it = kvdb->get_iterator(meta_prefix);
114 it->lower_bound(string());
115
116 // load meta
117 while (it->valid()) {
118 string k = it->key();
119 if (k == "bytes_per_block") {
120 bufferlist bl = it->value();
121 bufferlist::iterator p = bl.begin();
122 ::decode(bytes_per_block, p);
123 dout(10) << __func__ << " bytes_per_block 0x" << std::hex
124 << bytes_per_block << std::dec << dendl;
125 } else if (k == "blocks") {
126 bufferlist bl = it->value();
127 bufferlist::iterator p = bl.begin();
128 ::decode(blocks, p);
129 dout(10) << __func__ << " blocks 0x" << std::hex << blocks << std::dec
130 << dendl;
131 } else if (k == "size") {
132 bufferlist bl = it->value();
133 bufferlist::iterator p = bl.begin();
134 ::decode(size, p);
135 dout(10) << __func__ << " size 0x" << std::hex << size << std::dec
136 << dendl;
137 } else if (k == "blocks_per_key") {
138 bufferlist bl = it->value();
139 bufferlist::iterator p = bl.begin();
140 ::decode(blocks_per_key, p);
141 dout(10) << __func__ << " blocks_per_key 0x" << std::hex << blocks_per_key
142 << std::dec << dendl;
143 } else {
144 derr << __func__ << " unrecognized meta " << k << dendl;
145 return -EIO;
146 }
147 it->next();
148 }
149
150 dout(10) << __func__ << std::hex
151 << " size 0x" << size
152 << " bytes_per_block 0x" << bytes_per_block
153 << " blocks 0x" << blocks
154 << " blocks_per_key 0x" << blocks_per_key
155 << std::dec << dendl;
156 _init_misc();
157
158 // check for http://tracker.ceph.com/issues/21089 inconsistency
159 {
160 uint64_t new_size = P2ALIGN(dev_size, bytes_per_block);
161 if (new_size != size) {
162 uint64_t bad_size = new_size & ~bytes_per_block;
163 if (size == bad_size) {
164 derr << __func__ << " size is 0x" << std::hex << size << " should be 0x"
165 << new_size << " and appears to be due to #21089" << std::dec
166 << dendl;
167
168 uint64_t new_blocks = new_size / bytes_per_block;
169 if (new_blocks / blocks_per_key * blocks_per_key != new_blocks) {
170 new_blocks = (new_blocks / blocks_per_key + 1) *
171 blocks_per_key;
172 }
173
174 KeyValueDB::Transaction t = kvdb->get_transaction();
175 {
176 bufferlist sizebl;
177 ::encode(new_size, sizebl);
178 t->set(meta_prefix, "size", sizebl);
179 }
180 if (new_blocks != blocks) {
181 derr << "blocks is 0x" << std::hex << blocks << " should be 0x"
182 << new_blocks << std::dec << dendl;
183 bufferlist bl;
184 ::encode(new_blocks, bl);
185 t->set(meta_prefix, "blocks", bl);
186 _xor(new_size, new_blocks * bytes_per_block - new_size, t);
187 } else {
188 derr << "blocks are ok" << dendl;
189 _xor(bad_size, bytes_per_block, t);
190 }
191 int r = kvdb->submit_transaction_sync(t);
192 assert(r == 0);
193 size = new_size;
194 blocks = new_blocks;
195 derr << __func__ << " fixed inconsistency, size now 0x" << std::hex
196 << size << " blocks 0x" << blocks << std::dec << dendl;
197 }
198 }
199 }
200 return 0;
201 }
202
203 void BitmapFreelistManager::_init_misc()
204 {
205 bufferptr z(blocks_per_key >> 3);
206 memset(z.c_str(), 0xff, z.length());
207 all_set_bl.clear();
208 all_set_bl.append(z);
209
210 block_mask = ~(bytes_per_block - 1);
211
212 bytes_per_key = bytes_per_block * blocks_per_key;
213 key_mask = ~(bytes_per_key - 1);
214 dout(10) << __func__ << std::hex << " bytes_per_key 0x" << bytes_per_key
215 << ", key_mask 0x" << key_mask << std::dec
216 << dendl;
217 }
218
219 void BitmapFreelistManager::shutdown()
220 {
221 dout(1) << __func__ << dendl;
222 }
223
224 void BitmapFreelistManager::enumerate_reset()
225 {
226 std::lock_guard<std::mutex> l(lock);
227 enumerate_offset = 0;
228 enumerate_bl_pos = 0;
229 enumerate_bl.clear();
230 enumerate_p.reset();
231 }
232
233 int get_next_clear_bit(bufferlist& bl, int start)
234 {
235 const char *p = bl.c_str();
236 int bits = bl.length() << 3;
237 while (start < bits) {
238 // byte = start / 8 (or start >> 3)
239 // bit = start % 8 (or start & 7)
240 unsigned char byte_mask = 1 << (start & 7);
241 if ((p[start >> 3] & byte_mask) == 0) {
242 return start;
243 }
244 ++start;
245 }
246 return -1; // not found
247 }
248
249 int get_next_set_bit(bufferlist& bl, int start)
250 {
251 const char *p = bl.c_str();
252 int bits = bl.length() << 3;
253 while (start < bits) {
254 int which_byte = start / 8;
255 int which_bit = start % 8;
256 unsigned char byte_mask = 1 << which_bit;
257 if (p[which_byte] & byte_mask) {
258 return start;
259 }
260 ++start;
261 }
262 return -1; // not found
263 }
264
265 bool BitmapFreelistManager::enumerate_next(uint64_t *offset, uint64_t *length)
266 {
267 std::lock_guard<std::mutex> l(lock);
268
269 // initial base case is a bit awkward
270 if (enumerate_offset == 0 && enumerate_bl_pos == 0) {
271 dout(10) << __func__ << " start" << dendl;
272 enumerate_p = kvdb->get_iterator(bitmap_prefix);
273 enumerate_p->lower_bound(string());
274 // we assert that the first block is always allocated; it's true,
275 // and it simplifies our lives a bit.
276 assert(enumerate_p->valid());
277 string k = enumerate_p->key();
278 const char *p = k.c_str();
279 _key_decode_u64(p, &enumerate_offset);
280 enumerate_bl = enumerate_p->value();
281 assert(enumerate_offset == 0);
282 assert(get_next_set_bit(enumerate_bl, 0) == 0);
283 }
284
285 if (enumerate_offset >= size) {
286 dout(10) << __func__ << " end" << dendl;
287 return false;
288 }
289
290 // skip set bits to find offset
291 while (true) {
292 enumerate_bl_pos = get_next_clear_bit(enumerate_bl, enumerate_bl_pos);
293 if (enumerate_bl_pos >= 0) {
294 *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
295 dout(30) << __func__ << " found clear bit, key 0x" << std::hex
296 << enumerate_offset << " bit 0x" << enumerate_bl_pos
297 << " offset 0x" << *offset
298 << std::dec << dendl;
299 break;
300 }
301 dout(30) << " no more clear bits in 0x" << std::hex << enumerate_offset
302 << std::dec << dendl;
303 enumerate_p->next();
304 enumerate_bl.clear();
305 if (!enumerate_p->valid()) {
306 enumerate_offset += bytes_per_key;
307 enumerate_bl_pos = 0;
308 *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
309 break;
310 }
311 string k = enumerate_p->key();
312 const char *p = k.c_str();
313 uint64_t next = enumerate_offset + bytes_per_key;
314 _key_decode_u64(p, &enumerate_offset);
315 enumerate_bl = enumerate_p->value();
316 enumerate_bl_pos = 0;
317 if (enumerate_offset > next) {
318 dout(30) << " no key at 0x" << std::hex << next << ", got 0x"
319 << enumerate_offset << std::dec << dendl;
320 *offset = next;
321 break;
322 }
323 }
324
325 // skip clear bits to find the end
326 uint64_t end = 0;
327 if (enumerate_p->valid()) {
328 while (true) {
329 enumerate_bl_pos = get_next_set_bit(enumerate_bl, enumerate_bl_pos);
330 if (enumerate_bl_pos >= 0) {
331 end = _get_offset(enumerate_offset, enumerate_bl_pos);
332 dout(30) << __func__ << " found set bit, key 0x" << std::hex
333 << enumerate_offset << " bit 0x" << enumerate_bl_pos
334 << " offset 0x" << end << std::dec
335 << dendl;
336 end = std::min(get_alloc_units() * bytes_per_block, end);
337 *length = end - *offset;
338 dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
339 << std::dec << dendl;
340 return true;
341 }
342 dout(30) << " no more set bits in 0x" << std::hex << enumerate_offset
343 << std::dec << dendl;
344 enumerate_p->next();
345 enumerate_bl.clear();
346 enumerate_bl_pos = 0;
347 if (!enumerate_p->valid()) {
348 break;
349 }
350 string k = enumerate_p->key();
351 const char *p = k.c_str();
352 _key_decode_u64(p, &enumerate_offset);
353 enumerate_bl = enumerate_p->value();
354 }
355 }
356
357 if (enumerate_offset < size) {
358 end = get_alloc_units() * bytes_per_block;
359 *length = end - *offset;
360 dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
361 << std::dec << dendl;
362 enumerate_offset = size;
363 enumerate_bl_pos = blocks_per_key;
364 return true;
365 }
366
367 dout(10) << __func__ << " end" << dendl;
368 return false;
369 }
370
371 void BitmapFreelistManager::dump()
372 {
373 enumerate_reset();
374 uint64_t offset, length;
375 while (enumerate_next(&offset, &length)) {
376 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
377 << std::dec << dendl;
378 }
379 }
380
381 void BitmapFreelistManager::_verify_range(uint64_t offset, uint64_t length,
382 int val)
383 {
384 unsigned errors = 0;
385 uint64_t first_key = offset & key_mask;
386 uint64_t last_key = (offset + length - 1) & key_mask;
387 if (first_key == last_key) {
388 string k;
389 make_offset_key(first_key, &k);
390 bufferlist bl;
391 kvdb->get(bitmap_prefix, k, &bl);
392 if (bl.length() > 0) {
393 const char *p = bl.c_str();
394 unsigned s = (offset & ~key_mask) / bytes_per_block;
395 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
396 for (unsigned i = s; i <= e; ++i) {
397 int has = !!(p[i >> 3] & (1ull << (i & 7)));
398 if (has != val) {
399 derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
400 << i << " has 0x" << has << " expected 0x" << val
401 << std::dec << dendl;
402 ++errors;
403 }
404 }
405 } else {
406 if (val) {
407 derr << __func__ << " key 0x" << std::hex << first_key
408 << " not present, expected 0x" << val << std::dec << dendl;
409 ++errors;
410 }
411 }
412 } else {
413 // first key
414 {
415 string k;
416 make_offset_key(first_key, &k);
417 bufferlist bl;
418 kvdb->get(bitmap_prefix, k, &bl);
419 if (bl.length()) {
420 const char *p = bl.c_str();
421 unsigned s = (offset & ~key_mask) / bytes_per_block;
422 unsigned e = blocks_per_key;
423 for (unsigned i = s; i < e; ++i) {
424 int has = !!(p[i >> 3] & (1ull << (i & 7)));
425 if (has != val) {
426 derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
427 << i << " has 0x" << has << " expected 0x" << val << std::dec
428 << dendl;
429 ++errors;
430 }
431 }
432 } else {
433 if (val) {
434 derr << __func__ << " key 0x" << std::hex << first_key
435 << " not present, expected 0x" << val << std::dec << dendl;
436 ++errors;
437 }
438 }
439 first_key += bytes_per_key;
440 }
441 // middle keys
442 if (first_key < last_key) {
443 while (first_key < last_key) {
444 string k;
445 make_offset_key(first_key, &k);
446 bufferlist bl;
447 kvdb->get(bitmap_prefix, k, &bl);
448 if (bl.length() > 0) {
449 const char *p = bl.c_str();
450 for (unsigned i = 0; i < blocks_per_key; ++i) {
451 int has = !!(p[i >> 3] & (1ull << (i & 7)));
452 if (has != val) {
453 derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
454 << i << " has 0x" << has << " expected 0x" << val
455 << std::dec << dendl;
456 ++errors;
457 }
458 }
459 } else {
460 if (val) {
461 derr << __func__ << " key 0x" << std::hex << first_key
462 << " not present, expected 0x" << val << std::dec << dendl;
463 ++errors;
464 }
465 }
466 first_key += bytes_per_key;
467 }
468 }
469 assert(first_key == last_key);
470 {
471 string k;
472 make_offset_key(first_key, &k);
473 bufferlist bl;
474 kvdb->get(bitmap_prefix, k, &bl);
475 if (bl.length() > 0) {
476 const char *p = bl.c_str();
477 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
478 for (unsigned i = 0; i < e; ++i) {
479 int has = !!(p[i >> 3] & (1ull << (i & 7)));
480 if (has != val) {
481 derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
482 << i << " has 0x" << has << " expected 0x" << val << std::dec
483 << dendl;
484 ++errors;
485 }
486 }
487 } else {
488 if (val) {
489 derr << __func__ << " key 0x" << std::hex << first_key
490 << " not present, expected 0x" << val << std::dec << dendl;
491 ++errors;
492 }
493 }
494 }
495 }
496 if (errors) {
497 derr << __func__ << " saw " << errors << " errors" << dendl;
498 assert(0 == "bitmap freelist errors");
499 }
500 }
501
502 void BitmapFreelistManager::allocate(
503 uint64_t offset, uint64_t length,
504 KeyValueDB::Transaction txn)
505 {
506 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
507 << std::dec << dendl;
508 if (cct->_conf->bluestore_debug_freelist)
509 _verify_range(offset, length, 0);
510 _xor(offset, length, txn);
511 }
512
513 void BitmapFreelistManager::release(
514 uint64_t offset, uint64_t length,
515 KeyValueDB::Transaction txn)
516 {
517 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
518 << std::dec << dendl;
519 if (cct->_conf->bluestore_debug_freelist)
520 _verify_range(offset, length, 1);
521 _xor(offset, length, txn);
522 }
523
524 void BitmapFreelistManager::_xor(
525 uint64_t offset, uint64_t length,
526 KeyValueDB::Transaction txn)
527 {
528 // must be block aligned
529 assert((offset & block_mask) == offset);
530 assert((length & block_mask) == length);
531
532 uint64_t first_key = offset & key_mask;
533 uint64_t last_key = (offset + length - 1) & key_mask;
534 dout(20) << __func__ << " first_key 0x" << std::hex << first_key
535 << " last_key 0x" << last_key << std::dec << dendl;
536
537 if (first_key == last_key) {
538 bufferptr p(blocks_per_key >> 3);
539 p.zero();
540 unsigned s = (offset & ~key_mask) / bytes_per_block;
541 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
542 for (unsigned i = s; i <= e; ++i) {
543 p[i >> 3] ^= 1ull << (i & 7);
544 }
545 string k;
546 make_offset_key(first_key, &k);
547 bufferlist bl;
548 bl.append(p);
549 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
550 bl.hexdump(*_dout, false);
551 *_dout << dendl;
552 txn->merge(bitmap_prefix, k, bl);
553 } else {
554 // first key
555 {
556 bufferptr p(blocks_per_key >> 3);
557 p.zero();
558 unsigned s = (offset & ~key_mask) / bytes_per_block;
559 unsigned e = blocks_per_key;
560 for (unsigned i = s; i < e; ++i) {
561 p[i >> 3] ^= 1ull << (i & 7);
562 }
563 string k;
564 make_offset_key(first_key, &k);
565 bufferlist bl;
566 bl.append(p);
567 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
568 bl.hexdump(*_dout, false);
569 *_dout << dendl;
570 txn->merge(bitmap_prefix, k, bl);
571 first_key += bytes_per_key;
572 }
573 // middle keys
574 while (first_key < last_key) {
575 string k;
576 make_offset_key(first_key, &k);
577 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec
578 << ": ";
579 all_set_bl.hexdump(*_dout, false);
580 *_dout << dendl;
581 txn->merge(bitmap_prefix, k, all_set_bl);
582 first_key += bytes_per_key;
583 }
584 assert(first_key == last_key);
585 {
586 bufferptr p(blocks_per_key >> 3);
587 p.zero();
588 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
589 for (unsigned i = 0; i <= e; ++i) {
590 p[i >> 3] ^= 1ull << (i & 7);
591 }
592 string k;
593 make_offset_key(first_key, &k);
594 bufferlist bl;
595 bl.append(p);
596 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
597 bl.hexdump(*_dout, false);
598 *_dout << dendl;
599 txn->merge(bitmap_prefix, k, bl);
600 }
601 }
602 }