]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "BitmapFreelistManager.h" | |
5 | #include "kv/KeyValueDB.h" | |
6 | #include "os/kv.h" | |
7 | ||
8 | #include "common/debug.h" | |
9 | ||
10 | #define dout_context cct | |
11 | #define dout_subsys ceph_subsys_bluestore | |
12 | #undef dout_prefix | |
13 | #define dout_prefix *_dout << "freelist " | |
14 | ||
15 | void make_offset_key(uint64_t offset, std::string *key) | |
16 | { | |
17 | key->reserve(10); | |
18 | _key_encode_u64(offset, key); | |
19 | } | |
20 | ||
21 | struct XorMergeOperator : public KeyValueDB::MergeOperator { | |
22 | void merge_nonexistent( | |
23 | const char *rdata, size_t rlen, std::string *new_value) override { | |
24 | *new_value = std::string(rdata, rlen); | |
25 | } | |
26 | void merge( | |
27 | const char *ldata, size_t llen, | |
28 | const char *rdata, size_t rlen, | |
29 | std::string *new_value) override { | |
30 | assert(llen == rlen); | |
31 | *new_value = std::string(ldata, llen); | |
32 | for (size_t i = 0; i < rlen; ++i) { | |
33 | (*new_value)[i] ^= rdata[i]; | |
34 | } | |
35 | } | |
36 | // We use each operator name and each prefix to construct the | |
37 | // overall RocksDB operator name for consistency check at open time. | |
38 | string name() const override { | |
39 | return "bitwise_xor"; | |
40 | } | |
41 | }; | |
42 | ||
43 | void BitmapFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix) | |
44 | { | |
45 | ceph::shared_ptr<XorMergeOperator> merge_op(new XorMergeOperator); | |
46 | db->set_merge_operator(prefix, merge_op); | |
47 | } | |
48 | ||
49 | BitmapFreelistManager::BitmapFreelistManager(CephContext* cct, | |
50 | KeyValueDB *db, | |
51 | string meta_prefix, | |
52 | string bitmap_prefix) | |
53 | : FreelistManager(cct), | |
54 | meta_prefix(meta_prefix), | |
55 | bitmap_prefix(bitmap_prefix), | |
56 | kvdb(db), | |
57 | enumerate_bl_pos(0) | |
58 | { | |
59 | } | |
60 | ||
61 | int BitmapFreelistManager::create(uint64_t new_size, KeyValueDB::Transaction txn) | |
62 | { | |
63 | bytes_per_block = cct->_conf->bdev_block_size; | |
64 | assert(ISP2(bytes_per_block)); | |
65 | size = P2ALIGN(new_size, bytes_per_block); | |
66 | blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key; | |
67 | ||
68 | _init_misc(); | |
69 | ||
70 | blocks = size / bytes_per_block; | |
71 | if (blocks / blocks_per_key * blocks_per_key != blocks) { | |
72 | blocks = (blocks / blocks_per_key + 1) * blocks_per_key; | |
73 | dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size | |
74 | << " to 0x" << (blocks * bytes_per_block) | |
75 | << " (0x" << blocks << " blocks)" << std::dec << dendl; | |
76 | // set past-eof blocks as allocated | |
77 | _xor(size, blocks * bytes_per_block - size, txn); | |
78 | } | |
79 | dout(10) << __func__ | |
80 | << " size 0x" << std::hex << size | |
81 | << " bytes_per_block 0x" << bytes_per_block | |
82 | << " blocks 0x" << blocks | |
83 | << " blocks_per_key 0x" << blocks_per_key | |
84 | << std::dec << dendl; | |
85 | { | |
86 | bufferlist bl; | |
87 | ::encode(bytes_per_block, bl); | |
88 | txn->set(meta_prefix, "bytes_per_block", bl); | |
89 | } | |
90 | { | |
91 | bufferlist bl; | |
92 | ::encode(blocks_per_key, bl); | |
93 | txn->set(meta_prefix, "blocks_per_key", bl); | |
94 | } | |
95 | { | |
96 | bufferlist bl; | |
97 | ::encode(blocks, bl); | |
98 | txn->set(meta_prefix, "blocks", bl); | |
99 | } | |
100 | { | |
101 | bufferlist bl; | |
102 | ::encode(size, bl); | |
103 | txn->set(meta_prefix, "size", bl); | |
104 | } | |
105 | return 0; | |
106 | } | |
107 | ||
108 | int BitmapFreelistManager::init() | |
109 | { | |
110 | dout(1) << __func__ << dendl; | |
111 | ||
112 | KeyValueDB::Iterator it = kvdb->get_iterator(meta_prefix); | |
113 | it->lower_bound(string()); | |
114 | ||
115 | // load meta | |
116 | while (it->valid()) { | |
117 | string k = it->key(); | |
118 | if (k == "bytes_per_block") { | |
119 | bufferlist bl = it->value(); | |
120 | bufferlist::iterator p = bl.begin(); | |
121 | ::decode(bytes_per_block, p); | |
122 | dout(10) << __func__ << " bytes_per_block 0x" << std::hex | |
123 | << bytes_per_block << std::dec << dendl; | |
124 | } else if (k == "blocks") { | |
125 | bufferlist bl = it->value(); | |
126 | bufferlist::iterator p = bl.begin(); | |
127 | ::decode(blocks, p); | |
128 | dout(10) << __func__ << " blocks 0x" << std::hex << blocks << std::dec | |
129 | << dendl; | |
130 | } else if (k == "size") { | |
131 | bufferlist bl = it->value(); | |
132 | bufferlist::iterator p = bl.begin(); | |
133 | ::decode(size, p); | |
134 | dout(10) << __func__ << " size 0x" << std::hex << size << std::dec | |
135 | << dendl; | |
136 | } else if (k == "blocks_per_key") { | |
137 | bufferlist bl = it->value(); | |
138 | bufferlist::iterator p = bl.begin(); | |
139 | ::decode(blocks_per_key, p); | |
140 | dout(10) << __func__ << " blocks_per_key 0x" << std::hex << blocks_per_key | |
141 | << std::dec << dendl; | |
142 | } else { | |
143 | derr << __func__ << " unrecognized meta " << k << dendl; | |
144 | return -EIO; | |
145 | } | |
146 | it->next(); | |
147 | } | |
148 | ||
149 | dout(10) << __func__ << std::hex | |
150 | << " size 0x" << size | |
151 | << " bytes_per_block 0x" << bytes_per_block | |
152 | << " blocks 0x" << blocks | |
153 | << " blocks_per_key 0x" << blocks_per_key | |
154 | << std::dec << dendl; | |
155 | _init_misc(); | |
156 | return 0; | |
157 | } | |
158 | ||
159 | void BitmapFreelistManager::_init_misc() | |
160 | { | |
161 | bufferptr z(blocks_per_key >> 3); | |
162 | memset(z.c_str(), 0xff, z.length()); | |
163 | all_set_bl.clear(); | |
164 | all_set_bl.append(z); | |
165 | ||
166 | block_mask = ~(bytes_per_block - 1); | |
167 | ||
168 | bytes_per_key = bytes_per_block * blocks_per_key; | |
169 | key_mask = ~(bytes_per_key - 1); | |
170 | dout(10) << __func__ << std::hex << " bytes_per_key 0x" << bytes_per_key | |
171 | << ", key_mask 0x" << key_mask << std::dec | |
172 | << dendl; | |
173 | } | |
174 | ||
175 | void BitmapFreelistManager::shutdown() | |
176 | { | |
177 | dout(1) << __func__ << dendl; | |
178 | } | |
179 | ||
180 | void BitmapFreelistManager::enumerate_reset() | |
181 | { | |
182 | std::lock_guard<std::mutex> l(lock); | |
183 | enumerate_offset = 0; | |
184 | enumerate_bl_pos = 0; | |
185 | enumerate_bl.clear(); | |
224ce89b | 186 | enumerate_p.reset(); |
7c673cae FG |
187 | } |
188 | ||
189 | int get_next_clear_bit(bufferlist& bl, int start) | |
190 | { | |
191 | const char *p = bl.c_str(); | |
192 | int bits = bl.length() << 3; | |
193 | while (start < bits) { | |
194 | int which_byte = start / 8; | |
195 | int which_bit = start % 8; | |
196 | unsigned char byte_mask = 1 << which_bit; | |
197 | if ((p[which_byte] & byte_mask) == 0) { | |
198 | return start; | |
199 | } | |
200 | ++start; | |
201 | } | |
202 | return -1; // not found | |
203 | } | |
204 | ||
205 | int get_next_set_bit(bufferlist& bl, int start) | |
206 | { | |
207 | const char *p = bl.c_str(); | |
208 | int bits = bl.length() << 3; | |
209 | while (start < bits) { | |
210 | int which_byte = start / 8; | |
211 | int which_bit = start % 8; | |
212 | unsigned char byte_mask = 1 << which_bit; | |
213 | if (p[which_byte] & byte_mask) { | |
214 | return start; | |
215 | } | |
216 | ++start; | |
217 | } | |
218 | return -1; // not found | |
219 | } | |
220 | ||
221 | bool BitmapFreelistManager::enumerate_next(uint64_t *offset, uint64_t *length) | |
222 | { | |
223 | std::lock_guard<std::mutex> l(lock); | |
224 | ||
225 | // initial base case is a bit awkward | |
226 | if (enumerate_offset == 0 && enumerate_bl_pos == 0) { | |
227 | dout(10) << __func__ << " start" << dendl; | |
228 | enumerate_p = kvdb->get_iterator(bitmap_prefix); | |
229 | enumerate_p->lower_bound(string()); | |
230 | // we assert that the first block is always allocated; it's true, | |
231 | // and it simplifies our lives a bit. | |
232 | assert(enumerate_p->valid()); | |
233 | string k = enumerate_p->key(); | |
234 | const char *p = k.c_str(); | |
235 | _key_decode_u64(p, &enumerate_offset); | |
236 | enumerate_bl = enumerate_p->value(); | |
237 | assert(enumerate_offset == 0); | |
238 | assert(get_next_set_bit(enumerate_bl, 0) == 0); | |
239 | } | |
240 | ||
241 | if (enumerate_offset >= size) { | |
242 | dout(10) << __func__ << " end" << dendl; | |
243 | return false; | |
244 | } | |
245 | ||
246 | // skip set bits to find offset | |
247 | while (true) { | |
248 | enumerate_bl_pos = get_next_clear_bit(enumerate_bl, enumerate_bl_pos); | |
249 | if (enumerate_bl_pos >= 0) { | |
250 | *offset = _get_offset(enumerate_offset, enumerate_bl_pos); | |
251 | dout(30) << __func__ << " found clear bit, key 0x" << std::hex | |
252 | << enumerate_offset << " bit 0x" << enumerate_bl_pos | |
253 | << " offset 0x" << *offset | |
254 | << std::dec << dendl; | |
255 | break; | |
256 | } | |
257 | dout(30) << " no more clear bits in 0x" << std::hex << enumerate_offset | |
258 | << std::dec << dendl; | |
259 | enumerate_p->next(); | |
260 | enumerate_bl.clear(); | |
261 | if (!enumerate_p->valid()) { | |
262 | enumerate_offset += bytes_per_key; | |
263 | enumerate_bl_pos = 0; | |
264 | *offset = _get_offset(enumerate_offset, enumerate_bl_pos); | |
265 | break; | |
266 | } | |
267 | string k = enumerate_p->key(); | |
268 | const char *p = k.c_str(); | |
269 | uint64_t next = enumerate_offset + bytes_per_key; | |
270 | _key_decode_u64(p, &enumerate_offset); | |
271 | enumerate_bl = enumerate_p->value(); | |
272 | enumerate_bl_pos = 0; | |
273 | if (enumerate_offset > next) { | |
274 | dout(30) << " no key at 0x" << std::hex << next << ", got 0x" | |
275 | << enumerate_offset << std::dec << dendl; | |
276 | *offset = next; | |
277 | break; | |
278 | } | |
279 | } | |
280 | ||
281 | // skip clear bits to find the end | |
282 | uint64_t end = 0; | |
283 | if (enumerate_p->valid()) { | |
284 | while (true) { | |
285 | enumerate_bl_pos = get_next_set_bit(enumerate_bl, enumerate_bl_pos); | |
286 | if (enumerate_bl_pos >= 0) { | |
287 | end = _get_offset(enumerate_offset, enumerate_bl_pos); | |
288 | dout(30) << __func__ << " found set bit, key 0x" << std::hex | |
289 | << enumerate_offset << " bit 0x" << enumerate_bl_pos | |
290 | << " offset 0x" << end << std::dec | |
291 | << dendl; | |
292 | *length = end - *offset; | |
293 | assert((*offset + *length) <= size); | |
294 | dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length | |
295 | << std::dec << dendl; | |
296 | return true; | |
297 | } | |
298 | dout(30) << " no more set bits in 0x" << std::hex << enumerate_offset | |
299 | << std::dec << dendl; | |
300 | enumerate_p->next(); | |
301 | enumerate_bl.clear(); | |
302 | enumerate_bl_pos = 0; | |
303 | if (!enumerate_p->valid()) { | |
304 | break; | |
305 | } | |
306 | string k = enumerate_p->key(); | |
307 | const char *p = k.c_str(); | |
308 | _key_decode_u64(p, &enumerate_offset); | |
309 | enumerate_bl = enumerate_p->value(); | |
310 | } | |
311 | } | |
312 | ||
313 | end = size; | |
314 | if (enumerate_offset < end) { | |
315 | *length = end - *offset; | |
316 | dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length | |
317 | << std::dec << dendl; | |
318 | enumerate_offset = end; | |
319 | enumerate_bl_pos = blocks_per_key; | |
320 | assert((*offset + *length) <= size); | |
321 | return true; | |
322 | } | |
323 | ||
324 | dout(10) << __func__ << " end" << dendl; | |
325 | return false; | |
326 | } | |
327 | ||
328 | void BitmapFreelistManager::dump() | |
329 | { | |
330 | enumerate_reset(); | |
331 | uint64_t offset, length; | |
332 | while (enumerate_next(&offset, &length)) { | |
333 | dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length | |
334 | << std::dec << dendl; | |
335 | } | |
336 | } | |
337 | ||
338 | void BitmapFreelistManager::_verify_range(uint64_t offset, uint64_t length, | |
339 | int val) | |
340 | { | |
341 | unsigned errors = 0; | |
342 | uint64_t first_key = offset & key_mask; | |
343 | uint64_t last_key = (offset + length - 1) & key_mask; | |
344 | if (first_key == last_key) { | |
345 | string k; | |
346 | make_offset_key(first_key, &k); | |
347 | bufferlist bl; | |
348 | kvdb->get(bitmap_prefix, k, &bl); | |
349 | if (bl.length() > 0) { | |
350 | const char *p = bl.c_str(); | |
351 | unsigned s = (offset & ~key_mask) / bytes_per_block; | |
352 | unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block; | |
353 | for (unsigned i = s; i <= e; ++i) { | |
354 | int has = !!(p[i >> 3] & (1ull << (i & 7))); | |
355 | if (has != val) { | |
356 | derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x" | |
357 | << i << " has 0x" << has << " expected 0x" << val | |
358 | << std::dec << dendl; | |
359 | ++errors; | |
360 | } | |
361 | } | |
362 | } else { | |
363 | if (val) { | |
364 | derr << __func__ << " key 0x" << std::hex << first_key | |
365 | << " not present, expected 0x" << val << std::dec << dendl; | |
366 | ++errors; | |
367 | } | |
368 | } | |
369 | } else { | |
370 | // first key | |
371 | { | |
372 | string k; | |
373 | make_offset_key(first_key, &k); | |
374 | bufferlist bl; | |
375 | kvdb->get(bitmap_prefix, k, &bl); | |
376 | if (bl.length()) { | |
377 | const char *p = bl.c_str(); | |
378 | unsigned s = (offset & ~key_mask) / bytes_per_block; | |
379 | unsigned e = blocks_per_key; | |
380 | for (unsigned i = s; i < e; ++i) { | |
381 | int has = !!(p[i >> 3] & (1ull << (i & 7))); | |
382 | if (has != val) { | |
383 | derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x" | |
384 | << i << " has 0x" << has << " expected 0x" << val << std::dec | |
385 | << dendl; | |
386 | ++errors; | |
387 | } | |
388 | } | |
389 | } else { | |
390 | if (val) { | |
391 | derr << __func__ << " key 0x" << std::hex << first_key | |
392 | << " not present, expected 0x" << val << std::dec << dendl; | |
393 | ++errors; | |
394 | } | |
395 | } | |
396 | first_key += bytes_per_key; | |
397 | } | |
398 | // middle keys | |
399 | if (first_key < last_key) { | |
400 | while (first_key < last_key) { | |
401 | string k; | |
402 | make_offset_key(first_key, &k); | |
403 | bufferlist bl; | |
404 | kvdb->get(bitmap_prefix, k, &bl); | |
405 | if (bl.length() > 0) { | |
406 | const char *p = bl.c_str(); | |
407 | for (unsigned i = 0; i < blocks_per_key; ++i) { | |
408 | int has = !!(p[i >> 3] & (1ull << (i & 7))); | |
409 | if (has != val) { | |
410 | derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x" | |
411 | << i << " has 0x" << has << " expected 0x" << val | |
412 | << std::dec << dendl; | |
413 | ++errors; | |
414 | } | |
415 | } | |
416 | } else { | |
417 | if (val) { | |
418 | derr << __func__ << " key 0x" << std::hex << first_key | |
419 | << " not present, expected 0x" << val << std::dec << dendl; | |
420 | ++errors; | |
421 | } | |
422 | } | |
423 | first_key += bytes_per_key; | |
424 | } | |
425 | } | |
426 | assert(first_key == last_key); | |
427 | { | |
428 | string k; | |
429 | make_offset_key(first_key, &k); | |
430 | bufferlist bl; | |
431 | kvdb->get(bitmap_prefix, k, &bl); | |
432 | if (bl.length() > 0) { | |
433 | const char *p = bl.c_str(); | |
434 | unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block; | |
435 | for (unsigned i = 0; i < e; ++i) { | |
436 | int has = !!(p[i >> 3] & (1ull << (i & 7))); | |
437 | if (has != val) { | |
438 | derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x" | |
439 | << i << " has 0x" << has << " expected 0x" << val << std::dec | |
440 | << dendl; | |
441 | ++errors; | |
442 | } | |
443 | } | |
444 | } else { | |
445 | if (val) { | |
446 | derr << __func__ << " key 0x" << std::hex << first_key | |
447 | << " not present, expected 0x" << val << std::dec << dendl; | |
448 | ++errors; | |
449 | } | |
450 | } | |
451 | } | |
452 | } | |
453 | if (errors) { | |
454 | derr << __func__ << " saw " << errors << " errors" << dendl; | |
455 | assert(0 == "bitmap freelist errors"); | |
456 | } | |
457 | } | |
458 | ||
459 | void BitmapFreelistManager::allocate( | |
460 | uint64_t offset, uint64_t length, | |
461 | KeyValueDB::Transaction txn) | |
462 | { | |
463 | dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length | |
464 | << std::dec << dendl; | |
465 | if (cct->_conf->bluestore_debug_freelist) | |
466 | _verify_range(offset, length, 0); | |
467 | _xor(offset, length, txn); | |
468 | } | |
469 | ||
470 | void BitmapFreelistManager::release( | |
471 | uint64_t offset, uint64_t length, | |
472 | KeyValueDB::Transaction txn) | |
473 | { | |
474 | dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length | |
475 | << std::dec << dendl; | |
476 | if (cct->_conf->bluestore_debug_freelist) | |
477 | _verify_range(offset, length, 1); | |
478 | _xor(offset, length, txn); | |
479 | } | |
480 | ||
481 | void BitmapFreelistManager::_xor( | |
482 | uint64_t offset, uint64_t length, | |
483 | KeyValueDB::Transaction txn) | |
484 | { | |
485 | // must be block aligned | |
486 | assert((offset & block_mask) == offset); | |
487 | assert((length & block_mask) == length); | |
488 | ||
489 | uint64_t first_key = offset & key_mask; | |
490 | uint64_t last_key = (offset + length - 1) & key_mask; | |
491 | dout(20) << __func__ << " first_key 0x" << std::hex << first_key | |
492 | << " last_key 0x" << last_key << std::dec << dendl; | |
493 | ||
494 | if (first_key == last_key) { | |
495 | bufferptr p(blocks_per_key >> 3); | |
496 | p.zero(); | |
497 | unsigned s = (offset & ~key_mask) / bytes_per_block; | |
498 | unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block; | |
499 | for (unsigned i = s; i <= e; ++i) { | |
500 | p[i >> 3] ^= 1ull << (i & 7); | |
501 | } | |
502 | string k; | |
503 | make_offset_key(first_key, &k); | |
504 | bufferlist bl; | |
505 | bl.append(p); | |
506 | dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": "; | |
507 | bl.hexdump(*_dout, false); | |
508 | *_dout << dendl; | |
509 | txn->merge(bitmap_prefix, k, bl); | |
510 | } else { | |
511 | // first key | |
512 | { | |
513 | bufferptr p(blocks_per_key >> 3); | |
514 | p.zero(); | |
515 | unsigned s = (offset & ~key_mask) / bytes_per_block; | |
516 | unsigned e = blocks_per_key; | |
517 | for (unsigned i = s; i < e; ++i) { | |
518 | p[i >> 3] ^= 1ull << (i & 7); | |
519 | } | |
520 | string k; | |
521 | make_offset_key(first_key, &k); | |
522 | bufferlist bl; | |
523 | bl.append(p); | |
524 | dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": "; | |
525 | bl.hexdump(*_dout, false); | |
526 | *_dout << dendl; | |
527 | txn->merge(bitmap_prefix, k, bl); | |
528 | first_key += bytes_per_key; | |
529 | } | |
530 | // middle keys | |
531 | while (first_key < last_key) { | |
532 | string k; | |
533 | make_offset_key(first_key, &k); | |
534 | dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec | |
535 | << ": "; | |
536 | all_set_bl.hexdump(*_dout, false); | |
537 | *_dout << dendl; | |
538 | txn->merge(bitmap_prefix, k, all_set_bl); | |
539 | first_key += bytes_per_key; | |
540 | } | |
541 | assert(first_key == last_key); | |
542 | { | |
543 | bufferptr p(blocks_per_key >> 3); | |
544 | p.zero(); | |
545 | unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block; | |
546 | for (unsigned i = 0; i <= e; ++i) { | |
547 | p[i >> 3] ^= 1ull << (i & 7); | |
548 | } | |
549 | string k; | |
550 | make_offset_key(first_key, &k); | |
551 | bufferlist bl; | |
552 | bl.append(p); | |
553 | dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": "; | |
554 | bl.hexdump(*_dout, false); | |
555 | *_dout << dendl; | |
556 | txn->merge(bitmap_prefix, k, bl); | |
557 | } | |
558 | } | |
559 | } |