]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.h
Add patch for failing prerm scripts
[ceph.git] / ceph / src / os / bluestore / BlueStore.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_OSD_BLUESTORE_H
16#define CEPH_OSD_BLUESTORE_H
17
18#include "acconfig.h"
19
20#include <unistd.h>
21
22#include <atomic>
23#include <mutex>
24#include <condition_variable>
25
26#include <boost/intrusive/list.hpp>
27#include <boost/intrusive/unordered_set.hpp>
28#include <boost/intrusive/set.hpp>
29#include <boost/functional/hash.hpp>
30#include <boost/dynamic_bitset.hpp>
31
eafe8130
TL
32#include "include/cpp-btree/btree_set.h"
33
11fdf7f2 34#include "include/ceph_assert.h"
7c673cae 35#include "include/unordered_map.h"
7c673cae 36#include "include/mempool.h"
11fdf7f2 37#include "common/bloom_filter.hpp"
7c673cae 38#include "common/Finisher.h"
11fdf7f2 39#include "common/Throttle.h"
7c673cae 40#include "common/perf_counters.h"
91327a77 41#include "common/PriorityCache.h"
7c673cae
FG
42#include "compressor/Compressor.h"
43#include "os/ObjectStore.h"
44
45#include "bluestore_types.h"
46#include "BlockDevice.h"
11fdf7f2 47#include "BlueFS.h"
7c673cae
FG
48#include "common/EventTrace.h"
49
50class Allocator;
51class FreelistManager;
11fdf7f2 52class BlueStoreRepairer;
7c673cae
FG
53
54//#define DEBUG_CACHE
55//#define DEBUG_DEFERRED
56
31f18b77
FG
57
58
59// constants for Buffer::optimize()
60#define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
61
62
7c673cae
FG
63enum {
64 l_bluestore_first = 732430,
65 l_bluestore_kv_flush_lat,
66 l_bluestore_kv_commit_lat,
11fdf7f2
TL
67 l_bluestore_kv_sync_lat,
68 l_bluestore_kv_final_lat,
7c673cae
FG
69 l_bluestore_state_prepare_lat,
70 l_bluestore_state_aio_wait_lat,
71 l_bluestore_state_io_done_lat,
72 l_bluestore_state_kv_queued_lat,
73 l_bluestore_state_kv_committing_lat,
74 l_bluestore_state_kv_done_lat,
75 l_bluestore_state_deferred_queued_lat,
76 l_bluestore_state_deferred_aio_wait_lat,
77 l_bluestore_state_deferred_cleanup_lat,
78 l_bluestore_state_finishing_lat,
79 l_bluestore_state_done_lat,
80 l_bluestore_throttle_lat,
81 l_bluestore_submit_lat,
82 l_bluestore_commit_lat,
83 l_bluestore_read_lat,
84 l_bluestore_read_onode_meta_lat,
85 l_bluestore_read_wait_aio_lat,
86 l_bluestore_compress_lat,
87 l_bluestore_decompress_lat,
88 l_bluestore_csum_lat,
89 l_bluestore_compress_success_count,
90 l_bluestore_compress_rejected_count,
91 l_bluestore_write_pad_bytes,
92 l_bluestore_deferred_write_ops,
93 l_bluestore_deferred_write_bytes,
94 l_bluestore_write_penalty_read_ops,
95 l_bluestore_allocated,
96 l_bluestore_stored,
97 l_bluestore_compressed,
98 l_bluestore_compressed_allocated,
99 l_bluestore_compressed_original,
100 l_bluestore_onodes,
101 l_bluestore_onode_hits,
102 l_bluestore_onode_misses,
103 l_bluestore_onode_shard_hits,
104 l_bluestore_onode_shard_misses,
105 l_bluestore_extents,
106 l_bluestore_blobs,
107 l_bluestore_buffers,
108 l_bluestore_buffer_bytes,
109 l_bluestore_buffer_hit_bytes,
110 l_bluestore_buffer_miss_bytes,
111 l_bluestore_write_big,
112 l_bluestore_write_big_bytes,
113 l_bluestore_write_big_blobs,
114 l_bluestore_write_small,
115 l_bluestore_write_small_bytes,
116 l_bluestore_write_small_unused,
117 l_bluestore_write_small_deferred,
118 l_bluestore_write_small_pre_read,
119 l_bluestore_write_small_new,
120 l_bluestore_txc,
121 l_bluestore_onode_reshard,
122 l_bluestore_blob_split,
123 l_bluestore_extent_compress,
124 l_bluestore_gc_merged,
b32b8144 125 l_bluestore_read_eio,
f64942e4 126 l_bluestore_reads_with_retries,
a8e16298 127 l_bluestore_fragmentation,
11fdf7f2
TL
128 l_bluestore_omap_seek_to_first_lat,
129 l_bluestore_omap_upper_bound_lat,
130 l_bluestore_omap_lower_bound_lat,
131 l_bluestore_omap_next_lat,
494da23a 132 l_bluestore_clist_lat,
7c673cae
FG
133 l_bluestore_last
134};
135
11fdf7f2
TL
136#define META_POOL_ID ((uint64_t)-1ull)
137
7c673cae 138class BlueStore : public ObjectStore,
11fdf7f2 139 public BlueFSDeviceExpander,
7c673cae
FG
140 public md_config_obs_t {
141 // -----------------------------------------------------
142 // types
143public:
144 // config observer
145 const char** get_tracked_conf_keys() const override;
11fdf7f2
TL
146 void handle_conf_change(const ConfigProxy& conf,
147 const std::set<std::string> &changed) override;
148
149 //handler for discard event
150 void handle_discard(interval_set<uint64_t>& to_release);
7c673cae
FG
151
152 void _set_csum();
153 void _set_compression();
154 void _set_throttle_params();
31f18b77 155 int _set_cache_sizes();
7c673cae
FG
156
157 class TransContext;
158
159 typedef map<uint64_t, bufferlist> ready_regions_t;
160
eafe8130 161
7c673cae
FG
162 struct BufferSpace;
163 struct Collection;
164 typedef boost::intrusive_ptr<Collection> CollectionRef;
165
166 struct AioContext {
167 virtual void aio_finish(BlueStore *store) = 0;
168 virtual ~AioContext() {}
169 };
170
171 /// cached buffer
172 struct Buffer {
173 MEMPOOL_CLASS_HELPERS();
174
175 enum {
176 STATE_EMPTY, ///< empty buffer -- used for cache history
177 STATE_CLEAN, ///< clean data that is up to date
178 STATE_WRITING, ///< data that is being written (io not yet complete)
179 };
180 static const char *get_state_name(int s) {
181 switch (s) {
182 case STATE_EMPTY: return "empty";
183 case STATE_CLEAN: return "clean";
184 case STATE_WRITING: return "writing";
185 default: return "???";
186 }
187 }
188 enum {
189 FLAG_NOCACHE = 1, ///< trim when done WRITING (do not become CLEAN)
190 // NOTE: fix operator<< when you define a second flag
191 };
192 static const char *get_flag_name(int s) {
193 switch (s) {
194 case FLAG_NOCACHE: return "nocache";
195 default: return "???";
196 }
197 }
198
199 BufferSpace *space;
200 uint16_t state; ///< STATE_*
201 uint16_t cache_private = 0; ///< opaque (to us) value used by Cache impl
202 uint32_t flags; ///< FLAG_*
203 uint64_t seq;
204 uint32_t offset, length;
205 bufferlist data;
206
207 boost::intrusive::list_member_hook<> lru_item;
208 boost::intrusive::list_member_hook<> state_item;
209
210 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, uint32_t l,
211 unsigned f = 0)
212 : space(space), state(s), flags(f), seq(q), offset(o), length(l) {}
213 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, bufferlist& b,
214 unsigned f = 0)
215 : space(space), state(s), flags(f), seq(q), offset(o),
216 length(b.length()), data(b) {}
217
218 bool is_empty() const {
219 return state == STATE_EMPTY;
220 }
221 bool is_clean() const {
222 return state == STATE_CLEAN;
223 }
224 bool is_writing() const {
225 return state == STATE_WRITING;
226 }
227
228 uint32_t end() const {
229 return offset + length;
230 }
231
232 void truncate(uint32_t newlen) {
11fdf7f2 233 ceph_assert(newlen < length);
7c673cae
FG
234 if (data.length()) {
235 bufferlist t;
236 t.substr_of(data, 0, newlen);
237 data.claim(t);
238 }
239 length = newlen;
240 }
31f18b77
FG
241 void maybe_rebuild() {
242 if (data.length() &&
243 (data.get_num_buffers() > 1 ||
244 data.front().wasted() > data.length() / MAX_BUFFER_SLOP_RATIO_DEN)) {
245 data.rebuild();
246 }
247 }
7c673cae
FG
248
249 void dump(Formatter *f) const {
250 f->dump_string("state", get_state_name(state));
251 f->dump_unsigned("seq", seq);
252 f->dump_unsigned("offset", offset);
253 f->dump_unsigned("length", length);
254 f->dump_unsigned("data_length", data.length());
255 }
256 };
257
258 struct Cache;
259
260 /// map logical extent range (object) onto buffers
261 struct BufferSpace {
91327a77
AA
262 enum {
263 BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache
264 };
265
7c673cae
FG
266 typedef boost::intrusive::list<
267 Buffer,
268 boost::intrusive::member_hook<
269 Buffer,
270 boost::intrusive::list_member_hook<>,
271 &Buffer::state_item> > state_list_t;
272
31f18b77 273 mempool::bluestore_cache_other::map<uint32_t, std::unique_ptr<Buffer>>
7c673cae
FG
274 buffer_map;
275
276 // we use a bare intrusive list here instead of std::map because
277 // it uses less memory and we expect this to be very small (very
278 // few IOs in flight to the same Blob at the same time).
279 state_list_t writing; ///< writing buffers, sorted by seq, ascending
280
281 ~BufferSpace() {
11fdf7f2
TL
282 ceph_assert(buffer_map.empty());
283 ceph_assert(writing.empty());
7c673cae
FG
284 }
285
286 void _add_buffer(Cache* cache, Buffer *b, int level, Buffer *near) {
287 cache->_audit("_add_buffer start");
288 buffer_map[b->offset].reset(b);
289 if (b->is_writing()) {
31f18b77 290 b->data.reassign_to_mempool(mempool::mempool_bluestore_writing);
224ce89b
WB
291 if (writing.empty() || writing.rbegin()->seq <= b->seq) {
292 writing.push_back(*b);
293 } else {
294 auto it = writing.begin();
295 while (it->seq < b->seq) {
296 ++it;
297 }
298
11fdf7f2 299 ceph_assert(it->seq >= b->seq);
224ce89b
WB
300 // note that this will insert b before it
301 // hence the order is maintained
302 writing.insert(it, *b);
303 }
7c673cae 304 } else {
31f18b77 305 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
7c673cae
FG
306 cache->_add_buffer(b, level, near);
307 }
308 cache->_audit("_add_buffer end");
309 }
310 void _rm_buffer(Cache* cache, Buffer *b) {
311 _rm_buffer(cache, buffer_map.find(b->offset));
312 }
31f18b77
FG
313 void _rm_buffer(Cache* cache,
314 map<uint32_t, std::unique_ptr<Buffer>>::iterator p) {
11fdf7f2 315 ceph_assert(p != buffer_map.end());
7c673cae
FG
316 cache->_audit("_rm_buffer start");
317 if (p->second->is_writing()) {
318 writing.erase(writing.iterator_to(*p->second));
319 } else {
320 cache->_rm_buffer(p->second.get());
321 }
322 buffer_map.erase(p);
323 cache->_audit("_rm_buffer end");
324 }
325
326 map<uint32_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
327 uint32_t offset) {
328 auto i = buffer_map.lower_bound(offset);
329 if (i != buffer_map.begin()) {
330 --i;
331 if (i->first + i->second->length <= offset)
332 ++i;
333 }
334 return i;
335 }
336
337 // must be called under protection of the Cache lock
338 void _clear(Cache* cache);
339
340 // return value is the highest cache_private of a trimmed buffer, or 0.
341 int discard(Cache* cache, uint32_t offset, uint32_t length) {
11fdf7f2 342 std::lock_guard l(cache->lock);
7c673cae
FG
343 return _discard(cache, offset, length);
344 }
345 int _discard(Cache* cache, uint32_t offset, uint32_t length);
346
347 void write(Cache* cache, uint64_t seq, uint32_t offset, bufferlist& bl,
348 unsigned flags) {
11fdf7f2 349 std::lock_guard l(cache->lock);
7c673cae
FG
350 Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
351 flags);
352 b->cache_private = _discard(cache, offset, bl.length());
353 _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
354 }
f64942e4 355 void _finish_write(Cache* cache, uint64_t seq);
7c673cae 356 void did_read(Cache* cache, uint32_t offset, bufferlist& bl) {
11fdf7f2 357 std::lock_guard l(cache->lock);
7c673cae
FG
358 Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
359 b->cache_private = _discard(cache, offset, bl.length());
360 _add_buffer(cache, b, 1, nullptr);
361 }
362
363 void read(Cache* cache, uint32_t offset, uint32_t length,
364 BlueStore::ready_regions_t& res,
91327a77
AA
365 interval_set<uint32_t>& res_intervals,
366 int flags = 0);
7c673cae
FG
367
368 void truncate(Cache* cache, uint32_t offset) {
369 discard(cache, offset, (uint32_t)-1 - offset);
370 }
371
372 void split(Cache* cache, size_t pos, BufferSpace &r);
373
374 void dump(Cache* cache, Formatter *f) const {
11fdf7f2 375 std::lock_guard l(cache->lock);
7c673cae
FG
376 f->open_array_section("buffers");
377 for (auto& i : buffer_map) {
378 f->open_object_section("buffer");
11fdf7f2 379 ceph_assert(i.first == i.second->offset);
7c673cae
FG
380 i.second->dump(f);
381 f->close_section();
382 }
383 f->close_section();
384 }
385 };
386
387 struct SharedBlobSet;
388
389 /// in-memory shared blob state (incl cached buffers)
390 struct SharedBlob {
391 MEMPOOL_CLASS_HELPERS();
392
393 std::atomic_int nref = {0}; ///< reference count
394 bool loaded = false;
395
396 CollectionRef coll;
397 union {
398 uint64_t sbid_unloaded; ///< sbid if persistent isn't loaded
399 bluestore_shared_blob_t *persistent; ///< persistent part of the shared blob if any
400 };
401 BufferSpace bc; ///< buffer cache
402
403 SharedBlob(Collection *_coll) : coll(_coll), sbid_unloaded(0) {
404 if (get_cache()) {
405 get_cache()->add_blob();
406 }
407 }
408 SharedBlob(uint64_t i, Collection *_coll);
409 ~SharedBlob();
410
411 uint64_t get_sbid() const {
412 return loaded ? persistent->sbid : sbid_unloaded;
413 }
414
415 friend void intrusive_ptr_add_ref(SharedBlob *b) { b->get(); }
416 friend void intrusive_ptr_release(SharedBlob *b) { b->put(); }
417
418 friend ostream& operator<<(ostream& out, const SharedBlob& sb);
419
420 void get() {
421 ++nref;
422 }
423 void put();
424
425 /// get logical references
426 void get_ref(uint64_t offset, uint32_t length);
427
428 /// put logical references, and get back any released extents
429 void put_ref(uint64_t offset, uint32_t length,
11fdf7f2 430 PExtentVector *r, bool *unshare);
7c673cae 431
f64942e4
AA
432 void finish_write(uint64_t seq);
433
7c673cae
FG
434 friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
435 return l.get_sbid() == r.get_sbid();
436 }
437 inline Cache* get_cache() {
438 return coll ? coll->cache : nullptr;
439 }
440 inline SharedBlobSet* get_parent() {
441 return coll ? &(coll->shared_blob_set) : nullptr;
442 }
443 inline bool is_loaded() const {
444 return loaded;
445 }
446
447 };
448 typedef boost::intrusive_ptr<SharedBlob> SharedBlobRef;
449
450 /// a lookup table of SharedBlobs
451 struct SharedBlobSet {
11fdf7f2
TL
452 /// protect lookup, insertion, removal
453 ceph::mutex lock = ceph::make_mutex("BlueStore::SharedBlobSet::lock");
7c673cae
FG
454
455 // we use a bare pointer because we don't want to affect the ref
456 // count
31f18b77 457 mempool::bluestore_cache_other::unordered_map<uint64_t,SharedBlob*> sb_map;
7c673cae
FG
458
459 SharedBlobRef lookup(uint64_t sbid) {
11fdf7f2 460 std::lock_guard l(lock);
7c673cae 461 auto p = sb_map.find(sbid);
28e407b8
AA
462 if (p == sb_map.end() ||
463 p->second->nref == 0) {
7c673cae
FG
464 return nullptr;
465 }
466 return p->second;
467 }
468
469 void add(Collection* coll, SharedBlob *sb) {
11fdf7f2 470 std::lock_guard l(lock);
7c673cae
FG
471 sb_map[sb->get_sbid()] = sb;
472 sb->coll = coll;
473 }
474
91327a77 475 bool remove(SharedBlob *sb, bool verify_nref_is_zero=false) {
11fdf7f2
TL
476 std::lock_guard l(lock);
477 ceph_assert(sb->get_parent() == this);
91327a77
AA
478 if (verify_nref_is_zero && sb->nref != 0) {
479 return false;
480 }
28e407b8
AA
481 // only remove if it still points to us
482 auto p = sb_map.find(sb->get_sbid());
483 if (p != sb_map.end() &&
484 p->second == sb) {
485 sb_map.erase(p);
486 }
91327a77 487 return true;
3efd9988
FG
488 }
489
7c673cae 490 bool empty() {
11fdf7f2 491 std::lock_guard l(lock);
7c673cae
FG
492 return sb_map.empty();
493 }
3efd9988 494
11fdf7f2
TL
495 template <int LogLevelV>
496 void dump(CephContext *cct);
7c673cae
FG
497 };
498
499//#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
500
501 /// in-memory blob metadata and associated cached buffers (if any)
502 struct Blob {
503 MEMPOOL_CLASS_HELPERS();
504
505 std::atomic_int nref = {0}; ///< reference count
506 int16_t id = -1; ///< id, for spanning blobs only, >= 0
507 int16_t last_encoded_id = -1; ///< (ephemeral) used during encoding only
508 SharedBlobRef shared_blob; ///< shared blob state (if any)
509
510 private:
511 mutable bluestore_blob_t blob; ///< decoded blob metadata
512#ifdef CACHE_BLOB_BL
513 mutable bufferlist blob_bl; ///< cached encoded blob, blob is dirty if empty
514#endif
515 /// refs from this shard. ephemeral if id<0, persisted if spanning.
516 bluestore_blob_use_tracker_t used_in_blob;
517
518 public:
519
520 friend void intrusive_ptr_add_ref(Blob *b) { b->get(); }
521 friend void intrusive_ptr_release(Blob *b) { b->put(); }
522
523 friend ostream& operator<<(ostream& out, const Blob &b);
524
525 const bluestore_blob_use_tracker_t& get_blob_use_tracker() const {
526 return used_in_blob;
527 }
528 bool is_referenced() const {
529 return used_in_blob.is_not_empty();
530 }
531 uint32_t get_referenced_bytes() const {
532 return used_in_blob.get_referenced_bytes();
533 }
534
535 bool is_spanning() const {
536 return id >= 0;
537 }
538
539 bool can_split() const {
11fdf7f2 540 std::lock_guard l(shared_blob->get_cache()->lock);
7c673cae
FG
541 // splitting a BufferSpace writing list is too hard; don't try.
542 return shared_blob->bc.writing.empty() &&
543 used_in_blob.can_split() &&
544 get_blob().can_split();
545 }
546
547 bool can_split_at(uint32_t blob_offset) const {
548 return used_in_blob.can_split_at(blob_offset) &&
549 get_blob().can_split_at(blob_offset);
550 }
551
224ce89b 552 bool can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
553 uint32_t target_blob_size,
554 uint32_t b_offset,
555 uint32_t *length0);
556
557 void dup(Blob& o) {
558 o.shared_blob = shared_blob;
559 o.blob = blob;
560#ifdef CACHE_BLOB_BL
561 o.blob_bl = blob_bl;
562#endif
563 }
564
224ce89b 565 inline const bluestore_blob_t& get_blob() const {
7c673cae
FG
566 return blob;
567 }
224ce89b 568 inline bluestore_blob_t& dirty_blob() {
7c673cae
FG
569#ifdef CACHE_BLOB_BL
570 blob_bl.clear();
571#endif
572 return blob;
573 }
574
575 /// discard buffers for unallocated regions
576 void discard_unallocated(Collection *coll);
577
578 /// get logical references
579 void get_ref(Collection *coll, uint32_t offset, uint32_t length);
580 /// put logical references, and get back any released extents
581 bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
582 PExtentVector *r);
583
584 /// split the blob
585 void split(Collection *coll, uint32_t blob_offset, Blob *o);
586
587 void get() {
588 ++nref;
589 }
590 void put() {
591 if (--nref == 0)
592 delete this;
593 }
594
595
596#ifdef CACHE_BLOB_BL
597 void _encode() const {
598 if (blob_bl.length() == 0 ) {
11fdf7f2 599 encode(blob, blob_bl);
7c673cae 600 } else {
11fdf7f2 601 ceph_assert(blob_bl.length());
7c673cae
FG
602 }
603 }
604 void bound_encode(
605 size_t& p,
606 bool include_ref_map) const {
607 _encode();
608 p += blob_bl.length();
609 if (include_ref_map) {
610 used_in_blob.bound_encode(p);
611 }
612 }
613 void encode(
614 bufferlist::contiguous_appender& p,
615 bool include_ref_map) const {
616 _encode();
617 p.append(blob_bl);
618 if (include_ref_map) {
619 used_in_blob.encode(p);
620 }
621 }
622 void decode(
623 Collection */*coll*/,
11fdf7f2 624 bufferptr::const_iterator& p,
7c673cae
FG
625 bool include_ref_map) {
626 const char *start = p.get_pos();
627 denc(blob, p);
628 const char *end = p.get_pos();
629 blob_bl.clear();
630 blob_bl.append(start, end - start);
631 if (include_ref_map) {
632 used_in_blob.decode(p);
633 }
634 }
635#else
636 void bound_encode(
637 size_t& p,
638 uint64_t struct_v,
639 uint64_t sbid,
640 bool include_ref_map) const {
641 denc(blob, p, struct_v);
642 if (blob.is_shared()) {
643 denc(sbid, p);
644 }
645 if (include_ref_map) {
646 used_in_blob.bound_encode(p);
647 }
648 }
649 void encode(
650 bufferlist::contiguous_appender& p,
651 uint64_t struct_v,
652 uint64_t sbid,
653 bool include_ref_map) const {
654 denc(blob, p, struct_v);
655 if (blob.is_shared()) {
656 denc(sbid, p);
657 }
658 if (include_ref_map) {
659 used_in_blob.encode(p);
660 }
661 }
662 void decode(
663 Collection *coll,
11fdf7f2 664 bufferptr::const_iterator& p,
7c673cae
FG
665 uint64_t struct_v,
666 uint64_t* sbid,
667 bool include_ref_map);
668#endif
669 };
670 typedef boost::intrusive_ptr<Blob> BlobRef;
31f18b77 671 typedef mempool::bluestore_cache_other::map<int,BlobRef> blob_map_t;
7c673cae
FG
672
673 /// a logical extent, pointing to (some portion of) a blob
674 typedef boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true> > ExtentBase; //making an alias to avoid build warnings
675 struct Extent : public ExtentBase {
676 MEMPOOL_CLASS_HELPERS();
677
678 uint32_t logical_offset = 0; ///< logical offset
679 uint32_t blob_offset = 0; ///< blob offset
680 uint32_t length = 0; ///< length
681 BlobRef blob; ///< the blob with our data
682
683 /// ctor for lookup only
684 explicit Extent(uint32_t lo) : ExtentBase(), logical_offset(lo) { }
685 /// ctor for delayed initialization (see decode_some())
686 explicit Extent() : ExtentBase() {
687 }
688 /// ctor for general usage
689 Extent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
690 : ExtentBase(),
691 logical_offset(lo), blob_offset(o), length(l) {
692 assign_blob(b);
693 }
694 ~Extent() {
695 if (blob) {
696 blob->shared_blob->get_cache()->rm_extent();
697 }
698 }
699
700 void assign_blob(const BlobRef& b) {
11fdf7f2 701 ceph_assert(!blob);
7c673cae
FG
702 blob = b;
703 blob->shared_blob->get_cache()->add_extent();
704 }
705
706 // comparators for intrusive_set
707 friend bool operator<(const Extent &a, const Extent &b) {
708 return a.logical_offset < b.logical_offset;
709 }
710 friend bool operator>(const Extent &a, const Extent &b) {
711 return a.logical_offset > b.logical_offset;
712 }
713 friend bool operator==(const Extent &a, const Extent &b) {
714 return a.logical_offset == b.logical_offset;
715 }
716
717 uint32_t blob_start() const {
718 return logical_offset - blob_offset;
719 }
720
721 uint32_t blob_end() const {
722 return blob_start() + blob->get_blob().get_logical_length();
723 }
724
725 uint32_t logical_end() const {
726 return logical_offset + length;
727 }
728
729 // return true if any piece of the blob is out of
730 // the given range [o, o + l].
731 bool blob_escapes_range(uint32_t o, uint32_t l) const {
732 return blob_start() < o || blob_end() > o + l;
733 }
734 };
735 typedef boost::intrusive::set<Extent> extent_map_t;
736
737
738 friend ostream& operator<<(ostream& out, const Extent& e);
739
740 struct OldExtent {
741 boost::intrusive::list_member_hook<> old_extent_item;
742 Extent e;
743 PExtentVector r;
744 bool blob_empty; // flag to track the last removed extent that makes blob
745 // empty - required to update compression stat properly
746 OldExtent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
747 : e(lo, o, l, b), blob_empty(false) {
748 }
749 static OldExtent* create(CollectionRef c,
750 uint32_t lo,
751 uint32_t o,
752 uint32_t l,
753 BlobRef& b);
754 };
755 typedef boost::intrusive::list<
756 OldExtent,
757 boost::intrusive::member_hook<
758 OldExtent,
759 boost::intrusive::list_member_hook<>,
760 &OldExtent::old_extent_item> > old_extent_map_t;
761
762 struct Onode;
763
764 /// a sharded extent map, mapping offsets to lextents to blobs
765 struct ExtentMap {
766 Onode *onode;
767 extent_map_t extent_map; ///< map of Extents to Blobs
768 blob_map_t spanning_blob_map; ///< blobs that span shards
11fdf7f2 769 typedef boost::intrusive_ptr<Onode> OnodeRef;
7c673cae
FG
770
771 struct Shard {
772 bluestore_onode_t::shard_info *shard_info = nullptr;
773 unsigned extents = 0; ///< count extents in this shard
774 bool loaded = false; ///< true if shard is loaded
775 bool dirty = false; ///< true if shard is dirty and needs reencoding
776 };
31f18b77 777 mempool::bluestore_cache_other::vector<Shard> shards; ///< shards
7c673cae
FG
778
779 bufferlist inline_bl; ///< cached encoded map, if unsharded; empty=>dirty
780
781 uint32_t needs_reshard_begin = 0;
782 uint32_t needs_reshard_end = 0;
783
11fdf7f2
TL
784 void dup(BlueStore* b, TransContext*, CollectionRef&, OnodeRef&, OnodeRef&,
785 uint64_t&, uint64_t&, uint64_t&);
786
7c673cae
FG
787 bool needs_reshard() const {
788 return needs_reshard_end > needs_reshard_begin;
789 }
790 void clear_needs_reshard() {
791 needs_reshard_begin = needs_reshard_end = 0;
792 }
793 void request_reshard(uint32_t begin, uint32_t end) {
794 if (begin < needs_reshard_begin) {
795 needs_reshard_begin = begin;
796 }
797 if (end > needs_reshard_end) {
798 needs_reshard_end = end;
799 }
800 }
801
802 struct DeleteDisposer {
803 void operator()(Extent *e) { delete e; }
804 };
805
806 ExtentMap(Onode *o);
807 ~ExtentMap() {
808 extent_map.clear_and_dispose(DeleteDisposer());
809 }
810
811 void clear() {
812 extent_map.clear_and_dispose(DeleteDisposer());
813 shards.clear();
814 inline_bl.clear();
815 clear_needs_reshard();
816 }
817
818 bool encode_some(uint32_t offset, uint32_t length, bufferlist& bl,
819 unsigned *pn);
820 unsigned decode_some(bufferlist& bl);
821
822 void bound_encode_spanning_blobs(size_t& p);
823 void encode_spanning_blobs(bufferlist::contiguous_appender& p);
11fdf7f2 824 void decode_spanning_blobs(bufferptr::const_iterator& p);
7c673cae
FG
825
826 BlobRef get_spanning_blob(int id) {
827 auto p = spanning_blob_map.find(id);
11fdf7f2 828 ceph_assert(p != spanning_blob_map.end());
7c673cae
FG
829 return p->second;
830 }
831
832 void update(KeyValueDB::Transaction t, bool force);
31f18b77 833 decltype(BlueStore::Blob::id) allocate_spanning_blob_id();
7c673cae
FG
834 void reshard(
835 KeyValueDB *db,
836 KeyValueDB::Transaction t);
837
838 /// initialize Shards from the onode
839 void init_shards(bool loaded, bool dirty);
840
841 /// return index of shard containing offset
842 /// or -1 if not found
843 int seek_shard(uint32_t offset) {
844 size_t end = shards.size();
845 size_t mid, left = 0;
846 size_t right = end; // one passed the right end
847
848 while (left < right) {
849 mid = left + (right - left) / 2;
850 if (offset >= shards[mid].shard_info->offset) {
851 size_t next = mid + 1;
852 if (next >= end || offset < shards[next].shard_info->offset)
853 return mid;
854 //continue to search forwards
855 left = next;
856 } else {
857 //continue to search backwards
858 right = mid;
859 }
860 }
861
862 return -1; // not found
863 }
864
865 /// check if a range spans a shard
866 bool spans_shard(uint32_t offset, uint32_t length) {
867 if (shards.empty()) {
868 return false;
869 }
870 int s = seek_shard(offset);
11fdf7f2 871 ceph_assert(s >= 0);
7c673cae
FG
872 if (s == (int)shards.size() - 1) {
873 return false; // last shard
874 }
875 if (offset + length <= shards[s+1].shard_info->offset) {
876 return false;
877 }
878 return true;
879 }
880
881 /// ensure that a range of the map is loaded
882 void fault_range(KeyValueDB *db,
883 uint32_t offset, uint32_t length);
884
885 /// ensure a range of the map is marked dirty
31f18b77 886 void dirty_range(uint32_t offset, uint32_t length);
7c673cae 887
31f18b77 888 /// for seek_lextent test
7c673cae
FG
889 extent_map_t::iterator find(uint64_t offset);
890
7c673cae
FG
891 /// seek to the first lextent including or after offset
892 extent_map_t::iterator seek_lextent(uint64_t offset);
893 extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
894
895 /// add a new Extent
896 void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
897 extent_map.insert(*new Extent(lo, o, l, b));
898 }
899
900 /// remove (and delete) an Extent
901 void rm(extent_map_t::iterator p) {
902 extent_map.erase_and_dispose(p, DeleteDisposer());
903 }
904
905 bool has_any_lextents(uint64_t offset, uint64_t length);
906
907 /// consolidate adjacent lextents in extent_map
908 int compress_extent_map(uint64_t offset, uint64_t length);
909
910 /// punch a logical hole. add lextents to deref to target list.
911 void punch_hole(CollectionRef &c,
912 uint64_t offset, uint64_t length,
913 old_extent_map_t *old_extents);
914
915 /// put new lextent into lextent_map overwriting existing ones if
916 /// any and update references accordingly
917 Extent *set_lextent(CollectionRef &c,
918 uint64_t logical_offset,
919 uint64_t offset, uint64_t length,
920 BlobRef b,
921 old_extent_map_t *old_extents);
922
923 /// split a blob (and referring extents)
924 BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
925 };
926
927 /// Compressed Blob Garbage collector
928 /*
929 The primary idea of the collector is to estimate a difference between
930 allocation units(AU) currently present for compressed blobs and new AUs
931 required to store that data uncompressed.
932 Estimation is performed for protrusive extents within a logical range
933 determined by a concatenation of old_extents collection and specific(current)
934 write request.
935 The root cause for old_extents use is the need to handle blob ref counts
936 properly. Old extents still hold blob refs and hence we need to traverse
937 the collection to determine if blob to be released.
938 Protrusive extents are extents that fit into the blob set in action
939 (ones that are below the logical range from above) but not removed totally
940 due to the current write.
941 E.g. for
942 extent1 <loffs = 100, boffs = 100, len = 100> ->
943 blob1<compressed, len_on_disk=4096, logical_len=8192>
944 extent2 <loffs = 200, boffs = 200, len = 100> ->
945 blob2<raw, len_on_disk=4096, llen=4096>
946 extent3 <loffs = 300, boffs = 300, len = 100> ->
947 blob1<compressed, len_on_disk=4096, llen=8192>
948 extent4 <loffs = 4096, boffs = 0, len = 100> ->
949 blob3<raw, len_on_disk=4096, llen=4096>
950 write(300~100)
951 protrusive extents are within the following ranges <0~300, 400~8192-400>
952 In this case existing AUs that might be removed due to GC (i.e. blob1)
953 use 2x4K bytes.
954 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
955 Hence we should do a collect.
956 */
957 class GarbageCollector
958 {
959 public:
960 /// return amount of allocation units that might be saved due to GC
961 int64_t estimate(
962 uint64_t offset,
963 uint64_t length,
964 const ExtentMap& extent_map,
965 const old_extent_map_t& old_extents,
966 uint64_t min_alloc_size);
967
968 /// return a collection of extents to perform GC on
eafe8130 969 const interval_set<uint64_t>& get_extents_to_collect() const {
7c673cae
FG
970 return extents_to_collect;
971 }
972 GarbageCollector(CephContext* _cct) : cct(_cct) {}
973
974 private:
975 struct BlobInfo {
976 uint64_t referenced_bytes = 0; ///< amount of bytes referenced in blob
977 int64_t expected_allocations = 0; ///< new alloc units required
978 ///< in case of gc fulfilled
979 bool collect_candidate = false; ///< indicate if blob has any extents
980 ///< eligible for GC.
981 extent_map_t::const_iterator first_lextent; ///< points to the first
982 ///< lextent referring to
983 ///< the blob if any.
984 ///< collect_candidate flag
985 ///< determines the validity
986 extent_map_t::const_iterator last_lextent; ///< points to the last
987 ///< lextent referring to
988 ///< the blob if any.
989
990 BlobInfo(uint64_t ref_bytes) :
991 referenced_bytes(ref_bytes) {
992 }
993 };
994 CephContext* cct;
995 map<Blob*, BlobInfo> affected_blobs; ///< compressed blobs and their ref_map
996 ///< copies that are affected by the
997 ///< specific write
998
a8e16298 999 ///< protrusive extents that should be collected if GC takes place
eafe8130 1000 interval_set<uint64_t> extents_to_collect;
7c673cae
FG
1001
1002 boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
1003 ///< unit when traversing
1004 ///< protrusive extents.
1005 ///< Other extents mapped to
1006 ///< this AU to be ignored
1007 ///< (except the case where
1008 ///< uncompressed extent follows
1009 ///< compressed one - see below).
1010 BlobInfo* blob_info_counted = nullptr; ///< set if previous allocation unit
1011 ///< caused expected_allocations
1012 ///< counter increment at this blob.
1013 ///< if uncompressed extent follows
1014 ///< a decrement for the
1015 ///< expected_allocations counter
1016 ///< is needed
1017 int64_t expected_allocations = 0; ///< new alloc units required in case
1018 ///< of gc fulfilled
1019 int64_t expected_for_release = 0; ///< alloc units currently used by
1020 ///< compressed blobs that might
1021 ///< gone after GC
7c673cae
FG
1022
1023 protected:
1024 void process_protrusive_extents(const BlueStore::ExtentMap& extent_map,
1025 uint64_t start_offset,
1026 uint64_t end_offset,
1027 uint64_t start_touch_offset,
1028 uint64_t end_touch_offset,
1029 uint64_t min_alloc_size);
1030 };
1031
1032 struct OnodeSpace;
1033
1034 /// an in-memory object
1035 struct Onode {
1036 MEMPOOL_CLASS_HELPERS();
1037
1038 std::atomic_int nref; ///< reference count
1039 Collection *c;
1040
1041 ghobject_t oid;
1042
1043 /// key under PREFIX_OBJ where we are stored
31f18b77 1044 mempool::bluestore_cache_other::string key;
7c673cae
FG
1045
1046 boost::intrusive::list_member_hook<> lru_item;
1047
1048 bluestore_onode_t onode; ///< metadata stored as value in kv store
1049 bool exists; ///< true if object logically exists
1050
1051 ExtentMap extent_map;
1052
1053 // track txc's that have not been committed to kv store (and whose
1054 // effects cannot be read via the kvdb read methods)
1055 std::atomic<int> flushing_count = {0};
11fdf7f2
TL
1056 /// protect flush_txns
1057 ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock");
1058 ceph::condition_variable flush_cond; ///< wait here for uncommitted txns
7c673cae
FG
1059
1060 Onode(Collection *c, const ghobject_t& o,
31f18b77 1061 const mempool::bluestore_cache_other::string& k)
7c673cae
FG
1062 : nref(0),
1063 c(c),
1064 oid(o),
1065 key(k),
1066 exists(false),
1067 extent_map(this) {
1068 }
eafe8130
TL
1069 Onode(Collection* c, const ghobject_t& o,
1070 const string& k)
1071 : nref(0),
1072 c(c),
1073 oid(o),
1074 key(k),
1075 exists(false),
1076 extent_map(this) {
1077 }
1078 Onode(Collection* c, const ghobject_t& o,
1079 const char* k)
1080 : nref(0),
1081 c(c),
1082 oid(o),
1083 key(k),
1084 exists(false),
1085 extent_map(this) {
1086 }
1087
1088 static Onode* decode(
1089 CollectionRef c,
1090 const ghobject_t& oid,
1091 const string& key,
1092 const bufferlist& v);
7c673cae
FG
1093
1094 void flush();
1095 void get() {
1096 ++nref;
1097 }
1098 void put() {
1099 if (--nref == 0)
1100 delete this;
1101 }
1102 };
1103 typedef boost::intrusive_ptr<Onode> OnodeRef;
1104
1105
1106 /// a cache (shard) of onodes and buffers
1107 struct Cache {
1108 CephContext* cct;
1109 PerfCounters *logger;
11fdf7f2
TL
1110
1111 /// protect lru and other structures
1112 ceph::recursive_mutex lock = {
1113 ceph::make_recursive_mutex("BlueStore::Cache::lock") };
7c673cae
FG
1114
1115 std::atomic<uint64_t> num_extents = {0};
1116 std::atomic<uint64_t> num_blobs = {0};
1117
7c673cae
FG
1118 static Cache *create(CephContext* cct, string type, PerfCounters *logger);
1119
1120 Cache(CephContext* cct) : cct(cct), logger(nullptr) {}
1121 virtual ~Cache() {}
1122
1123 virtual void _add_onode(OnodeRef& o, int level) = 0;
1124 virtual void _rm_onode(OnodeRef& o) = 0;
1125 virtual void _touch_onode(OnodeRef& o) = 0;
1126
1127 virtual void _add_buffer(Buffer *b, int level, Buffer *near) = 0;
1128 virtual void _rm_buffer(Buffer *b) = 0;
1129 virtual void _move_buffer(Cache *src, Buffer *b) = 0;
1130 virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0;
1131 virtual void _touch_buffer(Buffer *b) = 0;
1132
1133 virtual uint64_t _get_num_onodes() = 0;
1134 virtual uint64_t _get_buffer_bytes() = 0;
1135
1136 void add_extent() {
1137 ++num_extents;
1138 }
1139 void rm_extent() {
1140 --num_extents;
1141 }
1142
1143 void add_blob() {
1144 ++num_blobs;
1145 }
1146 void rm_blob() {
1147 --num_blobs;
1148 }
1149
91327a77 1150 void trim(uint64_t onode_max, uint64_t buffer_max);
7c673cae
FG
1151
1152 void trim_all();
1153
1154 virtual void _trim(uint64_t onode_max, uint64_t buffer_max) = 0;
1155
1156 virtual void add_stats(uint64_t *onodes, uint64_t *extents,
1157 uint64_t *blobs,
1158 uint64_t *buffers,
1159 uint64_t *bytes) = 0;
1160
31f18b77 1161 bool empty() {
11fdf7f2 1162 std::lock_guard l(lock);
31f18b77
FG
1163 return _get_num_onodes() == 0 && _get_buffer_bytes() == 0;
1164 }
1165
7c673cae
FG
1166#ifdef DEBUG_CACHE
1167 virtual void _audit(const char *s) = 0;
1168#else
1169 void _audit(const char *s) { /* no-op */ }
1170#endif
1171 };
1172
1173 /// simple LRU cache for onodes and buffers
1174 struct LRUCache : public Cache {
1175 private:
1176 typedef boost::intrusive::list<
1177 Onode,
1178 boost::intrusive::member_hook<
1179 Onode,
1180 boost::intrusive::list_member_hook<>,
1181 &Onode::lru_item> > onode_lru_list_t;
1182 typedef boost::intrusive::list<
1183 Buffer,
1184 boost::intrusive::member_hook<
1185 Buffer,
1186 boost::intrusive::list_member_hook<>,
1187 &Buffer::lru_item> > buffer_lru_list_t;
1188
1189 onode_lru_list_t onode_lru;
1190
1191 buffer_lru_list_t buffer_lru;
1192 uint64_t buffer_size = 0;
1193
1194 public:
1195 LRUCache(CephContext* cct) : Cache(cct) {}
1196 uint64_t _get_num_onodes() override {
1197 return onode_lru.size();
1198 }
1199 void _add_onode(OnodeRef& o, int level) override {
1200 if (level > 0)
1201 onode_lru.push_front(*o);
1202 else
1203 onode_lru.push_back(*o);
1204 }
1205 void _rm_onode(OnodeRef& o) override {
1206 auto q = onode_lru.iterator_to(*o);
1207 onode_lru.erase(q);
1208 }
1209 void _touch_onode(OnodeRef& o) override;
1210
1211 uint64_t _get_buffer_bytes() override {
1212 return buffer_size;
1213 }
1214 void _add_buffer(Buffer *b, int level, Buffer *near) override {
1215 if (near) {
1216 auto q = buffer_lru.iterator_to(*near);
1217 buffer_lru.insert(q, *b);
1218 } else if (level > 0) {
1219 buffer_lru.push_front(*b);
1220 } else {
1221 buffer_lru.push_back(*b);
1222 }
1223 buffer_size += b->length;
1224 }
1225 void _rm_buffer(Buffer *b) override {
11fdf7f2 1226 ceph_assert(buffer_size >= b->length);
7c673cae
FG
1227 buffer_size -= b->length;
1228 auto q = buffer_lru.iterator_to(*b);
1229 buffer_lru.erase(q);
1230 }
1231 void _move_buffer(Cache *src, Buffer *b) override {
1232 src->_rm_buffer(b);
1233 _add_buffer(b, 0, nullptr);
1234 }
1235 void _adjust_buffer_size(Buffer *b, int64_t delta) override {
11fdf7f2 1236 ceph_assert((int64_t)buffer_size + delta >= 0);
7c673cae
FG
1237 buffer_size += delta;
1238 }
1239 void _touch_buffer(Buffer *b) override {
1240 auto p = buffer_lru.iterator_to(*b);
1241 buffer_lru.erase(p);
1242 buffer_lru.push_front(*b);
1243 _audit("_touch_buffer end");
1244 }
1245
1246 void _trim(uint64_t onode_max, uint64_t buffer_max) override;
1247
1248 void add_stats(uint64_t *onodes, uint64_t *extents,
1249 uint64_t *blobs,
1250 uint64_t *buffers,
1251 uint64_t *bytes) override {
11fdf7f2 1252 std::lock_guard l(lock);
7c673cae
FG
1253 *onodes += onode_lru.size();
1254 *extents += num_extents;
1255 *blobs += num_blobs;
1256 *buffers += buffer_lru.size();
1257 *bytes += buffer_size;
1258 }
1259
1260#ifdef DEBUG_CACHE
1261 void _audit(const char *s) override;
1262#endif
1263 };
1264
1265 // 2Q cache for buffers, LRU for onodes
1266 struct TwoQCache : public Cache {
1267 private:
1268 // stick with LRU for onodes for now (fixme?)
1269 typedef boost::intrusive::list<
1270 Onode,
1271 boost::intrusive::member_hook<
1272 Onode,
1273 boost::intrusive::list_member_hook<>,
1274 &Onode::lru_item> > onode_lru_list_t;
1275 typedef boost::intrusive::list<
1276 Buffer,
1277 boost::intrusive::member_hook<
1278 Buffer,
1279 boost::intrusive::list_member_hook<>,
1280 &Buffer::lru_item> > buffer_list_t;
1281
1282 onode_lru_list_t onode_lru;
1283
1284 buffer_list_t buffer_hot; ///< "Am" hot buffers
1285 buffer_list_t buffer_warm_in; ///< "A1in" newly warm buffers
1286 buffer_list_t buffer_warm_out; ///< "A1out" empty buffers we've evicted
1287 uint64_t buffer_bytes = 0; ///< bytes
1288
1289 enum {
1290 BUFFER_NEW = 0,
1291 BUFFER_WARM_IN, ///< in buffer_warm_in
1292 BUFFER_WARM_OUT, ///< in buffer_warm_out
1293 BUFFER_HOT, ///< in buffer_hot
1294 BUFFER_TYPE_MAX
1295 };
1296
1297 uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
1298
1299 public:
1300 TwoQCache(CephContext* cct) : Cache(cct) {}
1301 uint64_t _get_num_onodes() override {
1302 return onode_lru.size();
1303 }
1304 void _add_onode(OnodeRef& o, int level) override {
1305 if (level > 0)
1306 onode_lru.push_front(*o);
1307 else
1308 onode_lru.push_back(*o);
1309 }
1310 void _rm_onode(OnodeRef& o) override {
1311 auto q = onode_lru.iterator_to(*o);
1312 onode_lru.erase(q);
1313 }
1314 void _touch_onode(OnodeRef& o) override;
1315
1316 uint64_t _get_buffer_bytes() override {
1317 return buffer_bytes;
1318 }
1319 void _add_buffer(Buffer *b, int level, Buffer *near) override;
1320 void _rm_buffer(Buffer *b) override;
1321 void _move_buffer(Cache *src, Buffer *b) override;
1322 void _adjust_buffer_size(Buffer *b, int64_t delta) override;
1323 void _touch_buffer(Buffer *b) override {
1324 switch (b->cache_private) {
1325 case BUFFER_WARM_IN:
1326 // do nothing (somewhat counter-intuitively!)
1327 break;
1328 case BUFFER_WARM_OUT:
1329 // move from warm_out to hot LRU
11fdf7f2 1330 ceph_abort_msg("this happens via discard hint");
7c673cae
FG
1331 break;
1332 case BUFFER_HOT:
1333 // move to front of hot LRU
1334 buffer_hot.erase(buffer_hot.iterator_to(*b));
1335 buffer_hot.push_front(*b);
1336 break;
1337 }
1338 _audit("_touch_buffer end");
1339 }
1340
1341 void _trim(uint64_t onode_max, uint64_t buffer_max) override;
1342
1343 void add_stats(uint64_t *onodes, uint64_t *extents,
1344 uint64_t *blobs,
1345 uint64_t *buffers,
1346 uint64_t *bytes) override {
11fdf7f2 1347 std::lock_guard l(lock);
7c673cae
FG
1348 *onodes += onode_lru.size();
1349 *extents += num_extents;
1350 *blobs += num_blobs;
1351 *buffers += buffer_hot.size() + buffer_warm_in.size();
1352 *bytes += buffer_bytes;
1353 }
1354
1355#ifdef DEBUG_CACHE
1356 void _audit(const char *s) override;
1357#endif
1358 };
1359
1360 struct OnodeSpace {
1361 private:
1362 Cache *cache;
1363
1364 /// forward lookups
31f18b77 1365 mempool::bluestore_cache_other::unordered_map<ghobject_t,OnodeRef> onode_map;
7c673cae
FG
1366
1367 friend class Collection; // for split_cache()
1368
1369 public:
1370 OnodeSpace(Cache *c) : cache(c) {}
1371 ~OnodeSpace() {
1372 clear();
1373 }
1374
1375 OnodeRef add(const ghobject_t& oid, OnodeRef o);
1376 OnodeRef lookup(const ghobject_t& o);
1377 void remove(const ghobject_t& oid) {
1378 onode_map.erase(oid);
1379 }
1380 void rename(OnodeRef& o, const ghobject_t& old_oid,
1381 const ghobject_t& new_oid,
31f18b77 1382 const mempool::bluestore_cache_other::string& new_okey);
7c673cae
FG
1383 void clear();
1384 bool empty();
1385
11fdf7f2
TL
1386 template <int LogLevelV>
1387 void dump(CephContext *cct);
3efd9988 1388
7c673cae
FG
1389 /// return true if f true for any item
1390 bool map_any(std::function<bool(OnodeRef)> f);
1391 };
1392
11fdf7f2
TL
1393 class OpSequencer;
1394 typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
1395
7c673cae
FG
1396 struct Collection : public CollectionImpl {
1397 BlueStore *store;
11fdf7f2 1398 OpSequencerRef osr;
7c673cae 1399 Cache *cache; ///< our cache shard
7c673cae
FG
1400 bluestore_cnode_t cnode;
1401 RWLock lock;
1402
1403 bool exists;
1404
1405 SharedBlobSet shared_blob_set; ///< open SharedBlobs
1406
1407 // cache onodes on a per-collection basis to avoid lock
1408 // contention.
1409 OnodeSpace onode_map;
1410
1411 //pool options
1412 pool_opts_t pool_opts;
11fdf7f2 1413 ContextQueue *commit_queue;
7c673cae
FG
1414
1415 OnodeRef get_onode(const ghobject_t& oid, bool create);
1416
1417 // the terminology is confusing here, sorry!
1418 //
1419 // blob_t shared_blob_t
1420 // !shared unused -> open
1421 // shared !loaded -> open + shared
1422 // shared loaded -> open + shared + loaded
1423 //
1424 // i.e.,
1425 // open = SharedBlob is instantiated
1426 // shared = blob_t shared flag is set; SharedBlob is hashed.
1427 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1428 void open_shared_blob(uint64_t sbid, BlobRef b);
1429 void load_shared_blob(SharedBlobRef sb);
1430 void make_blob_shared(uint64_t sbid, BlobRef b);
31f18b77 1431 uint64_t make_blob_unshared(SharedBlob *sb);
7c673cae
FG
1432
1433 BlobRef new_blob() {
1434 BlobRef b = new Blob();
1435 b->shared_blob = new SharedBlob(this);
1436 return b;
1437 }
1438
7c673cae
FG
1439 bool contains(const ghobject_t& oid) {
1440 if (cid.is_meta())
1441 return oid.hobj.pool == -1;
1442 spg_t spgid;
1443 if (cid.is_pg(&spgid))
1444 return
1445 spgid.pgid.contains(cnode.bits, oid) &&
1446 oid.shard_id == spgid.shard;
1447 return false;
1448 }
1449
1450 void split_cache(Collection *dest);
7c673cae 1451
11fdf7f2
TL
1452 bool flush_commit(Context *c) override;
1453 void flush() override;
1454 void flush_all_but_last();
1455
7c673cae
FG
1456 Collection(BlueStore *ns, Cache *ca, coll_t c);
1457 };
1458
1459 class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
1460 CollectionRef c;
1461 OnodeRef o;
1462 KeyValueDB::Iterator it;
1463 string head, tail;
11fdf7f2
TL
1464
1465 string _stringify() const;
1466
7c673cae
FG
1467 public:
1468 OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
1469 int seek_to_first() override;
1470 int upper_bound(const string &after) override;
1471 int lower_bound(const string &to) override;
1472 bool valid() override;
11fdf7f2 1473 int next() override;
7c673cae
FG
1474 string key() override;
1475 bufferlist value() override;
1476 int status() override {
1477 return 0;
1478 }
1479 };
1480
31f18b77
FG
1481 struct volatile_statfs{
1482 enum {
1483 STATFS_ALLOCATED = 0,
1484 STATFS_STORED,
1485 STATFS_COMPRESSED_ORIGINAL,
1486 STATFS_COMPRESSED,
1487 STATFS_COMPRESSED_ALLOCATED,
1488 STATFS_LAST
1489 };
1490 int64_t values[STATFS_LAST];
1491 volatile_statfs() {
1492 memset(this, 0, sizeof(volatile_statfs));
1493 }
1494 void reset() {
1495 *this = volatile_statfs();
1496 }
11fdf7f2
TL
1497 void publish(store_statfs_t* buf) const {
1498 buf->allocated = allocated();
1499 buf->data_stored = stored();
1500 buf->data_compressed = compressed();
1501 buf->data_compressed_original = compressed_original();
1502 buf->data_compressed_allocated = compressed_allocated();
1503 }
1504
31f18b77
FG
1505 volatile_statfs& operator+=(const volatile_statfs& other) {
1506 for (size_t i = 0; i < STATFS_LAST; ++i) {
1507 values[i] += other.values[i];
1508 }
1509 return *this;
1510 }
1511 int64_t& allocated() {
1512 return values[STATFS_ALLOCATED];
1513 }
1514 int64_t& stored() {
1515 return values[STATFS_STORED];
1516 }
1517 int64_t& compressed_original() {
1518 return values[STATFS_COMPRESSED_ORIGINAL];
1519 }
1520 int64_t& compressed() {
1521 return values[STATFS_COMPRESSED];
1522 }
1523 int64_t& compressed_allocated() {
1524 return values[STATFS_COMPRESSED_ALLOCATED];
1525 }
11fdf7f2
TL
1526 int64_t allocated() const {
1527 return values[STATFS_ALLOCATED];
1528 }
1529 int64_t stored() const {
1530 return values[STATFS_STORED];
1531 }
1532 int64_t compressed_original() const {
1533 return values[STATFS_COMPRESSED_ORIGINAL];
1534 }
1535 int64_t compressed() const {
1536 return values[STATFS_COMPRESSED];
1537 }
1538 int64_t compressed_allocated() const {
1539 return values[STATFS_COMPRESSED_ALLOCATED];
1540 }
1541 volatile_statfs& operator=(const store_statfs_t& st) {
1542 values[STATFS_ALLOCATED] = st.allocated;
1543 values[STATFS_STORED] = st.data_stored;
1544 values[STATFS_COMPRESSED_ORIGINAL] = st.data_compressed_original;
1545 values[STATFS_COMPRESSED] = st.data_compressed;
1546 values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated;
1547 return *this;
1548 }
31f18b77
FG
1549 bool is_empty() {
1550 return values[STATFS_ALLOCATED] == 0 &&
1551 values[STATFS_STORED] == 0 &&
1552 values[STATFS_COMPRESSED] == 0 &&
1553 values[STATFS_COMPRESSED_ORIGINAL] == 0 &&
1554 values[STATFS_COMPRESSED_ALLOCATED] == 0;
1555 }
11fdf7f2
TL
1556 void decode(bufferlist::const_iterator& it) {
1557 using ceph::decode;
31f18b77 1558 for (size_t i = 0; i < STATFS_LAST; i++) {
11fdf7f2 1559 decode(values[i], it);
31f18b77
FG
1560 }
1561 }
1562
1563 void encode(bufferlist& bl) {
11fdf7f2 1564 using ceph::encode;
31f18b77 1565 for (size_t i = 0; i < STATFS_LAST; i++) {
11fdf7f2 1566 encode(values[i], bl);
31f18b77
FG
1567 }
1568 }
1569 };
1570
11fdf7f2 1571 struct TransContext final : public AioContext {
31f18b77
FG
1572 MEMPOOL_CLASS_HELPERS();
1573
7c673cae
FG
1574 typedef enum {
1575 STATE_PREPARE,
1576 STATE_AIO_WAIT,
1577 STATE_IO_DONE,
1578 STATE_KV_QUEUED, // queued for kv_sync_thread submission
1579 STATE_KV_SUBMITTED, // submitted to kv; not yet synced
1580 STATE_KV_DONE,
1581 STATE_DEFERRED_QUEUED, // in deferred_queue (pending or running)
1582 STATE_DEFERRED_CLEANUP, // remove deferred kv record
1583 STATE_DEFERRED_DONE,
1584 STATE_FINISHING,
1585 STATE_DONE,
1586 } state_t;
1587
1588 state_t state = STATE_PREPARE;
1589
1590 const char *get_state_name() {
1591 switch (state) {
1592 case STATE_PREPARE: return "prepare";
1593 case STATE_AIO_WAIT: return "aio_wait";
1594 case STATE_IO_DONE: return "io_done";
1595 case STATE_KV_QUEUED: return "kv_queued";
1596 case STATE_KV_SUBMITTED: return "kv_submitted";
1597 case STATE_KV_DONE: return "kv_done";
1598 case STATE_DEFERRED_QUEUED: return "deferred_queued";
1599 case STATE_DEFERRED_CLEANUP: return "deferred_cleanup";
1600 case STATE_DEFERRED_DONE: return "deferred_done";
1601 case STATE_FINISHING: return "finishing";
1602 case STATE_DONE: return "done";
1603 }
1604 return "???";
1605 }
1606
1607#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1608 const char *get_state_latency_name(int state) {
1609 switch (state) {
1610 case l_bluestore_state_prepare_lat: return "prepare";
1611 case l_bluestore_state_aio_wait_lat: return "aio_wait";
1612 case l_bluestore_state_io_done_lat: return "io_done";
1613 case l_bluestore_state_kv_queued_lat: return "kv_queued";
1614 case l_bluestore_state_kv_committing_lat: return "kv_committing";
1615 case l_bluestore_state_kv_done_lat: return "kv_done";
1616 case l_bluestore_state_deferred_queued_lat: return "deferred_queued";
1617 case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup";
1618 case l_bluestore_state_finishing_lat: return "finishing";
1619 case l_bluestore_state_done_lat: return "done";
1620 }
1621 return "???";
1622 }
1623#endif
1624
11fdf7f2 1625 utime_t log_state_latency(PerfCounters *logger, int state) {
7c673cae
FG
1626 utime_t lat, now = ceph_clock_now();
1627 lat = now - last_stamp;
1628 logger->tinc(state, lat);
1629#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1630 if (state >= l_bluestore_state_prepare_lat && state <= l_bluestore_state_done_lat) {
1631 double usecs = (now.to_nsec()-last_stamp.to_nsec())/1000;
1632 OID_ELAPSED("", usecs, get_state_latency_name(state));
1633 }
1634#endif
1635 last_stamp = now;
11fdf7f2 1636 return lat;
7c673cae
FG
1637 }
1638
11fdf7f2
TL
1639 CollectionRef ch;
1640 OpSequencerRef osr; // this should be ch->osr
7c673cae
FG
1641 boost::intrusive::list_member_hook<> sequencer_item;
1642
1643 uint64_t bytes = 0, cost = 0;
1644
1645 set<OnodeRef> onodes; ///< these need to be updated/written
1646 set<OnodeRef> modified_objects; ///< objects we modified (and need a ref)
1647 set<SharedBlobRef> shared_blobs; ///< these need to be updated/written
1648 set<SharedBlobRef> shared_blobs_written; ///< update these on io completion
1649
1650 KeyValueDB::Transaction t; ///< then we will commit this
7c673cae
FG
1651 list<Context*> oncommits; ///< more commit completions
1652 list<CollectionRef> removed_collections; ///< colls we removed
1653
1654 boost::intrusive::list_member_hook<> deferred_queue_item;
1655 bluestore_deferred_transaction_t *deferred_txn = nullptr; ///< if any
1656
1657 interval_set<uint64_t> allocated, released;
11fdf7f2
TL
1658 volatile_statfs statfs_delta; ///< overall store statistics delta
1659 uint64_t osd_pool_id = META_POOL_ID; ///< osd pool id we're operating on
1660
7c673cae
FG
1661 IOContext ioc;
1662 bool had_ios = false; ///< true if we submitted IOs before our kv txn
1663
7c673cae
FG
1664 uint64_t seq = 0;
1665 utime_t start;
1666 utime_t last_stamp;
1667
1668 uint64_t last_nid = 0; ///< if non-zero, highest new nid we allocated
1669 uint64_t last_blobid = 0; ///< if non-zero, highest new blobid we allocated
1670
11fdf7f2
TL
1671 explicit TransContext(CephContext* cct, Collection *c, OpSequencer *o,
1672 list<Context*> *on_commits)
1673 : ch(c),
1674 osr(o),
7c673cae
FG
1675 ioc(cct, this),
1676 start(ceph_clock_now()) {
1677 last_stamp = start;
11fdf7f2
TL
1678 if (on_commits) {
1679 oncommits.swap(*on_commits);
1680 }
7c673cae
FG
1681 }
1682 ~TransContext() {
1683 delete deferred_txn;
1684 }
1685
1686 void write_onode(OnodeRef &o) {
1687 onodes.insert(o);
1688 }
1689 void write_shared_blob(SharedBlobRef &sb) {
1690 shared_blobs.insert(sb);
1691 }
31f18b77
FG
1692 void unshare_blob(SharedBlob *sb) {
1693 shared_blobs.erase(sb);
1694 }
1695
7c673cae
FG
1696 /// note we logically modified object (when onode itself is unmodified)
1697 void note_modified_object(OnodeRef &o) {
1698 // onode itself isn't written, though
1699 modified_objects.insert(o);
1700 }
a8e16298 1701 void note_removed_object(OnodeRef& o) {
7c673cae 1702 onodes.erase(o);
a8e16298 1703 modified_objects.insert(o);
7c673cae
FG
1704 }
1705
1706 void aio_finish(BlueStore *store) override {
1707 store->txc_aio_finish(this);
1708 }
1709 };
1710
1711 typedef boost::intrusive::list<
1712 TransContext,
1713 boost::intrusive::member_hook<
1714 TransContext,
1715 boost::intrusive::list_member_hook<>,
1716 &TransContext::deferred_queue_item> > deferred_queue_t;
1717
11fdf7f2 1718 struct DeferredBatch final : public AioContext {
7c673cae
FG
1719 OpSequencer *osr;
1720 struct deferred_io {
1721 bufferlist bl; ///< data
1722 uint64_t seq; ///< deferred transaction seq
1723 };
1724 map<uint64_t,deferred_io> iomap; ///< map of ios in this batch
1725 deferred_queue_t txcs; ///< txcs in this batch
1726 IOContext ioc; ///< our aios
1727 /// bytes of pending io for each deferred seq (may be 0)
1728 map<uint64_t,int> seq_bytes;
1729
1730 void _discard(CephContext *cct, uint64_t offset, uint64_t length);
1731 void _audit(CephContext *cct);
1732
1733 DeferredBatch(CephContext *cct, OpSequencer *osr)
1734 : osr(osr), ioc(cct, this) {}
1735
1736 /// prepare a write
1737 void prepare_write(CephContext *cct,
1738 uint64_t seq, uint64_t offset, uint64_t length,
1739 bufferlist::const_iterator& p);
1740
1741 void aio_finish(BlueStore *store) override {
1742 store->_deferred_aio_finish(osr);
1743 }
1744 };
1745
11fdf7f2 1746 class OpSequencer : public RefCountedObject {
7c673cae 1747 public:
11fdf7f2
TL
1748 ceph::mutex qlock = ceph::make_mutex("BlueStore::OpSequencer::qlock");
1749 ceph::condition_variable qcond;
7c673cae
FG
1750 typedef boost::intrusive::list<
1751 TransContext,
1752 boost::intrusive::member_hook<
1753 TransContext,
1754 boost::intrusive::list_member_hook<>,
1755 &TransContext::sequencer_item> > q_list_t;
1756 q_list_t q; ///< transactions
1757
1758 boost::intrusive::list_member_hook<> deferred_osr_queue_item;
1759
1760 DeferredBatch *deferred_running = nullptr;
1761 DeferredBatch *deferred_pending = nullptr;
1762
7c673cae 1763 BlueStore *store;
11fdf7f2 1764 coll_t cid;
7c673cae
FG
1765
1766 uint64_t last_seq = 0;
1767
1768 std::atomic_int txc_with_unstable_io = {0}; ///< num txcs with unstable io
1769
1770 std::atomic_int kv_committing_serially = {0};
1771
1772 std::atomic_int kv_submitted_waiters = {0};
1773
11fdf7f2 1774 std::atomic_bool zombie = {false}; ///< in zombie_osr set (collection going away)
7c673cae 1775
11fdf7f2
TL
1776 OpSequencer(BlueStore *store, const coll_t& c)
1777 : RefCountedObject(store->cct, 0),
1778 store(store), cid(c) {
7c673cae 1779 }
11fdf7f2
TL
1780 ~OpSequencer() {
1781 ceph_assert(q.empty());
7c673cae
FG
1782 }
1783
1784 void queue_new(TransContext *txc) {
11fdf7f2 1785 std::lock_guard l(qlock);
7c673cae
FG
1786 txc->seq = ++last_seq;
1787 q.push_back(*txc);
1788 }
1789
1790 void drain() {
11fdf7f2 1791 std::unique_lock l(qlock);
7c673cae
FG
1792 while (!q.empty())
1793 qcond.wait(l);
1794 }
1795
1796 void drain_preceding(TransContext *txc) {
11fdf7f2 1797 std::unique_lock l(qlock);
7c673cae
FG
1798 while (!q.empty() && &q.front() != txc)
1799 qcond.wait(l);
1800 }
1801
1802 bool _is_all_kv_submitted() {
11fdf7f2
TL
1803 // caller must hold qlock & q.empty() must not empty
1804 ceph_assert(!q.empty());
7c673cae
FG
1805 TransContext *txc = &q.back();
1806 if (txc->state >= TransContext::STATE_KV_SUBMITTED) {
1807 return true;
1808 }
1809 return false;
1810 }
1811
11fdf7f2
TL
1812 void flush() {
1813 std::unique_lock l(qlock);
1814 while (true) {
1815 // set flag before the check because the condition
1816 // may become true outside qlock, and we need to make
1817 // sure those threads see waiters and signal qcond.
1818 ++kv_submitted_waiters;
1819 if (q.empty() || _is_all_kv_submitted()) {
1820 --kv_submitted_waiters;
1821 return;
1822 }
1823 qcond.wait(l);
1824 --kv_submitted_waiters;
1825 }
1826 }
1827
1828 void flush_all_but_last() {
1829 std::unique_lock l(qlock);
1830 assert (q.size() >= 1);
7c673cae
FG
1831 while (true) {
1832 // set flag before the check because the condition
1833 // may become true outside qlock, and we need to make
1834 // sure those threads see waiters and signal qcond.
1835 ++kv_submitted_waiters;
11fdf7f2
TL
1836 if (q.size() <= 1) {
1837 --kv_submitted_waiters;
7c673cae 1838 return;
11fdf7f2
TL
1839 } else {
1840 auto it = q.rbegin();
1841 it++;
1842 if (it->state >= TransContext::STATE_KV_SUBMITTED) {
eafe8130 1843 --kv_submitted_waiters;
11fdf7f2
TL
1844 return;
1845 }
7c673cae
FG
1846 }
1847 qcond.wait(l);
1848 --kv_submitted_waiters;
1849 }
1850 }
1851
11fdf7f2
TL
1852 bool flush_commit(Context *c) {
1853 std::lock_guard l(qlock);
7c673cae
FG
1854 if (q.empty()) {
1855 return true;
1856 }
1857 TransContext *txc = &q.back();
1858 if (txc->state >= TransContext::STATE_KV_DONE) {
1859 return true;
1860 }
1861 txc->oncommits.push_back(c);
1862 return false;
1863 }
1864 };
1865
1866 typedef boost::intrusive::list<
1867 OpSequencer,
1868 boost::intrusive::member_hook<
1869 OpSequencer,
1870 boost::intrusive::list_member_hook<>,
1871 &OpSequencer::deferred_osr_queue_item> > deferred_osr_queue_t;
1872
1873 struct KVSyncThread : public Thread {
1874 BlueStore *store;
1875 explicit KVSyncThread(BlueStore *s) : store(s) {}
1876 void *entry() override {
1877 store->_kv_sync_thread();
1878 return NULL;
1879 }
1880 };
31f18b77
FG
1881 struct KVFinalizeThread : public Thread {
1882 BlueStore *store;
1883 explicit KVFinalizeThread(BlueStore *s) : store(s) {}
1884 void *entry() {
1885 store->_kv_finalize_thread();
1886 return NULL;
1887 }
1888 };
7c673cae
FG
1889
1890 struct DBHistogram {
1891 struct value_dist {
1892 uint64_t count;
1893 uint32_t max_len;
1894 };
1895
1896 struct key_dist {
1897 uint64_t count;
1898 uint32_t max_len;
1899 map<int, struct value_dist> val_map; ///< slab id to count, max length of value and key
1900 };
1901
1902 map<string, map<int, struct key_dist> > key_hist;
1903 map<int, uint64_t> value_hist;
1904 int get_key_slab(size_t sz);
1905 string get_key_slab_to_range(int slab);
1906 int get_value_slab(size_t sz);
1907 string get_value_slab_to_range(int slab);
1908 void update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
1909 const string &prefix, size_t key_size, size_t value_size);
1910 void dump(Formatter *f);
1911 };
1912
1913 // --------------------------------------------------------
1914 // members
1915private:
1916 BlueFS *bluefs = nullptr;
1917 unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing
1918 bool bluefs_single_shared_device = true;
11fdf7f2
TL
1919 mono_time bluefs_last_balance;
1920 utime_t next_dump_on_bluefs_alloc_failure;
7c673cae
FG
1921
1922 KeyValueDB *db = nullptr;
1923 BlockDevice *bdev = nullptr;
1924 std::string freelist_type;
1925 FreelistManager *fm = nullptr;
1926 Allocator *alloc = nullptr;
1927 uuid_d fsid;
1928 int path_fd = -1; ///< open handle to $path
1929 int fsid_fd = -1; ///< open handle (locked) to $path/fsid
1930 bool mounted = false;
1931
1932 RWLock coll_lock = {"BlueStore::coll_lock"}; ///< rwlock to protect coll_map
31f18b77 1933 mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map;
eafe8130 1934 bool collections_had_errors = false;
11fdf7f2 1935 map<coll_t,CollectionRef> new_coll_map;
7c673cae
FG
1936
1937 vector<Cache*> cache_shards;
1938
11fdf7f2
TL
1939 /// protect zombie_osr_set
1940 ceph::mutex zombie_osr_lock = ceph::make_mutex("BlueStore::zombie_osr_lock");
1941 std::map<coll_t,OpSequencerRef> zombie_osr_set; ///< set of OpSequencers for deleted collections
7c673cae
FG
1942
1943 std::atomic<uint64_t> nid_last = {0};
1944 std::atomic<uint64_t> nid_max = {0};
1945 std::atomic<uint64_t> blobid_last = {0};
1946 std::atomic<uint64_t> blobid_max = {0};
1947
1948 Throttle throttle_bytes; ///< submit to commit
1949 Throttle throttle_deferred_bytes; ///< submit to deferred complete
1950
1951 interval_set<uint64_t> bluefs_extents; ///< block extents owned by bluefs
1952 interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming
1953
11fdf7f2 1954 ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::deferred_lock");
7c673cae
FG
1955 std::atomic<uint64_t> deferred_seq = {0};
1956 deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
1957 int deferred_queue_size = 0; ///< num txc's queued across all osrs
1958 atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
11fdf7f2 1959 Finisher deferred_finisher, finisher;
7c673cae
FG
1960
1961 KVSyncThread kv_sync_thread;
11fdf7f2
TL
1962 ceph::mutex kv_lock = ceph::make_mutex("BlueStore::kv_lock");
1963 ceph::condition_variable kv_cond;
3efd9988 1964 bool _kv_only = false;
31f18b77 1965 bool kv_sync_started = false;
7c673cae 1966 bool kv_stop = false;
31f18b77
FG
1967 bool kv_finalize_started = false;
1968 bool kv_finalize_stop = false;
7c673cae
FG
1969 deque<TransContext*> kv_queue; ///< ready, already submitted
1970 deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
1971 deque<TransContext*> kv_committing; ///< currently syncing
1972 deque<DeferredBatch*> deferred_done_queue; ///< deferred ios done
7c673cae 1973
31f18b77 1974 KVFinalizeThread kv_finalize_thread;
11fdf7f2
TL
1975 ceph::mutex kv_finalize_lock = ceph::make_mutex("BlueStore::kv_finalize_lock");
1976 ceph::condition_variable kv_finalize_cond;
31f18b77
FG
1977 deque<TransContext*> kv_committing_to_finalize; ///< pending finalization
1978 deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization
1979
7c673cae
FG
1980 PerfCounters *logger = nullptr;
1981
7c673cae
FG
1982 list<CollectionRef> removed_collections;
1983
1984 RWLock debug_read_error_lock = {"BlueStore::debug_read_error_lock"};
1985 set<ghobject_t> debug_data_error_objects;
1986 set<ghobject_t> debug_mdata_error_objects;
1987
1988 std::atomic<int> csum_type = {Checksummer::CSUM_CRC32C};
1989
1990 uint64_t block_size = 0; ///< block size of block device (power of 2)
1991 uint64_t block_mask = 0; ///< mask to get just the block offset
1992 size_t block_size_order = 0; ///< bits to shift to get block size
1993
1994 uint64_t min_alloc_size = 0; ///< minimum allocation unit (power of 2)
7c673cae 1995 ///< bits for min_alloc_size
224ce89b 1996 uint8_t min_alloc_size_order = 0;
7c673cae
FG
1997 static_assert(std::numeric_limits<uint8_t>::max() >
1998 std::numeric_limits<decltype(min_alloc_size)>::digits,
1999 "not enough bits for min_alloc_size");
2000
7c673cae
FG
2001 ///< maximum allocation unit (power of 2)
2002 std::atomic<uint64_t> max_alloc_size = {0};
2003
224ce89b
WB
2004 ///< number threshold for forced deferred writes
2005 std::atomic<int> deferred_batch_ops = {0};
2006
2007 ///< size threshold for forced deferred writes
2008 std::atomic<uint64_t> prefer_deferred_size = {0};
2009
7c673cae
FG
2010 ///< approx cost per io, in bytes
2011 std::atomic<uint64_t> throttle_cost_per_io = {0};
2012
224ce89b
WB
2013 std::atomic<Compressor::CompressionMode> comp_mode =
2014 {Compressor::COMP_NONE}; ///< compression mode
7c673cae
FG
2015 CompressorRef compressor;
2016 std::atomic<uint64_t> comp_min_blob_size = {0};
2017 std::atomic<uint64_t> comp_max_blob_size = {0};
2018
2019 std::atomic<uint64_t> max_blob_size = {0}; ///< maximum blob size
2020
31f18b77
FG
2021 uint64_t kv_ios = 0;
2022 uint64_t kv_throttle_costs = 0;
2023
7c673cae 2024 // cache trim control
91327a77
AA
2025 uint64_t cache_size = 0; ///< total cache size
2026 double cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
2027 double cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
2028 double cache_data_ratio = 0; ///< cache ratio dedicated to object data
2029 bool cache_autotune = false; ///< cache autotune setting
91327a77
AA
2030 double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
2031 uint64_t osd_memory_target = 0; ///< OSD memory target when autotuning cache
2032 uint64_t osd_memory_base = 0; ///< OSD base memory when autotuning cache
2033 double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
11fdf7f2 2034 uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cache
91327a77 2035 double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing
11fdf7f2
TL
2036
2037 typedef map<uint64_t, volatile_statfs> osd_pools_map;
2038
2039 ceph::mutex vstatfs_lock = ceph::make_mutex("BlueStore::vstatfs_lock");
31f18b77 2040 volatile_statfs vstatfs;
11fdf7f2
TL
2041 osd_pools_map osd_pools; // protected by vstatfs_lock as well
2042
2043 bool per_pool_stat_collection = true;
7c673cae
FG
2044
2045 struct MempoolThread : public Thread {
91327a77 2046 public:
7c673cae 2047 BlueStore *store;
91327a77 2048
11fdf7f2
TL
2049 ceph::condition_variable cond;
2050 ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock");
7c673cae 2051 bool stop = false;
91327a77 2052 uint64_t autotune_cache_size = 0;
11fdf7f2 2053 std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr;
eafe8130 2054 std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
91327a77
AA
2055
2056 struct MempoolCache : public PriorityCache::PriCache {
2057 BlueStore *store;
11fdf7f2
TL
2058 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
2059 int64_t committed_bytes = 0;
91327a77
AA
2060 double cache_ratio = 0;
2061
2062 MempoolCache(BlueStore *s) : store(s) {};
2063
2064 virtual uint64_t _get_used_bytes() const = 0;
2065
2066 virtual int64_t request_cache_bytes(
11fdf7f2 2067 PriorityCache::Priority pri, uint64_t total_cache) const {
91327a77
AA
2068 int64_t assigned = get_cache_bytes(pri);
2069
2070 switch (pri) {
eafe8130
TL
2071 // All cache items are currently shoved into the PRI1 priority
2072 case PriorityCache::Priority::PRI1:
91327a77 2073 {
11fdf7f2 2074 int64_t request = _get_used_bytes();
91327a77
AA
2075 return(request > assigned) ? request - assigned : 0;
2076 }
2077 default:
2078 break;
2079 }
2080 return -EOPNOTSUPP;
2081 }
2082
2083 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
2084 return cache_bytes[pri];
2085 }
2086 virtual int64_t get_cache_bytes() const {
2087 int64_t total = 0;
2088
2089 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
2090 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
2091 total += get_cache_bytes(pri);
2092 }
2093 return total;
2094 }
2095 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
2096 cache_bytes[pri] = bytes;
2097 }
2098 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
2099 cache_bytes[pri] += bytes;
2100 }
11fdf7f2
TL
2101 virtual int64_t commit_cache_size(uint64_t total_cache) {
2102 committed_bytes = PriorityCache::get_chunk(
2103 get_cache_bytes(), total_cache);
2104 return committed_bytes;
2105 }
2106 virtual int64_t get_committed_size() const {
2107 return committed_bytes;
91327a77
AA
2108 }
2109 virtual double get_cache_ratio() const {
2110 return cache_ratio;
2111 }
2112 virtual void set_cache_ratio(double ratio) {
2113 cache_ratio = ratio;
2114 }
2115 virtual string get_cache_name() const = 0;
2116 };
2117
2118 struct MetaCache : public MempoolCache {
2119 MetaCache(BlueStore *s) : MempoolCache(s) {};
2120
2121 virtual uint64_t _get_used_bytes() const {
2122 return mempool::bluestore_cache_other::allocated_bytes() +
2123 mempool::bluestore_cache_onode::allocated_bytes();
2124 }
2125
2126 virtual string get_cache_name() const {
2127 return "BlueStore Meta Cache";
2128 }
2129
2130 uint64_t _get_num_onodes() const {
2131 uint64_t onode_num =
2132 mempool::bluestore_cache_onode::allocated_items();
2133 return (2 > onode_num) ? 2 : onode_num;
2134 }
2135
2136 double get_bytes_per_onode() const {
2137 return (double)_get_used_bytes() / (double)_get_num_onodes();
2138 }
11fdf7f2
TL
2139 };
2140 std::shared_ptr<MetaCache> meta_cache;
91327a77
AA
2141
2142 struct DataCache : public MempoolCache {
2143 DataCache(BlueStore *s) : MempoolCache(s) {};
2144
2145 virtual uint64_t _get_used_bytes() const {
2146 uint64_t bytes = 0;
2147 for (auto i : store->cache_shards) {
2148 bytes += i->_get_buffer_bytes();
2149 }
2150 return bytes;
2151 }
2152 virtual string get_cache_name() const {
2153 return "BlueStore Data Cache";
2154 }
11fdf7f2
TL
2155 };
2156 std::shared_ptr<DataCache> data_cache;
91327a77 2157
7c673cae
FG
2158 public:
2159 explicit MempoolThread(BlueStore *s)
2160 : store(s),
11fdf7f2
TL
2161 meta_cache(new MetaCache(s)),
2162 data_cache(new DataCache(s)) {}
91327a77 2163
7c673cae
FG
2164 void *entry() override;
2165 void init() {
11fdf7f2 2166 ceph_assert(stop == false);
7c673cae
FG
2167 create("bstore_mempool");
2168 }
2169 void shutdown() {
11fdf7f2 2170 lock.lock();
7c673cae 2171 stop = true;
11fdf7f2
TL
2172 cond.notify_all();
2173 lock.unlock();
7c673cae
FG
2174 join();
2175 }
91327a77
AA
2176
2177 private:
2178 void _adjust_cache_settings();
2179 void _trim_shards(bool interval_stats);
2180 void _tune_cache_size(bool interval_stats);
11fdf7f2
TL
2181 void _balance_cache(
2182 const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches);
2183 void _balance_cache_pri(
2184 int64_t *mem_avail,
2185 const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches,
2186 PriorityCache::Priority pri);
7c673cae
FG
2187 } mempool_thread;
2188
2189 // --------------------------------------------------------
2190 // private methods
2191
2192 void _init_logger();
2193 void _shutdown_logger();
2194 int _reload_logger();
2195
2196 int _open_path();
2197 void _close_path();
2198 int _open_fsid(bool create);
2199 int _lock_fsid();
2200 int _read_fsid(uuid_d *f);
2201 int _write_fsid();
2202 void _close_fsid();
2203 void _set_alloc_sizes();
2204 void _set_blob_size();
1adf2230 2205 void _set_finisher_num();
7c673cae
FG
2206
2207 int _open_bdev(bool create);
11fdf7f2
TL
2208 // Verifies if disk space is enough for reserved + min bluefs
2209 // and alters the latter if needed.
2210 // Depends on min_alloc_size hence should be called after
2211 // its initialization (and outside of _open_bdev)
2212 void _validate_bdev();
7c673cae 2213 void _close_bdev();
11fdf7f2
TL
2214
2215 int _minimal_open_bluefs(bool create);
2216 void _minimal_close_bluefs();
2217 int _open_bluefs(bool create);
2218 void _close_bluefs();
2219
2220 // Limited (u)mount intended for BlueFS operations only
2221 int _mount_for_bluefs();
2222 void _umount_for_bluefs();
2223
2224
2225 int _is_bluefs(bool create, bool* ret);
2226 /*
2227 * opens both DB and dependant super_meta, FreelistManager and allocator
2228 * in the proper order
2229 */
2230 int _open_db_and_around(bool read_only);
2231 void _close_db_and_around();
2232
2233 // updates legacy bluefs related recs in DB to a state valid for
2234 // downgrades from nautilus.
2235 void _sync_bluefs_and_fm();
2236
2237 /*
2238 * @warning to_repair_db means that we open this db to repair it, will not
2239 * hold the rocksdb's file lock.
2240 */
2241 int _open_db(bool create,
2242 bool to_repair_db=false,
2243 bool read_only = false);
7c673cae 2244 void _close_db();
11fdf7f2 2245 int _open_fm(KeyValueDB::Transaction t);
7c673cae
FG
2246 void _close_fm();
2247 int _open_alloc();
2248 void _close_alloc();
eafe8130
TL
2249 int _open_collections();
2250 void _fsck_collections(int64_t* errors);
7c673cae
FG
2251 void _close_collections();
2252
2253 int _setup_block_symlink_or_file(string name, string path, uint64_t size,
2254 bool create);
2255
7c673cae 2256public:
3efd9988
FG
2257 static int _write_bdev_label(CephContext* cct,
2258 string path, bluestore_bdev_label_t label);
7c673cae
FG
2259 static int _read_bdev_label(CephContext* cct, string path,
2260 bluestore_bdev_label_t *label);
2261private:
2262 int _check_or_set_bdev_label(string path, uint64_t size, string desc,
2263 bool create);
2264
2265 int _open_super_meta();
2266
224ce89b 2267 void _open_statfs();
11fdf7f2 2268 void _get_statfs_overall(struct store_statfs_t *buf);
31f18b77 2269
11fdf7f2
TL
2270 void _dump_alloc_on_failure();
2271
2272 int64_t _get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total);
2273 int _balance_bluefs_freespace();
7c673cae
FG
2274
2275 CollectionRef _get_collection(const coll_t& cid);
2276 void _queue_reap_collection(CollectionRef& c);
2277 void _reap_collections();
2278 void _update_cache_logger();
2279
2280 void _assign_nid(TransContext *txc, OnodeRef o);
2281 uint64_t _assign_blobid(TransContext *txc);
2282
81eedcae
TL
2283 template <int LogLevelV>
2284 friend void _dump_onode(CephContext *cct, const Onode& o);
2285 template <int LogLevelV>
2286 friend void _dump_extent_map(CephContext *cct, const ExtentMap& em);
2287 template <int LogLevelV>
2288 friend void _dump_transaction(CephContext *cct, Transaction *t);
7c673cae 2289
11fdf7f2
TL
2290 TransContext *_txc_create(Collection *c, OpSequencer *osr,
2291 list<Context*> *on_commits);
7c673cae
FG
2292 void _txc_update_store_statfs(TransContext *txc);
2293 void _txc_add_transaction(TransContext *txc, Transaction *t);
2294 void _txc_calc_cost(TransContext *txc);
2295 void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t);
2296 void _txc_state_proc(TransContext *txc);
2297 void _txc_aio_submit(TransContext *txc);
2298public:
2299 void txc_aio_finish(void *p) {
2300 _txc_state_proc(static_cast<TransContext*>(p));
2301 }
2302private:
2303 void _txc_finish_io(TransContext *txc);
2304 void _txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t);
2305 void _txc_applied_kv(TransContext *txc);
2306 void _txc_committed_kv(TransContext *txc);
2307 void _txc_finish(TransContext *txc);
2308 void _txc_release_alloc(TransContext *txc);
2309
11fdf7f2
TL
2310 void _osr_attach(Collection *c);
2311 void _osr_register_zombie(OpSequencer *osr);
2312 void _osr_drain(OpSequencer *osr);
7c673cae
FG
2313 void _osr_drain_preceding(TransContext *txc);
2314 void _osr_drain_all();
7c673cae 2315
31f18b77
FG
2316 void _kv_start();
2317 void _kv_stop();
7c673cae 2318 void _kv_sync_thread();
31f18b77 2319 void _kv_finalize_thread();
7c673cae
FG
2320
2321 bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
2322 void _deferred_queue(TransContext *txc);
3efd9988 2323public:
224ce89b 2324 void deferred_try_submit();
3efd9988 2325private:
224ce89b 2326 void _deferred_submit_unlock(OpSequencer *osr);
7c673cae
FG
2327 void _deferred_aio_finish(OpSequencer *osr);
2328 int _deferred_replay();
2329
2330public:
2331 using mempool_dynamic_bitset =
2332 boost::dynamic_bitset<uint64_t,
2333 mempool::bluestore_fsck::pool_allocator<uint64_t>>;
eafe8130
TL
2334 using per_pool_statfs =
2335 mempool::bluestore_fsck::map<uint64_t, store_statfs_t>;
2336
2337 enum FSCKDepth {
2338 FSCK_REGULAR,
2339 FSCK_DEEP,
2340 FSCK_SHALLOW
2341 };
7c673cae
FG
2342
2343private:
2344 int _fsck_check_extents(
11fdf7f2 2345 const coll_t& cid,
7c673cae
FG
2346 const ghobject_t& oid,
2347 const PExtentVector& extents,
2348 bool compressed,
2349 mempool_dynamic_bitset &used_blocks,
b32b8144 2350 uint64_t granularity,
11fdf7f2 2351 BlueStoreRepairer* repairer,
eafe8130
TL
2352 store_statfs_t& expected_statfs,
2353 FSCKDepth depth);
7c673cae 2354
11fdf7f2
TL
2355 void _fsck_check_pool_statfs(
2356 per_pool_statfs& expected_pool_statfs,
eafe8130
TL
2357 int64_t& errors,
2358 int64_t &warnings,
11fdf7f2
TL
2359 BlueStoreRepairer* repairer);
2360
eafe8130
TL
2361 int _fsck(FSCKDepth depth, bool repair);
2362 int _fsck_on_open(BlueStore::FSCKDepth depth, bool repair);
2363
7c673cae
FG
2364 void _buffer_cache_write(
2365 TransContext *txc,
2366 BlobRef b,
2367 uint64_t offset,
2368 bufferlist& bl,
2369 unsigned flags) {
2370 b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl,
2371 flags);
2372 txc->shared_blobs_written.insert(b->shared_blob);
2373 }
2374
2375 int _collection_list(
2376 Collection *c, const ghobject_t& start, const ghobject_t& end,
2377 int max, vector<ghobject_t> *ls, ghobject_t *next);
2378
2379 template <typename T, typename F>
2380 T select_option(const std::string& opt_name, T val1, F f) {
2381 //NB: opt_name reserved for future use
2382 boost::optional<T> val2 = f();
2383 if (val2) {
2384 return *val2;
2385 }
2386 return val1;
2387 }
2388
2389 void _apply_padding(uint64_t head_pad,
2390 uint64_t tail_pad,
7c673cae
FG
2391 bufferlist& padded);
2392
11fdf7f2
TL
2393 void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn);
2394
7c673cae
FG
2395 // -- ondisk version ---
2396public:
2397 const int32_t latest_ondisk_format = 2; ///< our version
2398 const int32_t min_readable_ondisk_format = 1; ///< what we can read
2399 const int32_t min_compat_ondisk_format = 2; ///< who can read us
2400
2401private:
2402 int32_t ondisk_format = 0; ///< value detected on mount
2403
2404 int _upgrade_super(); ///< upgrade (called during open_super)
11fdf7f2 2405 uint64_t _get_ondisk_reserved() const;
7c673cae
FG
2406 void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
2407
2408 // --- public interface ---
2409public:
2410 BlueStore(CephContext *cct, const string& path);
2411 BlueStore(CephContext *cct, const string& path, uint64_t min_alloc_size); // Ctor for UT only
2412 ~BlueStore() override;
2413
2414 string get_type() override {
2415 return "bluestore";
2416 }
2417
2418 bool needs_journal() override { return false; };
2419 bool wants_journal() override { return false; };
2420 bool allows_journal() override { return false; };
2421
11fdf7f2
TL
2422 int get_devices(set<string> *ls) override;
2423
31f18b77 2424 bool is_rotational() override;
d2e6a577 2425 bool is_journal_rotational() override;
31f18b77 2426
224ce89b
WB
2427 string get_default_device_class() override {
2428 string device_class;
2429 map<string, string> metadata;
2430 collect_metadata(&metadata);
2431 auto it = metadata.find("bluestore_bdev_type");
2432 if (it != metadata.end()) {
2433 device_class = it->second;
2434 }
2435 return device_class;
2436 }
2437
11fdf7f2
TL
2438 int get_numa_node(
2439 int *numa_node,
2440 set<int> *nodes,
2441 set<string> *failed) override;
2442
7c673cae
FG
2443 static int get_block_device_fsid(CephContext* cct, const string& path,
2444 uuid_d *fsid);
2445
2446 bool test_mount_in_use() override;
2447
2448private:
11fdf7f2 2449 int _mount(bool kv_only, bool open_db=true);
7c673cae
FG
2450public:
2451 int mount() override {
2452 return _mount(false);
2453 }
2454 int umount() override;
2455
11fdf7f2
TL
2456 int start_kv_only(KeyValueDB **pdb, bool open_db=true) {
2457 int r = _mount(true, open_db);
7c673cae
FG
2458 if (r < 0)
2459 return r;
2460 *pdb = db;
2461 return 0;
2462 }
2463
3efd9988
FG
2464 int write_meta(const std::string& key, const std::string& value) override;
2465 int read_meta(const std::string& key, std::string *value) override;
2466
eafe8130
TL
2467 int cold_open();
2468 int cold_close();
3efd9988
FG
2469
2470 int fsck(bool deep) override {
eafe8130 2471 return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, false);
3efd9988
FG
2472 }
2473 int repair(bool deep) override {
eafe8130
TL
2474 return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, true);
2475 }
2476 int quick_fix() override {
2477 return _fsck(FSCK_SHALLOW, true);
3efd9988 2478 }
7c673cae
FG
2479
2480 void set_cache_shards(unsigned num) override;
11fdf7f2
TL
2481 void dump_cache_stats(Formatter *f) override {
2482 int onode_count = 0, buffers_bytes = 0;
2483 for (auto i: cache_shards) {
2484 onode_count += i->_get_num_onodes();
2485 buffers_bytes += i->_get_buffer_bytes();
2486 }
2487 f->dump_int("bluestore_onode", onode_count);
2488 f->dump_int("bluestore_buffers", buffers_bytes);
2489 }
2490 void dump_cache_stats(ostream& ss) override {
2491 int onode_count = 0, buffers_bytes = 0;
2492 for (auto i: cache_shards) {
2493 onode_count += i->_get_num_onodes();
2494 buffers_bytes += i->_get_buffer_bytes();
2495 }
2496 ss << "bluestore_onode: " << onode_count;
2497 ss << "bluestore_buffers: " << buffers_bytes;
2498 }
7c673cae
FG
2499
2500 int validate_hobject_key(const hobject_t &obj) const override {
2501 return 0;
2502 }
2503 unsigned get_max_attr_name_length() override {
2504 return 256; // arbitrary; there is no real limit internally
2505 }
2506
2507 int mkfs() override;
2508 int mkjournal() override {
2509 return 0;
2510 }
2511
2512 void get_db_statistics(Formatter *f) override;
2513 void generate_db_histogram(Formatter *f) override;
31f18b77 2514 void _flush_cache();
11fdf7f2 2515 int flush_cache(ostream *os = NULL) override;
7c673cae
FG
2516 void dump_perf_counters(Formatter *f) override {
2517 f->open_object_section("perf_counters");
2518 logger->dump_formatted(f, false);
2519 f->close_section();
2520 }
2521
11fdf7f2
TL
2522 int add_new_bluefs_device(int id, const string& path);
2523 int migrate_to_existing_bluefs_device(const set<int>& devs_source,
2524 int id);
2525 int migrate_to_new_bluefs_device(const set<int>& devs_source,
2526 int id,
2527 const string& path);
2528 int expand_devices(ostream& out);
2529 string get_device_path(unsigned id);
7c673cae
FG
2530
2531public:
11fdf7f2
TL
2532 int statfs(struct store_statfs_t *buf,
2533 osd_alert_list_t* alerts = nullptr) override;
2534 int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override;
7c673cae
FG
2535
2536 void collect_metadata(map<string,string> *pm) override;
2537
7c673cae
FG
2538 bool exists(CollectionHandle &c, const ghobject_t& oid) override;
2539 int set_collection_opts(
11fdf7f2 2540 CollectionHandle& c,
7c673cae 2541 const pool_opts_t& opts) override;
7c673cae
FG
2542 int stat(
2543 CollectionHandle &c,
2544 const ghobject_t& oid,
2545 struct stat *st,
2546 bool allow_eio = false) override;
7c673cae
FG
2547 int read(
2548 CollectionHandle &c,
2549 const ghobject_t& oid,
2550 uint64_t offset,
2551 size_t len,
2552 bufferlist& bl,
224ce89b 2553 uint32_t op_flags = 0) override;
7c673cae
FG
2554 int _do_read(
2555 Collection *c,
2556 OnodeRef o,
2557 uint64_t offset,
2558 size_t len,
2559 bufferlist& bl,
f64942e4
AA
2560 uint32_t op_flags = 0,
2561 uint64_t retry_count = 0);
7c673cae
FG
2562
2563private:
2564 int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
2565 uint64_t offset, size_t len, interval_set<uint64_t>& destset);
2566public:
7c673cae
FG
2567 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2568 uint64_t offset, size_t len, bufferlist& bl) override;
7c673cae
FG
2569 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2570 uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
2571
2572
7c673cae
FG
2573 int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
2574 bufferptr& value) override;
2575
7c673cae
FG
2576 int getattrs(CollectionHandle &c, const ghobject_t& oid,
2577 map<string,bufferptr>& aset) override;
2578
2579 int list_collections(vector<coll_t>& ls) override;
2580
2581 CollectionHandle open_collection(const coll_t &c) override;
11fdf7f2
TL
2582 CollectionHandle create_new_collection(const coll_t& cid) override;
2583 void set_collection_commit_queue(const coll_t& cid,
2584 ContextQueue *commit_queue) override;
7c673cae
FG
2585
2586 bool collection_exists(const coll_t& c) override;
11fdf7f2
TL
2587 int collection_empty(CollectionHandle& c, bool *empty) override;
2588 int collection_bits(CollectionHandle& c) override;
7c673cae 2589
7c673cae
FG
2590 int collection_list(CollectionHandle &c,
2591 const ghobject_t& start,
2592 const ghobject_t& end,
2593 int max,
2594 vector<ghobject_t> *ls, ghobject_t *next) override;
2595
7c673cae
FG
2596 int omap_get(
2597 CollectionHandle &c, ///< [in] Collection containing oid
2598 const ghobject_t &oid, ///< [in] Object containing omap
2599 bufferlist *header, ///< [out] omap header
2600 map<string, bufferlist> *out /// < [out] Key to value map
2601 ) override;
2602
2603 /// Get omap header
7c673cae
FG
2604 int omap_get_header(
2605 CollectionHandle &c, ///< [in] Collection containing oid
2606 const ghobject_t &oid, ///< [in] Object containing omap
2607 bufferlist *header, ///< [out] omap header
2608 bool allow_eio = false ///< [in] don't assert on eio
2609 ) override;
2610
2611 /// Get keys defined on oid
7c673cae
FG
2612 int omap_get_keys(
2613 CollectionHandle &c, ///< [in] Collection containing oid
2614 const ghobject_t &oid, ///< [in] Object containing omap
2615 set<string> *keys ///< [out] Keys defined on oid
2616 ) override;
2617
2618 /// Get key values
7c673cae
FG
2619 int omap_get_values(
2620 CollectionHandle &c, ///< [in] Collection containing oid
2621 const ghobject_t &oid, ///< [in] Object containing omap
2622 const set<string> &keys, ///< [in] Keys to get
2623 map<string, bufferlist> *out ///< [out] Returned keys and values
2624 ) override;
2625
2626 /// Filters keys into out which are defined on oid
7c673cae
FG
2627 int omap_check_keys(
2628 CollectionHandle &c, ///< [in] Collection containing oid
2629 const ghobject_t &oid, ///< [in] Object containing omap
2630 const set<string> &keys, ///< [in] Keys to check
2631 set<string> *out ///< [out] Subset of keys defined on oid
2632 ) override;
2633
7c673cae
FG
2634 ObjectMap::ObjectMapIterator get_omap_iterator(
2635 CollectionHandle &c, ///< [in] collection
2636 const ghobject_t &oid ///< [in] object
2637 ) override;
2638
2639 void set_fsid(uuid_d u) override {
2640 fsid = u;
2641 }
2642 uuid_d get_fsid() override {
2643 return fsid;
2644 }
2645
2646 uint64_t estimate_objects_overhead(uint64_t num_objects) override {
2647 return num_objects * 300; //assuming per-object overhead is 300 bytes
2648 }
2649
2650 struct BSPerfTracker {
11fdf7f2
TL
2651 PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
2652 PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
7c673cae
FG
2653
2654 objectstore_perf_stat_t get_cur_stats() const {
2655 objectstore_perf_stat_t ret;
11fdf7f2
TL
2656 ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
2657 ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
7c673cae
FG
2658 return ret;
2659 }
2660
2661 void update_from_perfcounters(PerfCounters &logger);
2662 } perf_tracker;
2663
2664 objectstore_perf_stat_t get_cur_stats() override {
2665 perf_tracker.update_from_perfcounters(*logger);
2666 return perf_tracker.get_cur_stats();
2667 }
2668 const PerfCounters* get_perf_counters() const override {
2669 return logger;
2670 }
2671
2672 int queue_transactions(
11fdf7f2 2673 CollectionHandle& ch,
7c673cae
FG
2674 vector<Transaction>& tls,
2675 TrackedOpRef op = TrackedOpRef(),
2676 ThreadPool::TPHandle *handle = NULL) override;
2677
2678 // error injection
2679 void inject_data_error(const ghobject_t& o) override {
2680 RWLock::WLocker l(debug_read_error_lock);
2681 debug_data_error_objects.insert(o);
2682 }
2683 void inject_mdata_error(const ghobject_t& o) override {
2684 RWLock::WLocker l(debug_read_error_lock);
2685 debug_mdata_error_objects.insert(o);
2686 }
11fdf7f2
TL
2687
2688 /// methods to inject various errors fsck can repair
2689 void inject_broken_shared_blob_key(const string& key,
2690 const bufferlist& bl);
2691 void inject_leaked(uint64_t len);
2692 void inject_false_free(coll_t cid, ghobject_t oid);
2693 void inject_statfs(const string& key, const store_statfs_t& new_statfs);
eafe8130 2694 void inject_global_statfs(const store_statfs_t& new_statfs);
11fdf7f2
TL
2695 void inject_misreference(coll_t cid1, ghobject_t oid1,
2696 coll_t cid2, ghobject_t oid2,
2697 uint64_t offset);
2698
224ce89b 2699 void compact() override {
11fdf7f2 2700 ceph_assert(db);
224ce89b
WB
2701 db->compact();
2702 }
28e407b8
AA
2703 bool has_builtin_csum() const override {
2704 return true;
2705 }
2706
11fdf7f2
TL
2707 /*
2708 Allocate space for BlueFS from slow device.
2709 Either automatically applies allocated extents to underlying
2710 BlueFS (extents == nullptr) or just return them (non-null extents) provided
2711 */
2712 int allocate_bluefs_freespace(
2713 uint64_t min_size,
2714 uint64_t size,
2715 PExtentVector* extents);
2716
494da23a
TL
2717 inline void log_latency(const char* name,
2718 int idx,
2719 const ceph::timespan& lat,
2720 double lat_threshold,
2721 const char* info = "") const;
2722
2723 inline void log_latency_fn(const char* name,
2724 int idx,
2725 const ceph::timespan& lat,
2726 double lat_threshold,
2727 std::function<string (const ceph::timespan& lat)> fn) const;
11fdf7f2 2728
7c673cae
FG
2729private:
2730 bool _debug_data_eio(const ghobject_t& o) {
2731 if (!cct->_conf->bluestore_debug_inject_read_err) {
2732 return false;
2733 }
2734 RWLock::RLocker l(debug_read_error_lock);
2735 return debug_data_error_objects.count(o);
2736 }
2737 bool _debug_mdata_eio(const ghobject_t& o) {
2738 if (!cct->_conf->bluestore_debug_inject_read_err) {
2739 return false;
2740 }
2741 RWLock::RLocker l(debug_read_error_lock);
2742 return debug_mdata_error_objects.count(o);
2743 }
2744 void _debug_obj_on_delete(const ghobject_t& o) {
2745 if (cct->_conf->bluestore_debug_inject_read_err) {
2746 RWLock::WLocker l(debug_read_error_lock);
2747 debug_data_error_objects.erase(o);
2748 debug_mdata_error_objects.erase(o);
2749 }
2750 }
11fdf7f2
TL
2751private:
2752 ceph::mutex qlock = ceph::make_mutex("BlueStore::Alerts::qlock");
2753 string failed_cmode;
2754 set<string> failed_compressors;
2755 string spillover_alert;
81eedcae
TL
2756 string legacy_statfs_alert;
2757 string disk_size_mismatch_alert;
11fdf7f2
TL
2758
2759 void _log_alerts(osd_alert_list_t& alerts);
2760 bool _set_compression_alert(bool cmode, const char* s) {
2761 std::lock_guard l(qlock);
2762 if (cmode) {
2763 bool ret = failed_cmode.empty();
2764 failed_cmode = s;
2765 return ret;
2766 }
2767 return failed_compressors.emplace(s).second;
2768 }
2769 void _clear_compression_alert() {
2770 std::lock_guard l(qlock);
2771 failed_compressors.clear();
2772 failed_cmode.clear();
2773 }
2774
2775 void _set_spillover_alert(const string& s) {
2776 std::lock_guard l(qlock);
2777 spillover_alert = s;
2778 }
2779 void _clear_spillover_alert() {
2780 std::lock_guard l(qlock);
2781 spillover_alert.clear();
2782 }
7c673cae 2783
81eedcae
TL
2784 void _check_legacy_statfs_alert();
2785 void _set_disk_size_mismatch_alert(const string& s) {
2786 std::lock_guard l(qlock);
2787 disk_size_mismatch_alert = s;
2788 }
2789
7c673cae
FG
2790private:
2791
2792 // --------------------------------------------------------
2793 // read processing internal methods
2794 int _verify_csum(
2795 OnodeRef& o,
2796 const bluestore_blob_t* blob,
2797 uint64_t blob_xoffset,
2798 const bufferlist& bl,
2799 uint64_t logical_offset) const;
2800 int _decompress(bufferlist& source, bufferlist* result);
2801
2802
2803 // --------------------------------------------------------
2804 // write ops
2805
2806 struct WriteContext {
2807 bool buffered = false; ///< buffered write
2808 bool compress = false; ///< compressed write
2809 uint64_t target_blob_size = 0; ///< target (max) blob size
2810 unsigned csum_order = 0; ///< target checksum chunk order
2811
2812 old_extent_map_t old_extents; ///< must deref these blobs
eafe8130 2813 interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection
7c673cae
FG
2814
2815 struct write_item {
2816 uint64_t logical_offset; ///< write logical offset
2817 BlobRef b;
2818 uint64_t blob_length;
2819 uint64_t b_off;
2820 bufferlist bl;
2821 uint64_t b_off0; ///< original offset in a blob prior to padding
2822 uint64_t length0; ///< original data length prior to padding
2823
2824 bool mark_unused;
2825 bool new_blob; ///< whether new blob was created
2826
3efd9988
FG
2827 bool compressed = false;
2828 bufferlist compressed_bl;
2829 size_t compressed_len = 0;
2830
7c673cae
FG
2831 write_item(
2832 uint64_t logical_offs,
2833 BlobRef b,
2834 uint64_t blob_len,
2835 uint64_t o,
2836 bufferlist& bl,
2837 uint64_t o0,
2838 uint64_t l0,
2839 bool _mark_unused,
2840 bool _new_blob)
2841 :
2842 logical_offset(logical_offs),
2843 b(b),
2844 blob_length(blob_len),
2845 b_off(o),
2846 bl(bl),
2847 b_off0(o0),
2848 length0(l0),
2849 mark_unused(_mark_unused),
2850 new_blob(_new_blob) {}
2851 };
2852 vector<write_item> writes; ///< blobs we're writing
2853
2854 /// partial clone of the context
2855 void fork(const WriteContext& other) {
2856 buffered = other.buffered;
2857 compress = other.compress;
2858 target_blob_size = other.target_blob_size;
2859 csum_order = other.csum_order;
2860 }
2861 void write(
2862 uint64_t loffs,
2863 BlobRef b,
2864 uint64_t blob_len,
2865 uint64_t o,
2866 bufferlist& bl,
2867 uint64_t o0,
2868 uint64_t len0,
2869 bool _mark_unused,
2870 bool _new_blob) {
2871 writes.emplace_back(loffs,
2872 b,
2873 blob_len,
2874 o,
2875 bl,
2876 o0,
2877 len0,
2878 _mark_unused,
2879 _new_blob);
2880 }
2881 /// Checks for writes to the same pextent within a blob
2882 bool has_conflict(
2883 BlobRef b,
2884 uint64_t loffs,
2885 uint64_t loffs_end,
2886 uint64_t min_alloc_size);
2887 };
2888
2889 void _do_write_small(
2890 TransContext *txc,
2891 CollectionRef &c,
2892 OnodeRef o,
2893 uint64_t offset, uint64_t length,
2894 bufferlist::iterator& blp,
2895 WriteContext *wctx);
2896 void _do_write_big(
2897 TransContext *txc,
2898 CollectionRef &c,
2899 OnodeRef o,
2900 uint64_t offset, uint64_t length,
2901 bufferlist::iterator& blp,
2902 WriteContext *wctx);
2903 int _do_alloc_write(
2904 TransContext *txc,
2905 CollectionRef c,
2906 OnodeRef o,
2907 WriteContext *wctx);
2908 void _wctx_finish(
2909 TransContext *txc,
2910 CollectionRef& c,
2911 OnodeRef o,
31f18b77
FG
2912 WriteContext *wctx,
2913 set<SharedBlob*> *maybe_unshared_blobs=0);
7c673cae 2914
7c673cae
FG
2915 int _write(TransContext *txc,
2916 CollectionRef& c,
2917 OnodeRef& o,
2918 uint64_t offset, size_t len,
2919 bufferlist& bl,
2920 uint32_t fadvise_flags);
2921 void _pad_zeros(bufferlist *bl, uint64_t *offset,
2922 uint64_t chunk_size);
2923
31f18b77
FG
2924 void _choose_write_options(CollectionRef& c,
2925 OnodeRef o,
2926 uint32_t fadvise_flags,
2927 WriteContext *wctx);
2928
2929 int _do_gc(TransContext *txc,
2930 CollectionRef& c,
2931 OnodeRef o,
31f18b77
FG
2932 const WriteContext& wctx,
2933 uint64_t *dirty_start,
2934 uint64_t *dirty_end);
2935
7c673cae
FG
2936 int _do_write(TransContext *txc,
2937 CollectionRef &c,
2938 OnodeRef o,
2939 uint64_t offset, uint64_t length,
2940 bufferlist& bl,
2941 uint32_t fadvise_flags);
2942 void _do_write_data(TransContext *txc,
2943 CollectionRef& c,
2944 OnodeRef o,
2945 uint64_t offset,
2946 uint64_t length,
2947 bufferlist& bl,
2948 WriteContext *wctx);
2949
2950 int _touch(TransContext *txc,
2951 CollectionRef& c,
2952 OnodeRef& o);
2953 int _do_zero(TransContext *txc,
2954 CollectionRef& c,
2955 OnodeRef& o,
2956 uint64_t offset, size_t len);
2957 int _zero(TransContext *txc,
2958 CollectionRef& c,
2959 OnodeRef& o,
2960 uint64_t offset, size_t len);
2961 void _do_truncate(TransContext *txc,
2962 CollectionRef& c,
2963 OnodeRef o,
31f18b77
FG
2964 uint64_t offset,
2965 set<SharedBlob*> *maybe_unshared_blobs=0);
35e4c445 2966 int _truncate(TransContext *txc,
7c673cae
FG
2967 CollectionRef& c,
2968 OnodeRef& o,
2969 uint64_t offset);
2970 int _remove(TransContext *txc,
2971 CollectionRef& c,
2972 OnodeRef& o);
2973 int _do_remove(TransContext *txc,
2974 CollectionRef& c,
2975 OnodeRef o);
2976 int _setattr(TransContext *txc,
2977 CollectionRef& c,
2978 OnodeRef& o,
2979 const string& name,
2980 bufferptr& val);
2981 int _setattrs(TransContext *txc,
2982 CollectionRef& c,
2983 OnodeRef& o,
2984 const map<string,bufferptr>& aset);
2985 int _rmattr(TransContext *txc,
2986 CollectionRef& c,
2987 OnodeRef& o,
2988 const string& name);
2989 int _rmattrs(TransContext *txc,
2990 CollectionRef& c,
2991 OnodeRef& o);
11fdf7f2 2992 void _do_omap_clear(TransContext *txc, const string& prefix, uint64_t id);
7c673cae
FG
2993 int _omap_clear(TransContext *txc,
2994 CollectionRef& c,
2995 OnodeRef& o);
2996 int _omap_setkeys(TransContext *txc,
2997 CollectionRef& c,
2998 OnodeRef& o,
2999 bufferlist& bl);
3000 int _omap_setheader(TransContext *txc,
3001 CollectionRef& c,
3002 OnodeRef& o,
3003 bufferlist& header);
3004 int _omap_rmkeys(TransContext *txc,
3005 CollectionRef& c,
3006 OnodeRef& o,
3007 bufferlist& bl);
3008 int _omap_rmkey_range(TransContext *txc,
3009 CollectionRef& c,
3010 OnodeRef& o,
3011 const string& first, const string& last);
3012 int _set_alloc_hint(
3013 TransContext *txc,
3014 CollectionRef& c,
3015 OnodeRef& o,
3016 uint64_t expected_object_size,
3017 uint64_t expected_write_size,
3018 uint32_t flags);
3019 int _do_clone_range(TransContext *txc,
3020 CollectionRef& c,
3021 OnodeRef& oldo,
3022 OnodeRef& newo,
3023 uint64_t srcoff, uint64_t length, uint64_t dstoff);
3024 int _clone(TransContext *txc,
3025 CollectionRef& c,
3026 OnodeRef& oldo,
3027 OnodeRef& newo);
3028 int _clone_range(TransContext *txc,
3029 CollectionRef& c,
3030 OnodeRef& oldo,
3031 OnodeRef& newo,
3032 uint64_t srcoff, uint64_t length, uint64_t dstoff);
3033 int _rename(TransContext *txc,
3034 CollectionRef& c,
3035 OnodeRef& oldo,
3036 OnodeRef& newo,
3037 const ghobject_t& new_oid);
3038 int _create_collection(TransContext *txc, const coll_t &cid,
3039 unsigned bits, CollectionRef *c);
3040 int _remove_collection(TransContext *txc, const coll_t &cid,
3041 CollectionRef *c);
11fdf7f2 3042 void _do_remove_collection(TransContext *txc, CollectionRef *c);
7c673cae
FG
3043 int _split_collection(TransContext *txc,
3044 CollectionRef& c,
3045 CollectionRef& d,
3046 unsigned bits, int rem);
11fdf7f2
TL
3047 int _merge_collection(TransContext *txc,
3048 CollectionRef *c,
3049 CollectionRef& d,
3050 unsigned bits);
3051
3052private:
3053 std::atomic<uint64_t> out_of_sync_fm = {0};
3054 // --------------------------------------------------------
3055 // BlueFSDeviceExpander implementation
3056 uint64_t get_recommended_expansion_delta(uint64_t bluefs_free,
3057 uint64_t bluefs_total) override {
3058 auto delta = _get_bluefs_size_delta(bluefs_free, bluefs_total);
3059 return delta > 0 ? delta : 0;
3060 }
3061 int allocate_freespace(
3062 uint64_t min_size,
3063 uint64_t size,
3064 PExtentVector& extents) override {
3065 return allocate_bluefs_freespace(min_size, size, &extents);
3066 };
eafe8130
TL
3067 size_t available_freespace(uint64_t alloc_size) override;
3068
3069public:
3070 struct sb_info_t {
3071 coll_t cid;
3072 int64_t pool_id = INT64_MIN;
3073 list<ghobject_t> oids;
3074 BlueStore::SharedBlobRef sb;
3075 bluestore_extent_ref_map_t ref_map;
3076 bool compressed = false;
3077 bool passed = false;
3078 bool updated = false;
3079 };
3080 typedef btree::btree_set<
3081 uint64_t, std::less<uint64_t>,
3082 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
3083
3084 typedef mempool::bluestore_fsck::map<uint64_t, sb_info_t> sb_info_map_t;
3085 struct FSCK_ObjectCtx {
3086 int64_t& errors;
3087 int64_t& warnings;
3088 uint64_t& num_objects;
3089 uint64_t& num_extents;
3090 uint64_t& num_blobs;
3091 uint64_t& num_sharded_objects;
3092 uint64_t& num_spanning_blobs;
3093
3094 mempool_dynamic_bitset* used_blocks;
3095 uint64_t_btree_t* used_omap_head;
3096 uint64_t_btree_t* used_per_pool_omap_head;
3097 uint64_t_btree_t* used_pgmeta_omap_head;
3098
3099 ceph::mutex* sb_info_lock;
3100 sb_info_map_t& sb_info;
3101
3102 store_statfs_t& expected_store_statfs;
3103 per_pool_statfs& expected_pool_statfs;
3104 BlueStoreRepairer* repairer;
3105
3106 FSCK_ObjectCtx(int64_t& e,
3107 int64_t& w,
3108 uint64_t& _num_objects,
3109 uint64_t& _num_extents,
3110 uint64_t& _num_blobs,
3111 uint64_t& _num_sharded_objects,
3112 uint64_t& _num_spanning_blobs,
3113 mempool_dynamic_bitset* _ub,
3114 uint64_t_btree_t* _used_omap_head,
3115 uint64_t_btree_t* _used_per_pool_omap_head,
3116 uint64_t_btree_t* _used_pgmeta_omap_head,
3117 ceph::mutex* _sb_info_lock,
3118 sb_info_map_t& _sb_info,
3119 store_statfs_t& _store_statfs,
3120 per_pool_statfs& _pool_statfs,
3121 BlueStoreRepairer* _repairer) :
3122 errors(e),
3123 warnings(w),
3124 num_objects(_num_objects),
3125 num_extents(_num_extents),
3126 num_blobs(_num_blobs),
3127 num_sharded_objects(_num_sharded_objects),
3128 num_spanning_blobs(_num_spanning_blobs),
3129 used_blocks(_ub),
3130 used_omap_head(_used_omap_head),
3131 used_per_pool_omap_head(_used_per_pool_omap_head),
3132 used_pgmeta_omap_head(_used_pgmeta_omap_head),
3133 sb_info_lock(_sb_info_lock),
3134 sb_info(_sb_info),
3135 expected_store_statfs(_store_statfs),
3136 expected_pool_statfs(_pool_statfs),
3137 repairer(_repairer) {
3138 }
3139 };
3140
3141 OnodeRef fsck_check_objects_shallow(
3142 FSCKDepth depth,
3143 int64_t pool_id,
3144 CollectionRef c,
3145 const ghobject_t& oid,
3146 const string& key,
3147 const bufferlist& value,
3148 mempool::bluestore_fsck::list<string>& expecting_shards,
3149 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
3150 const BlueStore::FSCK_ObjectCtx& ctx);
3151
3152private:
3153 void _fsck_check_objects(FSCKDepth depth,
3154 FSCK_ObjectCtx& ctx);
7c673cae
FG
3155};
3156
11fdf7f2
TL
3157inline ostream& operator<<(ostream& out, const BlueStore::volatile_statfs& s) {
3158 return out
3159 << " allocated:"
3160 << s.values[BlueStore::volatile_statfs::STATFS_ALLOCATED]
3161 << " stored:"
3162 << s.values[BlueStore::volatile_statfs::STATFS_STORED]
3163 << " compressed:"
3164 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED]
3165 << " compressed_orig:"
3166 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ORIGINAL]
3167 << " compressed_alloc:"
3168 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ALLOCATED];
7c673cae
FG
3169}
3170
3171static inline void intrusive_ptr_add_ref(BlueStore::Onode *o) {
3172 o->get();
3173}
3174static inline void intrusive_ptr_release(BlueStore::Onode *o) {
3175 o->put();
3176}
3177
3178static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer *o) {
3179 o->get();
3180}
3181static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
3182 o->put();
3183}
3184
11fdf7f2
TL
3185class BlueStoreRepairer
3186{
3187public:
3188 // to simplify future potential migration to mempools
3189 using fsck_interval = interval_set<uint64_t>;
3190
3191 // Structure to track what pextents are used for specific cid/oid.
3192 // Similar to Bloom filter positive and false-positive matches are
3193 // possible only.
3194 // Maintains two lists of bloom filters for both cids and oids
3195 // where each list entry is a BF for specific disk pextent
3196 // The length of the extent per filter is measured on init.
3197 // Allows to filter out 'uninteresting' pextents to speadup subsequent
3198 // 'is_used' access.
3199 struct StoreSpaceTracker {
3200 const uint64_t BLOOM_FILTER_SALT_COUNT = 2;
3201 const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter
3202 const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected
3203 static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024;
3204
3205 typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector;
3206 bloom_vector collections_bfs;
3207 bloom_vector objects_bfs;
3208
3209 bool was_filtered_out = false;
3210 uint64_t granularity = 0; // extent length for a single filter
3211
3212 StoreSpaceTracker() {
3213 }
3214 StoreSpaceTracker(const StoreSpaceTracker& from) :
3215 collections_bfs(from.collections_bfs),
3216 objects_bfs(from.objects_bfs),
3217 granularity(from.granularity) {
3218 }
3219
3220 void init(uint64_t total,
3221 uint64_t min_alloc_size,
3222 uint64_t mem_cap = DEF_MEM_CAP) {
3223 ceph_assert(!granularity); // not initialized yet
3224 ceph_assert(min_alloc_size && isp2(min_alloc_size));
3225 ceph_assert(mem_cap);
3226
3227 total = round_up_to(total, min_alloc_size);
3228 granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap;
3229
3230 if (!granularity) {
3231 granularity = min_alloc_size;
3232 } else {
3233 granularity = round_up_to(granularity, min_alloc_size);
3234 }
3235
3236 uint64_t entries = round_up_to(total, granularity) / granularity;
3237 collections_bfs.resize(entries,
3238 bloom_filter(BLOOM_FILTER_SALT_COUNT,
3239 BLOOM_FILTER_TABLE_SIZE,
3240 0,
3241 BLOOM_FILTER_EXPECTED_COUNT));
3242 objects_bfs.resize(entries,
3243 bloom_filter(BLOOM_FILTER_SALT_COUNT,
3244 BLOOM_FILTER_TABLE_SIZE,
3245 0,
3246 BLOOM_FILTER_EXPECTED_COUNT));
3247 }
3248 inline uint32_t get_hash(const coll_t& cid) const {
3249 return cid.hash_to_shard(1);
3250 }
3251 inline void set_used(uint64_t offset, uint64_t len,
3252 const coll_t& cid, const ghobject_t& oid) {
3253 ceph_assert(granularity); // initialized
3254
3255 // can't call this func after filter_out has been applied
3256 ceph_assert(!was_filtered_out);
3257 if (!len) {
3258 return;
3259 }
3260 auto pos = offset / granularity;
3261 auto end_pos = (offset + len - 1) / granularity;
3262 while (pos <= end_pos) {
3263 collections_bfs[pos].insert(get_hash(cid));
3264 objects_bfs[pos].insert(oid.hobj.get_hash());
3265 ++pos;
3266 }
3267 }
3268 // filter-out entries unrelated to the specified(broken) extents.
3269 // 'is_used' calls are permitted after that only
3270 size_t filter_out(const fsck_interval& extents);
3271
3272 // determines if collection's present after filtering-out
3273 inline bool is_used(const coll_t& cid) const {
3274 ceph_assert(was_filtered_out);
3275 for(auto& bf : collections_bfs) {
3276 if (bf.contains(get_hash(cid))) {
3277 return true;
3278 }
3279 }
3280 return false;
3281 }
3282 // determines if object's present after filtering-out
3283 inline bool is_used(const ghobject_t& oid) const {
3284 ceph_assert(was_filtered_out);
3285 for(auto& bf : objects_bfs) {
3286 if (bf.contains(oid.hobj.get_hash())) {
3287 return true;
3288 }
3289 }
3290 return false;
3291 }
3292 // determines if collection's present before filtering-out
3293 inline bool is_used(const coll_t& cid, uint64_t offs) const {
3294 ceph_assert(granularity); // initialized
3295 ceph_assert(!was_filtered_out);
3296 auto &bf = collections_bfs[offs / granularity];
3297 if (bf.contains(get_hash(cid))) {
3298 return true;
3299 }
3300 return false;
3301 }
3302 // determines if object's present before filtering-out
3303 inline bool is_used(const ghobject_t& oid, uint64_t offs) const {
3304 ceph_assert(granularity); // initialized
3305 ceph_assert(!was_filtered_out);
3306 auto &bf = objects_bfs[offs / granularity];
3307 if (bf.contains(oid.hobj.get_hash())) {
3308 return true;
3309 }
3310 return false;
3311 }
3312 };
3313public:
3314
3315 bool remove_key(KeyValueDB *db, const string& prefix, const string& key);
3316 bool fix_shared_blob(KeyValueDB *db,
3317 uint64_t sbid,
3318 const bufferlist* bl);
3319 bool fix_statfs(KeyValueDB *db, const string& key,
3320 const store_statfs_t& new_statfs);
3321
3322 bool fix_leaked(KeyValueDB *db,
3323 FreelistManager* fm,
3324 uint64_t offset, uint64_t len);
3325 bool fix_false_free(KeyValueDB *db,
3326 FreelistManager* fm,
3327 uint64_t offset, uint64_t len);
3328 bool fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag);
3329
3330 void init(uint64_t total_space, uint64_t lres_tracking_unit_size);
3331
3332 bool preprocess_misreference(KeyValueDB *db);
3333
3334 unsigned apply(KeyValueDB* db);
3335
3336 void note_misreference(uint64_t offs, uint64_t len, bool inc_error) {
3337 misreferenced_extents.union_insert(offs, len);
3338 if (inc_error) {
3339 ++to_repair_cnt;
3340 }
3341 }
eafe8130
TL
3342 void inc_repaired() {
3343 ++to_repair_cnt;
3344 }
11fdf7f2
TL
3345
3346 StoreSpaceTracker& get_space_usage_tracker() {
3347 return space_usage_tracker;
3348 }
3349 const fsck_interval& get_misreferences() const {
3350 return misreferenced_extents;
3351 }
3352 KeyValueDB::Transaction get_fix_misreferences_txn() {
3353 return fix_misreferences_txn;
3354 }
3355
3356private:
3357 unsigned to_repair_cnt = 0;
3358 KeyValueDB::Transaction fix_fm_leaked_txn;
3359 KeyValueDB::Transaction fix_fm_false_free_txn;
3360 KeyValueDB::Transaction remove_key_txn;
3361 KeyValueDB::Transaction fix_statfs_txn;
3362 KeyValueDB::Transaction fix_shared_blob_txn;
3363
3364 KeyValueDB::Transaction fix_misreferences_txn;
3365
3366 StoreSpaceTracker space_usage_tracker;
3367
3368 // non-shared extents with multiple references
3369 fsck_interval misreferenced_extents;
3370
3371};
7c673cae 3372#endif