]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.h
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / os / bluestore / BlueStore.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_OSD_BLUESTORE_H
16#define CEPH_OSD_BLUESTORE_H
17
18#include "acconfig.h"
19
20#include <unistd.h>
21
22#include <atomic>
23#include <mutex>
24#include <condition_variable>
25
26#include <boost/intrusive/list.hpp>
27#include <boost/intrusive/unordered_set.hpp>
28#include <boost/intrusive/set.hpp>
29#include <boost/functional/hash.hpp>
30#include <boost/dynamic_bitset.hpp>
31
11fdf7f2 32#include "include/ceph_assert.h"
7c673cae 33#include "include/unordered_map.h"
7c673cae 34#include "include/mempool.h"
11fdf7f2 35#include "common/bloom_filter.hpp"
7c673cae 36#include "common/Finisher.h"
11fdf7f2 37#include "common/Throttle.h"
7c673cae 38#include "common/perf_counters.h"
91327a77 39#include "common/PriorityCache.h"
7c673cae
FG
40#include "compressor/Compressor.h"
41#include "os/ObjectStore.h"
42
43#include "bluestore_types.h"
44#include "BlockDevice.h"
11fdf7f2 45#include "BlueFS.h"
7c673cae
FG
46#include "common/EventTrace.h"
47
48class Allocator;
49class FreelistManager;
11fdf7f2 50class BlueStoreRepairer;
7c673cae
FG
51
52//#define DEBUG_CACHE
53//#define DEBUG_DEFERRED
54
31f18b77
FG
55
56
57// constants for Buffer::optimize()
58#define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
59
60
7c673cae
FG
61enum {
62 l_bluestore_first = 732430,
63 l_bluestore_kv_flush_lat,
64 l_bluestore_kv_commit_lat,
11fdf7f2
TL
65 l_bluestore_kv_sync_lat,
66 l_bluestore_kv_final_lat,
7c673cae
FG
67 l_bluestore_state_prepare_lat,
68 l_bluestore_state_aio_wait_lat,
69 l_bluestore_state_io_done_lat,
70 l_bluestore_state_kv_queued_lat,
71 l_bluestore_state_kv_committing_lat,
72 l_bluestore_state_kv_done_lat,
73 l_bluestore_state_deferred_queued_lat,
74 l_bluestore_state_deferred_aio_wait_lat,
75 l_bluestore_state_deferred_cleanup_lat,
76 l_bluestore_state_finishing_lat,
77 l_bluestore_state_done_lat,
78 l_bluestore_throttle_lat,
79 l_bluestore_submit_lat,
80 l_bluestore_commit_lat,
81 l_bluestore_read_lat,
82 l_bluestore_read_onode_meta_lat,
83 l_bluestore_read_wait_aio_lat,
84 l_bluestore_compress_lat,
85 l_bluestore_decompress_lat,
86 l_bluestore_csum_lat,
87 l_bluestore_compress_success_count,
88 l_bluestore_compress_rejected_count,
89 l_bluestore_write_pad_bytes,
90 l_bluestore_deferred_write_ops,
91 l_bluestore_deferred_write_bytes,
92 l_bluestore_write_penalty_read_ops,
93 l_bluestore_allocated,
94 l_bluestore_stored,
95 l_bluestore_compressed,
96 l_bluestore_compressed_allocated,
97 l_bluestore_compressed_original,
98 l_bluestore_onodes,
99 l_bluestore_onode_hits,
100 l_bluestore_onode_misses,
101 l_bluestore_onode_shard_hits,
102 l_bluestore_onode_shard_misses,
103 l_bluestore_extents,
104 l_bluestore_blobs,
105 l_bluestore_buffers,
106 l_bluestore_buffer_bytes,
107 l_bluestore_buffer_hit_bytes,
108 l_bluestore_buffer_miss_bytes,
109 l_bluestore_write_big,
110 l_bluestore_write_big_bytes,
111 l_bluestore_write_big_blobs,
112 l_bluestore_write_small,
113 l_bluestore_write_small_bytes,
114 l_bluestore_write_small_unused,
115 l_bluestore_write_small_deferred,
116 l_bluestore_write_small_pre_read,
117 l_bluestore_write_small_new,
118 l_bluestore_txc,
119 l_bluestore_onode_reshard,
120 l_bluestore_blob_split,
121 l_bluestore_extent_compress,
122 l_bluestore_gc_merged,
b32b8144 123 l_bluestore_read_eio,
f64942e4 124 l_bluestore_reads_with_retries,
a8e16298 125 l_bluestore_fragmentation,
11fdf7f2
TL
126 l_bluestore_omap_seek_to_first_lat,
127 l_bluestore_omap_upper_bound_lat,
128 l_bluestore_omap_lower_bound_lat,
129 l_bluestore_omap_next_lat,
7c673cae
FG
130 l_bluestore_last
131};
132
11fdf7f2
TL
133#define META_POOL_ID ((uint64_t)-1ull)
134
7c673cae 135class BlueStore : public ObjectStore,
11fdf7f2 136 public BlueFSDeviceExpander,
7c673cae
FG
137 public md_config_obs_t {
138 // -----------------------------------------------------
139 // types
140public:
141 // config observer
142 const char** get_tracked_conf_keys() const override;
11fdf7f2
TL
143 void handle_conf_change(const ConfigProxy& conf,
144 const std::set<std::string> &changed) override;
145
146 //handler for discard event
147 void handle_discard(interval_set<uint64_t>& to_release);
7c673cae
FG
148
149 void _set_csum();
150 void _set_compression();
151 void _set_throttle_params();
31f18b77 152 int _set_cache_sizes();
7c673cae
FG
153
154 class TransContext;
155
156 typedef map<uint64_t, bufferlist> ready_regions_t;
157
158 struct BufferSpace;
159 struct Collection;
160 typedef boost::intrusive_ptr<Collection> CollectionRef;
161
162 struct AioContext {
163 virtual void aio_finish(BlueStore *store) = 0;
164 virtual ~AioContext() {}
165 };
166
167 /// cached buffer
168 struct Buffer {
169 MEMPOOL_CLASS_HELPERS();
170
171 enum {
172 STATE_EMPTY, ///< empty buffer -- used for cache history
173 STATE_CLEAN, ///< clean data that is up to date
174 STATE_WRITING, ///< data that is being written (io not yet complete)
175 };
176 static const char *get_state_name(int s) {
177 switch (s) {
178 case STATE_EMPTY: return "empty";
179 case STATE_CLEAN: return "clean";
180 case STATE_WRITING: return "writing";
181 default: return "???";
182 }
183 }
184 enum {
185 FLAG_NOCACHE = 1, ///< trim when done WRITING (do not become CLEAN)
186 // NOTE: fix operator<< when you define a second flag
187 };
188 static const char *get_flag_name(int s) {
189 switch (s) {
190 case FLAG_NOCACHE: return "nocache";
191 default: return "???";
192 }
193 }
194
195 BufferSpace *space;
196 uint16_t state; ///< STATE_*
197 uint16_t cache_private = 0; ///< opaque (to us) value used by Cache impl
198 uint32_t flags; ///< FLAG_*
199 uint64_t seq;
200 uint32_t offset, length;
201 bufferlist data;
202
203 boost::intrusive::list_member_hook<> lru_item;
204 boost::intrusive::list_member_hook<> state_item;
205
206 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, uint32_t l,
207 unsigned f = 0)
208 : space(space), state(s), flags(f), seq(q), offset(o), length(l) {}
209 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, bufferlist& b,
210 unsigned f = 0)
211 : space(space), state(s), flags(f), seq(q), offset(o),
212 length(b.length()), data(b) {}
213
214 bool is_empty() const {
215 return state == STATE_EMPTY;
216 }
217 bool is_clean() const {
218 return state == STATE_CLEAN;
219 }
220 bool is_writing() const {
221 return state == STATE_WRITING;
222 }
223
224 uint32_t end() const {
225 return offset + length;
226 }
227
228 void truncate(uint32_t newlen) {
11fdf7f2 229 ceph_assert(newlen < length);
7c673cae
FG
230 if (data.length()) {
231 bufferlist t;
232 t.substr_of(data, 0, newlen);
233 data.claim(t);
234 }
235 length = newlen;
236 }
31f18b77
FG
237 void maybe_rebuild() {
238 if (data.length() &&
239 (data.get_num_buffers() > 1 ||
240 data.front().wasted() > data.length() / MAX_BUFFER_SLOP_RATIO_DEN)) {
241 data.rebuild();
242 }
243 }
7c673cae
FG
244
245 void dump(Formatter *f) const {
246 f->dump_string("state", get_state_name(state));
247 f->dump_unsigned("seq", seq);
248 f->dump_unsigned("offset", offset);
249 f->dump_unsigned("length", length);
250 f->dump_unsigned("data_length", data.length());
251 }
252 };
253
254 struct Cache;
255
256 /// map logical extent range (object) onto buffers
257 struct BufferSpace {
91327a77
AA
258 enum {
259 BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache
260 };
261
7c673cae
FG
262 typedef boost::intrusive::list<
263 Buffer,
264 boost::intrusive::member_hook<
265 Buffer,
266 boost::intrusive::list_member_hook<>,
267 &Buffer::state_item> > state_list_t;
268
31f18b77 269 mempool::bluestore_cache_other::map<uint32_t, std::unique_ptr<Buffer>>
7c673cae
FG
270 buffer_map;
271
272 // we use a bare intrusive list here instead of std::map because
273 // it uses less memory and we expect this to be very small (very
274 // few IOs in flight to the same Blob at the same time).
275 state_list_t writing; ///< writing buffers, sorted by seq, ascending
276
277 ~BufferSpace() {
11fdf7f2
TL
278 ceph_assert(buffer_map.empty());
279 ceph_assert(writing.empty());
7c673cae
FG
280 }
281
282 void _add_buffer(Cache* cache, Buffer *b, int level, Buffer *near) {
283 cache->_audit("_add_buffer start");
284 buffer_map[b->offset].reset(b);
285 if (b->is_writing()) {
31f18b77 286 b->data.reassign_to_mempool(mempool::mempool_bluestore_writing);
224ce89b
WB
287 if (writing.empty() || writing.rbegin()->seq <= b->seq) {
288 writing.push_back(*b);
289 } else {
290 auto it = writing.begin();
291 while (it->seq < b->seq) {
292 ++it;
293 }
294
11fdf7f2 295 ceph_assert(it->seq >= b->seq);
224ce89b
WB
296 // note that this will insert b before it
297 // hence the order is maintained
298 writing.insert(it, *b);
299 }
7c673cae 300 } else {
31f18b77 301 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
7c673cae
FG
302 cache->_add_buffer(b, level, near);
303 }
304 cache->_audit("_add_buffer end");
305 }
306 void _rm_buffer(Cache* cache, Buffer *b) {
307 _rm_buffer(cache, buffer_map.find(b->offset));
308 }
31f18b77
FG
309 void _rm_buffer(Cache* cache,
310 map<uint32_t, std::unique_ptr<Buffer>>::iterator p) {
11fdf7f2 311 ceph_assert(p != buffer_map.end());
7c673cae
FG
312 cache->_audit("_rm_buffer start");
313 if (p->second->is_writing()) {
314 writing.erase(writing.iterator_to(*p->second));
315 } else {
316 cache->_rm_buffer(p->second.get());
317 }
318 buffer_map.erase(p);
319 cache->_audit("_rm_buffer end");
320 }
321
322 map<uint32_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
323 uint32_t offset) {
324 auto i = buffer_map.lower_bound(offset);
325 if (i != buffer_map.begin()) {
326 --i;
327 if (i->first + i->second->length <= offset)
328 ++i;
329 }
330 return i;
331 }
332
333 // must be called under protection of the Cache lock
334 void _clear(Cache* cache);
335
336 // return value is the highest cache_private of a trimmed buffer, or 0.
337 int discard(Cache* cache, uint32_t offset, uint32_t length) {
11fdf7f2 338 std::lock_guard l(cache->lock);
7c673cae
FG
339 return _discard(cache, offset, length);
340 }
341 int _discard(Cache* cache, uint32_t offset, uint32_t length);
342
343 void write(Cache* cache, uint64_t seq, uint32_t offset, bufferlist& bl,
344 unsigned flags) {
11fdf7f2 345 std::lock_guard l(cache->lock);
7c673cae
FG
346 Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
347 flags);
348 b->cache_private = _discard(cache, offset, bl.length());
349 _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
350 }
f64942e4 351 void _finish_write(Cache* cache, uint64_t seq);
7c673cae 352 void did_read(Cache* cache, uint32_t offset, bufferlist& bl) {
11fdf7f2 353 std::lock_guard l(cache->lock);
7c673cae
FG
354 Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
355 b->cache_private = _discard(cache, offset, bl.length());
356 _add_buffer(cache, b, 1, nullptr);
357 }
358
359 void read(Cache* cache, uint32_t offset, uint32_t length,
360 BlueStore::ready_regions_t& res,
91327a77
AA
361 interval_set<uint32_t>& res_intervals,
362 int flags = 0);
7c673cae
FG
363
364 void truncate(Cache* cache, uint32_t offset) {
365 discard(cache, offset, (uint32_t)-1 - offset);
366 }
367
368 void split(Cache* cache, size_t pos, BufferSpace &r);
369
370 void dump(Cache* cache, Formatter *f) const {
11fdf7f2 371 std::lock_guard l(cache->lock);
7c673cae
FG
372 f->open_array_section("buffers");
373 for (auto& i : buffer_map) {
374 f->open_object_section("buffer");
11fdf7f2 375 ceph_assert(i.first == i.second->offset);
7c673cae
FG
376 i.second->dump(f);
377 f->close_section();
378 }
379 f->close_section();
380 }
381 };
382
383 struct SharedBlobSet;
384
385 /// in-memory shared blob state (incl cached buffers)
386 struct SharedBlob {
387 MEMPOOL_CLASS_HELPERS();
388
389 std::atomic_int nref = {0}; ///< reference count
390 bool loaded = false;
391
392 CollectionRef coll;
393 union {
394 uint64_t sbid_unloaded; ///< sbid if persistent isn't loaded
395 bluestore_shared_blob_t *persistent; ///< persistent part of the shared blob if any
396 };
397 BufferSpace bc; ///< buffer cache
398
399 SharedBlob(Collection *_coll) : coll(_coll), sbid_unloaded(0) {
400 if (get_cache()) {
401 get_cache()->add_blob();
402 }
403 }
404 SharedBlob(uint64_t i, Collection *_coll);
405 ~SharedBlob();
406
407 uint64_t get_sbid() const {
408 return loaded ? persistent->sbid : sbid_unloaded;
409 }
410
411 friend void intrusive_ptr_add_ref(SharedBlob *b) { b->get(); }
412 friend void intrusive_ptr_release(SharedBlob *b) { b->put(); }
413
414 friend ostream& operator<<(ostream& out, const SharedBlob& sb);
415
416 void get() {
417 ++nref;
418 }
419 void put();
420
421 /// get logical references
422 void get_ref(uint64_t offset, uint32_t length);
423
424 /// put logical references, and get back any released extents
425 void put_ref(uint64_t offset, uint32_t length,
11fdf7f2 426 PExtentVector *r, bool *unshare);
7c673cae 427
f64942e4
AA
428 void finish_write(uint64_t seq);
429
7c673cae
FG
430 friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
431 return l.get_sbid() == r.get_sbid();
432 }
433 inline Cache* get_cache() {
434 return coll ? coll->cache : nullptr;
435 }
436 inline SharedBlobSet* get_parent() {
437 return coll ? &(coll->shared_blob_set) : nullptr;
438 }
439 inline bool is_loaded() const {
440 return loaded;
441 }
442
443 };
444 typedef boost::intrusive_ptr<SharedBlob> SharedBlobRef;
445
446 /// a lookup table of SharedBlobs
447 struct SharedBlobSet {
11fdf7f2
TL
448 /// protect lookup, insertion, removal
449 ceph::mutex lock = ceph::make_mutex("BlueStore::SharedBlobSet::lock");
7c673cae
FG
450
451 // we use a bare pointer because we don't want to affect the ref
452 // count
31f18b77 453 mempool::bluestore_cache_other::unordered_map<uint64_t,SharedBlob*> sb_map;
7c673cae
FG
454
455 SharedBlobRef lookup(uint64_t sbid) {
11fdf7f2 456 std::lock_guard l(lock);
7c673cae 457 auto p = sb_map.find(sbid);
28e407b8
AA
458 if (p == sb_map.end() ||
459 p->second->nref == 0) {
7c673cae
FG
460 return nullptr;
461 }
462 return p->second;
463 }
464
465 void add(Collection* coll, SharedBlob *sb) {
11fdf7f2 466 std::lock_guard l(lock);
7c673cae
FG
467 sb_map[sb->get_sbid()] = sb;
468 sb->coll = coll;
469 }
470
91327a77 471 bool remove(SharedBlob *sb, bool verify_nref_is_zero=false) {
11fdf7f2
TL
472 std::lock_guard l(lock);
473 ceph_assert(sb->get_parent() == this);
91327a77
AA
474 if (verify_nref_is_zero && sb->nref != 0) {
475 return false;
476 }
28e407b8
AA
477 // only remove if it still points to us
478 auto p = sb_map.find(sb->get_sbid());
479 if (p != sb_map.end() &&
480 p->second == sb) {
481 sb_map.erase(p);
482 }
91327a77 483 return true;
3efd9988
FG
484 }
485
7c673cae 486 bool empty() {
11fdf7f2 487 std::lock_guard l(lock);
7c673cae
FG
488 return sb_map.empty();
489 }
3efd9988 490
11fdf7f2
TL
491 template <int LogLevelV>
492 void dump(CephContext *cct);
7c673cae
FG
493 };
494
495//#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
496
497 /// in-memory blob metadata and associated cached buffers (if any)
498 struct Blob {
499 MEMPOOL_CLASS_HELPERS();
500
501 std::atomic_int nref = {0}; ///< reference count
502 int16_t id = -1; ///< id, for spanning blobs only, >= 0
503 int16_t last_encoded_id = -1; ///< (ephemeral) used during encoding only
504 SharedBlobRef shared_blob; ///< shared blob state (if any)
505
506 private:
507 mutable bluestore_blob_t blob; ///< decoded blob metadata
508#ifdef CACHE_BLOB_BL
509 mutable bufferlist blob_bl; ///< cached encoded blob, blob is dirty if empty
510#endif
511 /// refs from this shard. ephemeral if id<0, persisted if spanning.
512 bluestore_blob_use_tracker_t used_in_blob;
513
514 public:
515
516 friend void intrusive_ptr_add_ref(Blob *b) { b->get(); }
517 friend void intrusive_ptr_release(Blob *b) { b->put(); }
518
519 friend ostream& operator<<(ostream& out, const Blob &b);
520
521 const bluestore_blob_use_tracker_t& get_blob_use_tracker() const {
522 return used_in_blob;
523 }
524 bool is_referenced() const {
525 return used_in_blob.is_not_empty();
526 }
527 uint32_t get_referenced_bytes() const {
528 return used_in_blob.get_referenced_bytes();
529 }
530
531 bool is_spanning() const {
532 return id >= 0;
533 }
534
535 bool can_split() const {
11fdf7f2 536 std::lock_guard l(shared_blob->get_cache()->lock);
7c673cae
FG
537 // splitting a BufferSpace writing list is too hard; don't try.
538 return shared_blob->bc.writing.empty() &&
539 used_in_blob.can_split() &&
540 get_blob().can_split();
541 }
542
543 bool can_split_at(uint32_t blob_offset) const {
544 return used_in_blob.can_split_at(blob_offset) &&
545 get_blob().can_split_at(blob_offset);
546 }
547
224ce89b 548 bool can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
549 uint32_t target_blob_size,
550 uint32_t b_offset,
551 uint32_t *length0);
552
553 void dup(Blob& o) {
554 o.shared_blob = shared_blob;
555 o.blob = blob;
556#ifdef CACHE_BLOB_BL
557 o.blob_bl = blob_bl;
558#endif
559 }
560
224ce89b 561 inline const bluestore_blob_t& get_blob() const {
7c673cae
FG
562 return blob;
563 }
224ce89b 564 inline bluestore_blob_t& dirty_blob() {
7c673cae
FG
565#ifdef CACHE_BLOB_BL
566 blob_bl.clear();
567#endif
568 return blob;
569 }
570
571 /// discard buffers for unallocated regions
572 void discard_unallocated(Collection *coll);
573
574 /// get logical references
575 void get_ref(Collection *coll, uint32_t offset, uint32_t length);
576 /// put logical references, and get back any released extents
577 bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
578 PExtentVector *r);
579
580 /// split the blob
581 void split(Collection *coll, uint32_t blob_offset, Blob *o);
582
583 void get() {
584 ++nref;
585 }
586 void put() {
587 if (--nref == 0)
588 delete this;
589 }
590
591
592#ifdef CACHE_BLOB_BL
593 void _encode() const {
594 if (blob_bl.length() == 0 ) {
11fdf7f2 595 encode(blob, blob_bl);
7c673cae 596 } else {
11fdf7f2 597 ceph_assert(blob_bl.length());
7c673cae
FG
598 }
599 }
600 void bound_encode(
601 size_t& p,
602 bool include_ref_map) const {
603 _encode();
604 p += blob_bl.length();
605 if (include_ref_map) {
606 used_in_blob.bound_encode(p);
607 }
608 }
609 void encode(
610 bufferlist::contiguous_appender& p,
611 bool include_ref_map) const {
612 _encode();
613 p.append(blob_bl);
614 if (include_ref_map) {
615 used_in_blob.encode(p);
616 }
617 }
618 void decode(
619 Collection */*coll*/,
11fdf7f2 620 bufferptr::const_iterator& p,
7c673cae
FG
621 bool include_ref_map) {
622 const char *start = p.get_pos();
623 denc(blob, p);
624 const char *end = p.get_pos();
625 blob_bl.clear();
626 blob_bl.append(start, end - start);
627 if (include_ref_map) {
628 used_in_blob.decode(p);
629 }
630 }
631#else
632 void bound_encode(
633 size_t& p,
634 uint64_t struct_v,
635 uint64_t sbid,
636 bool include_ref_map) const {
637 denc(blob, p, struct_v);
638 if (blob.is_shared()) {
639 denc(sbid, p);
640 }
641 if (include_ref_map) {
642 used_in_blob.bound_encode(p);
643 }
644 }
645 void encode(
646 bufferlist::contiguous_appender& p,
647 uint64_t struct_v,
648 uint64_t sbid,
649 bool include_ref_map) const {
650 denc(blob, p, struct_v);
651 if (blob.is_shared()) {
652 denc(sbid, p);
653 }
654 if (include_ref_map) {
655 used_in_blob.encode(p);
656 }
657 }
658 void decode(
659 Collection *coll,
11fdf7f2 660 bufferptr::const_iterator& p,
7c673cae
FG
661 uint64_t struct_v,
662 uint64_t* sbid,
663 bool include_ref_map);
664#endif
665 };
666 typedef boost::intrusive_ptr<Blob> BlobRef;
31f18b77 667 typedef mempool::bluestore_cache_other::map<int,BlobRef> blob_map_t;
7c673cae
FG
668
669 /// a logical extent, pointing to (some portion of) a blob
670 typedef boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true> > ExtentBase; //making an alias to avoid build warnings
671 struct Extent : public ExtentBase {
672 MEMPOOL_CLASS_HELPERS();
673
674 uint32_t logical_offset = 0; ///< logical offset
675 uint32_t blob_offset = 0; ///< blob offset
676 uint32_t length = 0; ///< length
677 BlobRef blob; ///< the blob with our data
678
679 /// ctor for lookup only
680 explicit Extent(uint32_t lo) : ExtentBase(), logical_offset(lo) { }
681 /// ctor for delayed initialization (see decode_some())
682 explicit Extent() : ExtentBase() {
683 }
684 /// ctor for general usage
685 Extent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
686 : ExtentBase(),
687 logical_offset(lo), blob_offset(o), length(l) {
688 assign_blob(b);
689 }
690 ~Extent() {
691 if (blob) {
692 blob->shared_blob->get_cache()->rm_extent();
693 }
694 }
695
696 void assign_blob(const BlobRef& b) {
11fdf7f2 697 ceph_assert(!blob);
7c673cae
FG
698 blob = b;
699 blob->shared_blob->get_cache()->add_extent();
700 }
701
702 // comparators for intrusive_set
703 friend bool operator<(const Extent &a, const Extent &b) {
704 return a.logical_offset < b.logical_offset;
705 }
706 friend bool operator>(const Extent &a, const Extent &b) {
707 return a.logical_offset > b.logical_offset;
708 }
709 friend bool operator==(const Extent &a, const Extent &b) {
710 return a.logical_offset == b.logical_offset;
711 }
712
713 uint32_t blob_start() const {
714 return logical_offset - blob_offset;
715 }
716
717 uint32_t blob_end() const {
718 return blob_start() + blob->get_blob().get_logical_length();
719 }
720
721 uint32_t logical_end() const {
722 return logical_offset + length;
723 }
724
725 // return true if any piece of the blob is out of
726 // the given range [o, o + l].
727 bool blob_escapes_range(uint32_t o, uint32_t l) const {
728 return blob_start() < o || blob_end() > o + l;
729 }
730 };
731 typedef boost::intrusive::set<Extent> extent_map_t;
732
733
734 friend ostream& operator<<(ostream& out, const Extent& e);
735
736 struct OldExtent {
737 boost::intrusive::list_member_hook<> old_extent_item;
738 Extent e;
739 PExtentVector r;
740 bool blob_empty; // flag to track the last removed extent that makes blob
741 // empty - required to update compression stat properly
742 OldExtent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
743 : e(lo, o, l, b), blob_empty(false) {
744 }
745 static OldExtent* create(CollectionRef c,
746 uint32_t lo,
747 uint32_t o,
748 uint32_t l,
749 BlobRef& b);
750 };
751 typedef boost::intrusive::list<
752 OldExtent,
753 boost::intrusive::member_hook<
754 OldExtent,
755 boost::intrusive::list_member_hook<>,
756 &OldExtent::old_extent_item> > old_extent_map_t;
757
758 struct Onode;
759
760 /// a sharded extent map, mapping offsets to lextents to blobs
761 struct ExtentMap {
762 Onode *onode;
763 extent_map_t extent_map; ///< map of Extents to Blobs
764 blob_map_t spanning_blob_map; ///< blobs that span shards
11fdf7f2 765 typedef boost::intrusive_ptr<Onode> OnodeRef;
7c673cae
FG
766
767 struct Shard {
768 bluestore_onode_t::shard_info *shard_info = nullptr;
769 unsigned extents = 0; ///< count extents in this shard
770 bool loaded = false; ///< true if shard is loaded
771 bool dirty = false; ///< true if shard is dirty and needs reencoding
772 };
31f18b77 773 mempool::bluestore_cache_other::vector<Shard> shards; ///< shards
7c673cae
FG
774
775 bufferlist inline_bl; ///< cached encoded map, if unsharded; empty=>dirty
776
777 uint32_t needs_reshard_begin = 0;
778 uint32_t needs_reshard_end = 0;
779
11fdf7f2
TL
780 void dup(BlueStore* b, TransContext*, CollectionRef&, OnodeRef&, OnodeRef&,
781 uint64_t&, uint64_t&, uint64_t&);
782
7c673cae
FG
783 bool needs_reshard() const {
784 return needs_reshard_end > needs_reshard_begin;
785 }
786 void clear_needs_reshard() {
787 needs_reshard_begin = needs_reshard_end = 0;
788 }
789 void request_reshard(uint32_t begin, uint32_t end) {
790 if (begin < needs_reshard_begin) {
791 needs_reshard_begin = begin;
792 }
793 if (end > needs_reshard_end) {
794 needs_reshard_end = end;
795 }
796 }
797
798 struct DeleteDisposer {
799 void operator()(Extent *e) { delete e; }
800 };
801
802 ExtentMap(Onode *o);
803 ~ExtentMap() {
804 extent_map.clear_and_dispose(DeleteDisposer());
805 }
806
807 void clear() {
808 extent_map.clear_and_dispose(DeleteDisposer());
809 shards.clear();
810 inline_bl.clear();
811 clear_needs_reshard();
812 }
813
814 bool encode_some(uint32_t offset, uint32_t length, bufferlist& bl,
815 unsigned *pn);
816 unsigned decode_some(bufferlist& bl);
817
818 void bound_encode_spanning_blobs(size_t& p);
819 void encode_spanning_blobs(bufferlist::contiguous_appender& p);
11fdf7f2 820 void decode_spanning_blobs(bufferptr::const_iterator& p);
7c673cae
FG
821
822 BlobRef get_spanning_blob(int id) {
823 auto p = spanning_blob_map.find(id);
11fdf7f2 824 ceph_assert(p != spanning_blob_map.end());
7c673cae
FG
825 return p->second;
826 }
827
828 void update(KeyValueDB::Transaction t, bool force);
31f18b77 829 decltype(BlueStore::Blob::id) allocate_spanning_blob_id();
7c673cae
FG
830 void reshard(
831 KeyValueDB *db,
832 KeyValueDB::Transaction t);
833
834 /// initialize Shards from the onode
835 void init_shards(bool loaded, bool dirty);
836
837 /// return index of shard containing offset
838 /// or -1 if not found
839 int seek_shard(uint32_t offset) {
840 size_t end = shards.size();
841 size_t mid, left = 0;
842 size_t right = end; // one passed the right end
843
844 while (left < right) {
845 mid = left + (right - left) / 2;
846 if (offset >= shards[mid].shard_info->offset) {
847 size_t next = mid + 1;
848 if (next >= end || offset < shards[next].shard_info->offset)
849 return mid;
850 //continue to search forwards
851 left = next;
852 } else {
853 //continue to search backwards
854 right = mid;
855 }
856 }
857
858 return -1; // not found
859 }
860
861 /// check if a range spans a shard
862 bool spans_shard(uint32_t offset, uint32_t length) {
863 if (shards.empty()) {
864 return false;
865 }
866 int s = seek_shard(offset);
11fdf7f2 867 ceph_assert(s >= 0);
7c673cae
FG
868 if (s == (int)shards.size() - 1) {
869 return false; // last shard
870 }
871 if (offset + length <= shards[s+1].shard_info->offset) {
872 return false;
873 }
874 return true;
875 }
876
877 /// ensure that a range of the map is loaded
878 void fault_range(KeyValueDB *db,
879 uint32_t offset, uint32_t length);
880
881 /// ensure a range of the map is marked dirty
31f18b77 882 void dirty_range(uint32_t offset, uint32_t length);
7c673cae 883
31f18b77 884 /// for seek_lextent test
7c673cae
FG
885 extent_map_t::iterator find(uint64_t offset);
886
7c673cae
FG
887 /// seek to the first lextent including or after offset
888 extent_map_t::iterator seek_lextent(uint64_t offset);
889 extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
890
891 /// add a new Extent
892 void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
893 extent_map.insert(*new Extent(lo, o, l, b));
894 }
895
896 /// remove (and delete) an Extent
897 void rm(extent_map_t::iterator p) {
898 extent_map.erase_and_dispose(p, DeleteDisposer());
899 }
900
901 bool has_any_lextents(uint64_t offset, uint64_t length);
902
903 /// consolidate adjacent lextents in extent_map
904 int compress_extent_map(uint64_t offset, uint64_t length);
905
906 /// punch a logical hole. add lextents to deref to target list.
907 void punch_hole(CollectionRef &c,
908 uint64_t offset, uint64_t length,
909 old_extent_map_t *old_extents);
910
911 /// put new lextent into lextent_map overwriting existing ones if
912 /// any and update references accordingly
913 Extent *set_lextent(CollectionRef &c,
914 uint64_t logical_offset,
915 uint64_t offset, uint64_t length,
916 BlobRef b,
917 old_extent_map_t *old_extents);
918
919 /// split a blob (and referring extents)
920 BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
921 };
922
923 /// Compressed Blob Garbage collector
924 /*
925 The primary idea of the collector is to estimate a difference between
926 allocation units(AU) currently present for compressed blobs and new AUs
927 required to store that data uncompressed.
928 Estimation is performed for protrusive extents within a logical range
929 determined by a concatenation of old_extents collection and specific(current)
930 write request.
931 The root cause for old_extents use is the need to handle blob ref counts
932 properly. Old extents still hold blob refs and hence we need to traverse
933 the collection to determine if blob to be released.
934 Protrusive extents are extents that fit into the blob set in action
935 (ones that are below the logical range from above) but not removed totally
936 due to the current write.
937 E.g. for
938 extent1 <loffs = 100, boffs = 100, len = 100> ->
939 blob1<compressed, len_on_disk=4096, logical_len=8192>
940 extent2 <loffs = 200, boffs = 200, len = 100> ->
941 blob2<raw, len_on_disk=4096, llen=4096>
942 extent3 <loffs = 300, boffs = 300, len = 100> ->
943 blob1<compressed, len_on_disk=4096, llen=8192>
944 extent4 <loffs = 4096, boffs = 0, len = 100> ->
945 blob3<raw, len_on_disk=4096, llen=4096>
946 write(300~100)
947 protrusive extents are within the following ranges <0~300, 400~8192-400>
948 In this case existing AUs that might be removed due to GC (i.e. blob1)
949 use 2x4K bytes.
950 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
951 Hence we should do a collect.
952 */
953 class GarbageCollector
954 {
955 public:
956 /// return amount of allocation units that might be saved due to GC
957 int64_t estimate(
958 uint64_t offset,
959 uint64_t length,
960 const ExtentMap& extent_map,
961 const old_extent_map_t& old_extents,
962 uint64_t min_alloc_size);
963
964 /// return a collection of extents to perform GC on
a8e16298 965 const vector<bluestore_pextent_t>& get_extents_to_collect() const {
7c673cae
FG
966 return extents_to_collect;
967 }
968 GarbageCollector(CephContext* _cct) : cct(_cct) {}
969
970 private:
971 struct BlobInfo {
972 uint64_t referenced_bytes = 0; ///< amount of bytes referenced in blob
973 int64_t expected_allocations = 0; ///< new alloc units required
974 ///< in case of gc fulfilled
975 bool collect_candidate = false; ///< indicate if blob has any extents
976 ///< eligible for GC.
977 extent_map_t::const_iterator first_lextent; ///< points to the first
978 ///< lextent referring to
979 ///< the blob if any.
980 ///< collect_candidate flag
981 ///< determines the validity
982 extent_map_t::const_iterator last_lextent; ///< points to the last
983 ///< lextent referring to
984 ///< the blob if any.
985
986 BlobInfo(uint64_t ref_bytes) :
987 referenced_bytes(ref_bytes) {
988 }
989 };
990 CephContext* cct;
991 map<Blob*, BlobInfo> affected_blobs; ///< compressed blobs and their ref_map
992 ///< copies that are affected by the
993 ///< specific write
994
a8e16298
TL
995 ///< protrusive extents that should be collected if GC takes place
996 vector<bluestore_pextent_t> extents_to_collect;
7c673cae
FG
997
998 boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
999 ///< unit when traversing
1000 ///< protrusive extents.
1001 ///< Other extents mapped to
1002 ///< this AU to be ignored
1003 ///< (except the case where
1004 ///< uncompressed extent follows
1005 ///< compressed one - see below).
1006 BlobInfo* blob_info_counted = nullptr; ///< set if previous allocation unit
1007 ///< caused expected_allocations
1008 ///< counter increment at this blob.
1009 ///< if uncompressed extent follows
1010 ///< a decrement for the
1011 ///< expected_allocations counter
1012 ///< is needed
1013 int64_t expected_allocations = 0; ///< new alloc units required in case
1014 ///< of gc fulfilled
1015 int64_t expected_for_release = 0; ///< alloc units currently used by
1016 ///< compressed blobs that might
1017 ///< gone after GC
11fdf7f2
TL
1018 uint64_t gc_start_offset = 0; ///starting offset for GC
1019 uint64_t gc_end_offset = 0; ///ending offset for GC
7c673cae
FG
1020
1021 protected:
1022 void process_protrusive_extents(const BlueStore::ExtentMap& extent_map,
1023 uint64_t start_offset,
1024 uint64_t end_offset,
1025 uint64_t start_touch_offset,
1026 uint64_t end_touch_offset,
1027 uint64_t min_alloc_size);
1028 };
1029
1030 struct OnodeSpace;
1031
1032 /// an in-memory object
1033 struct Onode {
1034 MEMPOOL_CLASS_HELPERS();
1035
1036 std::atomic_int nref; ///< reference count
1037 Collection *c;
1038
1039 ghobject_t oid;
1040
1041 /// key under PREFIX_OBJ where we are stored
31f18b77 1042 mempool::bluestore_cache_other::string key;
7c673cae
FG
1043
1044 boost::intrusive::list_member_hook<> lru_item;
1045
1046 bluestore_onode_t onode; ///< metadata stored as value in kv store
1047 bool exists; ///< true if object logically exists
1048
1049 ExtentMap extent_map;
1050
1051 // track txc's that have not been committed to kv store (and whose
1052 // effects cannot be read via the kvdb read methods)
1053 std::atomic<int> flushing_count = {0};
11fdf7f2
TL
1054 /// protect flush_txns
1055 ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock");
1056 ceph::condition_variable flush_cond; ///< wait here for uncommitted txns
7c673cae
FG
1057
1058 Onode(Collection *c, const ghobject_t& o,
31f18b77 1059 const mempool::bluestore_cache_other::string& k)
7c673cae
FG
1060 : nref(0),
1061 c(c),
1062 oid(o),
1063 key(k),
1064 exists(false),
1065 extent_map(this) {
1066 }
1067
1068 void flush();
1069 void get() {
1070 ++nref;
1071 }
1072 void put() {
1073 if (--nref == 0)
1074 delete this;
1075 }
1076 };
1077 typedef boost::intrusive_ptr<Onode> OnodeRef;
1078
1079
1080 /// a cache (shard) of onodes and buffers
1081 struct Cache {
1082 CephContext* cct;
1083 PerfCounters *logger;
11fdf7f2
TL
1084
1085 /// protect lru and other structures
1086 ceph::recursive_mutex lock = {
1087 ceph::make_recursive_mutex("BlueStore::Cache::lock") };
7c673cae
FG
1088
1089 std::atomic<uint64_t> num_extents = {0};
1090 std::atomic<uint64_t> num_blobs = {0};
1091
7c673cae
FG
1092 static Cache *create(CephContext* cct, string type, PerfCounters *logger);
1093
1094 Cache(CephContext* cct) : cct(cct), logger(nullptr) {}
1095 virtual ~Cache() {}
1096
1097 virtual void _add_onode(OnodeRef& o, int level) = 0;
1098 virtual void _rm_onode(OnodeRef& o) = 0;
1099 virtual void _touch_onode(OnodeRef& o) = 0;
1100
1101 virtual void _add_buffer(Buffer *b, int level, Buffer *near) = 0;
1102 virtual void _rm_buffer(Buffer *b) = 0;
1103 virtual void _move_buffer(Cache *src, Buffer *b) = 0;
1104 virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0;
1105 virtual void _touch_buffer(Buffer *b) = 0;
1106
1107 virtual uint64_t _get_num_onodes() = 0;
1108 virtual uint64_t _get_buffer_bytes() = 0;
1109
1110 void add_extent() {
1111 ++num_extents;
1112 }
1113 void rm_extent() {
1114 --num_extents;
1115 }
1116
1117 void add_blob() {
1118 ++num_blobs;
1119 }
1120 void rm_blob() {
1121 --num_blobs;
1122 }
1123
91327a77 1124 void trim(uint64_t onode_max, uint64_t buffer_max);
7c673cae
FG
1125
1126 void trim_all();
1127
1128 virtual void _trim(uint64_t onode_max, uint64_t buffer_max) = 0;
1129
1130 virtual void add_stats(uint64_t *onodes, uint64_t *extents,
1131 uint64_t *blobs,
1132 uint64_t *buffers,
1133 uint64_t *bytes) = 0;
1134
31f18b77 1135 bool empty() {
11fdf7f2 1136 std::lock_guard l(lock);
31f18b77
FG
1137 return _get_num_onodes() == 0 && _get_buffer_bytes() == 0;
1138 }
1139
7c673cae
FG
1140#ifdef DEBUG_CACHE
1141 virtual void _audit(const char *s) = 0;
1142#else
1143 void _audit(const char *s) { /* no-op */ }
1144#endif
1145 };
1146
1147 /// simple LRU cache for onodes and buffers
1148 struct LRUCache : public Cache {
1149 private:
1150 typedef boost::intrusive::list<
1151 Onode,
1152 boost::intrusive::member_hook<
1153 Onode,
1154 boost::intrusive::list_member_hook<>,
1155 &Onode::lru_item> > onode_lru_list_t;
1156 typedef boost::intrusive::list<
1157 Buffer,
1158 boost::intrusive::member_hook<
1159 Buffer,
1160 boost::intrusive::list_member_hook<>,
1161 &Buffer::lru_item> > buffer_lru_list_t;
1162
1163 onode_lru_list_t onode_lru;
1164
1165 buffer_lru_list_t buffer_lru;
1166 uint64_t buffer_size = 0;
1167
1168 public:
1169 LRUCache(CephContext* cct) : Cache(cct) {}
1170 uint64_t _get_num_onodes() override {
1171 return onode_lru.size();
1172 }
1173 void _add_onode(OnodeRef& o, int level) override {
1174 if (level > 0)
1175 onode_lru.push_front(*o);
1176 else
1177 onode_lru.push_back(*o);
1178 }
1179 void _rm_onode(OnodeRef& o) override {
1180 auto q = onode_lru.iterator_to(*o);
1181 onode_lru.erase(q);
1182 }
1183 void _touch_onode(OnodeRef& o) override;
1184
1185 uint64_t _get_buffer_bytes() override {
1186 return buffer_size;
1187 }
1188 void _add_buffer(Buffer *b, int level, Buffer *near) override {
1189 if (near) {
1190 auto q = buffer_lru.iterator_to(*near);
1191 buffer_lru.insert(q, *b);
1192 } else if (level > 0) {
1193 buffer_lru.push_front(*b);
1194 } else {
1195 buffer_lru.push_back(*b);
1196 }
1197 buffer_size += b->length;
1198 }
1199 void _rm_buffer(Buffer *b) override {
11fdf7f2 1200 ceph_assert(buffer_size >= b->length);
7c673cae
FG
1201 buffer_size -= b->length;
1202 auto q = buffer_lru.iterator_to(*b);
1203 buffer_lru.erase(q);
1204 }
1205 void _move_buffer(Cache *src, Buffer *b) override {
1206 src->_rm_buffer(b);
1207 _add_buffer(b, 0, nullptr);
1208 }
1209 void _adjust_buffer_size(Buffer *b, int64_t delta) override {
11fdf7f2 1210 ceph_assert((int64_t)buffer_size + delta >= 0);
7c673cae
FG
1211 buffer_size += delta;
1212 }
1213 void _touch_buffer(Buffer *b) override {
1214 auto p = buffer_lru.iterator_to(*b);
1215 buffer_lru.erase(p);
1216 buffer_lru.push_front(*b);
1217 _audit("_touch_buffer end");
1218 }
1219
1220 void _trim(uint64_t onode_max, uint64_t buffer_max) override;
1221
1222 void add_stats(uint64_t *onodes, uint64_t *extents,
1223 uint64_t *blobs,
1224 uint64_t *buffers,
1225 uint64_t *bytes) override {
11fdf7f2 1226 std::lock_guard l(lock);
7c673cae
FG
1227 *onodes += onode_lru.size();
1228 *extents += num_extents;
1229 *blobs += num_blobs;
1230 *buffers += buffer_lru.size();
1231 *bytes += buffer_size;
1232 }
1233
1234#ifdef DEBUG_CACHE
1235 void _audit(const char *s) override;
1236#endif
1237 };
1238
1239 // 2Q cache for buffers, LRU for onodes
1240 struct TwoQCache : public Cache {
1241 private:
1242 // stick with LRU for onodes for now (fixme?)
1243 typedef boost::intrusive::list<
1244 Onode,
1245 boost::intrusive::member_hook<
1246 Onode,
1247 boost::intrusive::list_member_hook<>,
1248 &Onode::lru_item> > onode_lru_list_t;
1249 typedef boost::intrusive::list<
1250 Buffer,
1251 boost::intrusive::member_hook<
1252 Buffer,
1253 boost::intrusive::list_member_hook<>,
1254 &Buffer::lru_item> > buffer_list_t;
1255
1256 onode_lru_list_t onode_lru;
1257
1258 buffer_list_t buffer_hot; ///< "Am" hot buffers
1259 buffer_list_t buffer_warm_in; ///< "A1in" newly warm buffers
1260 buffer_list_t buffer_warm_out; ///< "A1out" empty buffers we've evicted
1261 uint64_t buffer_bytes = 0; ///< bytes
1262
1263 enum {
1264 BUFFER_NEW = 0,
1265 BUFFER_WARM_IN, ///< in buffer_warm_in
1266 BUFFER_WARM_OUT, ///< in buffer_warm_out
1267 BUFFER_HOT, ///< in buffer_hot
1268 BUFFER_TYPE_MAX
1269 };
1270
1271 uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
1272
1273 public:
1274 TwoQCache(CephContext* cct) : Cache(cct) {}
1275 uint64_t _get_num_onodes() override {
1276 return onode_lru.size();
1277 }
1278 void _add_onode(OnodeRef& o, int level) override {
1279 if (level > 0)
1280 onode_lru.push_front(*o);
1281 else
1282 onode_lru.push_back(*o);
1283 }
1284 void _rm_onode(OnodeRef& o) override {
1285 auto q = onode_lru.iterator_to(*o);
1286 onode_lru.erase(q);
1287 }
1288 void _touch_onode(OnodeRef& o) override;
1289
1290 uint64_t _get_buffer_bytes() override {
1291 return buffer_bytes;
1292 }
1293 void _add_buffer(Buffer *b, int level, Buffer *near) override;
1294 void _rm_buffer(Buffer *b) override;
1295 void _move_buffer(Cache *src, Buffer *b) override;
1296 void _adjust_buffer_size(Buffer *b, int64_t delta) override;
1297 void _touch_buffer(Buffer *b) override {
1298 switch (b->cache_private) {
1299 case BUFFER_WARM_IN:
1300 // do nothing (somewhat counter-intuitively!)
1301 break;
1302 case BUFFER_WARM_OUT:
1303 // move from warm_out to hot LRU
11fdf7f2 1304 ceph_abort_msg("this happens via discard hint");
7c673cae
FG
1305 break;
1306 case BUFFER_HOT:
1307 // move to front of hot LRU
1308 buffer_hot.erase(buffer_hot.iterator_to(*b));
1309 buffer_hot.push_front(*b);
1310 break;
1311 }
1312 _audit("_touch_buffer end");
1313 }
1314
1315 void _trim(uint64_t onode_max, uint64_t buffer_max) override;
1316
1317 void add_stats(uint64_t *onodes, uint64_t *extents,
1318 uint64_t *blobs,
1319 uint64_t *buffers,
1320 uint64_t *bytes) override {
11fdf7f2 1321 std::lock_guard l(lock);
7c673cae
FG
1322 *onodes += onode_lru.size();
1323 *extents += num_extents;
1324 *blobs += num_blobs;
1325 *buffers += buffer_hot.size() + buffer_warm_in.size();
1326 *bytes += buffer_bytes;
1327 }
1328
1329#ifdef DEBUG_CACHE
1330 void _audit(const char *s) override;
1331#endif
1332 };
1333
1334 struct OnodeSpace {
1335 private:
1336 Cache *cache;
1337
1338 /// forward lookups
31f18b77 1339 mempool::bluestore_cache_other::unordered_map<ghobject_t,OnodeRef> onode_map;
7c673cae
FG
1340
1341 friend class Collection; // for split_cache()
1342
1343 public:
1344 OnodeSpace(Cache *c) : cache(c) {}
1345 ~OnodeSpace() {
1346 clear();
1347 }
1348
1349 OnodeRef add(const ghobject_t& oid, OnodeRef o);
1350 OnodeRef lookup(const ghobject_t& o);
1351 void remove(const ghobject_t& oid) {
1352 onode_map.erase(oid);
1353 }
1354 void rename(OnodeRef& o, const ghobject_t& old_oid,
1355 const ghobject_t& new_oid,
31f18b77 1356 const mempool::bluestore_cache_other::string& new_okey);
7c673cae
FG
1357 void clear();
1358 bool empty();
1359
11fdf7f2
TL
1360 template <int LogLevelV>
1361 void dump(CephContext *cct);
3efd9988 1362
7c673cae
FG
1363 /// return true if f true for any item
1364 bool map_any(std::function<bool(OnodeRef)> f);
1365 };
1366
11fdf7f2
TL
1367 class OpSequencer;
1368 typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
1369
7c673cae
FG
1370 struct Collection : public CollectionImpl {
1371 BlueStore *store;
11fdf7f2 1372 OpSequencerRef osr;
7c673cae 1373 Cache *cache; ///< our cache shard
7c673cae
FG
1374 bluestore_cnode_t cnode;
1375 RWLock lock;
1376
1377 bool exists;
1378
1379 SharedBlobSet shared_blob_set; ///< open SharedBlobs
1380
1381 // cache onodes on a per-collection basis to avoid lock
1382 // contention.
1383 OnodeSpace onode_map;
1384
1385 //pool options
1386 pool_opts_t pool_opts;
11fdf7f2 1387 ContextQueue *commit_queue;
7c673cae
FG
1388
1389 OnodeRef get_onode(const ghobject_t& oid, bool create);
1390
1391 // the terminology is confusing here, sorry!
1392 //
1393 // blob_t shared_blob_t
1394 // !shared unused -> open
1395 // shared !loaded -> open + shared
1396 // shared loaded -> open + shared + loaded
1397 //
1398 // i.e.,
1399 // open = SharedBlob is instantiated
1400 // shared = blob_t shared flag is set; SharedBlob is hashed.
1401 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1402 void open_shared_blob(uint64_t sbid, BlobRef b);
1403 void load_shared_blob(SharedBlobRef sb);
1404 void make_blob_shared(uint64_t sbid, BlobRef b);
31f18b77 1405 uint64_t make_blob_unshared(SharedBlob *sb);
7c673cae
FG
1406
1407 BlobRef new_blob() {
1408 BlobRef b = new Blob();
1409 b->shared_blob = new SharedBlob(this);
1410 return b;
1411 }
1412
7c673cae
FG
1413 bool contains(const ghobject_t& oid) {
1414 if (cid.is_meta())
1415 return oid.hobj.pool == -1;
1416 spg_t spgid;
1417 if (cid.is_pg(&spgid))
1418 return
1419 spgid.pgid.contains(cnode.bits, oid) &&
1420 oid.shard_id == spgid.shard;
1421 return false;
1422 }
1423
1424 void split_cache(Collection *dest);
7c673cae 1425
11fdf7f2
TL
1426 bool flush_commit(Context *c) override;
1427 void flush() override;
1428 void flush_all_but_last();
1429
7c673cae
FG
1430 Collection(BlueStore *ns, Cache *ca, coll_t c);
1431 };
1432
1433 class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
1434 CollectionRef c;
1435 OnodeRef o;
1436 KeyValueDB::Iterator it;
1437 string head, tail;
11fdf7f2
TL
1438
1439 string _stringify() const;
1440
7c673cae
FG
1441 public:
1442 OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
1443 int seek_to_first() override;
1444 int upper_bound(const string &after) override;
1445 int lower_bound(const string &to) override;
1446 bool valid() override;
11fdf7f2 1447 int next() override;
7c673cae
FG
1448 string key() override;
1449 bufferlist value() override;
1450 int status() override {
1451 return 0;
1452 }
1453 };
1454
31f18b77
FG
1455 struct volatile_statfs{
1456 enum {
1457 STATFS_ALLOCATED = 0,
1458 STATFS_STORED,
1459 STATFS_COMPRESSED_ORIGINAL,
1460 STATFS_COMPRESSED,
1461 STATFS_COMPRESSED_ALLOCATED,
1462 STATFS_LAST
1463 };
1464 int64_t values[STATFS_LAST];
1465 volatile_statfs() {
1466 memset(this, 0, sizeof(volatile_statfs));
1467 }
1468 void reset() {
1469 *this = volatile_statfs();
1470 }
11fdf7f2
TL
1471 void publish(store_statfs_t* buf) const {
1472 buf->allocated = allocated();
1473 buf->data_stored = stored();
1474 buf->data_compressed = compressed();
1475 buf->data_compressed_original = compressed_original();
1476 buf->data_compressed_allocated = compressed_allocated();
1477 }
1478
31f18b77
FG
1479 volatile_statfs& operator+=(const volatile_statfs& other) {
1480 for (size_t i = 0; i < STATFS_LAST; ++i) {
1481 values[i] += other.values[i];
1482 }
1483 return *this;
1484 }
1485 int64_t& allocated() {
1486 return values[STATFS_ALLOCATED];
1487 }
1488 int64_t& stored() {
1489 return values[STATFS_STORED];
1490 }
1491 int64_t& compressed_original() {
1492 return values[STATFS_COMPRESSED_ORIGINAL];
1493 }
1494 int64_t& compressed() {
1495 return values[STATFS_COMPRESSED];
1496 }
1497 int64_t& compressed_allocated() {
1498 return values[STATFS_COMPRESSED_ALLOCATED];
1499 }
11fdf7f2
TL
1500 int64_t allocated() const {
1501 return values[STATFS_ALLOCATED];
1502 }
1503 int64_t stored() const {
1504 return values[STATFS_STORED];
1505 }
1506 int64_t compressed_original() const {
1507 return values[STATFS_COMPRESSED_ORIGINAL];
1508 }
1509 int64_t compressed() const {
1510 return values[STATFS_COMPRESSED];
1511 }
1512 int64_t compressed_allocated() const {
1513 return values[STATFS_COMPRESSED_ALLOCATED];
1514 }
1515 volatile_statfs& operator=(const store_statfs_t& st) {
1516 values[STATFS_ALLOCATED] = st.allocated;
1517 values[STATFS_STORED] = st.data_stored;
1518 values[STATFS_COMPRESSED_ORIGINAL] = st.data_compressed_original;
1519 values[STATFS_COMPRESSED] = st.data_compressed;
1520 values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated;
1521 return *this;
1522 }
31f18b77
FG
1523 bool is_empty() {
1524 return values[STATFS_ALLOCATED] == 0 &&
1525 values[STATFS_STORED] == 0 &&
1526 values[STATFS_COMPRESSED] == 0 &&
1527 values[STATFS_COMPRESSED_ORIGINAL] == 0 &&
1528 values[STATFS_COMPRESSED_ALLOCATED] == 0;
1529 }
11fdf7f2
TL
1530 void decode(bufferlist::const_iterator& it) {
1531 using ceph::decode;
31f18b77 1532 for (size_t i = 0; i < STATFS_LAST; i++) {
11fdf7f2 1533 decode(values[i], it);
31f18b77
FG
1534 }
1535 }
1536
1537 void encode(bufferlist& bl) {
11fdf7f2 1538 using ceph::encode;
31f18b77 1539 for (size_t i = 0; i < STATFS_LAST; i++) {
11fdf7f2 1540 encode(values[i], bl);
31f18b77
FG
1541 }
1542 }
1543 };
1544
11fdf7f2 1545 struct TransContext final : public AioContext {
31f18b77
FG
1546 MEMPOOL_CLASS_HELPERS();
1547
7c673cae
FG
1548 typedef enum {
1549 STATE_PREPARE,
1550 STATE_AIO_WAIT,
1551 STATE_IO_DONE,
1552 STATE_KV_QUEUED, // queued for kv_sync_thread submission
1553 STATE_KV_SUBMITTED, // submitted to kv; not yet synced
1554 STATE_KV_DONE,
1555 STATE_DEFERRED_QUEUED, // in deferred_queue (pending or running)
1556 STATE_DEFERRED_CLEANUP, // remove deferred kv record
1557 STATE_DEFERRED_DONE,
1558 STATE_FINISHING,
1559 STATE_DONE,
1560 } state_t;
1561
1562 state_t state = STATE_PREPARE;
1563
1564 const char *get_state_name() {
1565 switch (state) {
1566 case STATE_PREPARE: return "prepare";
1567 case STATE_AIO_WAIT: return "aio_wait";
1568 case STATE_IO_DONE: return "io_done";
1569 case STATE_KV_QUEUED: return "kv_queued";
1570 case STATE_KV_SUBMITTED: return "kv_submitted";
1571 case STATE_KV_DONE: return "kv_done";
1572 case STATE_DEFERRED_QUEUED: return "deferred_queued";
1573 case STATE_DEFERRED_CLEANUP: return "deferred_cleanup";
1574 case STATE_DEFERRED_DONE: return "deferred_done";
1575 case STATE_FINISHING: return "finishing";
1576 case STATE_DONE: return "done";
1577 }
1578 return "???";
1579 }
1580
1581#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1582 const char *get_state_latency_name(int state) {
1583 switch (state) {
1584 case l_bluestore_state_prepare_lat: return "prepare";
1585 case l_bluestore_state_aio_wait_lat: return "aio_wait";
1586 case l_bluestore_state_io_done_lat: return "io_done";
1587 case l_bluestore_state_kv_queued_lat: return "kv_queued";
1588 case l_bluestore_state_kv_committing_lat: return "kv_committing";
1589 case l_bluestore_state_kv_done_lat: return "kv_done";
1590 case l_bluestore_state_deferred_queued_lat: return "deferred_queued";
1591 case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup";
1592 case l_bluestore_state_finishing_lat: return "finishing";
1593 case l_bluestore_state_done_lat: return "done";
1594 }
1595 return "???";
1596 }
1597#endif
1598
11fdf7f2 1599 utime_t log_state_latency(PerfCounters *logger, int state) {
7c673cae
FG
1600 utime_t lat, now = ceph_clock_now();
1601 lat = now - last_stamp;
1602 logger->tinc(state, lat);
1603#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1604 if (state >= l_bluestore_state_prepare_lat && state <= l_bluestore_state_done_lat) {
1605 double usecs = (now.to_nsec()-last_stamp.to_nsec())/1000;
1606 OID_ELAPSED("", usecs, get_state_latency_name(state));
1607 }
1608#endif
1609 last_stamp = now;
11fdf7f2 1610 return lat;
7c673cae
FG
1611 }
1612
11fdf7f2
TL
1613 CollectionRef ch;
1614 OpSequencerRef osr; // this should be ch->osr
7c673cae
FG
1615 boost::intrusive::list_member_hook<> sequencer_item;
1616
1617 uint64_t bytes = 0, cost = 0;
1618
1619 set<OnodeRef> onodes; ///< these need to be updated/written
1620 set<OnodeRef> modified_objects; ///< objects we modified (and need a ref)
1621 set<SharedBlobRef> shared_blobs; ///< these need to be updated/written
1622 set<SharedBlobRef> shared_blobs_written; ///< update these on io completion
1623
1624 KeyValueDB::Transaction t; ///< then we will commit this
7c673cae
FG
1625 list<Context*> oncommits; ///< more commit completions
1626 list<CollectionRef> removed_collections; ///< colls we removed
1627
1628 boost::intrusive::list_member_hook<> deferred_queue_item;
1629 bluestore_deferred_transaction_t *deferred_txn = nullptr; ///< if any
1630
1631 interval_set<uint64_t> allocated, released;
11fdf7f2
TL
1632 volatile_statfs statfs_delta; ///< overall store statistics delta
1633 uint64_t osd_pool_id = META_POOL_ID; ///< osd pool id we're operating on
1634
7c673cae
FG
1635 IOContext ioc;
1636 bool had_ios = false; ///< true if we submitted IOs before our kv txn
1637
7c673cae
FG
1638 uint64_t seq = 0;
1639 utime_t start;
1640 utime_t last_stamp;
1641
1642 uint64_t last_nid = 0; ///< if non-zero, highest new nid we allocated
1643 uint64_t last_blobid = 0; ///< if non-zero, highest new blobid we allocated
1644
11fdf7f2
TL
1645 explicit TransContext(CephContext* cct, Collection *c, OpSequencer *o,
1646 list<Context*> *on_commits)
1647 : ch(c),
1648 osr(o),
7c673cae
FG
1649 ioc(cct, this),
1650 start(ceph_clock_now()) {
1651 last_stamp = start;
11fdf7f2
TL
1652 if (on_commits) {
1653 oncommits.swap(*on_commits);
1654 }
7c673cae
FG
1655 }
1656 ~TransContext() {
1657 delete deferred_txn;
1658 }
1659
1660 void write_onode(OnodeRef &o) {
1661 onodes.insert(o);
1662 }
1663 void write_shared_blob(SharedBlobRef &sb) {
1664 shared_blobs.insert(sb);
1665 }
31f18b77
FG
1666 void unshare_blob(SharedBlob *sb) {
1667 shared_blobs.erase(sb);
1668 }
1669
7c673cae
FG
1670 /// note we logically modified object (when onode itself is unmodified)
1671 void note_modified_object(OnodeRef &o) {
1672 // onode itself isn't written, though
1673 modified_objects.insert(o);
1674 }
a8e16298 1675 void note_removed_object(OnodeRef& o) {
7c673cae 1676 onodes.erase(o);
a8e16298 1677 modified_objects.insert(o);
7c673cae
FG
1678 }
1679
1680 void aio_finish(BlueStore *store) override {
1681 store->txc_aio_finish(this);
1682 }
1683 };
1684
1685 typedef boost::intrusive::list<
1686 TransContext,
1687 boost::intrusive::member_hook<
1688 TransContext,
1689 boost::intrusive::list_member_hook<>,
1690 &TransContext::deferred_queue_item> > deferred_queue_t;
1691
11fdf7f2 1692 struct DeferredBatch final : public AioContext {
7c673cae
FG
1693 OpSequencer *osr;
1694 struct deferred_io {
1695 bufferlist bl; ///< data
1696 uint64_t seq; ///< deferred transaction seq
1697 };
1698 map<uint64_t,deferred_io> iomap; ///< map of ios in this batch
1699 deferred_queue_t txcs; ///< txcs in this batch
1700 IOContext ioc; ///< our aios
1701 /// bytes of pending io for each deferred seq (may be 0)
1702 map<uint64_t,int> seq_bytes;
1703
1704 void _discard(CephContext *cct, uint64_t offset, uint64_t length);
1705 void _audit(CephContext *cct);
1706
1707 DeferredBatch(CephContext *cct, OpSequencer *osr)
1708 : osr(osr), ioc(cct, this) {}
1709
1710 /// prepare a write
1711 void prepare_write(CephContext *cct,
1712 uint64_t seq, uint64_t offset, uint64_t length,
1713 bufferlist::const_iterator& p);
1714
1715 void aio_finish(BlueStore *store) override {
1716 store->_deferred_aio_finish(osr);
1717 }
1718 };
1719
11fdf7f2 1720 class OpSequencer : public RefCountedObject {
7c673cae 1721 public:
11fdf7f2
TL
1722 ceph::mutex qlock = ceph::make_mutex("BlueStore::OpSequencer::qlock");
1723 ceph::condition_variable qcond;
7c673cae
FG
1724 typedef boost::intrusive::list<
1725 TransContext,
1726 boost::intrusive::member_hook<
1727 TransContext,
1728 boost::intrusive::list_member_hook<>,
1729 &TransContext::sequencer_item> > q_list_t;
1730 q_list_t q; ///< transactions
1731
1732 boost::intrusive::list_member_hook<> deferred_osr_queue_item;
1733
1734 DeferredBatch *deferred_running = nullptr;
1735 DeferredBatch *deferred_pending = nullptr;
1736
7c673cae 1737 BlueStore *store;
11fdf7f2 1738 coll_t cid;
7c673cae
FG
1739
1740 uint64_t last_seq = 0;
1741
1742 std::atomic_int txc_with_unstable_io = {0}; ///< num txcs with unstable io
1743
1744 std::atomic_int kv_committing_serially = {0};
1745
1746 std::atomic_int kv_submitted_waiters = {0};
1747
11fdf7f2 1748 std::atomic_bool zombie = {false}; ///< in zombie_osr set (collection going away)
7c673cae 1749
11fdf7f2
TL
1750 OpSequencer(BlueStore *store, const coll_t& c)
1751 : RefCountedObject(store->cct, 0),
1752 store(store), cid(c) {
7c673cae 1753 }
11fdf7f2
TL
1754 ~OpSequencer() {
1755 ceph_assert(q.empty());
7c673cae
FG
1756 }
1757
1758 void queue_new(TransContext *txc) {
11fdf7f2 1759 std::lock_guard l(qlock);
7c673cae
FG
1760 txc->seq = ++last_seq;
1761 q.push_back(*txc);
1762 }
1763
1764 void drain() {
11fdf7f2 1765 std::unique_lock l(qlock);
7c673cae
FG
1766 while (!q.empty())
1767 qcond.wait(l);
1768 }
1769
1770 void drain_preceding(TransContext *txc) {
11fdf7f2 1771 std::unique_lock l(qlock);
7c673cae
FG
1772 while (!q.empty() && &q.front() != txc)
1773 qcond.wait(l);
1774 }
1775
1776 bool _is_all_kv_submitted() {
11fdf7f2
TL
1777 // caller must hold qlock & q.empty() must not empty
1778 ceph_assert(!q.empty());
7c673cae
FG
1779 TransContext *txc = &q.back();
1780 if (txc->state >= TransContext::STATE_KV_SUBMITTED) {
1781 return true;
1782 }
1783 return false;
1784 }
1785
11fdf7f2
TL
1786 void flush() {
1787 std::unique_lock l(qlock);
1788 while (true) {
1789 // set flag before the check because the condition
1790 // may become true outside qlock, and we need to make
1791 // sure those threads see waiters and signal qcond.
1792 ++kv_submitted_waiters;
1793 if (q.empty() || _is_all_kv_submitted()) {
1794 --kv_submitted_waiters;
1795 return;
1796 }
1797 qcond.wait(l);
1798 --kv_submitted_waiters;
1799 }
1800 }
1801
1802 void flush_all_but_last() {
1803 std::unique_lock l(qlock);
1804 assert (q.size() >= 1);
7c673cae
FG
1805 while (true) {
1806 // set flag before the check because the condition
1807 // may become true outside qlock, and we need to make
1808 // sure those threads see waiters and signal qcond.
1809 ++kv_submitted_waiters;
11fdf7f2
TL
1810 if (q.size() <= 1) {
1811 --kv_submitted_waiters;
7c673cae 1812 return;
11fdf7f2
TL
1813 } else {
1814 auto it = q.rbegin();
1815 it++;
1816 if (it->state >= TransContext::STATE_KV_SUBMITTED) {
1817 return;
1818 }
7c673cae
FG
1819 }
1820 qcond.wait(l);
1821 --kv_submitted_waiters;
1822 }
1823 }
1824
11fdf7f2
TL
1825 bool flush_commit(Context *c) {
1826 std::lock_guard l(qlock);
7c673cae
FG
1827 if (q.empty()) {
1828 return true;
1829 }
1830 TransContext *txc = &q.back();
1831 if (txc->state >= TransContext::STATE_KV_DONE) {
1832 return true;
1833 }
1834 txc->oncommits.push_back(c);
1835 return false;
1836 }
1837 };
1838
1839 typedef boost::intrusive::list<
1840 OpSequencer,
1841 boost::intrusive::member_hook<
1842 OpSequencer,
1843 boost::intrusive::list_member_hook<>,
1844 &OpSequencer::deferred_osr_queue_item> > deferred_osr_queue_t;
1845
1846 struct KVSyncThread : public Thread {
1847 BlueStore *store;
1848 explicit KVSyncThread(BlueStore *s) : store(s) {}
1849 void *entry() override {
1850 store->_kv_sync_thread();
1851 return NULL;
1852 }
1853 };
31f18b77
FG
1854 struct KVFinalizeThread : public Thread {
1855 BlueStore *store;
1856 explicit KVFinalizeThread(BlueStore *s) : store(s) {}
1857 void *entry() {
1858 store->_kv_finalize_thread();
1859 return NULL;
1860 }
1861 };
7c673cae
FG
1862
1863 struct DBHistogram {
1864 struct value_dist {
1865 uint64_t count;
1866 uint32_t max_len;
1867 };
1868
1869 struct key_dist {
1870 uint64_t count;
1871 uint32_t max_len;
1872 map<int, struct value_dist> val_map; ///< slab id to count, max length of value and key
1873 };
1874
1875 map<string, map<int, struct key_dist> > key_hist;
1876 map<int, uint64_t> value_hist;
1877 int get_key_slab(size_t sz);
1878 string get_key_slab_to_range(int slab);
1879 int get_value_slab(size_t sz);
1880 string get_value_slab_to_range(int slab);
1881 void update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
1882 const string &prefix, size_t key_size, size_t value_size);
1883 void dump(Formatter *f);
1884 };
1885
1886 // --------------------------------------------------------
1887 // members
1888private:
1889 BlueFS *bluefs = nullptr;
1890 unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing
1891 bool bluefs_single_shared_device = true;
11fdf7f2
TL
1892 mono_time bluefs_last_balance;
1893 utime_t next_dump_on_bluefs_alloc_failure;
7c673cae
FG
1894
1895 KeyValueDB *db = nullptr;
1896 BlockDevice *bdev = nullptr;
1897 std::string freelist_type;
1898 FreelistManager *fm = nullptr;
1899 Allocator *alloc = nullptr;
1900 uuid_d fsid;
1901 int path_fd = -1; ///< open handle to $path
1902 int fsid_fd = -1; ///< open handle (locked) to $path/fsid
1903 bool mounted = false;
1904
1905 RWLock coll_lock = {"BlueStore::coll_lock"}; ///< rwlock to protect coll_map
31f18b77 1906 mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map;
11fdf7f2 1907 map<coll_t,CollectionRef> new_coll_map;
7c673cae
FG
1908
1909 vector<Cache*> cache_shards;
1910
11fdf7f2
TL
1911 /// protect zombie_osr_set
1912 ceph::mutex zombie_osr_lock = ceph::make_mutex("BlueStore::zombie_osr_lock");
1913 std::map<coll_t,OpSequencerRef> zombie_osr_set; ///< set of OpSequencers for deleted collections
7c673cae
FG
1914
1915 std::atomic<uint64_t> nid_last = {0};
1916 std::atomic<uint64_t> nid_max = {0};
1917 std::atomic<uint64_t> blobid_last = {0};
1918 std::atomic<uint64_t> blobid_max = {0};
1919
1920 Throttle throttle_bytes; ///< submit to commit
1921 Throttle throttle_deferred_bytes; ///< submit to deferred complete
1922
1923 interval_set<uint64_t> bluefs_extents; ///< block extents owned by bluefs
1924 interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming
1925
11fdf7f2 1926 ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::deferred_lock");
7c673cae
FG
1927 std::atomic<uint64_t> deferred_seq = {0};
1928 deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
1929 int deferred_queue_size = 0; ///< num txc's queued across all osrs
1930 atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
11fdf7f2 1931 Finisher deferred_finisher, finisher;
7c673cae
FG
1932
1933 KVSyncThread kv_sync_thread;
11fdf7f2
TL
1934 ceph::mutex kv_lock = ceph::make_mutex("BlueStore::kv_lock");
1935 ceph::condition_variable kv_cond;
3efd9988 1936 bool _kv_only = false;
31f18b77 1937 bool kv_sync_started = false;
7c673cae 1938 bool kv_stop = false;
31f18b77
FG
1939 bool kv_finalize_started = false;
1940 bool kv_finalize_stop = false;
7c673cae
FG
1941 deque<TransContext*> kv_queue; ///< ready, already submitted
1942 deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
1943 deque<TransContext*> kv_committing; ///< currently syncing
1944 deque<DeferredBatch*> deferred_done_queue; ///< deferred ios done
7c673cae 1945
31f18b77 1946 KVFinalizeThread kv_finalize_thread;
11fdf7f2
TL
1947 ceph::mutex kv_finalize_lock = ceph::make_mutex("BlueStore::kv_finalize_lock");
1948 ceph::condition_variable kv_finalize_cond;
31f18b77
FG
1949 deque<TransContext*> kv_committing_to_finalize; ///< pending finalization
1950 deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization
1951
7c673cae
FG
1952 PerfCounters *logger = nullptr;
1953
7c673cae
FG
1954 list<CollectionRef> removed_collections;
1955
1956 RWLock debug_read_error_lock = {"BlueStore::debug_read_error_lock"};
1957 set<ghobject_t> debug_data_error_objects;
1958 set<ghobject_t> debug_mdata_error_objects;
1959
1960 std::atomic<int> csum_type = {Checksummer::CSUM_CRC32C};
1961
1962 uint64_t block_size = 0; ///< block size of block device (power of 2)
1963 uint64_t block_mask = 0; ///< mask to get just the block offset
1964 size_t block_size_order = 0; ///< bits to shift to get block size
1965
1966 uint64_t min_alloc_size = 0; ///< minimum allocation unit (power of 2)
7c673cae 1967 ///< bits for min_alloc_size
224ce89b 1968 uint8_t min_alloc_size_order = 0;
7c673cae
FG
1969 static_assert(std::numeric_limits<uint8_t>::max() >
1970 std::numeric_limits<decltype(min_alloc_size)>::digits,
1971 "not enough bits for min_alloc_size");
1972
7c673cae
FG
1973 ///< maximum allocation unit (power of 2)
1974 std::atomic<uint64_t> max_alloc_size = {0};
1975
224ce89b
WB
1976 ///< number threshold for forced deferred writes
1977 std::atomic<int> deferred_batch_ops = {0};
1978
1979 ///< size threshold for forced deferred writes
1980 std::atomic<uint64_t> prefer_deferred_size = {0};
1981
7c673cae
FG
1982 ///< approx cost per io, in bytes
1983 std::atomic<uint64_t> throttle_cost_per_io = {0};
1984
224ce89b
WB
1985 std::atomic<Compressor::CompressionMode> comp_mode =
1986 {Compressor::COMP_NONE}; ///< compression mode
7c673cae
FG
1987 CompressorRef compressor;
1988 std::atomic<uint64_t> comp_min_blob_size = {0};
1989 std::atomic<uint64_t> comp_max_blob_size = {0};
1990
1991 std::atomic<uint64_t> max_blob_size = {0}; ///< maximum blob size
1992
31f18b77
FG
1993 uint64_t kv_ios = 0;
1994 uint64_t kv_throttle_costs = 0;
1995
7c673cae 1996 // cache trim control
91327a77
AA
1997 uint64_t cache_size = 0; ///< total cache size
1998 double cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
1999 double cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
2000 double cache_data_ratio = 0; ///< cache ratio dedicated to object data
2001 bool cache_autotune = false; ///< cache autotune setting
91327a77
AA
2002 double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
2003 uint64_t osd_memory_target = 0; ///< OSD memory target when autotuning cache
2004 uint64_t osd_memory_base = 0; ///< OSD base memory when autotuning cache
2005 double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
11fdf7f2 2006 uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cache
91327a77 2007 double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing
11fdf7f2
TL
2008
2009 typedef map<uint64_t, volatile_statfs> osd_pools_map;
2010
2011 ceph::mutex vstatfs_lock = ceph::make_mutex("BlueStore::vstatfs_lock");
31f18b77 2012 volatile_statfs vstatfs;
11fdf7f2
TL
2013 osd_pools_map osd_pools; // protected by vstatfs_lock as well
2014
2015 bool per_pool_stat_collection = true;
7c673cae
FG
2016
2017 struct MempoolThread : public Thread {
91327a77 2018 public:
7c673cae 2019 BlueStore *store;
91327a77 2020
11fdf7f2
TL
2021 ceph::condition_variable cond;
2022 ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock");
7c673cae 2023 bool stop = false;
91327a77 2024 uint64_t autotune_cache_size = 0;
11fdf7f2 2025 std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr;
91327a77
AA
2026
2027 struct MempoolCache : public PriorityCache::PriCache {
2028 BlueStore *store;
11fdf7f2
TL
2029 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
2030 int64_t committed_bytes = 0;
91327a77
AA
2031 double cache_ratio = 0;
2032
2033 MempoolCache(BlueStore *s) : store(s) {};
2034
2035 virtual uint64_t _get_used_bytes() const = 0;
2036
2037 virtual int64_t request_cache_bytes(
11fdf7f2 2038 PriorityCache::Priority pri, uint64_t total_cache) const {
91327a77
AA
2039 int64_t assigned = get_cache_bytes(pri);
2040
2041 switch (pri) {
2042 // All cache items are currently shoved into the LAST priority
2043 case PriorityCache::Priority::LAST:
2044 {
11fdf7f2 2045 int64_t request = _get_used_bytes();
91327a77
AA
2046 return(request > assigned) ? request - assigned : 0;
2047 }
2048 default:
2049 break;
2050 }
2051 return -EOPNOTSUPP;
2052 }
2053
2054 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
2055 return cache_bytes[pri];
2056 }
2057 virtual int64_t get_cache_bytes() const {
2058 int64_t total = 0;
2059
2060 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
2061 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
2062 total += get_cache_bytes(pri);
2063 }
2064 return total;
2065 }
2066 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
2067 cache_bytes[pri] = bytes;
2068 }
2069 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
2070 cache_bytes[pri] += bytes;
2071 }
11fdf7f2
TL
2072 virtual int64_t commit_cache_size(uint64_t total_cache) {
2073 committed_bytes = PriorityCache::get_chunk(
2074 get_cache_bytes(), total_cache);
2075 return committed_bytes;
2076 }
2077 virtual int64_t get_committed_size() const {
2078 return committed_bytes;
91327a77
AA
2079 }
2080 virtual double get_cache_ratio() const {
2081 return cache_ratio;
2082 }
2083 virtual void set_cache_ratio(double ratio) {
2084 cache_ratio = ratio;
2085 }
2086 virtual string get_cache_name() const = 0;
2087 };
2088
2089 struct MetaCache : public MempoolCache {
2090 MetaCache(BlueStore *s) : MempoolCache(s) {};
2091
2092 virtual uint64_t _get_used_bytes() const {
2093 return mempool::bluestore_cache_other::allocated_bytes() +
2094 mempool::bluestore_cache_onode::allocated_bytes();
2095 }
2096
2097 virtual string get_cache_name() const {
2098 return "BlueStore Meta Cache";
2099 }
2100
2101 uint64_t _get_num_onodes() const {
2102 uint64_t onode_num =
2103 mempool::bluestore_cache_onode::allocated_items();
2104 return (2 > onode_num) ? 2 : onode_num;
2105 }
2106
2107 double get_bytes_per_onode() const {
2108 return (double)_get_used_bytes() / (double)_get_num_onodes();
2109 }
11fdf7f2
TL
2110 };
2111 std::shared_ptr<MetaCache> meta_cache;
91327a77
AA
2112
2113 struct DataCache : public MempoolCache {
2114 DataCache(BlueStore *s) : MempoolCache(s) {};
2115
2116 virtual uint64_t _get_used_bytes() const {
2117 uint64_t bytes = 0;
2118 for (auto i : store->cache_shards) {
2119 bytes += i->_get_buffer_bytes();
2120 }
2121 return bytes;
2122 }
2123 virtual string get_cache_name() const {
2124 return "BlueStore Data Cache";
2125 }
11fdf7f2
TL
2126 };
2127 std::shared_ptr<DataCache> data_cache;
91327a77 2128
7c673cae
FG
2129 public:
2130 explicit MempoolThread(BlueStore *s)
2131 : store(s),
11fdf7f2
TL
2132 meta_cache(new MetaCache(s)),
2133 data_cache(new DataCache(s)) {}
91327a77 2134
7c673cae
FG
2135 void *entry() override;
2136 void init() {
11fdf7f2 2137 ceph_assert(stop == false);
7c673cae
FG
2138 create("bstore_mempool");
2139 }
2140 void shutdown() {
11fdf7f2 2141 lock.lock();
7c673cae 2142 stop = true;
11fdf7f2
TL
2143 cond.notify_all();
2144 lock.unlock();
7c673cae
FG
2145 join();
2146 }
91327a77
AA
2147
2148 private:
2149 void _adjust_cache_settings();
2150 void _trim_shards(bool interval_stats);
2151 void _tune_cache_size(bool interval_stats);
11fdf7f2
TL
2152 void _balance_cache(
2153 const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches);
2154 void _balance_cache_pri(
2155 int64_t *mem_avail,
2156 const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches,
2157 PriorityCache::Priority pri);
7c673cae
FG
2158 } mempool_thread;
2159
2160 // --------------------------------------------------------
2161 // private methods
2162
2163 void _init_logger();
2164 void _shutdown_logger();
2165 int _reload_logger();
2166
2167 int _open_path();
2168 void _close_path();
2169 int _open_fsid(bool create);
2170 int _lock_fsid();
2171 int _read_fsid(uuid_d *f);
2172 int _write_fsid();
2173 void _close_fsid();
2174 void _set_alloc_sizes();
2175 void _set_blob_size();
1adf2230 2176 void _set_finisher_num();
7c673cae
FG
2177
2178 int _open_bdev(bool create);
11fdf7f2
TL
2179 // Verifies if disk space is enough for reserved + min bluefs
2180 // and alters the latter if needed.
2181 // Depends on min_alloc_size hence should be called after
2182 // its initialization (and outside of _open_bdev)
2183 void _validate_bdev();
7c673cae 2184 void _close_bdev();
11fdf7f2
TL
2185
2186 int _minimal_open_bluefs(bool create);
2187 void _minimal_close_bluefs();
2188 int _open_bluefs(bool create);
2189 void _close_bluefs();
2190
2191 // Limited (u)mount intended for BlueFS operations only
2192 int _mount_for_bluefs();
2193 void _umount_for_bluefs();
2194
2195
2196 int _is_bluefs(bool create, bool* ret);
2197 /*
2198 * opens both DB and dependant super_meta, FreelistManager and allocator
2199 * in the proper order
2200 */
2201 int _open_db_and_around(bool read_only);
2202 void _close_db_and_around();
2203
2204 // updates legacy bluefs related recs in DB to a state valid for
2205 // downgrades from nautilus.
2206 void _sync_bluefs_and_fm();
2207
2208 /*
2209 * @warning to_repair_db means that we open this db to repair it, will not
2210 * hold the rocksdb's file lock.
2211 */
2212 int _open_db(bool create,
2213 bool to_repair_db=false,
2214 bool read_only = false);
7c673cae 2215 void _close_db();
11fdf7f2 2216 int _open_fm(KeyValueDB::Transaction t);
7c673cae
FG
2217 void _close_fm();
2218 int _open_alloc();
2219 void _close_alloc();
2220 int _open_collections(int *errors=0);
2221 void _close_collections();
2222
2223 int _setup_block_symlink_or_file(string name, string path, uint64_t size,
2224 bool create);
2225
7c673cae 2226public:
3efd9988
FG
2227 static int _write_bdev_label(CephContext* cct,
2228 string path, bluestore_bdev_label_t label);
7c673cae
FG
2229 static int _read_bdev_label(CephContext* cct, string path,
2230 bluestore_bdev_label_t *label);
2231private:
2232 int _check_or_set_bdev_label(string path, uint64_t size, string desc,
2233 bool create);
2234
2235 int _open_super_meta();
2236
224ce89b 2237 void _open_statfs();
11fdf7f2 2238 void _get_statfs_overall(struct store_statfs_t *buf);
31f18b77 2239
11fdf7f2
TL
2240 void _dump_alloc_on_failure();
2241
2242 int64_t _get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total);
2243 int _balance_bluefs_freespace();
7c673cae
FG
2244
2245 CollectionRef _get_collection(const coll_t& cid);
2246 void _queue_reap_collection(CollectionRef& c);
2247 void _reap_collections();
2248 void _update_cache_logger();
2249
2250 void _assign_nid(TransContext *txc, OnodeRef o);
2251 uint64_t _assign_blobid(TransContext *txc);
2252
81eedcae
TL
2253 template <int LogLevelV>
2254 friend void _dump_onode(CephContext *cct, const Onode& o);
2255 template <int LogLevelV>
2256 friend void _dump_extent_map(CephContext *cct, const ExtentMap& em);
2257 template <int LogLevelV>
2258 friend void _dump_transaction(CephContext *cct, Transaction *t);
7c673cae 2259
11fdf7f2
TL
2260 TransContext *_txc_create(Collection *c, OpSequencer *osr,
2261 list<Context*> *on_commits);
7c673cae
FG
2262 void _txc_update_store_statfs(TransContext *txc);
2263 void _txc_add_transaction(TransContext *txc, Transaction *t);
2264 void _txc_calc_cost(TransContext *txc);
2265 void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t);
2266 void _txc_state_proc(TransContext *txc);
2267 void _txc_aio_submit(TransContext *txc);
2268public:
2269 void txc_aio_finish(void *p) {
2270 _txc_state_proc(static_cast<TransContext*>(p));
2271 }
2272private:
2273 void _txc_finish_io(TransContext *txc);
2274 void _txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t);
2275 void _txc_applied_kv(TransContext *txc);
2276 void _txc_committed_kv(TransContext *txc);
2277 void _txc_finish(TransContext *txc);
2278 void _txc_release_alloc(TransContext *txc);
2279
11fdf7f2
TL
2280 void _osr_attach(Collection *c);
2281 void _osr_register_zombie(OpSequencer *osr);
2282 void _osr_drain(OpSequencer *osr);
7c673cae
FG
2283 void _osr_drain_preceding(TransContext *txc);
2284 void _osr_drain_all();
7c673cae 2285
31f18b77
FG
2286 void _kv_start();
2287 void _kv_stop();
7c673cae 2288 void _kv_sync_thread();
31f18b77 2289 void _kv_finalize_thread();
7c673cae
FG
2290
2291 bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
2292 void _deferred_queue(TransContext *txc);
3efd9988 2293public:
224ce89b 2294 void deferred_try_submit();
3efd9988 2295private:
224ce89b 2296 void _deferred_submit_unlock(OpSequencer *osr);
7c673cae
FG
2297 void _deferred_aio_finish(OpSequencer *osr);
2298 int _deferred_replay();
2299
2300public:
2301 using mempool_dynamic_bitset =
2302 boost::dynamic_bitset<uint64_t,
2303 mempool::bluestore_fsck::pool_allocator<uint64_t>>;
2304
2305private:
2306 int _fsck_check_extents(
11fdf7f2 2307 const coll_t& cid,
7c673cae
FG
2308 const ghobject_t& oid,
2309 const PExtentVector& extents,
2310 bool compressed,
2311 mempool_dynamic_bitset &used_blocks,
b32b8144 2312 uint64_t granularity,
11fdf7f2 2313 BlueStoreRepairer* repairer,
7c673cae
FG
2314 store_statfs_t& expected_statfs);
2315
11fdf7f2
TL
2316 using per_pool_statfs =
2317 mempool::bluestore_fsck::map<uint64_t, store_statfs_t>;
2318 void _fsck_check_pool_statfs(
2319 per_pool_statfs& expected_pool_statfs,
2320 bool need_per_pool_stats,
2321 int& errors,
2322 BlueStoreRepairer* repairer);
2323
7c673cae
FG
2324 void _buffer_cache_write(
2325 TransContext *txc,
2326 BlobRef b,
2327 uint64_t offset,
2328 bufferlist& bl,
2329 unsigned flags) {
2330 b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl,
2331 flags);
2332 txc->shared_blobs_written.insert(b->shared_blob);
2333 }
2334
2335 int _collection_list(
2336 Collection *c, const ghobject_t& start, const ghobject_t& end,
2337 int max, vector<ghobject_t> *ls, ghobject_t *next);
2338
2339 template <typename T, typename F>
2340 T select_option(const std::string& opt_name, T val1, F f) {
2341 //NB: opt_name reserved for future use
2342 boost::optional<T> val2 = f();
2343 if (val2) {
2344 return *val2;
2345 }
2346 return val1;
2347 }
2348
2349 void _apply_padding(uint64_t head_pad,
2350 uint64_t tail_pad,
7c673cae
FG
2351 bufferlist& padded);
2352
11fdf7f2
TL
2353 void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn);
2354
7c673cae
FG
2355 // -- ondisk version ---
2356public:
2357 const int32_t latest_ondisk_format = 2; ///< our version
2358 const int32_t min_readable_ondisk_format = 1; ///< what we can read
2359 const int32_t min_compat_ondisk_format = 2; ///< who can read us
2360
2361private:
2362 int32_t ondisk_format = 0; ///< value detected on mount
2363
2364 int _upgrade_super(); ///< upgrade (called during open_super)
11fdf7f2 2365 uint64_t _get_ondisk_reserved() const;
7c673cae
FG
2366 void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
2367
2368 // --- public interface ---
2369public:
2370 BlueStore(CephContext *cct, const string& path);
2371 BlueStore(CephContext *cct, const string& path, uint64_t min_alloc_size); // Ctor for UT only
2372 ~BlueStore() override;
2373
2374 string get_type() override {
2375 return "bluestore";
2376 }
2377
2378 bool needs_journal() override { return false; };
2379 bool wants_journal() override { return false; };
2380 bool allows_journal() override { return false; };
2381
11fdf7f2
TL
2382 int get_devices(set<string> *ls) override;
2383
31f18b77 2384 bool is_rotational() override;
d2e6a577 2385 bool is_journal_rotational() override;
31f18b77 2386
224ce89b
WB
2387 string get_default_device_class() override {
2388 string device_class;
2389 map<string, string> metadata;
2390 collect_metadata(&metadata);
2391 auto it = metadata.find("bluestore_bdev_type");
2392 if (it != metadata.end()) {
2393 device_class = it->second;
2394 }
2395 return device_class;
2396 }
2397
11fdf7f2
TL
2398 int get_numa_node(
2399 int *numa_node,
2400 set<int> *nodes,
2401 set<string> *failed) override;
2402
7c673cae
FG
2403 static int get_block_device_fsid(CephContext* cct, const string& path,
2404 uuid_d *fsid);
2405
2406 bool test_mount_in_use() override;
2407
2408private:
11fdf7f2 2409 int _mount(bool kv_only, bool open_db=true);
7c673cae
FG
2410public:
2411 int mount() override {
2412 return _mount(false);
2413 }
2414 int umount() override;
2415
11fdf7f2
TL
2416 int start_kv_only(KeyValueDB **pdb, bool open_db=true) {
2417 int r = _mount(true, open_db);
7c673cae
FG
2418 if (r < 0)
2419 return r;
2420 *pdb = db;
2421 return 0;
2422 }
2423
3efd9988
FG
2424 int write_meta(const std::string& key, const std::string& value) override;
2425 int read_meta(const std::string& key, std::string *value) override;
2426
2427
2428 int fsck(bool deep) override {
2429 return _fsck(deep, false);
2430 }
2431 int repair(bool deep) override {
2432 return _fsck(deep, true);
2433 }
2434 int _fsck(bool deep, bool repair);
7c673cae
FG
2435
2436 void set_cache_shards(unsigned num) override;
11fdf7f2
TL
2437 void dump_cache_stats(Formatter *f) override {
2438 int onode_count = 0, buffers_bytes = 0;
2439 for (auto i: cache_shards) {
2440 onode_count += i->_get_num_onodes();
2441 buffers_bytes += i->_get_buffer_bytes();
2442 }
2443 f->dump_int("bluestore_onode", onode_count);
2444 f->dump_int("bluestore_buffers", buffers_bytes);
2445 }
2446 void dump_cache_stats(ostream& ss) override {
2447 int onode_count = 0, buffers_bytes = 0;
2448 for (auto i: cache_shards) {
2449 onode_count += i->_get_num_onodes();
2450 buffers_bytes += i->_get_buffer_bytes();
2451 }
2452 ss << "bluestore_onode: " << onode_count;
2453 ss << "bluestore_buffers: " << buffers_bytes;
2454 }
7c673cae
FG
2455
2456 int validate_hobject_key(const hobject_t &obj) const override {
2457 return 0;
2458 }
2459 unsigned get_max_attr_name_length() override {
2460 return 256; // arbitrary; there is no real limit internally
2461 }
2462
2463 int mkfs() override;
2464 int mkjournal() override {
2465 return 0;
2466 }
2467
2468 void get_db_statistics(Formatter *f) override;
2469 void generate_db_histogram(Formatter *f) override;
31f18b77 2470 void _flush_cache();
11fdf7f2 2471 int flush_cache(ostream *os = NULL) override;
7c673cae
FG
2472 void dump_perf_counters(Formatter *f) override {
2473 f->open_object_section("perf_counters");
2474 logger->dump_formatted(f, false);
2475 f->close_section();
2476 }
2477
11fdf7f2
TL
2478 int add_new_bluefs_device(int id, const string& path);
2479 int migrate_to_existing_bluefs_device(const set<int>& devs_source,
2480 int id);
2481 int migrate_to_new_bluefs_device(const set<int>& devs_source,
2482 int id,
2483 const string& path);
2484 int expand_devices(ostream& out);
2485 string get_device_path(unsigned id);
7c673cae
FG
2486
2487public:
11fdf7f2
TL
2488 int statfs(struct store_statfs_t *buf,
2489 osd_alert_list_t* alerts = nullptr) override;
2490 int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override;
7c673cae
FG
2491
2492 void collect_metadata(map<string,string> *pm) override;
2493
7c673cae
FG
2494 bool exists(CollectionHandle &c, const ghobject_t& oid) override;
2495 int set_collection_opts(
11fdf7f2 2496 CollectionHandle& c,
7c673cae 2497 const pool_opts_t& opts) override;
7c673cae
FG
2498 int stat(
2499 CollectionHandle &c,
2500 const ghobject_t& oid,
2501 struct stat *st,
2502 bool allow_eio = false) override;
7c673cae
FG
2503 int read(
2504 CollectionHandle &c,
2505 const ghobject_t& oid,
2506 uint64_t offset,
2507 size_t len,
2508 bufferlist& bl,
224ce89b 2509 uint32_t op_flags = 0) override;
7c673cae
FG
2510 int _do_read(
2511 Collection *c,
2512 OnodeRef o,
2513 uint64_t offset,
2514 size_t len,
2515 bufferlist& bl,
f64942e4
AA
2516 uint32_t op_flags = 0,
2517 uint64_t retry_count = 0);
7c673cae
FG
2518
2519private:
2520 int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
2521 uint64_t offset, size_t len, interval_set<uint64_t>& destset);
2522public:
7c673cae
FG
2523 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2524 uint64_t offset, size_t len, bufferlist& bl) override;
7c673cae
FG
2525 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2526 uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
2527
2528
7c673cae
FG
2529 int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
2530 bufferptr& value) override;
2531
7c673cae
FG
2532 int getattrs(CollectionHandle &c, const ghobject_t& oid,
2533 map<string,bufferptr>& aset) override;
2534
2535 int list_collections(vector<coll_t>& ls) override;
2536
2537 CollectionHandle open_collection(const coll_t &c) override;
11fdf7f2
TL
2538 CollectionHandle create_new_collection(const coll_t& cid) override;
2539 void set_collection_commit_queue(const coll_t& cid,
2540 ContextQueue *commit_queue) override;
7c673cae
FG
2541
2542 bool collection_exists(const coll_t& c) override;
11fdf7f2
TL
2543 int collection_empty(CollectionHandle& c, bool *empty) override;
2544 int collection_bits(CollectionHandle& c) override;
7c673cae 2545
7c673cae
FG
2546 int collection_list(CollectionHandle &c,
2547 const ghobject_t& start,
2548 const ghobject_t& end,
2549 int max,
2550 vector<ghobject_t> *ls, ghobject_t *next) override;
2551
7c673cae
FG
2552 int omap_get(
2553 CollectionHandle &c, ///< [in] Collection containing oid
2554 const ghobject_t &oid, ///< [in] Object containing omap
2555 bufferlist *header, ///< [out] omap header
2556 map<string, bufferlist> *out /// < [out] Key to value map
2557 ) override;
2558
2559 /// Get omap header
7c673cae
FG
2560 int omap_get_header(
2561 CollectionHandle &c, ///< [in] Collection containing oid
2562 const ghobject_t &oid, ///< [in] Object containing omap
2563 bufferlist *header, ///< [out] omap header
2564 bool allow_eio = false ///< [in] don't assert on eio
2565 ) override;
2566
2567 /// Get keys defined on oid
7c673cae
FG
2568 int omap_get_keys(
2569 CollectionHandle &c, ///< [in] Collection containing oid
2570 const ghobject_t &oid, ///< [in] Object containing omap
2571 set<string> *keys ///< [out] Keys defined on oid
2572 ) override;
2573
2574 /// Get key values
7c673cae
FG
2575 int omap_get_values(
2576 CollectionHandle &c, ///< [in] Collection containing oid
2577 const ghobject_t &oid, ///< [in] Object containing omap
2578 const set<string> &keys, ///< [in] Keys to get
2579 map<string, bufferlist> *out ///< [out] Returned keys and values
2580 ) override;
2581
2582 /// Filters keys into out which are defined on oid
7c673cae
FG
2583 int omap_check_keys(
2584 CollectionHandle &c, ///< [in] Collection containing oid
2585 const ghobject_t &oid, ///< [in] Object containing omap
2586 const set<string> &keys, ///< [in] Keys to check
2587 set<string> *out ///< [out] Subset of keys defined on oid
2588 ) override;
2589
7c673cae
FG
2590 ObjectMap::ObjectMapIterator get_omap_iterator(
2591 CollectionHandle &c, ///< [in] collection
2592 const ghobject_t &oid ///< [in] object
2593 ) override;
2594
2595 void set_fsid(uuid_d u) override {
2596 fsid = u;
2597 }
2598 uuid_d get_fsid() override {
2599 return fsid;
2600 }
2601
2602 uint64_t estimate_objects_overhead(uint64_t num_objects) override {
2603 return num_objects * 300; //assuming per-object overhead is 300 bytes
2604 }
2605
2606 struct BSPerfTracker {
11fdf7f2
TL
2607 PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
2608 PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
7c673cae
FG
2609
2610 objectstore_perf_stat_t get_cur_stats() const {
2611 objectstore_perf_stat_t ret;
11fdf7f2
TL
2612 ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
2613 ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
7c673cae
FG
2614 return ret;
2615 }
2616
2617 void update_from_perfcounters(PerfCounters &logger);
2618 } perf_tracker;
2619
2620 objectstore_perf_stat_t get_cur_stats() override {
2621 perf_tracker.update_from_perfcounters(*logger);
2622 return perf_tracker.get_cur_stats();
2623 }
2624 const PerfCounters* get_perf_counters() const override {
2625 return logger;
2626 }
2627
2628 int queue_transactions(
11fdf7f2 2629 CollectionHandle& ch,
7c673cae
FG
2630 vector<Transaction>& tls,
2631 TrackedOpRef op = TrackedOpRef(),
2632 ThreadPool::TPHandle *handle = NULL) override;
2633
2634 // error injection
2635 void inject_data_error(const ghobject_t& o) override {
2636 RWLock::WLocker l(debug_read_error_lock);
2637 debug_data_error_objects.insert(o);
2638 }
2639 void inject_mdata_error(const ghobject_t& o) override {
2640 RWLock::WLocker l(debug_read_error_lock);
2641 debug_mdata_error_objects.insert(o);
2642 }
11fdf7f2
TL
2643
2644 /// methods to inject various errors fsck can repair
2645 void inject_broken_shared_blob_key(const string& key,
2646 const bufferlist& bl);
2647 void inject_leaked(uint64_t len);
2648 void inject_false_free(coll_t cid, ghobject_t oid);
2649 void inject_statfs(const string& key, const store_statfs_t& new_statfs);
2650 void inject_misreference(coll_t cid1, ghobject_t oid1,
2651 coll_t cid2, ghobject_t oid2,
2652 uint64_t offset);
2653
224ce89b 2654 void compact() override {
11fdf7f2 2655 ceph_assert(db);
224ce89b
WB
2656 db->compact();
2657 }
28e407b8
AA
2658 bool has_builtin_csum() const override {
2659 return true;
2660 }
2661
11fdf7f2
TL
2662 /*
2663 Allocate space for BlueFS from slow device.
2664 Either automatically applies allocated extents to underlying
2665 BlueFS (extents == nullptr) or just return them (non-null extents) provided
2666 */
2667 int allocate_bluefs_freespace(
2668 uint64_t min_size,
2669 uint64_t size,
2670 PExtentVector* extents);
2671
2672 void log_latency_fn(int idx,
2673 const ceph::timespan& lat,
2674 std::function<string (const ceph::timespan& lat)> fn);
2675
7c673cae
FG
2676private:
2677 bool _debug_data_eio(const ghobject_t& o) {
2678 if (!cct->_conf->bluestore_debug_inject_read_err) {
2679 return false;
2680 }
2681 RWLock::RLocker l(debug_read_error_lock);
2682 return debug_data_error_objects.count(o);
2683 }
2684 bool _debug_mdata_eio(const ghobject_t& o) {
2685 if (!cct->_conf->bluestore_debug_inject_read_err) {
2686 return false;
2687 }
2688 RWLock::RLocker l(debug_read_error_lock);
2689 return debug_mdata_error_objects.count(o);
2690 }
2691 void _debug_obj_on_delete(const ghobject_t& o) {
2692 if (cct->_conf->bluestore_debug_inject_read_err) {
2693 RWLock::WLocker l(debug_read_error_lock);
2694 debug_data_error_objects.erase(o);
2695 debug_mdata_error_objects.erase(o);
2696 }
2697 }
11fdf7f2
TL
2698private:
2699 ceph::mutex qlock = ceph::make_mutex("BlueStore::Alerts::qlock");
2700 string failed_cmode;
2701 set<string> failed_compressors;
2702 string spillover_alert;
81eedcae
TL
2703 string legacy_statfs_alert;
2704 string disk_size_mismatch_alert;
11fdf7f2
TL
2705
2706 void _log_alerts(osd_alert_list_t& alerts);
2707 bool _set_compression_alert(bool cmode, const char* s) {
2708 std::lock_guard l(qlock);
2709 if (cmode) {
2710 bool ret = failed_cmode.empty();
2711 failed_cmode = s;
2712 return ret;
2713 }
2714 return failed_compressors.emplace(s).second;
2715 }
2716 void _clear_compression_alert() {
2717 std::lock_guard l(qlock);
2718 failed_compressors.clear();
2719 failed_cmode.clear();
2720 }
2721
2722 void _set_spillover_alert(const string& s) {
2723 std::lock_guard l(qlock);
2724 spillover_alert = s;
2725 }
2726 void _clear_spillover_alert() {
2727 std::lock_guard l(qlock);
2728 spillover_alert.clear();
2729 }
7c673cae 2730
81eedcae
TL
2731 void _check_legacy_statfs_alert();
2732 void _set_disk_size_mismatch_alert(const string& s) {
2733 std::lock_guard l(qlock);
2734 disk_size_mismatch_alert = s;
2735 }
2736
7c673cae
FG
2737private:
2738
2739 // --------------------------------------------------------
2740 // read processing internal methods
2741 int _verify_csum(
2742 OnodeRef& o,
2743 const bluestore_blob_t* blob,
2744 uint64_t blob_xoffset,
2745 const bufferlist& bl,
2746 uint64_t logical_offset) const;
2747 int _decompress(bufferlist& source, bufferlist* result);
2748
2749
2750 // --------------------------------------------------------
2751 // write ops
2752
2753 struct WriteContext {
2754 bool buffered = false; ///< buffered write
2755 bool compress = false; ///< compressed write
2756 uint64_t target_blob_size = 0; ///< target (max) blob size
2757 unsigned csum_order = 0; ///< target checksum chunk order
2758
2759 old_extent_map_t old_extents; ///< must deref these blobs
2760
2761 struct write_item {
2762 uint64_t logical_offset; ///< write logical offset
2763 BlobRef b;
2764 uint64_t blob_length;
2765 uint64_t b_off;
2766 bufferlist bl;
2767 uint64_t b_off0; ///< original offset in a blob prior to padding
2768 uint64_t length0; ///< original data length prior to padding
2769
2770 bool mark_unused;
2771 bool new_blob; ///< whether new blob was created
2772
3efd9988
FG
2773 bool compressed = false;
2774 bufferlist compressed_bl;
2775 size_t compressed_len = 0;
2776
7c673cae
FG
2777 write_item(
2778 uint64_t logical_offs,
2779 BlobRef b,
2780 uint64_t blob_len,
2781 uint64_t o,
2782 bufferlist& bl,
2783 uint64_t o0,
2784 uint64_t l0,
2785 bool _mark_unused,
2786 bool _new_blob)
2787 :
2788 logical_offset(logical_offs),
2789 b(b),
2790 blob_length(blob_len),
2791 b_off(o),
2792 bl(bl),
2793 b_off0(o0),
2794 length0(l0),
2795 mark_unused(_mark_unused),
2796 new_blob(_new_blob) {}
2797 };
2798 vector<write_item> writes; ///< blobs we're writing
2799
2800 /// partial clone of the context
2801 void fork(const WriteContext& other) {
2802 buffered = other.buffered;
2803 compress = other.compress;
2804 target_blob_size = other.target_blob_size;
2805 csum_order = other.csum_order;
2806 }
2807 void write(
2808 uint64_t loffs,
2809 BlobRef b,
2810 uint64_t blob_len,
2811 uint64_t o,
2812 bufferlist& bl,
2813 uint64_t o0,
2814 uint64_t len0,
2815 bool _mark_unused,
2816 bool _new_blob) {
2817 writes.emplace_back(loffs,
2818 b,
2819 blob_len,
2820 o,
2821 bl,
2822 o0,
2823 len0,
2824 _mark_unused,
2825 _new_blob);
2826 }
2827 /// Checks for writes to the same pextent within a blob
2828 bool has_conflict(
2829 BlobRef b,
2830 uint64_t loffs,
2831 uint64_t loffs_end,
2832 uint64_t min_alloc_size);
2833 };
2834
2835 void _do_write_small(
2836 TransContext *txc,
2837 CollectionRef &c,
2838 OnodeRef o,
2839 uint64_t offset, uint64_t length,
2840 bufferlist::iterator& blp,
2841 WriteContext *wctx);
2842 void _do_write_big(
2843 TransContext *txc,
2844 CollectionRef &c,
2845 OnodeRef o,
2846 uint64_t offset, uint64_t length,
2847 bufferlist::iterator& blp,
2848 WriteContext *wctx);
2849 int _do_alloc_write(
2850 TransContext *txc,
2851 CollectionRef c,
2852 OnodeRef o,
2853 WriteContext *wctx);
2854 void _wctx_finish(
2855 TransContext *txc,
2856 CollectionRef& c,
2857 OnodeRef o,
31f18b77
FG
2858 WriteContext *wctx,
2859 set<SharedBlob*> *maybe_unshared_blobs=0);
7c673cae 2860
7c673cae
FG
2861 int _write(TransContext *txc,
2862 CollectionRef& c,
2863 OnodeRef& o,
2864 uint64_t offset, size_t len,
2865 bufferlist& bl,
2866 uint32_t fadvise_flags);
2867 void _pad_zeros(bufferlist *bl, uint64_t *offset,
2868 uint64_t chunk_size);
2869
31f18b77
FG
2870 void _choose_write_options(CollectionRef& c,
2871 OnodeRef o,
2872 uint32_t fadvise_flags,
2873 WriteContext *wctx);
2874
2875 int _do_gc(TransContext *txc,
2876 CollectionRef& c,
2877 OnodeRef o,
2878 const GarbageCollector& gc,
2879 const WriteContext& wctx,
2880 uint64_t *dirty_start,
2881 uint64_t *dirty_end);
2882
7c673cae
FG
2883 int _do_write(TransContext *txc,
2884 CollectionRef &c,
2885 OnodeRef o,
2886 uint64_t offset, uint64_t length,
2887 bufferlist& bl,
2888 uint32_t fadvise_flags);
2889 void _do_write_data(TransContext *txc,
2890 CollectionRef& c,
2891 OnodeRef o,
2892 uint64_t offset,
2893 uint64_t length,
2894 bufferlist& bl,
2895 WriteContext *wctx);
2896
2897 int _touch(TransContext *txc,
2898 CollectionRef& c,
2899 OnodeRef& o);
2900 int _do_zero(TransContext *txc,
2901 CollectionRef& c,
2902 OnodeRef& o,
2903 uint64_t offset, size_t len);
2904 int _zero(TransContext *txc,
2905 CollectionRef& c,
2906 OnodeRef& o,
2907 uint64_t offset, size_t len);
2908 void _do_truncate(TransContext *txc,
2909 CollectionRef& c,
2910 OnodeRef o,
31f18b77
FG
2911 uint64_t offset,
2912 set<SharedBlob*> *maybe_unshared_blobs=0);
35e4c445 2913 int _truncate(TransContext *txc,
7c673cae
FG
2914 CollectionRef& c,
2915 OnodeRef& o,
2916 uint64_t offset);
2917 int _remove(TransContext *txc,
2918 CollectionRef& c,
2919 OnodeRef& o);
2920 int _do_remove(TransContext *txc,
2921 CollectionRef& c,
2922 OnodeRef o);
2923 int _setattr(TransContext *txc,
2924 CollectionRef& c,
2925 OnodeRef& o,
2926 const string& name,
2927 bufferptr& val);
2928 int _setattrs(TransContext *txc,
2929 CollectionRef& c,
2930 OnodeRef& o,
2931 const map<string,bufferptr>& aset);
2932 int _rmattr(TransContext *txc,
2933 CollectionRef& c,
2934 OnodeRef& o,
2935 const string& name);
2936 int _rmattrs(TransContext *txc,
2937 CollectionRef& c,
2938 OnodeRef& o);
11fdf7f2 2939 void _do_omap_clear(TransContext *txc, const string& prefix, uint64_t id);
7c673cae
FG
2940 int _omap_clear(TransContext *txc,
2941 CollectionRef& c,
2942 OnodeRef& o);
2943 int _omap_setkeys(TransContext *txc,
2944 CollectionRef& c,
2945 OnodeRef& o,
2946 bufferlist& bl);
2947 int _omap_setheader(TransContext *txc,
2948 CollectionRef& c,
2949 OnodeRef& o,
2950 bufferlist& header);
2951 int _omap_rmkeys(TransContext *txc,
2952 CollectionRef& c,
2953 OnodeRef& o,
2954 bufferlist& bl);
2955 int _omap_rmkey_range(TransContext *txc,
2956 CollectionRef& c,
2957 OnodeRef& o,
2958 const string& first, const string& last);
2959 int _set_alloc_hint(
2960 TransContext *txc,
2961 CollectionRef& c,
2962 OnodeRef& o,
2963 uint64_t expected_object_size,
2964 uint64_t expected_write_size,
2965 uint32_t flags);
2966 int _do_clone_range(TransContext *txc,
2967 CollectionRef& c,
2968 OnodeRef& oldo,
2969 OnodeRef& newo,
2970 uint64_t srcoff, uint64_t length, uint64_t dstoff);
2971 int _clone(TransContext *txc,
2972 CollectionRef& c,
2973 OnodeRef& oldo,
2974 OnodeRef& newo);
2975 int _clone_range(TransContext *txc,
2976 CollectionRef& c,
2977 OnodeRef& oldo,
2978 OnodeRef& newo,
2979 uint64_t srcoff, uint64_t length, uint64_t dstoff);
2980 int _rename(TransContext *txc,
2981 CollectionRef& c,
2982 OnodeRef& oldo,
2983 OnodeRef& newo,
2984 const ghobject_t& new_oid);
2985 int _create_collection(TransContext *txc, const coll_t &cid,
2986 unsigned bits, CollectionRef *c);
2987 int _remove_collection(TransContext *txc, const coll_t &cid,
2988 CollectionRef *c);
11fdf7f2 2989 void _do_remove_collection(TransContext *txc, CollectionRef *c);
7c673cae
FG
2990 int _split_collection(TransContext *txc,
2991 CollectionRef& c,
2992 CollectionRef& d,
2993 unsigned bits, int rem);
11fdf7f2
TL
2994 int _merge_collection(TransContext *txc,
2995 CollectionRef *c,
2996 CollectionRef& d,
2997 unsigned bits);
2998
2999private:
3000 std::atomic<uint64_t> out_of_sync_fm = {0};
3001 // --------------------------------------------------------
3002 // BlueFSDeviceExpander implementation
3003 uint64_t get_recommended_expansion_delta(uint64_t bluefs_free,
3004 uint64_t bluefs_total) override {
3005 auto delta = _get_bluefs_size_delta(bluefs_free, bluefs_total);
3006 return delta > 0 ? delta : 0;
3007 }
3008 int allocate_freespace(
3009 uint64_t min_size,
3010 uint64_t size,
3011 PExtentVector& extents) override {
3012 return allocate_bluefs_freespace(min_size, size, &extents);
3013 };
7c673cae
FG
3014};
3015
11fdf7f2
TL
3016inline ostream& operator<<(ostream& out, const BlueStore::volatile_statfs& s) {
3017 return out
3018 << " allocated:"
3019 << s.values[BlueStore::volatile_statfs::STATFS_ALLOCATED]
3020 << " stored:"
3021 << s.values[BlueStore::volatile_statfs::STATFS_STORED]
3022 << " compressed:"
3023 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED]
3024 << " compressed_orig:"
3025 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ORIGINAL]
3026 << " compressed_alloc:"
3027 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ALLOCATED];
7c673cae
FG
3028}
3029
3030static inline void intrusive_ptr_add_ref(BlueStore::Onode *o) {
3031 o->get();
3032}
3033static inline void intrusive_ptr_release(BlueStore::Onode *o) {
3034 o->put();
3035}
3036
3037static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer *o) {
3038 o->get();
3039}
3040static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
3041 o->put();
3042}
3043
11fdf7f2
TL
3044class BlueStoreRepairer
3045{
3046public:
3047 // to simplify future potential migration to mempools
3048 using fsck_interval = interval_set<uint64_t>;
3049
3050 // Structure to track what pextents are used for specific cid/oid.
3051 // Similar to Bloom filter positive and false-positive matches are
3052 // possible only.
3053 // Maintains two lists of bloom filters for both cids and oids
3054 // where each list entry is a BF for specific disk pextent
3055 // The length of the extent per filter is measured on init.
3056 // Allows to filter out 'uninteresting' pextents to speadup subsequent
3057 // 'is_used' access.
3058 struct StoreSpaceTracker {
3059 const uint64_t BLOOM_FILTER_SALT_COUNT = 2;
3060 const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter
3061 const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected
3062 static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024;
3063
3064 typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector;
3065 bloom_vector collections_bfs;
3066 bloom_vector objects_bfs;
3067
3068 bool was_filtered_out = false;
3069 uint64_t granularity = 0; // extent length for a single filter
3070
3071 StoreSpaceTracker() {
3072 }
3073 StoreSpaceTracker(const StoreSpaceTracker& from) :
3074 collections_bfs(from.collections_bfs),
3075 objects_bfs(from.objects_bfs),
3076 granularity(from.granularity) {
3077 }
3078
3079 void init(uint64_t total,
3080 uint64_t min_alloc_size,
3081 uint64_t mem_cap = DEF_MEM_CAP) {
3082 ceph_assert(!granularity); // not initialized yet
3083 ceph_assert(min_alloc_size && isp2(min_alloc_size));
3084 ceph_assert(mem_cap);
3085
3086 total = round_up_to(total, min_alloc_size);
3087 granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap;
3088
3089 if (!granularity) {
3090 granularity = min_alloc_size;
3091 } else {
3092 granularity = round_up_to(granularity, min_alloc_size);
3093 }
3094
3095 uint64_t entries = round_up_to(total, granularity) / granularity;
3096 collections_bfs.resize(entries,
3097 bloom_filter(BLOOM_FILTER_SALT_COUNT,
3098 BLOOM_FILTER_TABLE_SIZE,
3099 0,
3100 BLOOM_FILTER_EXPECTED_COUNT));
3101 objects_bfs.resize(entries,
3102 bloom_filter(BLOOM_FILTER_SALT_COUNT,
3103 BLOOM_FILTER_TABLE_SIZE,
3104 0,
3105 BLOOM_FILTER_EXPECTED_COUNT));
3106 }
3107 inline uint32_t get_hash(const coll_t& cid) const {
3108 return cid.hash_to_shard(1);
3109 }
3110 inline void set_used(uint64_t offset, uint64_t len,
3111 const coll_t& cid, const ghobject_t& oid) {
3112 ceph_assert(granularity); // initialized
3113
3114 // can't call this func after filter_out has been applied
3115 ceph_assert(!was_filtered_out);
3116 if (!len) {
3117 return;
3118 }
3119 auto pos = offset / granularity;
3120 auto end_pos = (offset + len - 1) / granularity;
3121 while (pos <= end_pos) {
3122 collections_bfs[pos].insert(get_hash(cid));
3123 objects_bfs[pos].insert(oid.hobj.get_hash());
3124 ++pos;
3125 }
3126 }
3127 // filter-out entries unrelated to the specified(broken) extents.
3128 // 'is_used' calls are permitted after that only
3129 size_t filter_out(const fsck_interval& extents);
3130
3131 // determines if collection's present after filtering-out
3132 inline bool is_used(const coll_t& cid) const {
3133 ceph_assert(was_filtered_out);
3134 for(auto& bf : collections_bfs) {
3135 if (bf.contains(get_hash(cid))) {
3136 return true;
3137 }
3138 }
3139 return false;
3140 }
3141 // determines if object's present after filtering-out
3142 inline bool is_used(const ghobject_t& oid) const {
3143 ceph_assert(was_filtered_out);
3144 for(auto& bf : objects_bfs) {
3145 if (bf.contains(oid.hobj.get_hash())) {
3146 return true;
3147 }
3148 }
3149 return false;
3150 }
3151 // determines if collection's present before filtering-out
3152 inline bool is_used(const coll_t& cid, uint64_t offs) const {
3153 ceph_assert(granularity); // initialized
3154 ceph_assert(!was_filtered_out);
3155 auto &bf = collections_bfs[offs / granularity];
3156 if (bf.contains(get_hash(cid))) {
3157 return true;
3158 }
3159 return false;
3160 }
3161 // determines if object's present before filtering-out
3162 inline bool is_used(const ghobject_t& oid, uint64_t offs) const {
3163 ceph_assert(granularity); // initialized
3164 ceph_assert(!was_filtered_out);
3165 auto &bf = objects_bfs[offs / granularity];
3166 if (bf.contains(oid.hobj.get_hash())) {
3167 return true;
3168 }
3169 return false;
3170 }
3171 };
3172public:
3173
3174 bool remove_key(KeyValueDB *db, const string& prefix, const string& key);
3175 bool fix_shared_blob(KeyValueDB *db,
3176 uint64_t sbid,
3177 const bufferlist* bl);
3178 bool fix_statfs(KeyValueDB *db, const string& key,
3179 const store_statfs_t& new_statfs);
3180
3181 bool fix_leaked(KeyValueDB *db,
3182 FreelistManager* fm,
3183 uint64_t offset, uint64_t len);
3184 bool fix_false_free(KeyValueDB *db,
3185 FreelistManager* fm,
3186 uint64_t offset, uint64_t len);
3187 bool fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag);
3188
3189 void init(uint64_t total_space, uint64_t lres_tracking_unit_size);
3190
3191 bool preprocess_misreference(KeyValueDB *db);
3192
3193 unsigned apply(KeyValueDB* db);
3194
3195 void note_misreference(uint64_t offs, uint64_t len, bool inc_error) {
3196 misreferenced_extents.union_insert(offs, len);
3197 if (inc_error) {
3198 ++to_repair_cnt;
3199 }
3200 }
3201
3202 StoreSpaceTracker& get_space_usage_tracker() {
3203 return space_usage_tracker;
3204 }
3205 const fsck_interval& get_misreferences() const {
3206 return misreferenced_extents;
3207 }
3208 KeyValueDB::Transaction get_fix_misreferences_txn() {
3209 return fix_misreferences_txn;
3210 }
3211
3212private:
3213 unsigned to_repair_cnt = 0;
3214 KeyValueDB::Transaction fix_fm_leaked_txn;
3215 KeyValueDB::Transaction fix_fm_false_free_txn;
3216 KeyValueDB::Transaction remove_key_txn;
3217 KeyValueDB::Transaction fix_statfs_txn;
3218 KeyValueDB::Transaction fix_shared_blob_txn;
3219
3220 KeyValueDB::Transaction fix_misreferences_txn;
3221
3222 StoreSpaceTracker space_usage_tracker;
3223
3224 // non-shared extents with multiple references
3225 fsck_interval misreferenced_extents;
3226
3227};
7c673cae 3228#endif