]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.h
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / os / bluestore / BlueStore.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_OSD_BLUESTORE_H
16#define CEPH_OSD_BLUESTORE_H
17
18#include "acconfig.h"
19
20#include <unistd.h>
21
22#include <atomic>
23#include <mutex>
24#include <condition_variable>
25
26#include <boost/intrusive/list.hpp>
27#include <boost/intrusive/unordered_set.hpp>
28#include <boost/intrusive/set.hpp>
29#include <boost/functional/hash.hpp>
30#include <boost/dynamic_bitset.hpp>
31
eafe8130
TL
32#include "include/cpp-btree/btree_set.h"
33
11fdf7f2 34#include "include/ceph_assert.h"
7c673cae 35#include "include/unordered_map.h"
7c673cae 36#include "include/mempool.h"
11fdf7f2 37#include "common/bloom_filter.hpp"
7c673cae 38#include "common/Finisher.h"
11fdf7f2 39#include "common/Throttle.h"
7c673cae 40#include "common/perf_counters.h"
91327a77 41#include "common/PriorityCache.h"
7c673cae
FG
42#include "compressor/Compressor.h"
43#include "os/ObjectStore.h"
44
45#include "bluestore_types.h"
46#include "BlockDevice.h"
11fdf7f2 47#include "BlueFS.h"
7c673cae
FG
48#include "common/EventTrace.h"
49
50class Allocator;
51class FreelistManager;
11fdf7f2 52class BlueStoreRepairer;
7c673cae
FG
53
54//#define DEBUG_CACHE
55//#define DEBUG_DEFERRED
56
31f18b77
FG
57
58
59// constants for Buffer::optimize()
60#define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
61
62
7c673cae
FG
63enum {
64 l_bluestore_first = 732430,
65 l_bluestore_kv_flush_lat,
66 l_bluestore_kv_commit_lat,
11fdf7f2
TL
67 l_bluestore_kv_sync_lat,
68 l_bluestore_kv_final_lat,
7c673cae
FG
69 l_bluestore_state_prepare_lat,
70 l_bluestore_state_aio_wait_lat,
71 l_bluestore_state_io_done_lat,
72 l_bluestore_state_kv_queued_lat,
73 l_bluestore_state_kv_committing_lat,
74 l_bluestore_state_kv_done_lat,
75 l_bluestore_state_deferred_queued_lat,
76 l_bluestore_state_deferred_aio_wait_lat,
77 l_bluestore_state_deferred_cleanup_lat,
78 l_bluestore_state_finishing_lat,
79 l_bluestore_state_done_lat,
80 l_bluestore_throttle_lat,
81 l_bluestore_submit_lat,
82 l_bluestore_commit_lat,
83 l_bluestore_read_lat,
84 l_bluestore_read_onode_meta_lat,
85 l_bluestore_read_wait_aio_lat,
86 l_bluestore_compress_lat,
87 l_bluestore_decompress_lat,
88 l_bluestore_csum_lat,
89 l_bluestore_compress_success_count,
90 l_bluestore_compress_rejected_count,
91 l_bluestore_write_pad_bytes,
92 l_bluestore_deferred_write_ops,
93 l_bluestore_deferred_write_bytes,
94 l_bluestore_write_penalty_read_ops,
95 l_bluestore_allocated,
96 l_bluestore_stored,
97 l_bluestore_compressed,
98 l_bluestore_compressed_allocated,
99 l_bluestore_compressed_original,
100 l_bluestore_onodes,
101 l_bluestore_onode_hits,
102 l_bluestore_onode_misses,
103 l_bluestore_onode_shard_hits,
104 l_bluestore_onode_shard_misses,
105 l_bluestore_extents,
106 l_bluestore_blobs,
107 l_bluestore_buffers,
108 l_bluestore_buffer_bytes,
109 l_bluestore_buffer_hit_bytes,
110 l_bluestore_buffer_miss_bytes,
111 l_bluestore_write_big,
112 l_bluestore_write_big_bytes,
113 l_bluestore_write_big_blobs,
114 l_bluestore_write_small,
115 l_bluestore_write_small_bytes,
116 l_bluestore_write_small_unused,
117 l_bluestore_write_small_deferred,
118 l_bluestore_write_small_pre_read,
119 l_bluestore_write_small_new,
120 l_bluestore_txc,
121 l_bluestore_onode_reshard,
122 l_bluestore_blob_split,
123 l_bluestore_extent_compress,
124 l_bluestore_gc_merged,
b32b8144 125 l_bluestore_read_eio,
f64942e4 126 l_bluestore_reads_with_retries,
a8e16298 127 l_bluestore_fragmentation,
11fdf7f2
TL
128 l_bluestore_omap_seek_to_first_lat,
129 l_bluestore_omap_upper_bound_lat,
130 l_bluestore_omap_lower_bound_lat,
131 l_bluestore_omap_next_lat,
494da23a 132 l_bluestore_clist_lat,
7c673cae
FG
133 l_bluestore_last
134};
135
11fdf7f2
TL
136#define META_POOL_ID ((uint64_t)-1ull)
137
7c673cae 138class BlueStore : public ObjectStore,
11fdf7f2 139 public BlueFSDeviceExpander,
7c673cae
FG
140 public md_config_obs_t {
141 // -----------------------------------------------------
142 // types
143public:
144 // config observer
145 const char** get_tracked_conf_keys() const override;
11fdf7f2
TL
146 void handle_conf_change(const ConfigProxy& conf,
147 const std::set<std::string> &changed) override;
148
149 //handler for discard event
150 void handle_discard(interval_set<uint64_t>& to_release);
7c673cae
FG
151
152 void _set_csum();
153 void _set_compression();
154 void _set_throttle_params();
31f18b77 155 int _set_cache_sizes();
7c673cae
FG
156
157 class TransContext;
158
159 typedef map<uint64_t, bufferlist> ready_regions_t;
160
eafe8130 161
7c673cae
FG
162 struct BufferSpace;
163 struct Collection;
164 typedef boost::intrusive_ptr<Collection> CollectionRef;
165
166 struct AioContext {
167 virtual void aio_finish(BlueStore *store) = 0;
168 virtual ~AioContext() {}
169 };
170
171 /// cached buffer
172 struct Buffer {
173 MEMPOOL_CLASS_HELPERS();
174
175 enum {
176 STATE_EMPTY, ///< empty buffer -- used for cache history
177 STATE_CLEAN, ///< clean data that is up to date
178 STATE_WRITING, ///< data that is being written (io not yet complete)
179 };
180 static const char *get_state_name(int s) {
181 switch (s) {
182 case STATE_EMPTY: return "empty";
183 case STATE_CLEAN: return "clean";
184 case STATE_WRITING: return "writing";
185 default: return "???";
186 }
187 }
188 enum {
189 FLAG_NOCACHE = 1, ///< trim when done WRITING (do not become CLEAN)
190 // NOTE: fix operator<< when you define a second flag
191 };
192 static const char *get_flag_name(int s) {
193 switch (s) {
194 case FLAG_NOCACHE: return "nocache";
195 default: return "???";
196 }
197 }
198
199 BufferSpace *space;
200 uint16_t state; ///< STATE_*
201 uint16_t cache_private = 0; ///< opaque (to us) value used by Cache impl
202 uint32_t flags; ///< FLAG_*
203 uint64_t seq;
204 uint32_t offset, length;
205 bufferlist data;
206
207 boost::intrusive::list_member_hook<> lru_item;
208 boost::intrusive::list_member_hook<> state_item;
209
210 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, uint32_t l,
211 unsigned f = 0)
212 : space(space), state(s), flags(f), seq(q), offset(o), length(l) {}
213 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, bufferlist& b,
214 unsigned f = 0)
215 : space(space), state(s), flags(f), seq(q), offset(o),
216 length(b.length()), data(b) {}
217
218 bool is_empty() const {
219 return state == STATE_EMPTY;
220 }
221 bool is_clean() const {
222 return state == STATE_CLEAN;
223 }
224 bool is_writing() const {
225 return state == STATE_WRITING;
226 }
227
228 uint32_t end() const {
229 return offset + length;
230 }
231
232 void truncate(uint32_t newlen) {
11fdf7f2 233 ceph_assert(newlen < length);
7c673cae
FG
234 if (data.length()) {
235 bufferlist t;
236 t.substr_of(data, 0, newlen);
237 data.claim(t);
238 }
239 length = newlen;
240 }
31f18b77
FG
241 void maybe_rebuild() {
242 if (data.length() &&
243 (data.get_num_buffers() > 1 ||
244 data.front().wasted() > data.length() / MAX_BUFFER_SLOP_RATIO_DEN)) {
245 data.rebuild();
246 }
247 }
7c673cae
FG
248
249 void dump(Formatter *f) const {
250 f->dump_string("state", get_state_name(state));
251 f->dump_unsigned("seq", seq);
252 f->dump_unsigned("offset", offset);
253 f->dump_unsigned("length", length);
254 f->dump_unsigned("data_length", data.length());
255 }
256 };
257
258 struct Cache;
259
260 /// map logical extent range (object) onto buffers
261 struct BufferSpace {
91327a77
AA
262 enum {
263 BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache
264 };
265
7c673cae
FG
266 typedef boost::intrusive::list<
267 Buffer,
268 boost::intrusive::member_hook<
269 Buffer,
270 boost::intrusive::list_member_hook<>,
271 &Buffer::state_item> > state_list_t;
272
31f18b77 273 mempool::bluestore_cache_other::map<uint32_t, std::unique_ptr<Buffer>>
7c673cae
FG
274 buffer_map;
275
276 // we use a bare intrusive list here instead of std::map because
277 // it uses less memory and we expect this to be very small (very
278 // few IOs in flight to the same Blob at the same time).
279 state_list_t writing; ///< writing buffers, sorted by seq, ascending
280
281 ~BufferSpace() {
11fdf7f2
TL
282 ceph_assert(buffer_map.empty());
283 ceph_assert(writing.empty());
7c673cae
FG
284 }
285
286 void _add_buffer(Cache* cache, Buffer *b, int level, Buffer *near) {
287 cache->_audit("_add_buffer start");
288 buffer_map[b->offset].reset(b);
289 if (b->is_writing()) {
31f18b77 290 b->data.reassign_to_mempool(mempool::mempool_bluestore_writing);
224ce89b
WB
291 if (writing.empty() || writing.rbegin()->seq <= b->seq) {
292 writing.push_back(*b);
293 } else {
294 auto it = writing.begin();
295 while (it->seq < b->seq) {
296 ++it;
297 }
298
11fdf7f2 299 ceph_assert(it->seq >= b->seq);
224ce89b
WB
300 // note that this will insert b before it
301 // hence the order is maintained
302 writing.insert(it, *b);
303 }
7c673cae 304 } else {
31f18b77 305 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
7c673cae
FG
306 cache->_add_buffer(b, level, near);
307 }
308 cache->_audit("_add_buffer end");
309 }
310 void _rm_buffer(Cache* cache, Buffer *b) {
311 _rm_buffer(cache, buffer_map.find(b->offset));
312 }
31f18b77
FG
313 void _rm_buffer(Cache* cache,
314 map<uint32_t, std::unique_ptr<Buffer>>::iterator p) {
11fdf7f2 315 ceph_assert(p != buffer_map.end());
7c673cae
FG
316 cache->_audit("_rm_buffer start");
317 if (p->second->is_writing()) {
318 writing.erase(writing.iterator_to(*p->second));
319 } else {
320 cache->_rm_buffer(p->second.get());
321 }
322 buffer_map.erase(p);
323 cache->_audit("_rm_buffer end");
324 }
325
326 map<uint32_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
327 uint32_t offset) {
328 auto i = buffer_map.lower_bound(offset);
329 if (i != buffer_map.begin()) {
330 --i;
331 if (i->first + i->second->length <= offset)
332 ++i;
333 }
334 return i;
335 }
336
337 // must be called under protection of the Cache lock
338 void _clear(Cache* cache);
339
340 // return value is the highest cache_private of a trimmed buffer, or 0.
341 int discard(Cache* cache, uint32_t offset, uint32_t length) {
11fdf7f2 342 std::lock_guard l(cache->lock);
7c673cae
FG
343 return _discard(cache, offset, length);
344 }
345 int _discard(Cache* cache, uint32_t offset, uint32_t length);
346
347 void write(Cache* cache, uint64_t seq, uint32_t offset, bufferlist& bl,
348 unsigned flags) {
11fdf7f2 349 std::lock_guard l(cache->lock);
7c673cae
FG
350 Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
351 flags);
352 b->cache_private = _discard(cache, offset, bl.length());
353 _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
354 }
f64942e4 355 void _finish_write(Cache* cache, uint64_t seq);
7c673cae 356 void did_read(Cache* cache, uint32_t offset, bufferlist& bl) {
11fdf7f2 357 std::lock_guard l(cache->lock);
7c673cae
FG
358 Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
359 b->cache_private = _discard(cache, offset, bl.length());
360 _add_buffer(cache, b, 1, nullptr);
361 }
362
363 void read(Cache* cache, uint32_t offset, uint32_t length,
364 BlueStore::ready_regions_t& res,
91327a77
AA
365 interval_set<uint32_t>& res_intervals,
366 int flags = 0);
7c673cae
FG
367
368 void truncate(Cache* cache, uint32_t offset) {
369 discard(cache, offset, (uint32_t)-1 - offset);
370 }
371
372 void split(Cache* cache, size_t pos, BufferSpace &r);
373
374 void dump(Cache* cache, Formatter *f) const {
11fdf7f2 375 std::lock_guard l(cache->lock);
7c673cae
FG
376 f->open_array_section("buffers");
377 for (auto& i : buffer_map) {
378 f->open_object_section("buffer");
11fdf7f2 379 ceph_assert(i.first == i.second->offset);
7c673cae
FG
380 i.second->dump(f);
381 f->close_section();
382 }
383 f->close_section();
384 }
385 };
386
387 struct SharedBlobSet;
388
389 /// in-memory shared blob state (incl cached buffers)
390 struct SharedBlob {
391 MEMPOOL_CLASS_HELPERS();
392
393 std::atomic_int nref = {0}; ///< reference count
394 bool loaded = false;
395
396 CollectionRef coll;
397 union {
398 uint64_t sbid_unloaded; ///< sbid if persistent isn't loaded
399 bluestore_shared_blob_t *persistent; ///< persistent part of the shared blob if any
400 };
401 BufferSpace bc; ///< buffer cache
402
403 SharedBlob(Collection *_coll) : coll(_coll), sbid_unloaded(0) {
404 if (get_cache()) {
405 get_cache()->add_blob();
406 }
407 }
408 SharedBlob(uint64_t i, Collection *_coll);
409 ~SharedBlob();
410
411 uint64_t get_sbid() const {
412 return loaded ? persistent->sbid : sbid_unloaded;
413 }
414
415 friend void intrusive_ptr_add_ref(SharedBlob *b) { b->get(); }
416 friend void intrusive_ptr_release(SharedBlob *b) { b->put(); }
417
418 friend ostream& operator<<(ostream& out, const SharedBlob& sb);
419
420 void get() {
421 ++nref;
422 }
423 void put();
424
425 /// get logical references
426 void get_ref(uint64_t offset, uint32_t length);
427
428 /// put logical references, and get back any released extents
429 void put_ref(uint64_t offset, uint32_t length,
11fdf7f2 430 PExtentVector *r, bool *unshare);
7c673cae 431
f64942e4
AA
432 void finish_write(uint64_t seq);
433
7c673cae
FG
434 friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
435 return l.get_sbid() == r.get_sbid();
436 }
437 inline Cache* get_cache() {
438 return coll ? coll->cache : nullptr;
439 }
440 inline SharedBlobSet* get_parent() {
441 return coll ? &(coll->shared_blob_set) : nullptr;
442 }
443 inline bool is_loaded() const {
444 return loaded;
445 }
446
447 };
448 typedef boost::intrusive_ptr<SharedBlob> SharedBlobRef;
449
450 /// a lookup table of SharedBlobs
451 struct SharedBlobSet {
11fdf7f2
TL
452 /// protect lookup, insertion, removal
453 ceph::mutex lock = ceph::make_mutex("BlueStore::SharedBlobSet::lock");
7c673cae
FG
454
455 // we use a bare pointer because we don't want to affect the ref
456 // count
31f18b77 457 mempool::bluestore_cache_other::unordered_map<uint64_t,SharedBlob*> sb_map;
7c673cae
FG
458
459 SharedBlobRef lookup(uint64_t sbid) {
11fdf7f2 460 std::lock_guard l(lock);
7c673cae 461 auto p = sb_map.find(sbid);
28e407b8
AA
462 if (p == sb_map.end() ||
463 p->second->nref == 0) {
7c673cae
FG
464 return nullptr;
465 }
466 return p->second;
467 }
468
469 void add(Collection* coll, SharedBlob *sb) {
11fdf7f2 470 std::lock_guard l(lock);
7c673cae
FG
471 sb_map[sb->get_sbid()] = sb;
472 sb->coll = coll;
473 }
474
91327a77 475 bool remove(SharedBlob *sb, bool verify_nref_is_zero=false) {
11fdf7f2
TL
476 std::lock_guard l(lock);
477 ceph_assert(sb->get_parent() == this);
91327a77
AA
478 if (verify_nref_is_zero && sb->nref != 0) {
479 return false;
480 }
28e407b8
AA
481 // only remove if it still points to us
482 auto p = sb_map.find(sb->get_sbid());
483 if (p != sb_map.end() &&
484 p->second == sb) {
485 sb_map.erase(p);
486 }
91327a77 487 return true;
3efd9988
FG
488 }
489
7c673cae 490 bool empty() {
11fdf7f2 491 std::lock_guard l(lock);
7c673cae
FG
492 return sb_map.empty();
493 }
3efd9988 494
11fdf7f2
TL
495 template <int LogLevelV>
496 void dump(CephContext *cct);
7c673cae
FG
497 };
498
499//#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
500
501 /// in-memory blob metadata and associated cached buffers (if any)
502 struct Blob {
503 MEMPOOL_CLASS_HELPERS();
504
505 std::atomic_int nref = {0}; ///< reference count
506 int16_t id = -1; ///< id, for spanning blobs only, >= 0
507 int16_t last_encoded_id = -1; ///< (ephemeral) used during encoding only
508 SharedBlobRef shared_blob; ///< shared blob state (if any)
509
510 private:
511 mutable bluestore_blob_t blob; ///< decoded blob metadata
512#ifdef CACHE_BLOB_BL
513 mutable bufferlist blob_bl; ///< cached encoded blob, blob is dirty if empty
514#endif
515 /// refs from this shard. ephemeral if id<0, persisted if spanning.
516 bluestore_blob_use_tracker_t used_in_blob;
517
518 public:
519
520 friend void intrusive_ptr_add_ref(Blob *b) { b->get(); }
521 friend void intrusive_ptr_release(Blob *b) { b->put(); }
522
523 friend ostream& operator<<(ostream& out, const Blob &b);
524
525 const bluestore_blob_use_tracker_t& get_blob_use_tracker() const {
526 return used_in_blob;
527 }
528 bool is_referenced() const {
529 return used_in_blob.is_not_empty();
530 }
531 uint32_t get_referenced_bytes() const {
532 return used_in_blob.get_referenced_bytes();
533 }
534
535 bool is_spanning() const {
536 return id >= 0;
537 }
538
539 bool can_split() const {
11fdf7f2 540 std::lock_guard l(shared_blob->get_cache()->lock);
7c673cae
FG
541 // splitting a BufferSpace writing list is too hard; don't try.
542 return shared_blob->bc.writing.empty() &&
543 used_in_blob.can_split() &&
544 get_blob().can_split();
545 }
546
547 bool can_split_at(uint32_t blob_offset) const {
548 return used_in_blob.can_split_at(blob_offset) &&
549 get_blob().can_split_at(blob_offset);
550 }
551
224ce89b 552 bool can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
553 uint32_t target_blob_size,
554 uint32_t b_offset,
555 uint32_t *length0);
556
557 void dup(Blob& o) {
558 o.shared_blob = shared_blob;
559 o.blob = blob;
560#ifdef CACHE_BLOB_BL
561 o.blob_bl = blob_bl;
562#endif
563 }
564
224ce89b 565 inline const bluestore_blob_t& get_blob() const {
7c673cae
FG
566 return blob;
567 }
224ce89b 568 inline bluestore_blob_t& dirty_blob() {
7c673cae
FG
569#ifdef CACHE_BLOB_BL
570 blob_bl.clear();
571#endif
572 return blob;
573 }
574
575 /// discard buffers for unallocated regions
576 void discard_unallocated(Collection *coll);
577
578 /// get logical references
579 void get_ref(Collection *coll, uint32_t offset, uint32_t length);
580 /// put logical references, and get back any released extents
581 bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
582 PExtentVector *r);
583
584 /// split the blob
585 void split(Collection *coll, uint32_t blob_offset, Blob *o);
586
587 void get() {
588 ++nref;
589 }
590 void put() {
591 if (--nref == 0)
592 delete this;
593 }
594
595
596#ifdef CACHE_BLOB_BL
597 void _encode() const {
598 if (blob_bl.length() == 0 ) {
11fdf7f2 599 encode(blob, blob_bl);
7c673cae 600 } else {
11fdf7f2 601 ceph_assert(blob_bl.length());
7c673cae
FG
602 }
603 }
604 void bound_encode(
605 size_t& p,
606 bool include_ref_map) const {
607 _encode();
608 p += blob_bl.length();
609 if (include_ref_map) {
610 used_in_blob.bound_encode(p);
611 }
612 }
613 void encode(
614 bufferlist::contiguous_appender& p,
615 bool include_ref_map) const {
616 _encode();
617 p.append(blob_bl);
618 if (include_ref_map) {
619 used_in_blob.encode(p);
620 }
621 }
622 void decode(
623 Collection */*coll*/,
11fdf7f2 624 bufferptr::const_iterator& p,
7c673cae
FG
625 bool include_ref_map) {
626 const char *start = p.get_pos();
627 denc(blob, p);
628 const char *end = p.get_pos();
629 blob_bl.clear();
630 blob_bl.append(start, end - start);
631 if (include_ref_map) {
632 used_in_blob.decode(p);
633 }
634 }
635#else
636 void bound_encode(
637 size_t& p,
638 uint64_t struct_v,
639 uint64_t sbid,
640 bool include_ref_map) const {
641 denc(blob, p, struct_v);
642 if (blob.is_shared()) {
643 denc(sbid, p);
644 }
645 if (include_ref_map) {
646 used_in_blob.bound_encode(p);
647 }
648 }
649 void encode(
650 bufferlist::contiguous_appender& p,
651 uint64_t struct_v,
652 uint64_t sbid,
653 bool include_ref_map) const {
654 denc(blob, p, struct_v);
655 if (blob.is_shared()) {
656 denc(sbid, p);
657 }
658 if (include_ref_map) {
659 used_in_blob.encode(p);
660 }
661 }
662 void decode(
663 Collection *coll,
11fdf7f2 664 bufferptr::const_iterator& p,
7c673cae
FG
665 uint64_t struct_v,
666 uint64_t* sbid,
667 bool include_ref_map);
668#endif
669 };
670 typedef boost::intrusive_ptr<Blob> BlobRef;
31f18b77 671 typedef mempool::bluestore_cache_other::map<int,BlobRef> blob_map_t;
7c673cae
FG
672
673 /// a logical extent, pointing to (some portion of) a blob
674 typedef boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true> > ExtentBase; //making an alias to avoid build warnings
675 struct Extent : public ExtentBase {
676 MEMPOOL_CLASS_HELPERS();
677
678 uint32_t logical_offset = 0; ///< logical offset
679 uint32_t blob_offset = 0; ///< blob offset
680 uint32_t length = 0; ///< length
681 BlobRef blob; ///< the blob with our data
682
683 /// ctor for lookup only
684 explicit Extent(uint32_t lo) : ExtentBase(), logical_offset(lo) { }
685 /// ctor for delayed initialization (see decode_some())
686 explicit Extent() : ExtentBase() {
687 }
688 /// ctor for general usage
689 Extent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
690 : ExtentBase(),
691 logical_offset(lo), blob_offset(o), length(l) {
692 assign_blob(b);
693 }
694 ~Extent() {
695 if (blob) {
696 blob->shared_blob->get_cache()->rm_extent();
697 }
698 }
699
700 void assign_blob(const BlobRef& b) {
11fdf7f2 701 ceph_assert(!blob);
7c673cae
FG
702 blob = b;
703 blob->shared_blob->get_cache()->add_extent();
704 }
705
706 // comparators for intrusive_set
707 friend bool operator<(const Extent &a, const Extent &b) {
708 return a.logical_offset < b.logical_offset;
709 }
710 friend bool operator>(const Extent &a, const Extent &b) {
711 return a.logical_offset > b.logical_offset;
712 }
713 friend bool operator==(const Extent &a, const Extent &b) {
714 return a.logical_offset == b.logical_offset;
715 }
716
717 uint32_t blob_start() const {
718 return logical_offset - blob_offset;
719 }
720
721 uint32_t blob_end() const {
722 return blob_start() + blob->get_blob().get_logical_length();
723 }
724
725 uint32_t logical_end() const {
726 return logical_offset + length;
727 }
728
729 // return true if any piece of the blob is out of
730 // the given range [o, o + l].
731 bool blob_escapes_range(uint32_t o, uint32_t l) const {
732 return blob_start() < o || blob_end() > o + l;
733 }
734 };
735 typedef boost::intrusive::set<Extent> extent_map_t;
736
737
738 friend ostream& operator<<(ostream& out, const Extent& e);
739
740 struct OldExtent {
741 boost::intrusive::list_member_hook<> old_extent_item;
742 Extent e;
743 PExtentVector r;
744 bool blob_empty; // flag to track the last removed extent that makes blob
745 // empty - required to update compression stat properly
746 OldExtent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
747 : e(lo, o, l, b), blob_empty(false) {
748 }
749 static OldExtent* create(CollectionRef c,
750 uint32_t lo,
751 uint32_t o,
752 uint32_t l,
753 BlobRef& b);
754 };
755 typedef boost::intrusive::list<
756 OldExtent,
757 boost::intrusive::member_hook<
758 OldExtent,
759 boost::intrusive::list_member_hook<>,
760 &OldExtent::old_extent_item> > old_extent_map_t;
761
762 struct Onode;
763
764 /// a sharded extent map, mapping offsets to lextents to blobs
765 struct ExtentMap {
766 Onode *onode;
767 extent_map_t extent_map; ///< map of Extents to Blobs
768 blob_map_t spanning_blob_map; ///< blobs that span shards
11fdf7f2 769 typedef boost::intrusive_ptr<Onode> OnodeRef;
7c673cae
FG
770
771 struct Shard {
772 bluestore_onode_t::shard_info *shard_info = nullptr;
773 unsigned extents = 0; ///< count extents in this shard
774 bool loaded = false; ///< true if shard is loaded
775 bool dirty = false; ///< true if shard is dirty and needs reencoding
776 };
31f18b77 777 mempool::bluestore_cache_other::vector<Shard> shards; ///< shards
7c673cae
FG
778
779 bufferlist inline_bl; ///< cached encoded map, if unsharded; empty=>dirty
780
781 uint32_t needs_reshard_begin = 0;
782 uint32_t needs_reshard_end = 0;
783
11fdf7f2
TL
784 void dup(BlueStore* b, TransContext*, CollectionRef&, OnodeRef&, OnodeRef&,
785 uint64_t&, uint64_t&, uint64_t&);
786
7c673cae
FG
787 bool needs_reshard() const {
788 return needs_reshard_end > needs_reshard_begin;
789 }
790 void clear_needs_reshard() {
791 needs_reshard_begin = needs_reshard_end = 0;
792 }
793 void request_reshard(uint32_t begin, uint32_t end) {
794 if (begin < needs_reshard_begin) {
795 needs_reshard_begin = begin;
796 }
797 if (end > needs_reshard_end) {
798 needs_reshard_end = end;
799 }
800 }
801
802 struct DeleteDisposer {
803 void operator()(Extent *e) { delete e; }
804 };
805
806 ExtentMap(Onode *o);
807 ~ExtentMap() {
808 extent_map.clear_and_dispose(DeleteDisposer());
809 }
810
811 void clear() {
812 extent_map.clear_and_dispose(DeleteDisposer());
813 shards.clear();
814 inline_bl.clear();
815 clear_needs_reshard();
816 }
817
818 bool encode_some(uint32_t offset, uint32_t length, bufferlist& bl,
819 unsigned *pn);
820 unsigned decode_some(bufferlist& bl);
821
822 void bound_encode_spanning_blobs(size_t& p);
823 void encode_spanning_blobs(bufferlist::contiguous_appender& p);
11fdf7f2 824 void decode_spanning_blobs(bufferptr::const_iterator& p);
7c673cae
FG
825
826 BlobRef get_spanning_blob(int id) {
827 auto p = spanning_blob_map.find(id);
11fdf7f2 828 ceph_assert(p != spanning_blob_map.end());
7c673cae
FG
829 return p->second;
830 }
831
832 void update(KeyValueDB::Transaction t, bool force);
31f18b77 833 decltype(BlueStore::Blob::id) allocate_spanning_blob_id();
7c673cae
FG
834 void reshard(
835 KeyValueDB *db,
836 KeyValueDB::Transaction t);
837
838 /// initialize Shards from the onode
839 void init_shards(bool loaded, bool dirty);
840
841 /// return index of shard containing offset
842 /// or -1 if not found
843 int seek_shard(uint32_t offset) {
844 size_t end = shards.size();
845 size_t mid, left = 0;
846 size_t right = end; // one passed the right end
847
848 while (left < right) {
849 mid = left + (right - left) / 2;
850 if (offset >= shards[mid].shard_info->offset) {
851 size_t next = mid + 1;
852 if (next >= end || offset < shards[next].shard_info->offset)
853 return mid;
854 //continue to search forwards
855 left = next;
856 } else {
857 //continue to search backwards
858 right = mid;
859 }
860 }
861
862 return -1; // not found
863 }
864
865 /// check if a range spans a shard
866 bool spans_shard(uint32_t offset, uint32_t length) {
867 if (shards.empty()) {
868 return false;
869 }
870 int s = seek_shard(offset);
11fdf7f2 871 ceph_assert(s >= 0);
7c673cae
FG
872 if (s == (int)shards.size() - 1) {
873 return false; // last shard
874 }
875 if (offset + length <= shards[s+1].shard_info->offset) {
876 return false;
877 }
878 return true;
879 }
880
881 /// ensure that a range of the map is loaded
882 void fault_range(KeyValueDB *db,
883 uint32_t offset, uint32_t length);
884
885 /// ensure a range of the map is marked dirty
31f18b77 886 void dirty_range(uint32_t offset, uint32_t length);
7c673cae 887
31f18b77 888 /// for seek_lextent test
7c673cae
FG
889 extent_map_t::iterator find(uint64_t offset);
890
7c673cae
FG
891 /// seek to the first lextent including or after offset
892 extent_map_t::iterator seek_lextent(uint64_t offset);
893 extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
894
895 /// add a new Extent
896 void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
897 extent_map.insert(*new Extent(lo, o, l, b));
898 }
899
900 /// remove (and delete) an Extent
901 void rm(extent_map_t::iterator p) {
902 extent_map.erase_and_dispose(p, DeleteDisposer());
903 }
904
905 bool has_any_lextents(uint64_t offset, uint64_t length);
906
907 /// consolidate adjacent lextents in extent_map
908 int compress_extent_map(uint64_t offset, uint64_t length);
909
910 /// punch a logical hole. add lextents to deref to target list.
911 void punch_hole(CollectionRef &c,
912 uint64_t offset, uint64_t length,
913 old_extent_map_t *old_extents);
914
915 /// put new lextent into lextent_map overwriting existing ones if
916 /// any and update references accordingly
917 Extent *set_lextent(CollectionRef &c,
918 uint64_t logical_offset,
919 uint64_t offset, uint64_t length,
920 BlobRef b,
921 old_extent_map_t *old_extents);
922
923 /// split a blob (and referring extents)
924 BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
925 };
926
927 /// Compressed Blob Garbage collector
928 /*
929 The primary idea of the collector is to estimate a difference between
930 allocation units(AU) currently present for compressed blobs and new AUs
931 required to store that data uncompressed.
932 Estimation is performed for protrusive extents within a logical range
933 determined by a concatenation of old_extents collection and specific(current)
934 write request.
935 The root cause for old_extents use is the need to handle blob ref counts
936 properly. Old extents still hold blob refs and hence we need to traverse
937 the collection to determine if blob to be released.
938 Protrusive extents are extents that fit into the blob set in action
939 (ones that are below the logical range from above) but not removed totally
940 due to the current write.
941 E.g. for
942 extent1 <loffs = 100, boffs = 100, len = 100> ->
943 blob1<compressed, len_on_disk=4096, logical_len=8192>
944 extent2 <loffs = 200, boffs = 200, len = 100> ->
945 blob2<raw, len_on_disk=4096, llen=4096>
946 extent3 <loffs = 300, boffs = 300, len = 100> ->
947 blob1<compressed, len_on_disk=4096, llen=8192>
948 extent4 <loffs = 4096, boffs = 0, len = 100> ->
949 blob3<raw, len_on_disk=4096, llen=4096>
950 write(300~100)
951 protrusive extents are within the following ranges <0~300, 400~8192-400>
952 In this case existing AUs that might be removed due to GC (i.e. blob1)
953 use 2x4K bytes.
954 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
955 Hence we should do a collect.
956 */
957 class GarbageCollector
958 {
959 public:
960 /// return amount of allocation units that might be saved due to GC
961 int64_t estimate(
962 uint64_t offset,
963 uint64_t length,
964 const ExtentMap& extent_map,
965 const old_extent_map_t& old_extents,
966 uint64_t min_alloc_size);
967
968 /// return a collection of extents to perform GC on
eafe8130 969 const interval_set<uint64_t>& get_extents_to_collect() const {
7c673cae
FG
970 return extents_to_collect;
971 }
972 GarbageCollector(CephContext* _cct) : cct(_cct) {}
973
974 private:
975 struct BlobInfo {
976 uint64_t referenced_bytes = 0; ///< amount of bytes referenced in blob
977 int64_t expected_allocations = 0; ///< new alloc units required
978 ///< in case of gc fulfilled
979 bool collect_candidate = false; ///< indicate if blob has any extents
980 ///< eligible for GC.
981 extent_map_t::const_iterator first_lextent; ///< points to the first
982 ///< lextent referring to
983 ///< the blob if any.
984 ///< collect_candidate flag
985 ///< determines the validity
986 extent_map_t::const_iterator last_lextent; ///< points to the last
987 ///< lextent referring to
988 ///< the blob if any.
989
990 BlobInfo(uint64_t ref_bytes) :
991 referenced_bytes(ref_bytes) {
992 }
993 };
994 CephContext* cct;
995 map<Blob*, BlobInfo> affected_blobs; ///< compressed blobs and their ref_map
996 ///< copies that are affected by the
997 ///< specific write
998
a8e16298 999 ///< protrusive extents that should be collected if GC takes place
eafe8130 1000 interval_set<uint64_t> extents_to_collect;
7c673cae
FG
1001
1002 boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
1003 ///< unit when traversing
1004 ///< protrusive extents.
1005 ///< Other extents mapped to
1006 ///< this AU to be ignored
1007 ///< (except the case where
1008 ///< uncompressed extent follows
1009 ///< compressed one - see below).
1010 BlobInfo* blob_info_counted = nullptr; ///< set if previous allocation unit
1011 ///< caused expected_allocations
1012 ///< counter increment at this blob.
1013 ///< if uncompressed extent follows
1014 ///< a decrement for the
1015 ///< expected_allocations counter
1016 ///< is needed
1017 int64_t expected_allocations = 0; ///< new alloc units required in case
1018 ///< of gc fulfilled
1019 int64_t expected_for_release = 0; ///< alloc units currently used by
1020 ///< compressed blobs that might
1021 ///< gone after GC
7c673cae
FG
1022
1023 protected:
1024 void process_protrusive_extents(const BlueStore::ExtentMap& extent_map,
1025 uint64_t start_offset,
1026 uint64_t end_offset,
1027 uint64_t start_touch_offset,
1028 uint64_t end_touch_offset,
1029 uint64_t min_alloc_size);
1030 };
1031
1032 struct OnodeSpace;
1033
1034 /// an in-memory object
1035 struct Onode {
1036 MEMPOOL_CLASS_HELPERS();
1037
1038 std::atomic_int nref; ///< reference count
1039 Collection *c;
1040
1041 ghobject_t oid;
1042
1043 /// key under PREFIX_OBJ where we are stored
31f18b77 1044 mempool::bluestore_cache_other::string key;
7c673cae
FG
1045
1046 boost::intrusive::list_member_hook<> lru_item;
1047
1048 bluestore_onode_t onode; ///< metadata stored as value in kv store
1049 bool exists; ///< true if object logically exists
1050
1051 ExtentMap extent_map;
1052
1053 // track txc's that have not been committed to kv store (and whose
1054 // effects cannot be read via the kvdb read methods)
1055 std::atomic<int> flushing_count = {0};
11fdf7f2
TL
1056 /// protect flush_txns
1057 ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock");
1058 ceph::condition_variable flush_cond; ///< wait here for uncommitted txns
7c673cae
FG
1059
1060 Onode(Collection *c, const ghobject_t& o,
31f18b77 1061 const mempool::bluestore_cache_other::string& k)
7c673cae
FG
1062 : nref(0),
1063 c(c),
1064 oid(o),
1065 key(k),
1066 exists(false),
1067 extent_map(this) {
1068 }
eafe8130
TL
1069 Onode(Collection* c, const ghobject_t& o,
1070 const string& k)
1071 : nref(0),
1072 c(c),
1073 oid(o),
1074 key(k),
1075 exists(false),
1076 extent_map(this) {
1077 }
1078 Onode(Collection* c, const ghobject_t& o,
1079 const char* k)
1080 : nref(0),
1081 c(c),
1082 oid(o),
1083 key(k),
1084 exists(false),
1085 extent_map(this) {
1086 }
1087
1088 static Onode* decode(
1089 CollectionRef c,
1090 const ghobject_t& oid,
1091 const string& key,
1092 const bufferlist& v);
7c673cae
FG
1093
1094 void flush();
1095 void get() {
1096 ++nref;
1097 }
1098 void put() {
1099 if (--nref == 0)
1100 delete this;
1101 }
1102 };
1103 typedef boost::intrusive_ptr<Onode> OnodeRef;
1104
1105
1106 /// a cache (shard) of onodes and buffers
1107 struct Cache {
1108 CephContext* cct;
1109 PerfCounters *logger;
11fdf7f2
TL
1110
1111 /// protect lru and other structures
1112 ceph::recursive_mutex lock = {
1113 ceph::make_recursive_mutex("BlueStore::Cache::lock") };
7c673cae
FG
1114
1115 std::atomic<uint64_t> num_extents = {0};
1116 std::atomic<uint64_t> num_blobs = {0};
1117
7c673cae
FG
1118 static Cache *create(CephContext* cct, string type, PerfCounters *logger);
1119
1120 Cache(CephContext* cct) : cct(cct), logger(nullptr) {}
1121 virtual ~Cache() {}
1122
1123 virtual void _add_onode(OnodeRef& o, int level) = 0;
1124 virtual void _rm_onode(OnodeRef& o) = 0;
1125 virtual void _touch_onode(OnodeRef& o) = 0;
1126
1127 virtual void _add_buffer(Buffer *b, int level, Buffer *near) = 0;
1128 virtual void _rm_buffer(Buffer *b) = 0;
1129 virtual void _move_buffer(Cache *src, Buffer *b) = 0;
1130 virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0;
1131 virtual void _touch_buffer(Buffer *b) = 0;
1132
1133 virtual uint64_t _get_num_onodes() = 0;
1134 virtual uint64_t _get_buffer_bytes() = 0;
1135
1136 void add_extent() {
1137 ++num_extents;
1138 }
1139 void rm_extent() {
1140 --num_extents;
1141 }
1142
1143 void add_blob() {
1144 ++num_blobs;
1145 }
1146 void rm_blob() {
1147 --num_blobs;
1148 }
1149
91327a77 1150 void trim(uint64_t onode_max, uint64_t buffer_max);
7c673cae
FG
1151
1152 void trim_all();
1153
1154 virtual void _trim(uint64_t onode_max, uint64_t buffer_max) = 0;
1155
1156 virtual void add_stats(uint64_t *onodes, uint64_t *extents,
1157 uint64_t *blobs,
1158 uint64_t *buffers,
1159 uint64_t *bytes) = 0;
1160
31f18b77 1161 bool empty() {
11fdf7f2 1162 std::lock_guard l(lock);
31f18b77
FG
1163 return _get_num_onodes() == 0 && _get_buffer_bytes() == 0;
1164 }
1165
7c673cae
FG
1166#ifdef DEBUG_CACHE
1167 virtual void _audit(const char *s) = 0;
1168#else
1169 void _audit(const char *s) { /* no-op */ }
1170#endif
1171 };
1172
1173 /// simple LRU cache for onodes and buffers
1174 struct LRUCache : public Cache {
1175 private:
1176 typedef boost::intrusive::list<
1177 Onode,
1178 boost::intrusive::member_hook<
1179 Onode,
1180 boost::intrusive::list_member_hook<>,
1181 &Onode::lru_item> > onode_lru_list_t;
1182 typedef boost::intrusive::list<
1183 Buffer,
1184 boost::intrusive::member_hook<
1185 Buffer,
1186 boost::intrusive::list_member_hook<>,
1187 &Buffer::lru_item> > buffer_lru_list_t;
1188
1189 onode_lru_list_t onode_lru;
1190
1191 buffer_lru_list_t buffer_lru;
1192 uint64_t buffer_size = 0;
1193
1194 public:
1195 LRUCache(CephContext* cct) : Cache(cct) {}
1196 uint64_t _get_num_onodes() override {
1197 return onode_lru.size();
1198 }
1199 void _add_onode(OnodeRef& o, int level) override {
1200 if (level > 0)
1201 onode_lru.push_front(*o);
1202 else
1203 onode_lru.push_back(*o);
1204 }
1205 void _rm_onode(OnodeRef& o) override {
1206 auto q = onode_lru.iterator_to(*o);
1207 onode_lru.erase(q);
1208 }
1209 void _touch_onode(OnodeRef& o) override;
1210
1211 uint64_t _get_buffer_bytes() override {
1212 return buffer_size;
1213 }
1214 void _add_buffer(Buffer *b, int level, Buffer *near) override {
1215 if (near) {
1216 auto q = buffer_lru.iterator_to(*near);
1217 buffer_lru.insert(q, *b);
1218 } else if (level > 0) {
1219 buffer_lru.push_front(*b);
1220 } else {
1221 buffer_lru.push_back(*b);
1222 }
1223 buffer_size += b->length;
1224 }
1225 void _rm_buffer(Buffer *b) override {
11fdf7f2 1226 ceph_assert(buffer_size >= b->length);
7c673cae
FG
1227 buffer_size -= b->length;
1228 auto q = buffer_lru.iterator_to(*b);
1229 buffer_lru.erase(q);
1230 }
1231 void _move_buffer(Cache *src, Buffer *b) override {
1232 src->_rm_buffer(b);
1233 _add_buffer(b, 0, nullptr);
1234 }
1235 void _adjust_buffer_size(Buffer *b, int64_t delta) override {
11fdf7f2 1236 ceph_assert((int64_t)buffer_size + delta >= 0);
7c673cae
FG
1237 buffer_size += delta;
1238 }
1239 void _touch_buffer(Buffer *b) override {
1240 auto p = buffer_lru.iterator_to(*b);
1241 buffer_lru.erase(p);
1242 buffer_lru.push_front(*b);
1243 _audit("_touch_buffer end");
1244 }
1245
1246 void _trim(uint64_t onode_max, uint64_t buffer_max) override;
1247
1248 void add_stats(uint64_t *onodes, uint64_t *extents,
1249 uint64_t *blobs,
1250 uint64_t *buffers,
1251 uint64_t *bytes) override {
11fdf7f2 1252 std::lock_guard l(lock);
7c673cae
FG
1253 *onodes += onode_lru.size();
1254 *extents += num_extents;
1255 *blobs += num_blobs;
1256 *buffers += buffer_lru.size();
1257 *bytes += buffer_size;
1258 }
1259
1260#ifdef DEBUG_CACHE
1261 void _audit(const char *s) override;
1262#endif
1263 };
1264
1265 // 2Q cache for buffers, LRU for onodes
1266 struct TwoQCache : public Cache {
1267 private:
1268 // stick with LRU for onodes for now (fixme?)
1269 typedef boost::intrusive::list<
1270 Onode,
1271 boost::intrusive::member_hook<
1272 Onode,
1273 boost::intrusive::list_member_hook<>,
1274 &Onode::lru_item> > onode_lru_list_t;
1275 typedef boost::intrusive::list<
1276 Buffer,
1277 boost::intrusive::member_hook<
1278 Buffer,
1279 boost::intrusive::list_member_hook<>,
1280 &Buffer::lru_item> > buffer_list_t;
1281
1282 onode_lru_list_t onode_lru;
1283
1284 buffer_list_t buffer_hot; ///< "Am" hot buffers
1285 buffer_list_t buffer_warm_in; ///< "A1in" newly warm buffers
1286 buffer_list_t buffer_warm_out; ///< "A1out" empty buffers we've evicted
1287 uint64_t buffer_bytes = 0; ///< bytes
1288
1289 enum {
1290 BUFFER_NEW = 0,
1291 BUFFER_WARM_IN, ///< in buffer_warm_in
1292 BUFFER_WARM_OUT, ///< in buffer_warm_out
1293 BUFFER_HOT, ///< in buffer_hot
1294 BUFFER_TYPE_MAX
1295 };
1296
1297 uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
1298
1299 public:
1300 TwoQCache(CephContext* cct) : Cache(cct) {}
1301 uint64_t _get_num_onodes() override {
1302 return onode_lru.size();
1303 }
1304 void _add_onode(OnodeRef& o, int level) override {
1305 if (level > 0)
1306 onode_lru.push_front(*o);
1307 else
1308 onode_lru.push_back(*o);
1309 }
1310 void _rm_onode(OnodeRef& o) override {
1311 auto q = onode_lru.iterator_to(*o);
1312 onode_lru.erase(q);
1313 }
1314 void _touch_onode(OnodeRef& o) override;
1315
1316 uint64_t _get_buffer_bytes() override {
1317 return buffer_bytes;
1318 }
1319 void _add_buffer(Buffer *b, int level, Buffer *near) override;
1320 void _rm_buffer(Buffer *b) override;
1321 void _move_buffer(Cache *src, Buffer *b) override;
1322 void _adjust_buffer_size(Buffer *b, int64_t delta) override;
1323 void _touch_buffer(Buffer *b) override {
1324 switch (b->cache_private) {
1325 case BUFFER_WARM_IN:
1326 // do nothing (somewhat counter-intuitively!)
1327 break;
1328 case BUFFER_WARM_OUT:
1329 // move from warm_out to hot LRU
11fdf7f2 1330 ceph_abort_msg("this happens via discard hint");
7c673cae
FG
1331 break;
1332 case BUFFER_HOT:
1333 // move to front of hot LRU
1334 buffer_hot.erase(buffer_hot.iterator_to(*b));
1335 buffer_hot.push_front(*b);
1336 break;
1337 }
1338 _audit("_touch_buffer end");
1339 }
1340
1341 void _trim(uint64_t onode_max, uint64_t buffer_max) override;
1342
1343 void add_stats(uint64_t *onodes, uint64_t *extents,
1344 uint64_t *blobs,
1345 uint64_t *buffers,
1346 uint64_t *bytes) override {
11fdf7f2 1347 std::lock_guard l(lock);
7c673cae
FG
1348 *onodes += onode_lru.size();
1349 *extents += num_extents;
1350 *blobs += num_blobs;
1351 *buffers += buffer_hot.size() + buffer_warm_in.size();
1352 *bytes += buffer_bytes;
1353 }
1354
1355#ifdef DEBUG_CACHE
1356 void _audit(const char *s) override;
1357#endif
1358 };
1359
1360 struct OnodeSpace {
1361 private:
1362 Cache *cache;
1363
1364 /// forward lookups
31f18b77 1365 mempool::bluestore_cache_other::unordered_map<ghobject_t,OnodeRef> onode_map;
7c673cae
FG
1366
1367 friend class Collection; // for split_cache()
1368
1369 public:
1370 OnodeSpace(Cache *c) : cache(c) {}
1371 ~OnodeSpace() {
1372 clear();
1373 }
1374
1375 OnodeRef add(const ghobject_t& oid, OnodeRef o);
1376 OnodeRef lookup(const ghobject_t& o);
1377 void remove(const ghobject_t& oid) {
1378 onode_map.erase(oid);
1379 }
1380 void rename(OnodeRef& o, const ghobject_t& old_oid,
1381 const ghobject_t& new_oid,
31f18b77 1382 const mempool::bluestore_cache_other::string& new_okey);
7c673cae
FG
1383 void clear();
1384 bool empty();
1385
11fdf7f2
TL
1386 template <int LogLevelV>
1387 void dump(CephContext *cct);
3efd9988 1388
7c673cae
FG
1389 /// return true if f true for any item
1390 bool map_any(std::function<bool(OnodeRef)> f);
1391 };
1392
11fdf7f2
TL
1393 class OpSequencer;
1394 typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
1395
7c673cae
FG
1396 struct Collection : public CollectionImpl {
1397 BlueStore *store;
11fdf7f2 1398 OpSequencerRef osr;
7c673cae 1399 Cache *cache; ///< our cache shard
7c673cae
FG
1400 bluestore_cnode_t cnode;
1401 RWLock lock;
1402
1403 bool exists;
1404
1405 SharedBlobSet shared_blob_set; ///< open SharedBlobs
1406
1407 // cache onodes on a per-collection basis to avoid lock
1408 // contention.
1409 OnodeSpace onode_map;
1410
1411 //pool options
1412 pool_opts_t pool_opts;
11fdf7f2 1413 ContextQueue *commit_queue;
7c673cae
FG
1414
1415 OnodeRef get_onode(const ghobject_t& oid, bool create);
1416
1417 // the terminology is confusing here, sorry!
1418 //
1419 // blob_t shared_blob_t
1420 // !shared unused -> open
1421 // shared !loaded -> open + shared
1422 // shared loaded -> open + shared + loaded
1423 //
1424 // i.e.,
1425 // open = SharedBlob is instantiated
1426 // shared = blob_t shared flag is set; SharedBlob is hashed.
1427 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1428 void open_shared_blob(uint64_t sbid, BlobRef b);
1429 void load_shared_blob(SharedBlobRef sb);
1430 void make_blob_shared(uint64_t sbid, BlobRef b);
31f18b77 1431 uint64_t make_blob_unshared(SharedBlob *sb);
7c673cae
FG
1432
1433 BlobRef new_blob() {
1434 BlobRef b = new Blob();
1435 b->shared_blob = new SharedBlob(this);
1436 return b;
1437 }
1438
7c673cae
FG
1439 bool contains(const ghobject_t& oid) {
1440 if (cid.is_meta())
1441 return oid.hobj.pool == -1;
1442 spg_t spgid;
1443 if (cid.is_pg(&spgid))
1444 return
1445 spgid.pgid.contains(cnode.bits, oid) &&
1446 oid.shard_id == spgid.shard;
1447 return false;
1448 }
1449
1450 void split_cache(Collection *dest);
7c673cae 1451
11fdf7f2
TL
1452 bool flush_commit(Context *c) override;
1453 void flush() override;
1454 void flush_all_but_last();
1455
7c673cae
FG
1456 Collection(BlueStore *ns, Cache *ca, coll_t c);
1457 };
1458
1459 class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
1460 CollectionRef c;
1461 OnodeRef o;
1462 KeyValueDB::Iterator it;
1463 string head, tail;
11fdf7f2
TL
1464
1465 string _stringify() const;
1466
7c673cae
FG
1467 public:
1468 OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
1469 int seek_to_first() override;
1470 int upper_bound(const string &after) override;
1471 int lower_bound(const string &to) override;
1472 bool valid() override;
11fdf7f2 1473 int next() override;
7c673cae
FG
1474 string key() override;
1475 bufferlist value() override;
1476 int status() override {
1477 return 0;
1478 }
1479 };
1480
31f18b77
FG
1481 struct volatile_statfs{
1482 enum {
1483 STATFS_ALLOCATED = 0,
1484 STATFS_STORED,
1485 STATFS_COMPRESSED_ORIGINAL,
1486 STATFS_COMPRESSED,
1487 STATFS_COMPRESSED_ALLOCATED,
1488 STATFS_LAST
1489 };
1490 int64_t values[STATFS_LAST];
1491 volatile_statfs() {
1492 memset(this, 0, sizeof(volatile_statfs));
1493 }
1494 void reset() {
1495 *this = volatile_statfs();
1496 }
11fdf7f2
TL
1497 void publish(store_statfs_t* buf) const {
1498 buf->allocated = allocated();
1499 buf->data_stored = stored();
1500 buf->data_compressed = compressed();
1501 buf->data_compressed_original = compressed_original();
1502 buf->data_compressed_allocated = compressed_allocated();
1503 }
1504
31f18b77
FG
1505 volatile_statfs& operator+=(const volatile_statfs& other) {
1506 for (size_t i = 0; i < STATFS_LAST; ++i) {
1507 values[i] += other.values[i];
1508 }
1509 return *this;
1510 }
1511 int64_t& allocated() {
1512 return values[STATFS_ALLOCATED];
1513 }
1514 int64_t& stored() {
1515 return values[STATFS_STORED];
1516 }
1517 int64_t& compressed_original() {
1518 return values[STATFS_COMPRESSED_ORIGINAL];
1519 }
1520 int64_t& compressed() {
1521 return values[STATFS_COMPRESSED];
1522 }
1523 int64_t& compressed_allocated() {
1524 return values[STATFS_COMPRESSED_ALLOCATED];
1525 }
11fdf7f2
TL
1526 int64_t allocated() const {
1527 return values[STATFS_ALLOCATED];
1528 }
1529 int64_t stored() const {
1530 return values[STATFS_STORED];
1531 }
1532 int64_t compressed_original() const {
1533 return values[STATFS_COMPRESSED_ORIGINAL];
1534 }
1535 int64_t compressed() const {
1536 return values[STATFS_COMPRESSED];
1537 }
1538 int64_t compressed_allocated() const {
1539 return values[STATFS_COMPRESSED_ALLOCATED];
1540 }
1541 volatile_statfs& operator=(const store_statfs_t& st) {
1542 values[STATFS_ALLOCATED] = st.allocated;
1543 values[STATFS_STORED] = st.data_stored;
1544 values[STATFS_COMPRESSED_ORIGINAL] = st.data_compressed_original;
1545 values[STATFS_COMPRESSED] = st.data_compressed;
1546 values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated;
1547 return *this;
1548 }
31f18b77
FG
1549 bool is_empty() {
1550 return values[STATFS_ALLOCATED] == 0 &&
1551 values[STATFS_STORED] == 0 &&
1552 values[STATFS_COMPRESSED] == 0 &&
1553 values[STATFS_COMPRESSED_ORIGINAL] == 0 &&
1554 values[STATFS_COMPRESSED_ALLOCATED] == 0;
1555 }
11fdf7f2
TL
1556 void decode(bufferlist::const_iterator& it) {
1557 using ceph::decode;
31f18b77 1558 for (size_t i = 0; i < STATFS_LAST; i++) {
11fdf7f2 1559 decode(values[i], it);
31f18b77
FG
1560 }
1561 }
1562
1563 void encode(bufferlist& bl) {
11fdf7f2 1564 using ceph::encode;
31f18b77 1565 for (size_t i = 0; i < STATFS_LAST; i++) {
11fdf7f2 1566 encode(values[i], bl);
31f18b77
FG
1567 }
1568 }
1569 };
1570
11fdf7f2 1571 struct TransContext final : public AioContext {
31f18b77
FG
1572 MEMPOOL_CLASS_HELPERS();
1573
7c673cae
FG
1574 typedef enum {
1575 STATE_PREPARE,
1576 STATE_AIO_WAIT,
1577 STATE_IO_DONE,
1578 STATE_KV_QUEUED, // queued for kv_sync_thread submission
1579 STATE_KV_SUBMITTED, // submitted to kv; not yet synced
1580 STATE_KV_DONE,
1581 STATE_DEFERRED_QUEUED, // in deferred_queue (pending or running)
1582 STATE_DEFERRED_CLEANUP, // remove deferred kv record
1583 STATE_DEFERRED_DONE,
1584 STATE_FINISHING,
1585 STATE_DONE,
1586 } state_t;
1587
1588 state_t state = STATE_PREPARE;
1589
1590 const char *get_state_name() {
1591 switch (state) {
1592 case STATE_PREPARE: return "prepare";
1593 case STATE_AIO_WAIT: return "aio_wait";
1594 case STATE_IO_DONE: return "io_done";
1595 case STATE_KV_QUEUED: return "kv_queued";
1596 case STATE_KV_SUBMITTED: return "kv_submitted";
1597 case STATE_KV_DONE: return "kv_done";
1598 case STATE_DEFERRED_QUEUED: return "deferred_queued";
1599 case STATE_DEFERRED_CLEANUP: return "deferred_cleanup";
1600 case STATE_DEFERRED_DONE: return "deferred_done";
1601 case STATE_FINISHING: return "finishing";
1602 case STATE_DONE: return "done";
1603 }
1604 return "???";
1605 }
1606
1607#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1608 const char *get_state_latency_name(int state) {
1609 switch (state) {
1610 case l_bluestore_state_prepare_lat: return "prepare";
1611 case l_bluestore_state_aio_wait_lat: return "aio_wait";
1612 case l_bluestore_state_io_done_lat: return "io_done";
1613 case l_bluestore_state_kv_queued_lat: return "kv_queued";
1614 case l_bluestore_state_kv_committing_lat: return "kv_committing";
1615 case l_bluestore_state_kv_done_lat: return "kv_done";
1616 case l_bluestore_state_deferred_queued_lat: return "deferred_queued";
1617 case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup";
1618 case l_bluestore_state_finishing_lat: return "finishing";
1619 case l_bluestore_state_done_lat: return "done";
1620 }
1621 return "???";
1622 }
1623#endif
1624
11fdf7f2 1625 utime_t log_state_latency(PerfCounters *logger, int state) {
7c673cae
FG
1626 utime_t lat, now = ceph_clock_now();
1627 lat = now - last_stamp;
1628 logger->tinc(state, lat);
1629#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
1630 if (state >= l_bluestore_state_prepare_lat && state <= l_bluestore_state_done_lat) {
1631 double usecs = (now.to_nsec()-last_stamp.to_nsec())/1000;
1632 OID_ELAPSED("", usecs, get_state_latency_name(state));
1633 }
1634#endif
1635 last_stamp = now;
11fdf7f2 1636 return lat;
7c673cae
FG
1637 }
1638
11fdf7f2
TL
1639 CollectionRef ch;
1640 OpSequencerRef osr; // this should be ch->osr
7c673cae
FG
1641 boost::intrusive::list_member_hook<> sequencer_item;
1642
1643 uint64_t bytes = 0, cost = 0;
1644
1645 set<OnodeRef> onodes; ///< these need to be updated/written
1646 set<OnodeRef> modified_objects; ///< objects we modified (and need a ref)
1647 set<SharedBlobRef> shared_blobs; ///< these need to be updated/written
1648 set<SharedBlobRef> shared_blobs_written; ///< update these on io completion
1649
1650 KeyValueDB::Transaction t; ///< then we will commit this
7c673cae
FG
1651 list<Context*> oncommits; ///< more commit completions
1652 list<CollectionRef> removed_collections; ///< colls we removed
1653
1654 boost::intrusive::list_member_hook<> deferred_queue_item;
1655 bluestore_deferred_transaction_t *deferred_txn = nullptr; ///< if any
1656
1657 interval_set<uint64_t> allocated, released;
11fdf7f2
TL
1658 volatile_statfs statfs_delta; ///< overall store statistics delta
1659 uint64_t osd_pool_id = META_POOL_ID; ///< osd pool id we're operating on
1660
7c673cae
FG
1661 IOContext ioc;
1662 bool had_ios = false; ///< true if we submitted IOs before our kv txn
1663
7c673cae
FG
1664 uint64_t seq = 0;
1665 utime_t start;
1666 utime_t last_stamp;
1667
1668 uint64_t last_nid = 0; ///< if non-zero, highest new nid we allocated
1669 uint64_t last_blobid = 0; ///< if non-zero, highest new blobid we allocated
1670
11fdf7f2
TL
1671 explicit TransContext(CephContext* cct, Collection *c, OpSequencer *o,
1672 list<Context*> *on_commits)
1673 : ch(c),
1674 osr(o),
7c673cae
FG
1675 ioc(cct, this),
1676 start(ceph_clock_now()) {
1677 last_stamp = start;
11fdf7f2
TL
1678 if (on_commits) {
1679 oncommits.swap(*on_commits);
1680 }
7c673cae
FG
1681 }
1682 ~TransContext() {
1683 delete deferred_txn;
1684 }
1685
1686 void write_onode(OnodeRef &o) {
1687 onodes.insert(o);
1688 }
1689 void write_shared_blob(SharedBlobRef &sb) {
1690 shared_blobs.insert(sb);
1691 }
31f18b77
FG
1692 void unshare_blob(SharedBlob *sb) {
1693 shared_blobs.erase(sb);
1694 }
1695
7c673cae
FG
1696 /// note we logically modified object (when onode itself is unmodified)
1697 void note_modified_object(OnodeRef &o) {
1698 // onode itself isn't written, though
1699 modified_objects.insert(o);
1700 }
a8e16298 1701 void note_removed_object(OnodeRef& o) {
7c673cae 1702 onodes.erase(o);
a8e16298 1703 modified_objects.insert(o);
7c673cae
FG
1704 }
1705
1706 void aio_finish(BlueStore *store) override {
1707 store->txc_aio_finish(this);
1708 }
1709 };
1710
1711 typedef boost::intrusive::list<
1712 TransContext,
1713 boost::intrusive::member_hook<
1714 TransContext,
1715 boost::intrusive::list_member_hook<>,
1716 &TransContext::deferred_queue_item> > deferred_queue_t;
1717
11fdf7f2 1718 struct DeferredBatch final : public AioContext {
7c673cae
FG
1719 OpSequencer *osr;
1720 struct deferred_io {
1721 bufferlist bl; ///< data
1722 uint64_t seq; ///< deferred transaction seq
1723 };
1724 map<uint64_t,deferred_io> iomap; ///< map of ios in this batch
1725 deferred_queue_t txcs; ///< txcs in this batch
1726 IOContext ioc; ///< our aios
1727 /// bytes of pending io for each deferred seq (may be 0)
1728 map<uint64_t,int> seq_bytes;
1729
1730 void _discard(CephContext *cct, uint64_t offset, uint64_t length);
1731 void _audit(CephContext *cct);
1732
1733 DeferredBatch(CephContext *cct, OpSequencer *osr)
1734 : osr(osr), ioc(cct, this) {}
1735
1736 /// prepare a write
1737 void prepare_write(CephContext *cct,
1738 uint64_t seq, uint64_t offset, uint64_t length,
1739 bufferlist::const_iterator& p);
1740
1741 void aio_finish(BlueStore *store) override {
1742 store->_deferred_aio_finish(osr);
1743 }
1744 };
1745
11fdf7f2 1746 class OpSequencer : public RefCountedObject {
7c673cae 1747 public:
11fdf7f2
TL
1748 ceph::mutex qlock = ceph::make_mutex("BlueStore::OpSequencer::qlock");
1749 ceph::condition_variable qcond;
7c673cae
FG
1750 typedef boost::intrusive::list<
1751 TransContext,
1752 boost::intrusive::member_hook<
1753 TransContext,
1754 boost::intrusive::list_member_hook<>,
1755 &TransContext::sequencer_item> > q_list_t;
1756 q_list_t q; ///< transactions
1757
1758 boost::intrusive::list_member_hook<> deferred_osr_queue_item;
1759
1760 DeferredBatch *deferred_running = nullptr;
1761 DeferredBatch *deferred_pending = nullptr;
1762
7c673cae 1763 BlueStore *store;
11fdf7f2 1764 coll_t cid;
7c673cae
FG
1765
1766 uint64_t last_seq = 0;
1767
1768 std::atomic_int txc_with_unstable_io = {0}; ///< num txcs with unstable io
1769
1770 std::atomic_int kv_committing_serially = {0};
1771
1772 std::atomic_int kv_submitted_waiters = {0};
1773
11fdf7f2 1774 std::atomic_bool zombie = {false}; ///< in zombie_osr set (collection going away)
7c673cae 1775
11fdf7f2
TL
1776 OpSequencer(BlueStore *store, const coll_t& c)
1777 : RefCountedObject(store->cct, 0),
1778 store(store), cid(c) {
7c673cae 1779 }
11fdf7f2
TL
1780 ~OpSequencer() {
1781 ceph_assert(q.empty());
7c673cae
FG
1782 }
1783
1784 void queue_new(TransContext *txc) {
11fdf7f2 1785 std::lock_guard l(qlock);
7c673cae
FG
1786 txc->seq = ++last_seq;
1787 q.push_back(*txc);
1788 }
1789
1790 void drain() {
11fdf7f2 1791 std::unique_lock l(qlock);
7c673cae
FG
1792 while (!q.empty())
1793 qcond.wait(l);
1794 }
1795
1796 void drain_preceding(TransContext *txc) {
11fdf7f2 1797 std::unique_lock l(qlock);
7c673cae
FG
1798 while (!q.empty() && &q.front() != txc)
1799 qcond.wait(l);
1800 }
1801
1802 bool _is_all_kv_submitted() {
11fdf7f2
TL
1803 // caller must hold qlock & q.empty() must not empty
1804 ceph_assert(!q.empty());
7c673cae
FG
1805 TransContext *txc = &q.back();
1806 if (txc->state >= TransContext::STATE_KV_SUBMITTED) {
1807 return true;
1808 }
1809 return false;
1810 }
1811
11fdf7f2
TL
1812 void flush() {
1813 std::unique_lock l(qlock);
1814 while (true) {
1815 // set flag before the check because the condition
1816 // may become true outside qlock, and we need to make
1817 // sure those threads see waiters and signal qcond.
1818 ++kv_submitted_waiters;
1819 if (q.empty() || _is_all_kv_submitted()) {
1820 --kv_submitted_waiters;
1821 return;
1822 }
1823 qcond.wait(l);
1824 --kv_submitted_waiters;
1825 }
1826 }
1827
1828 void flush_all_but_last() {
1829 std::unique_lock l(qlock);
1830 assert (q.size() >= 1);
7c673cae
FG
1831 while (true) {
1832 // set flag before the check because the condition
1833 // may become true outside qlock, and we need to make
1834 // sure those threads see waiters and signal qcond.
1835 ++kv_submitted_waiters;
11fdf7f2
TL
1836 if (q.size() <= 1) {
1837 --kv_submitted_waiters;
7c673cae 1838 return;
11fdf7f2
TL
1839 } else {
1840 auto it = q.rbegin();
1841 it++;
1842 if (it->state >= TransContext::STATE_KV_SUBMITTED) {
eafe8130 1843 --kv_submitted_waiters;
11fdf7f2
TL
1844 return;
1845 }
7c673cae
FG
1846 }
1847 qcond.wait(l);
1848 --kv_submitted_waiters;
1849 }
1850 }
1851
11fdf7f2
TL
1852 bool flush_commit(Context *c) {
1853 std::lock_guard l(qlock);
7c673cae
FG
1854 if (q.empty()) {
1855 return true;
1856 }
1857 TransContext *txc = &q.back();
1858 if (txc->state >= TransContext::STATE_KV_DONE) {
1859 return true;
1860 }
1861 txc->oncommits.push_back(c);
1862 return false;
1863 }
1864 };
1865
1866 typedef boost::intrusive::list<
1867 OpSequencer,
1868 boost::intrusive::member_hook<
1869 OpSequencer,
1870 boost::intrusive::list_member_hook<>,
1871 &OpSequencer::deferred_osr_queue_item> > deferred_osr_queue_t;
1872
1873 struct KVSyncThread : public Thread {
1874 BlueStore *store;
1875 explicit KVSyncThread(BlueStore *s) : store(s) {}
1876 void *entry() override {
1877 store->_kv_sync_thread();
1878 return NULL;
1879 }
1880 };
31f18b77
FG
1881 struct KVFinalizeThread : public Thread {
1882 BlueStore *store;
1883 explicit KVFinalizeThread(BlueStore *s) : store(s) {}
1884 void *entry() {
1885 store->_kv_finalize_thread();
1886 return NULL;
1887 }
1888 };
7c673cae
FG
1889
1890 struct DBHistogram {
1891 struct value_dist {
1892 uint64_t count;
1893 uint32_t max_len;
1894 };
1895
1896 struct key_dist {
1897 uint64_t count;
1898 uint32_t max_len;
1899 map<int, struct value_dist> val_map; ///< slab id to count, max length of value and key
1900 };
1901
1902 map<string, map<int, struct key_dist> > key_hist;
1903 map<int, uint64_t> value_hist;
1904 int get_key_slab(size_t sz);
1905 string get_key_slab_to_range(int slab);
1906 int get_value_slab(size_t sz);
1907 string get_value_slab_to_range(int slab);
1908 void update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
1909 const string &prefix, size_t key_size, size_t value_size);
1910 void dump(Formatter *f);
1911 };
1912
1913 // --------------------------------------------------------
1914 // members
1915private:
1916 BlueFS *bluefs = nullptr;
1917 unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing
1918 bool bluefs_single_shared_device = true;
11fdf7f2
TL
1919 mono_time bluefs_last_balance;
1920 utime_t next_dump_on_bluefs_alloc_failure;
7c673cae
FG
1921
1922 KeyValueDB *db = nullptr;
1923 BlockDevice *bdev = nullptr;
1924 std::string freelist_type;
1925 FreelistManager *fm = nullptr;
1926 Allocator *alloc = nullptr;
1927 uuid_d fsid;
1928 int path_fd = -1; ///< open handle to $path
1929 int fsid_fd = -1; ///< open handle (locked) to $path/fsid
1930 bool mounted = false;
1931
1932 RWLock coll_lock = {"BlueStore::coll_lock"}; ///< rwlock to protect coll_map
31f18b77 1933 mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map;
eafe8130 1934 bool collections_had_errors = false;
11fdf7f2 1935 map<coll_t,CollectionRef> new_coll_map;
7c673cae
FG
1936
1937 vector<Cache*> cache_shards;
1938
11fdf7f2
TL
1939 /// protect zombie_osr_set
1940 ceph::mutex zombie_osr_lock = ceph::make_mutex("BlueStore::zombie_osr_lock");
1941 std::map<coll_t,OpSequencerRef> zombie_osr_set; ///< set of OpSequencers for deleted collections
7c673cae
FG
1942
1943 std::atomic<uint64_t> nid_last = {0};
1944 std::atomic<uint64_t> nid_max = {0};
1945 std::atomic<uint64_t> blobid_last = {0};
1946 std::atomic<uint64_t> blobid_max = {0};
1947
1948 Throttle throttle_bytes; ///< submit to commit
1949 Throttle throttle_deferred_bytes; ///< submit to deferred complete
1950
1951 interval_set<uint64_t> bluefs_extents; ///< block extents owned by bluefs
1952 interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming
1953
11fdf7f2 1954 ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::deferred_lock");
7c673cae
FG
1955 std::atomic<uint64_t> deferred_seq = {0};
1956 deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
1957 int deferred_queue_size = 0; ///< num txc's queued across all osrs
1958 atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
11fdf7f2 1959 Finisher deferred_finisher, finisher;
7c673cae
FG
1960
1961 KVSyncThread kv_sync_thread;
11fdf7f2
TL
1962 ceph::mutex kv_lock = ceph::make_mutex("BlueStore::kv_lock");
1963 ceph::condition_variable kv_cond;
3efd9988 1964 bool _kv_only = false;
31f18b77 1965 bool kv_sync_started = false;
7c673cae 1966 bool kv_stop = false;
31f18b77
FG
1967 bool kv_finalize_started = false;
1968 bool kv_finalize_stop = false;
7c673cae
FG
1969 deque<TransContext*> kv_queue; ///< ready, already submitted
1970 deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
1971 deque<TransContext*> kv_committing; ///< currently syncing
1972 deque<DeferredBatch*> deferred_done_queue; ///< deferred ios done
7c673cae 1973
31f18b77 1974 KVFinalizeThread kv_finalize_thread;
11fdf7f2
TL
1975 ceph::mutex kv_finalize_lock = ceph::make_mutex("BlueStore::kv_finalize_lock");
1976 ceph::condition_variable kv_finalize_cond;
31f18b77
FG
1977 deque<TransContext*> kv_committing_to_finalize; ///< pending finalization
1978 deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization
1979
7c673cae
FG
1980 PerfCounters *logger = nullptr;
1981
7c673cae
FG
1982 list<CollectionRef> removed_collections;
1983
1984 RWLock debug_read_error_lock = {"BlueStore::debug_read_error_lock"};
1985 set<ghobject_t> debug_data_error_objects;
1986 set<ghobject_t> debug_mdata_error_objects;
1987
1988 std::atomic<int> csum_type = {Checksummer::CSUM_CRC32C};
1989
1990 uint64_t block_size = 0; ///< block size of block device (power of 2)
1991 uint64_t block_mask = 0; ///< mask to get just the block offset
1992 size_t block_size_order = 0; ///< bits to shift to get block size
1993
1994 uint64_t min_alloc_size = 0; ///< minimum allocation unit (power of 2)
7c673cae 1995 ///< bits for min_alloc_size
224ce89b 1996 uint8_t min_alloc_size_order = 0;
7c673cae
FG
1997 static_assert(std::numeric_limits<uint8_t>::max() >
1998 std::numeric_limits<decltype(min_alloc_size)>::digits,
1999 "not enough bits for min_alloc_size");
2000
7c673cae
FG
2001 ///< maximum allocation unit (power of 2)
2002 std::atomic<uint64_t> max_alloc_size = {0};
2003
224ce89b
WB
2004 ///< number threshold for forced deferred writes
2005 std::atomic<int> deferred_batch_ops = {0};
2006
2007 ///< size threshold for forced deferred writes
2008 std::atomic<uint64_t> prefer_deferred_size = {0};
2009
7c673cae
FG
2010 ///< approx cost per io, in bytes
2011 std::atomic<uint64_t> throttle_cost_per_io = {0};
2012
224ce89b
WB
2013 std::atomic<Compressor::CompressionMode> comp_mode =
2014 {Compressor::COMP_NONE}; ///< compression mode
7c673cae
FG
2015 CompressorRef compressor;
2016 std::atomic<uint64_t> comp_min_blob_size = {0};
2017 std::atomic<uint64_t> comp_max_blob_size = {0};
2018
2019 std::atomic<uint64_t> max_blob_size = {0}; ///< maximum blob size
2020
31f18b77
FG
2021 uint64_t kv_ios = 0;
2022 uint64_t kv_throttle_costs = 0;
2023
7c673cae 2024 // cache trim control
91327a77
AA
2025 uint64_t cache_size = 0; ///< total cache size
2026 double cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
2027 double cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
2028 double cache_data_ratio = 0; ///< cache ratio dedicated to object data
2029 bool cache_autotune = false; ///< cache autotune setting
91327a77
AA
2030 double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
2031 uint64_t osd_memory_target = 0; ///< OSD memory target when autotuning cache
2032 uint64_t osd_memory_base = 0; ///< OSD base memory when autotuning cache
2033 double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
11fdf7f2 2034 uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cache
91327a77 2035 double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing
92f5a8d4 2036 std::atomic<uint32_t> config_changed = {0}; ///< Counter to determine if there is a configuration change.
11fdf7f2
TL
2037
2038 typedef map<uint64_t, volatile_statfs> osd_pools_map;
2039
2040 ceph::mutex vstatfs_lock = ceph::make_mutex("BlueStore::vstatfs_lock");
31f18b77 2041 volatile_statfs vstatfs;
11fdf7f2
TL
2042 osd_pools_map osd_pools; // protected by vstatfs_lock as well
2043
2044 bool per_pool_stat_collection = true;
7c673cae
FG
2045
2046 struct MempoolThread : public Thread {
91327a77 2047 public:
7c673cae 2048 BlueStore *store;
91327a77 2049
11fdf7f2
TL
2050 ceph::condition_variable cond;
2051 ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock");
7c673cae 2052 bool stop = false;
91327a77 2053 uint64_t autotune_cache_size = 0;
11fdf7f2 2054 std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr;
eafe8130 2055 std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
91327a77
AA
2056
2057 struct MempoolCache : public PriorityCache::PriCache {
2058 BlueStore *store;
11fdf7f2
TL
2059 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
2060 int64_t committed_bytes = 0;
91327a77
AA
2061 double cache_ratio = 0;
2062
2063 MempoolCache(BlueStore *s) : store(s) {};
2064
2065 virtual uint64_t _get_used_bytes() const = 0;
2066
2067 virtual int64_t request_cache_bytes(
11fdf7f2 2068 PriorityCache::Priority pri, uint64_t total_cache) const {
91327a77
AA
2069 int64_t assigned = get_cache_bytes(pri);
2070
2071 switch (pri) {
eafe8130
TL
2072 // All cache items are currently shoved into the PRI1 priority
2073 case PriorityCache::Priority::PRI1:
91327a77 2074 {
11fdf7f2 2075 int64_t request = _get_used_bytes();
91327a77
AA
2076 return(request > assigned) ? request - assigned : 0;
2077 }
2078 default:
2079 break;
2080 }
2081 return -EOPNOTSUPP;
2082 }
2083
2084 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
2085 return cache_bytes[pri];
2086 }
2087 virtual int64_t get_cache_bytes() const {
2088 int64_t total = 0;
2089
2090 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
2091 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
2092 total += get_cache_bytes(pri);
2093 }
2094 return total;
2095 }
2096 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
2097 cache_bytes[pri] = bytes;
2098 }
2099 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
2100 cache_bytes[pri] += bytes;
2101 }
11fdf7f2
TL
2102 virtual int64_t commit_cache_size(uint64_t total_cache) {
2103 committed_bytes = PriorityCache::get_chunk(
2104 get_cache_bytes(), total_cache);
2105 return committed_bytes;
2106 }
2107 virtual int64_t get_committed_size() const {
2108 return committed_bytes;
91327a77
AA
2109 }
2110 virtual double get_cache_ratio() const {
2111 return cache_ratio;
2112 }
2113 virtual void set_cache_ratio(double ratio) {
2114 cache_ratio = ratio;
2115 }
2116 virtual string get_cache_name() const = 0;
2117 };
2118
2119 struct MetaCache : public MempoolCache {
2120 MetaCache(BlueStore *s) : MempoolCache(s) {};
2121
2122 virtual uint64_t _get_used_bytes() const {
2123 return mempool::bluestore_cache_other::allocated_bytes() +
2124 mempool::bluestore_cache_onode::allocated_bytes();
2125 }
2126
2127 virtual string get_cache_name() const {
2128 return "BlueStore Meta Cache";
2129 }
2130
2131 uint64_t _get_num_onodes() const {
2132 uint64_t onode_num =
2133 mempool::bluestore_cache_onode::allocated_items();
2134 return (2 > onode_num) ? 2 : onode_num;
2135 }
2136
2137 double get_bytes_per_onode() const {
2138 return (double)_get_used_bytes() / (double)_get_num_onodes();
2139 }
11fdf7f2
TL
2140 };
2141 std::shared_ptr<MetaCache> meta_cache;
91327a77
AA
2142
2143 struct DataCache : public MempoolCache {
2144 DataCache(BlueStore *s) : MempoolCache(s) {};
2145
2146 virtual uint64_t _get_used_bytes() const {
2147 uint64_t bytes = 0;
2148 for (auto i : store->cache_shards) {
2149 bytes += i->_get_buffer_bytes();
2150 }
2151 return bytes;
2152 }
2153 virtual string get_cache_name() const {
2154 return "BlueStore Data Cache";
2155 }
11fdf7f2
TL
2156 };
2157 std::shared_ptr<DataCache> data_cache;
91327a77 2158
7c673cae
FG
2159 public:
2160 explicit MempoolThread(BlueStore *s)
2161 : store(s),
11fdf7f2
TL
2162 meta_cache(new MetaCache(s)),
2163 data_cache(new DataCache(s)) {}
91327a77 2164
7c673cae
FG
2165 void *entry() override;
2166 void init() {
11fdf7f2 2167 ceph_assert(stop == false);
7c673cae
FG
2168 create("bstore_mempool");
2169 }
2170 void shutdown() {
11fdf7f2 2171 lock.lock();
7c673cae 2172 stop = true;
11fdf7f2
TL
2173 cond.notify_all();
2174 lock.unlock();
7c673cae
FG
2175 join();
2176 }
91327a77
AA
2177
2178 private:
2179 void _adjust_cache_settings();
2180 void _trim_shards(bool interval_stats);
2181 void _tune_cache_size(bool interval_stats);
11fdf7f2
TL
2182 void _balance_cache(
2183 const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches);
2184 void _balance_cache_pri(
2185 int64_t *mem_avail,
2186 const std::list<std::shared_ptr<PriorityCache::PriCache>>& caches,
2187 PriorityCache::Priority pri);
92f5a8d4 2188 void _update_cache_settings();
7c673cae
FG
2189 } mempool_thread;
2190
2191 // --------------------------------------------------------
2192 // private methods
2193
2194 void _init_logger();
2195 void _shutdown_logger();
2196 int _reload_logger();
2197
2198 int _open_path();
2199 void _close_path();
2200 int _open_fsid(bool create);
2201 int _lock_fsid();
2202 int _read_fsid(uuid_d *f);
2203 int _write_fsid();
2204 void _close_fsid();
2205 void _set_alloc_sizes();
2206 void _set_blob_size();
1adf2230 2207 void _set_finisher_num();
92f5a8d4 2208 void _update_osd_memory_options();
7c673cae
FG
2209
2210 int _open_bdev(bool create);
11fdf7f2
TL
2211 // Verifies if disk space is enough for reserved + min bluefs
2212 // and alters the latter if needed.
2213 // Depends on min_alloc_size hence should be called after
2214 // its initialization (and outside of _open_bdev)
2215 void _validate_bdev();
7c673cae 2216 void _close_bdev();
11fdf7f2
TL
2217
2218 int _minimal_open_bluefs(bool create);
2219 void _minimal_close_bluefs();
2220 int _open_bluefs(bool create);
2221 void _close_bluefs();
2222
2223 // Limited (u)mount intended for BlueFS operations only
2224 int _mount_for_bluefs();
2225 void _umount_for_bluefs();
2226
2227
2228 int _is_bluefs(bool create, bool* ret);
2229 /*
2230 * opens both DB and dependant super_meta, FreelistManager and allocator
2231 * in the proper order
2232 */
2233 int _open_db_and_around(bool read_only);
2234 void _close_db_and_around();
2235
2236 // updates legacy bluefs related recs in DB to a state valid for
2237 // downgrades from nautilus.
2238 void _sync_bluefs_and_fm();
2239
2240 /*
2241 * @warning to_repair_db means that we open this db to repair it, will not
2242 * hold the rocksdb's file lock.
2243 */
2244 int _open_db(bool create,
2245 bool to_repair_db=false,
2246 bool read_only = false);
7c673cae 2247 void _close_db();
11fdf7f2 2248 int _open_fm(KeyValueDB::Transaction t);
7c673cae
FG
2249 void _close_fm();
2250 int _open_alloc();
2251 void _close_alloc();
eafe8130
TL
2252 int _open_collections();
2253 void _fsck_collections(int64_t* errors);
7c673cae
FG
2254 void _close_collections();
2255
2256 int _setup_block_symlink_or_file(string name, string path, uint64_t size,
2257 bool create);
2258
7c673cae 2259public:
3efd9988
FG
2260 static int _write_bdev_label(CephContext* cct,
2261 string path, bluestore_bdev_label_t label);
7c673cae
FG
2262 static int _read_bdev_label(CephContext* cct, string path,
2263 bluestore_bdev_label_t *label);
2264private:
2265 int _check_or_set_bdev_label(string path, uint64_t size, string desc,
2266 bool create);
2267
2268 int _open_super_meta();
2269
224ce89b 2270 void _open_statfs();
11fdf7f2 2271 void _get_statfs_overall(struct store_statfs_t *buf);
31f18b77 2272
11fdf7f2
TL
2273 void _dump_alloc_on_failure();
2274
2275 int64_t _get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total);
2276 int _balance_bluefs_freespace();
7c673cae
FG
2277
2278 CollectionRef _get_collection(const coll_t& cid);
2279 void _queue_reap_collection(CollectionRef& c);
2280 void _reap_collections();
2281 void _update_cache_logger();
2282
2283 void _assign_nid(TransContext *txc, OnodeRef o);
2284 uint64_t _assign_blobid(TransContext *txc);
2285
81eedcae
TL
2286 template <int LogLevelV>
2287 friend void _dump_onode(CephContext *cct, const Onode& o);
2288 template <int LogLevelV>
2289 friend void _dump_extent_map(CephContext *cct, const ExtentMap& em);
2290 template <int LogLevelV>
2291 friend void _dump_transaction(CephContext *cct, Transaction *t);
7c673cae 2292
11fdf7f2
TL
2293 TransContext *_txc_create(Collection *c, OpSequencer *osr,
2294 list<Context*> *on_commits);
7c673cae
FG
2295 void _txc_update_store_statfs(TransContext *txc);
2296 void _txc_add_transaction(TransContext *txc, Transaction *t);
2297 void _txc_calc_cost(TransContext *txc);
2298 void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t);
2299 void _txc_state_proc(TransContext *txc);
2300 void _txc_aio_submit(TransContext *txc);
2301public:
2302 void txc_aio_finish(void *p) {
2303 _txc_state_proc(static_cast<TransContext*>(p));
2304 }
2305private:
2306 void _txc_finish_io(TransContext *txc);
2307 void _txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t);
2308 void _txc_applied_kv(TransContext *txc);
2309 void _txc_committed_kv(TransContext *txc);
2310 void _txc_finish(TransContext *txc);
2311 void _txc_release_alloc(TransContext *txc);
2312
11fdf7f2
TL
2313 void _osr_attach(Collection *c);
2314 void _osr_register_zombie(OpSequencer *osr);
2315 void _osr_drain(OpSequencer *osr);
7c673cae
FG
2316 void _osr_drain_preceding(TransContext *txc);
2317 void _osr_drain_all();
7c673cae 2318
31f18b77
FG
2319 void _kv_start();
2320 void _kv_stop();
7c673cae 2321 void _kv_sync_thread();
31f18b77 2322 void _kv_finalize_thread();
7c673cae
FG
2323
2324 bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
2325 void _deferred_queue(TransContext *txc);
3efd9988 2326public:
224ce89b 2327 void deferred_try_submit();
3efd9988 2328private:
224ce89b 2329 void _deferred_submit_unlock(OpSequencer *osr);
7c673cae
FG
2330 void _deferred_aio_finish(OpSequencer *osr);
2331 int _deferred_replay();
2332
2333public:
2334 using mempool_dynamic_bitset =
2335 boost::dynamic_bitset<uint64_t,
2336 mempool::bluestore_fsck::pool_allocator<uint64_t>>;
eafe8130
TL
2337 using per_pool_statfs =
2338 mempool::bluestore_fsck::map<uint64_t, store_statfs_t>;
2339
2340 enum FSCKDepth {
2341 FSCK_REGULAR,
2342 FSCK_DEEP,
2343 FSCK_SHALLOW
2344 };
7c673cae
FG
2345
2346private:
2347 int _fsck_check_extents(
11fdf7f2 2348 const coll_t& cid,
7c673cae
FG
2349 const ghobject_t& oid,
2350 const PExtentVector& extents,
2351 bool compressed,
2352 mempool_dynamic_bitset &used_blocks,
b32b8144 2353 uint64_t granularity,
11fdf7f2 2354 BlueStoreRepairer* repairer,
eafe8130
TL
2355 store_statfs_t& expected_statfs,
2356 FSCKDepth depth);
7c673cae 2357
11fdf7f2
TL
2358 void _fsck_check_pool_statfs(
2359 per_pool_statfs& expected_pool_statfs,
eafe8130
TL
2360 int64_t& errors,
2361 int64_t &warnings,
11fdf7f2
TL
2362 BlueStoreRepairer* repairer);
2363
eafe8130
TL
2364 int _fsck(FSCKDepth depth, bool repair);
2365 int _fsck_on_open(BlueStore::FSCKDepth depth, bool repair);
2366
7c673cae
FG
2367 void _buffer_cache_write(
2368 TransContext *txc,
2369 BlobRef b,
2370 uint64_t offset,
2371 bufferlist& bl,
2372 unsigned flags) {
2373 b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl,
2374 flags);
2375 txc->shared_blobs_written.insert(b->shared_blob);
2376 }
2377
2378 int _collection_list(
2379 Collection *c, const ghobject_t& start, const ghobject_t& end,
2380 int max, vector<ghobject_t> *ls, ghobject_t *next);
2381
2382 template <typename T, typename F>
2383 T select_option(const std::string& opt_name, T val1, F f) {
2384 //NB: opt_name reserved for future use
2385 boost::optional<T> val2 = f();
2386 if (val2) {
2387 return *val2;
2388 }
2389 return val1;
2390 }
2391
2392 void _apply_padding(uint64_t head_pad,
2393 uint64_t tail_pad,
7c673cae
FG
2394 bufferlist& padded);
2395
11fdf7f2
TL
2396 void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn);
2397
7c673cae
FG
2398 // -- ondisk version ---
2399public:
2400 const int32_t latest_ondisk_format = 2; ///< our version
2401 const int32_t min_readable_ondisk_format = 1; ///< what we can read
2402 const int32_t min_compat_ondisk_format = 2; ///< who can read us
2403
2404private:
2405 int32_t ondisk_format = 0; ///< value detected on mount
2406
2407 int _upgrade_super(); ///< upgrade (called during open_super)
11fdf7f2 2408 uint64_t _get_ondisk_reserved() const;
7c673cae
FG
2409 void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
2410
2411 // --- public interface ---
2412public:
2413 BlueStore(CephContext *cct, const string& path);
2414 BlueStore(CephContext *cct, const string& path, uint64_t min_alloc_size); // Ctor for UT only
2415 ~BlueStore() override;
2416
2417 string get_type() override {
2418 return "bluestore";
2419 }
2420
2421 bool needs_journal() override { return false; };
2422 bool wants_journal() override { return false; };
2423 bool allows_journal() override { return false; };
2424
11fdf7f2
TL
2425 int get_devices(set<string> *ls) override;
2426
31f18b77 2427 bool is_rotational() override;
d2e6a577 2428 bool is_journal_rotational() override;
31f18b77 2429
224ce89b
WB
2430 string get_default_device_class() override {
2431 string device_class;
2432 map<string, string> metadata;
2433 collect_metadata(&metadata);
2434 auto it = metadata.find("bluestore_bdev_type");
2435 if (it != metadata.end()) {
2436 device_class = it->second;
2437 }
2438 return device_class;
2439 }
2440
11fdf7f2
TL
2441 int get_numa_node(
2442 int *numa_node,
2443 set<int> *nodes,
2444 set<string> *failed) override;
2445
7c673cae
FG
2446 static int get_block_device_fsid(CephContext* cct, const string& path,
2447 uuid_d *fsid);
2448
2449 bool test_mount_in_use() override;
2450
2451private:
11fdf7f2 2452 int _mount(bool kv_only, bool open_db=true);
7c673cae
FG
2453public:
2454 int mount() override {
2455 return _mount(false);
2456 }
2457 int umount() override;
2458
11fdf7f2
TL
2459 int start_kv_only(KeyValueDB **pdb, bool open_db=true) {
2460 int r = _mount(true, open_db);
7c673cae
FG
2461 if (r < 0)
2462 return r;
2463 *pdb = db;
2464 return 0;
2465 }
2466
3efd9988
FG
2467 int write_meta(const std::string& key, const std::string& value) override;
2468 int read_meta(const std::string& key, std::string *value) override;
2469
eafe8130
TL
2470 int cold_open();
2471 int cold_close();
3efd9988
FG
2472
2473 int fsck(bool deep) override {
eafe8130 2474 return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, false);
3efd9988
FG
2475 }
2476 int repair(bool deep) override {
eafe8130
TL
2477 return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, true);
2478 }
2479 int quick_fix() override {
2480 return _fsck(FSCK_SHALLOW, true);
3efd9988 2481 }
7c673cae
FG
2482
2483 void set_cache_shards(unsigned num) override;
11fdf7f2
TL
2484 void dump_cache_stats(Formatter *f) override {
2485 int onode_count = 0, buffers_bytes = 0;
2486 for (auto i: cache_shards) {
2487 onode_count += i->_get_num_onodes();
2488 buffers_bytes += i->_get_buffer_bytes();
2489 }
2490 f->dump_int("bluestore_onode", onode_count);
2491 f->dump_int("bluestore_buffers", buffers_bytes);
2492 }
2493 void dump_cache_stats(ostream& ss) override {
2494 int onode_count = 0, buffers_bytes = 0;
2495 for (auto i: cache_shards) {
2496 onode_count += i->_get_num_onodes();
2497 buffers_bytes += i->_get_buffer_bytes();
2498 }
2499 ss << "bluestore_onode: " << onode_count;
2500 ss << "bluestore_buffers: " << buffers_bytes;
2501 }
7c673cae
FG
2502
2503 int validate_hobject_key(const hobject_t &obj) const override {
2504 return 0;
2505 }
2506 unsigned get_max_attr_name_length() override {
2507 return 256; // arbitrary; there is no real limit internally
2508 }
2509
2510 int mkfs() override;
2511 int mkjournal() override {
2512 return 0;
2513 }
2514
2515 void get_db_statistics(Formatter *f) override;
2516 void generate_db_histogram(Formatter *f) override;
31f18b77 2517 void _flush_cache();
11fdf7f2 2518 int flush_cache(ostream *os = NULL) override;
7c673cae
FG
2519 void dump_perf_counters(Formatter *f) override {
2520 f->open_object_section("perf_counters");
2521 logger->dump_formatted(f, false);
2522 f->close_section();
2523 }
2524
11fdf7f2
TL
2525 int add_new_bluefs_device(int id, const string& path);
2526 int migrate_to_existing_bluefs_device(const set<int>& devs_source,
2527 int id);
2528 int migrate_to_new_bluefs_device(const set<int>& devs_source,
2529 int id,
2530 const string& path);
2531 int expand_devices(ostream& out);
2532 string get_device_path(unsigned id);
7c673cae
FG
2533
2534public:
11fdf7f2
TL
2535 int statfs(struct store_statfs_t *buf,
2536 osd_alert_list_t* alerts = nullptr) override;
2537 int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf) override;
7c673cae
FG
2538
2539 void collect_metadata(map<string,string> *pm) override;
2540
7c673cae
FG
2541 bool exists(CollectionHandle &c, const ghobject_t& oid) override;
2542 int set_collection_opts(
11fdf7f2 2543 CollectionHandle& c,
7c673cae 2544 const pool_opts_t& opts) override;
7c673cae
FG
2545 int stat(
2546 CollectionHandle &c,
2547 const ghobject_t& oid,
2548 struct stat *st,
2549 bool allow_eio = false) override;
7c673cae
FG
2550 int read(
2551 CollectionHandle &c,
2552 const ghobject_t& oid,
2553 uint64_t offset,
2554 size_t len,
2555 bufferlist& bl,
224ce89b 2556 uint32_t op_flags = 0) override;
7c673cae
FG
2557 int _do_read(
2558 Collection *c,
2559 OnodeRef o,
2560 uint64_t offset,
2561 size_t len,
2562 bufferlist& bl,
f64942e4
AA
2563 uint32_t op_flags = 0,
2564 uint64_t retry_count = 0);
7c673cae
FG
2565
2566private:
2567 int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
2568 uint64_t offset, size_t len, interval_set<uint64_t>& destset);
2569public:
7c673cae
FG
2570 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2571 uint64_t offset, size_t len, bufferlist& bl) override;
7c673cae
FG
2572 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2573 uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
2574
2575
7c673cae
FG
2576 int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
2577 bufferptr& value) override;
2578
7c673cae
FG
2579 int getattrs(CollectionHandle &c, const ghobject_t& oid,
2580 map<string,bufferptr>& aset) override;
2581
2582 int list_collections(vector<coll_t>& ls) override;
2583
2584 CollectionHandle open_collection(const coll_t &c) override;
11fdf7f2
TL
2585 CollectionHandle create_new_collection(const coll_t& cid) override;
2586 void set_collection_commit_queue(const coll_t& cid,
2587 ContextQueue *commit_queue) override;
7c673cae
FG
2588
2589 bool collection_exists(const coll_t& c) override;
11fdf7f2
TL
2590 int collection_empty(CollectionHandle& c, bool *empty) override;
2591 int collection_bits(CollectionHandle& c) override;
7c673cae 2592
7c673cae
FG
2593 int collection_list(CollectionHandle &c,
2594 const ghobject_t& start,
2595 const ghobject_t& end,
2596 int max,
2597 vector<ghobject_t> *ls, ghobject_t *next) override;
2598
7c673cae
FG
2599 int omap_get(
2600 CollectionHandle &c, ///< [in] Collection containing oid
2601 const ghobject_t &oid, ///< [in] Object containing omap
2602 bufferlist *header, ///< [out] omap header
2603 map<string, bufferlist> *out /// < [out] Key to value map
2604 ) override;
2605
2606 /// Get omap header
7c673cae
FG
2607 int omap_get_header(
2608 CollectionHandle &c, ///< [in] Collection containing oid
2609 const ghobject_t &oid, ///< [in] Object containing omap
2610 bufferlist *header, ///< [out] omap header
2611 bool allow_eio = false ///< [in] don't assert on eio
2612 ) override;
2613
2614 /// Get keys defined on oid
7c673cae
FG
2615 int omap_get_keys(
2616 CollectionHandle &c, ///< [in] Collection containing oid
2617 const ghobject_t &oid, ///< [in] Object containing omap
2618 set<string> *keys ///< [out] Keys defined on oid
2619 ) override;
2620
2621 /// Get key values
7c673cae
FG
2622 int omap_get_values(
2623 CollectionHandle &c, ///< [in] Collection containing oid
2624 const ghobject_t &oid, ///< [in] Object containing omap
2625 const set<string> &keys, ///< [in] Keys to get
2626 map<string, bufferlist> *out ///< [out] Returned keys and values
2627 ) override;
2628
2629 /// Filters keys into out which are defined on oid
7c673cae
FG
2630 int omap_check_keys(
2631 CollectionHandle &c, ///< [in] Collection containing oid
2632 const ghobject_t &oid, ///< [in] Object containing omap
2633 const set<string> &keys, ///< [in] Keys to check
2634 set<string> *out ///< [out] Subset of keys defined on oid
2635 ) override;
2636
7c673cae
FG
2637 ObjectMap::ObjectMapIterator get_omap_iterator(
2638 CollectionHandle &c, ///< [in] collection
2639 const ghobject_t &oid ///< [in] object
2640 ) override;
2641
2642 void set_fsid(uuid_d u) override {
2643 fsid = u;
2644 }
2645 uuid_d get_fsid() override {
2646 return fsid;
2647 }
2648
2649 uint64_t estimate_objects_overhead(uint64_t num_objects) override {
2650 return num_objects * 300; //assuming per-object overhead is 300 bytes
2651 }
2652
2653 struct BSPerfTracker {
11fdf7f2
TL
2654 PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
2655 PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
7c673cae
FG
2656
2657 objectstore_perf_stat_t get_cur_stats() const {
2658 objectstore_perf_stat_t ret;
11fdf7f2
TL
2659 ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
2660 ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
7c673cae
FG
2661 return ret;
2662 }
2663
2664 void update_from_perfcounters(PerfCounters &logger);
2665 } perf_tracker;
2666
2667 objectstore_perf_stat_t get_cur_stats() override {
2668 perf_tracker.update_from_perfcounters(*logger);
2669 return perf_tracker.get_cur_stats();
2670 }
2671 const PerfCounters* get_perf_counters() const override {
2672 return logger;
2673 }
2674
2675 int queue_transactions(
11fdf7f2 2676 CollectionHandle& ch,
7c673cae
FG
2677 vector<Transaction>& tls,
2678 TrackedOpRef op = TrackedOpRef(),
2679 ThreadPool::TPHandle *handle = NULL) override;
2680
2681 // error injection
2682 void inject_data_error(const ghobject_t& o) override {
2683 RWLock::WLocker l(debug_read_error_lock);
2684 debug_data_error_objects.insert(o);
2685 }
2686 void inject_mdata_error(const ghobject_t& o) override {
2687 RWLock::WLocker l(debug_read_error_lock);
2688 debug_mdata_error_objects.insert(o);
2689 }
11fdf7f2
TL
2690
2691 /// methods to inject various errors fsck can repair
2692 void inject_broken_shared_blob_key(const string& key,
2693 const bufferlist& bl);
2694 void inject_leaked(uint64_t len);
2695 void inject_false_free(coll_t cid, ghobject_t oid);
2696 void inject_statfs(const string& key, const store_statfs_t& new_statfs);
eafe8130 2697 void inject_global_statfs(const store_statfs_t& new_statfs);
11fdf7f2
TL
2698 void inject_misreference(coll_t cid1, ghobject_t oid1,
2699 coll_t cid2, ghobject_t oid2,
2700 uint64_t offset);
2701
224ce89b 2702 void compact() override {
11fdf7f2 2703 ceph_assert(db);
224ce89b
WB
2704 db->compact();
2705 }
28e407b8
AA
2706 bool has_builtin_csum() const override {
2707 return true;
2708 }
2709
11fdf7f2
TL
2710 /*
2711 Allocate space for BlueFS from slow device.
2712 Either automatically applies allocated extents to underlying
2713 BlueFS (extents == nullptr) or just return them (non-null extents) provided
2714 */
2715 int allocate_bluefs_freespace(
2716 uint64_t min_size,
2717 uint64_t size,
2718 PExtentVector* extents);
2719
494da23a
TL
2720 inline void log_latency(const char* name,
2721 int idx,
2722 const ceph::timespan& lat,
2723 double lat_threshold,
2724 const char* info = "") const;
2725
2726 inline void log_latency_fn(const char* name,
2727 int idx,
2728 const ceph::timespan& lat,
2729 double lat_threshold,
2730 std::function<string (const ceph::timespan& lat)> fn) const;
11fdf7f2 2731
7c673cae
FG
2732private:
2733 bool _debug_data_eio(const ghobject_t& o) {
2734 if (!cct->_conf->bluestore_debug_inject_read_err) {
2735 return false;
2736 }
2737 RWLock::RLocker l(debug_read_error_lock);
2738 return debug_data_error_objects.count(o);
2739 }
2740 bool _debug_mdata_eio(const ghobject_t& o) {
2741 if (!cct->_conf->bluestore_debug_inject_read_err) {
2742 return false;
2743 }
2744 RWLock::RLocker l(debug_read_error_lock);
2745 return debug_mdata_error_objects.count(o);
2746 }
2747 void _debug_obj_on_delete(const ghobject_t& o) {
2748 if (cct->_conf->bluestore_debug_inject_read_err) {
2749 RWLock::WLocker l(debug_read_error_lock);
2750 debug_data_error_objects.erase(o);
2751 debug_mdata_error_objects.erase(o);
2752 }
2753 }
11fdf7f2
TL
2754private:
2755 ceph::mutex qlock = ceph::make_mutex("BlueStore::Alerts::qlock");
2756 string failed_cmode;
2757 set<string> failed_compressors;
2758 string spillover_alert;
81eedcae
TL
2759 string legacy_statfs_alert;
2760 string disk_size_mismatch_alert;
11fdf7f2
TL
2761
2762 void _log_alerts(osd_alert_list_t& alerts);
2763 bool _set_compression_alert(bool cmode, const char* s) {
2764 std::lock_guard l(qlock);
2765 if (cmode) {
2766 bool ret = failed_cmode.empty();
2767 failed_cmode = s;
2768 return ret;
2769 }
2770 return failed_compressors.emplace(s).second;
2771 }
2772 void _clear_compression_alert() {
2773 std::lock_guard l(qlock);
2774 failed_compressors.clear();
2775 failed_cmode.clear();
2776 }
2777
2778 void _set_spillover_alert(const string& s) {
2779 std::lock_guard l(qlock);
2780 spillover_alert = s;
2781 }
2782 void _clear_spillover_alert() {
2783 std::lock_guard l(qlock);
2784 spillover_alert.clear();
2785 }
7c673cae 2786
81eedcae
TL
2787 void _check_legacy_statfs_alert();
2788 void _set_disk_size_mismatch_alert(const string& s) {
2789 std::lock_guard l(qlock);
2790 disk_size_mismatch_alert = s;
2791 }
2792
7c673cae
FG
2793private:
2794
2795 // --------------------------------------------------------
2796 // read processing internal methods
2797 int _verify_csum(
2798 OnodeRef& o,
2799 const bluestore_blob_t* blob,
2800 uint64_t blob_xoffset,
2801 const bufferlist& bl,
2802 uint64_t logical_offset) const;
2803 int _decompress(bufferlist& source, bufferlist* result);
2804
2805
2806 // --------------------------------------------------------
2807 // write ops
2808
2809 struct WriteContext {
2810 bool buffered = false; ///< buffered write
2811 bool compress = false; ///< compressed write
2812 uint64_t target_blob_size = 0; ///< target (max) blob size
2813 unsigned csum_order = 0; ///< target checksum chunk order
2814
2815 old_extent_map_t old_extents; ///< must deref these blobs
eafe8130 2816 interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection
7c673cae
FG
2817
2818 struct write_item {
2819 uint64_t logical_offset; ///< write logical offset
2820 BlobRef b;
2821 uint64_t blob_length;
2822 uint64_t b_off;
2823 bufferlist bl;
2824 uint64_t b_off0; ///< original offset in a blob prior to padding
2825 uint64_t length0; ///< original data length prior to padding
2826
2827 bool mark_unused;
2828 bool new_blob; ///< whether new blob was created
2829
3efd9988
FG
2830 bool compressed = false;
2831 bufferlist compressed_bl;
2832 size_t compressed_len = 0;
2833
7c673cae
FG
2834 write_item(
2835 uint64_t logical_offs,
2836 BlobRef b,
2837 uint64_t blob_len,
2838 uint64_t o,
2839 bufferlist& bl,
2840 uint64_t o0,
2841 uint64_t l0,
2842 bool _mark_unused,
2843 bool _new_blob)
2844 :
2845 logical_offset(logical_offs),
2846 b(b),
2847 blob_length(blob_len),
2848 b_off(o),
2849 bl(bl),
2850 b_off0(o0),
2851 length0(l0),
2852 mark_unused(_mark_unused),
2853 new_blob(_new_blob) {}
2854 };
2855 vector<write_item> writes; ///< blobs we're writing
2856
2857 /// partial clone of the context
2858 void fork(const WriteContext& other) {
2859 buffered = other.buffered;
2860 compress = other.compress;
2861 target_blob_size = other.target_blob_size;
2862 csum_order = other.csum_order;
2863 }
2864 void write(
2865 uint64_t loffs,
2866 BlobRef b,
2867 uint64_t blob_len,
2868 uint64_t o,
2869 bufferlist& bl,
2870 uint64_t o0,
2871 uint64_t len0,
2872 bool _mark_unused,
2873 bool _new_blob) {
2874 writes.emplace_back(loffs,
2875 b,
2876 blob_len,
2877 o,
2878 bl,
2879 o0,
2880 len0,
2881 _mark_unused,
2882 _new_blob);
2883 }
2884 /// Checks for writes to the same pextent within a blob
2885 bool has_conflict(
2886 BlobRef b,
2887 uint64_t loffs,
2888 uint64_t loffs_end,
2889 uint64_t min_alloc_size);
2890 };
2891
2892 void _do_write_small(
2893 TransContext *txc,
2894 CollectionRef &c,
2895 OnodeRef o,
2896 uint64_t offset, uint64_t length,
2897 bufferlist::iterator& blp,
2898 WriteContext *wctx);
2899 void _do_write_big(
2900 TransContext *txc,
2901 CollectionRef &c,
2902 OnodeRef o,
2903 uint64_t offset, uint64_t length,
2904 bufferlist::iterator& blp,
2905 WriteContext *wctx);
2906 int _do_alloc_write(
2907 TransContext *txc,
2908 CollectionRef c,
2909 OnodeRef o,
2910 WriteContext *wctx);
2911 void _wctx_finish(
2912 TransContext *txc,
2913 CollectionRef& c,
2914 OnodeRef o,
31f18b77
FG
2915 WriteContext *wctx,
2916 set<SharedBlob*> *maybe_unshared_blobs=0);
7c673cae 2917
7c673cae
FG
2918 int _write(TransContext *txc,
2919 CollectionRef& c,
2920 OnodeRef& o,
2921 uint64_t offset, size_t len,
2922 bufferlist& bl,
2923 uint32_t fadvise_flags);
2924 void _pad_zeros(bufferlist *bl, uint64_t *offset,
2925 uint64_t chunk_size);
2926
31f18b77
FG
2927 void _choose_write_options(CollectionRef& c,
2928 OnodeRef o,
2929 uint32_t fadvise_flags,
2930 WriteContext *wctx);
2931
2932 int _do_gc(TransContext *txc,
2933 CollectionRef& c,
2934 OnodeRef o,
31f18b77
FG
2935 const WriteContext& wctx,
2936 uint64_t *dirty_start,
2937 uint64_t *dirty_end);
2938
7c673cae
FG
2939 int _do_write(TransContext *txc,
2940 CollectionRef &c,
2941 OnodeRef o,
2942 uint64_t offset, uint64_t length,
2943 bufferlist& bl,
2944 uint32_t fadvise_flags);
2945 void _do_write_data(TransContext *txc,
2946 CollectionRef& c,
2947 OnodeRef o,
2948 uint64_t offset,
2949 uint64_t length,
2950 bufferlist& bl,
2951 WriteContext *wctx);
2952
2953 int _touch(TransContext *txc,
2954 CollectionRef& c,
2955 OnodeRef& o);
2956 int _do_zero(TransContext *txc,
2957 CollectionRef& c,
2958 OnodeRef& o,
2959 uint64_t offset, size_t len);
2960 int _zero(TransContext *txc,
2961 CollectionRef& c,
2962 OnodeRef& o,
2963 uint64_t offset, size_t len);
2964 void _do_truncate(TransContext *txc,
2965 CollectionRef& c,
2966 OnodeRef o,
31f18b77
FG
2967 uint64_t offset,
2968 set<SharedBlob*> *maybe_unshared_blobs=0);
35e4c445 2969 int _truncate(TransContext *txc,
7c673cae
FG
2970 CollectionRef& c,
2971 OnodeRef& o,
2972 uint64_t offset);
2973 int _remove(TransContext *txc,
2974 CollectionRef& c,
2975 OnodeRef& o);
2976 int _do_remove(TransContext *txc,
2977 CollectionRef& c,
2978 OnodeRef o);
2979 int _setattr(TransContext *txc,
2980 CollectionRef& c,
2981 OnodeRef& o,
2982 const string& name,
2983 bufferptr& val);
2984 int _setattrs(TransContext *txc,
2985 CollectionRef& c,
2986 OnodeRef& o,
2987 const map<string,bufferptr>& aset);
2988 int _rmattr(TransContext *txc,
2989 CollectionRef& c,
2990 OnodeRef& o,
2991 const string& name);
2992 int _rmattrs(TransContext *txc,
2993 CollectionRef& c,
2994 OnodeRef& o);
11fdf7f2 2995 void _do_omap_clear(TransContext *txc, const string& prefix, uint64_t id);
7c673cae
FG
2996 int _omap_clear(TransContext *txc,
2997 CollectionRef& c,
2998 OnodeRef& o);
2999 int _omap_setkeys(TransContext *txc,
3000 CollectionRef& c,
3001 OnodeRef& o,
3002 bufferlist& bl);
3003 int _omap_setheader(TransContext *txc,
3004 CollectionRef& c,
3005 OnodeRef& o,
3006 bufferlist& header);
3007 int _omap_rmkeys(TransContext *txc,
3008 CollectionRef& c,
3009 OnodeRef& o,
3010 bufferlist& bl);
3011 int _omap_rmkey_range(TransContext *txc,
3012 CollectionRef& c,
3013 OnodeRef& o,
3014 const string& first, const string& last);
3015 int _set_alloc_hint(
3016 TransContext *txc,
3017 CollectionRef& c,
3018 OnodeRef& o,
3019 uint64_t expected_object_size,
3020 uint64_t expected_write_size,
3021 uint32_t flags);
3022 int _do_clone_range(TransContext *txc,
3023 CollectionRef& c,
3024 OnodeRef& oldo,
3025 OnodeRef& newo,
3026 uint64_t srcoff, uint64_t length, uint64_t dstoff);
3027 int _clone(TransContext *txc,
3028 CollectionRef& c,
3029 OnodeRef& oldo,
3030 OnodeRef& newo);
3031 int _clone_range(TransContext *txc,
3032 CollectionRef& c,
3033 OnodeRef& oldo,
3034 OnodeRef& newo,
3035 uint64_t srcoff, uint64_t length, uint64_t dstoff);
3036 int _rename(TransContext *txc,
3037 CollectionRef& c,
3038 OnodeRef& oldo,
3039 OnodeRef& newo,
3040 const ghobject_t& new_oid);
3041 int _create_collection(TransContext *txc, const coll_t &cid,
3042 unsigned bits, CollectionRef *c);
3043 int _remove_collection(TransContext *txc, const coll_t &cid,
3044 CollectionRef *c);
11fdf7f2 3045 void _do_remove_collection(TransContext *txc, CollectionRef *c);
7c673cae
FG
3046 int _split_collection(TransContext *txc,
3047 CollectionRef& c,
3048 CollectionRef& d,
3049 unsigned bits, int rem);
11fdf7f2
TL
3050 int _merge_collection(TransContext *txc,
3051 CollectionRef *c,
3052 CollectionRef& d,
3053 unsigned bits);
3054
3055private:
3056 std::atomic<uint64_t> out_of_sync_fm = {0};
3057 // --------------------------------------------------------
3058 // BlueFSDeviceExpander implementation
3059 uint64_t get_recommended_expansion_delta(uint64_t bluefs_free,
3060 uint64_t bluefs_total) override {
3061 auto delta = _get_bluefs_size_delta(bluefs_free, bluefs_total);
3062 return delta > 0 ? delta : 0;
3063 }
3064 int allocate_freespace(
3065 uint64_t min_size,
3066 uint64_t size,
3067 PExtentVector& extents) override {
3068 return allocate_bluefs_freespace(min_size, size, &extents);
3069 };
eafe8130
TL
3070 size_t available_freespace(uint64_t alloc_size) override;
3071
3072public:
3073 struct sb_info_t {
3074 coll_t cid;
3075 int64_t pool_id = INT64_MIN;
3076 list<ghobject_t> oids;
3077 BlueStore::SharedBlobRef sb;
3078 bluestore_extent_ref_map_t ref_map;
3079 bool compressed = false;
3080 bool passed = false;
3081 bool updated = false;
3082 };
3083 typedef btree::btree_set<
3084 uint64_t, std::less<uint64_t>,
3085 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
3086
3087 typedef mempool::bluestore_fsck::map<uint64_t, sb_info_t> sb_info_map_t;
3088 struct FSCK_ObjectCtx {
3089 int64_t& errors;
3090 int64_t& warnings;
3091 uint64_t& num_objects;
3092 uint64_t& num_extents;
3093 uint64_t& num_blobs;
3094 uint64_t& num_sharded_objects;
3095 uint64_t& num_spanning_blobs;
3096
3097 mempool_dynamic_bitset* used_blocks;
3098 uint64_t_btree_t* used_omap_head;
3099 uint64_t_btree_t* used_per_pool_omap_head;
3100 uint64_t_btree_t* used_pgmeta_omap_head;
3101
3102 ceph::mutex* sb_info_lock;
3103 sb_info_map_t& sb_info;
3104
3105 store_statfs_t& expected_store_statfs;
3106 per_pool_statfs& expected_pool_statfs;
3107 BlueStoreRepairer* repairer;
3108
3109 FSCK_ObjectCtx(int64_t& e,
3110 int64_t& w,
3111 uint64_t& _num_objects,
3112 uint64_t& _num_extents,
3113 uint64_t& _num_blobs,
3114 uint64_t& _num_sharded_objects,
3115 uint64_t& _num_spanning_blobs,
3116 mempool_dynamic_bitset* _ub,
3117 uint64_t_btree_t* _used_omap_head,
3118 uint64_t_btree_t* _used_per_pool_omap_head,
3119 uint64_t_btree_t* _used_pgmeta_omap_head,
3120 ceph::mutex* _sb_info_lock,
3121 sb_info_map_t& _sb_info,
3122 store_statfs_t& _store_statfs,
3123 per_pool_statfs& _pool_statfs,
3124 BlueStoreRepairer* _repairer) :
3125 errors(e),
3126 warnings(w),
3127 num_objects(_num_objects),
3128 num_extents(_num_extents),
3129 num_blobs(_num_blobs),
3130 num_sharded_objects(_num_sharded_objects),
3131 num_spanning_blobs(_num_spanning_blobs),
3132 used_blocks(_ub),
3133 used_omap_head(_used_omap_head),
3134 used_per_pool_omap_head(_used_per_pool_omap_head),
3135 used_pgmeta_omap_head(_used_pgmeta_omap_head),
3136 sb_info_lock(_sb_info_lock),
3137 sb_info(_sb_info),
3138 expected_store_statfs(_store_statfs),
3139 expected_pool_statfs(_pool_statfs),
3140 repairer(_repairer) {
3141 }
3142 };
3143
3144 OnodeRef fsck_check_objects_shallow(
3145 FSCKDepth depth,
3146 int64_t pool_id,
3147 CollectionRef c,
3148 const ghobject_t& oid,
3149 const string& key,
3150 const bufferlist& value,
3151 mempool::bluestore_fsck::list<string>& expecting_shards,
3152 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
3153 const BlueStore::FSCK_ObjectCtx& ctx);
3154
3155private:
3156 void _fsck_check_objects(FSCKDepth depth,
3157 FSCK_ObjectCtx& ctx);
7c673cae
FG
3158};
3159
11fdf7f2
TL
3160inline ostream& operator<<(ostream& out, const BlueStore::volatile_statfs& s) {
3161 return out
3162 << " allocated:"
3163 << s.values[BlueStore::volatile_statfs::STATFS_ALLOCATED]
3164 << " stored:"
3165 << s.values[BlueStore::volatile_statfs::STATFS_STORED]
3166 << " compressed:"
3167 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED]
3168 << " compressed_orig:"
3169 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ORIGINAL]
3170 << " compressed_alloc:"
3171 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ALLOCATED];
7c673cae
FG
3172}
3173
3174static inline void intrusive_ptr_add_ref(BlueStore::Onode *o) {
3175 o->get();
3176}
3177static inline void intrusive_ptr_release(BlueStore::Onode *o) {
3178 o->put();
3179}
3180
3181static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer *o) {
3182 o->get();
3183}
3184static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
3185 o->put();
3186}
3187
11fdf7f2
TL
3188class BlueStoreRepairer
3189{
3190public:
3191 // to simplify future potential migration to mempools
3192 using fsck_interval = interval_set<uint64_t>;
3193
3194 // Structure to track what pextents are used for specific cid/oid.
3195 // Similar to Bloom filter positive and false-positive matches are
3196 // possible only.
3197 // Maintains two lists of bloom filters for both cids and oids
3198 // where each list entry is a BF for specific disk pextent
3199 // The length of the extent per filter is measured on init.
3200 // Allows to filter out 'uninteresting' pextents to speadup subsequent
3201 // 'is_used' access.
3202 struct StoreSpaceTracker {
3203 const uint64_t BLOOM_FILTER_SALT_COUNT = 2;
3204 const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter
3205 const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected
3206 static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024;
3207
3208 typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector;
3209 bloom_vector collections_bfs;
3210 bloom_vector objects_bfs;
3211
3212 bool was_filtered_out = false;
3213 uint64_t granularity = 0; // extent length for a single filter
3214
3215 StoreSpaceTracker() {
3216 }
3217 StoreSpaceTracker(const StoreSpaceTracker& from) :
3218 collections_bfs(from.collections_bfs),
3219 objects_bfs(from.objects_bfs),
3220 granularity(from.granularity) {
3221 }
3222
3223 void init(uint64_t total,
3224 uint64_t min_alloc_size,
3225 uint64_t mem_cap = DEF_MEM_CAP) {
3226 ceph_assert(!granularity); // not initialized yet
3227 ceph_assert(min_alloc_size && isp2(min_alloc_size));
3228 ceph_assert(mem_cap);
3229
3230 total = round_up_to(total, min_alloc_size);
3231 granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap;
3232
3233 if (!granularity) {
3234 granularity = min_alloc_size;
3235 } else {
3236 granularity = round_up_to(granularity, min_alloc_size);
3237 }
3238
3239 uint64_t entries = round_up_to(total, granularity) / granularity;
3240 collections_bfs.resize(entries,
3241 bloom_filter(BLOOM_FILTER_SALT_COUNT,
3242 BLOOM_FILTER_TABLE_SIZE,
3243 0,
3244 BLOOM_FILTER_EXPECTED_COUNT));
3245 objects_bfs.resize(entries,
3246 bloom_filter(BLOOM_FILTER_SALT_COUNT,
3247 BLOOM_FILTER_TABLE_SIZE,
3248 0,
3249 BLOOM_FILTER_EXPECTED_COUNT));
3250 }
3251 inline uint32_t get_hash(const coll_t& cid) const {
3252 return cid.hash_to_shard(1);
3253 }
3254 inline void set_used(uint64_t offset, uint64_t len,
3255 const coll_t& cid, const ghobject_t& oid) {
3256 ceph_assert(granularity); // initialized
3257
3258 // can't call this func after filter_out has been applied
3259 ceph_assert(!was_filtered_out);
3260 if (!len) {
3261 return;
3262 }
3263 auto pos = offset / granularity;
3264 auto end_pos = (offset + len - 1) / granularity;
3265 while (pos <= end_pos) {
3266 collections_bfs[pos].insert(get_hash(cid));
3267 objects_bfs[pos].insert(oid.hobj.get_hash());
3268 ++pos;
3269 }
3270 }
3271 // filter-out entries unrelated to the specified(broken) extents.
3272 // 'is_used' calls are permitted after that only
3273 size_t filter_out(const fsck_interval& extents);
3274
3275 // determines if collection's present after filtering-out
3276 inline bool is_used(const coll_t& cid) const {
3277 ceph_assert(was_filtered_out);
3278 for(auto& bf : collections_bfs) {
3279 if (bf.contains(get_hash(cid))) {
3280 return true;
3281 }
3282 }
3283 return false;
3284 }
3285 // determines if object's present after filtering-out
3286 inline bool is_used(const ghobject_t& oid) const {
3287 ceph_assert(was_filtered_out);
3288 for(auto& bf : objects_bfs) {
3289 if (bf.contains(oid.hobj.get_hash())) {
3290 return true;
3291 }
3292 }
3293 return false;
3294 }
3295 // determines if collection's present before filtering-out
3296 inline bool is_used(const coll_t& cid, uint64_t offs) const {
3297 ceph_assert(granularity); // initialized
3298 ceph_assert(!was_filtered_out);
3299 auto &bf = collections_bfs[offs / granularity];
3300 if (bf.contains(get_hash(cid))) {
3301 return true;
3302 }
3303 return false;
3304 }
3305 // determines if object's present before filtering-out
3306 inline bool is_used(const ghobject_t& oid, uint64_t offs) const {
3307 ceph_assert(granularity); // initialized
3308 ceph_assert(!was_filtered_out);
3309 auto &bf = objects_bfs[offs / granularity];
3310 if (bf.contains(oid.hobj.get_hash())) {
3311 return true;
3312 }
3313 return false;
3314 }
3315 };
3316public:
3317
3318 bool remove_key(KeyValueDB *db, const string& prefix, const string& key);
3319 bool fix_shared_blob(KeyValueDB *db,
3320 uint64_t sbid,
3321 const bufferlist* bl);
3322 bool fix_statfs(KeyValueDB *db, const string& key,
3323 const store_statfs_t& new_statfs);
3324
3325 bool fix_leaked(KeyValueDB *db,
3326 FreelistManager* fm,
3327 uint64_t offset, uint64_t len);
3328 bool fix_false_free(KeyValueDB *db,
3329 FreelistManager* fm,
3330 uint64_t offset, uint64_t len);
3331 bool fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag);
3332
3333 void init(uint64_t total_space, uint64_t lres_tracking_unit_size);
3334
3335 bool preprocess_misreference(KeyValueDB *db);
3336
3337 unsigned apply(KeyValueDB* db);
3338
3339 void note_misreference(uint64_t offs, uint64_t len, bool inc_error) {
3340 misreferenced_extents.union_insert(offs, len);
3341 if (inc_error) {
3342 ++to_repair_cnt;
3343 }
3344 }
eafe8130
TL
3345 void inc_repaired() {
3346 ++to_repair_cnt;
3347 }
11fdf7f2
TL
3348
3349 StoreSpaceTracker& get_space_usage_tracker() {
3350 return space_usage_tracker;
3351 }
3352 const fsck_interval& get_misreferences() const {
3353 return misreferenced_extents;
3354 }
3355 KeyValueDB::Transaction get_fix_misreferences_txn() {
3356 return fix_misreferences_txn;
3357 }
3358
3359private:
3360 unsigned to_repair_cnt = 0;
3361 KeyValueDB::Transaction fix_fm_leaked_txn;
3362 KeyValueDB::Transaction fix_fm_false_free_txn;
3363 KeyValueDB::Transaction remove_key_txn;
3364 KeyValueDB::Transaction fix_statfs_txn;
3365 KeyValueDB::Transaction fix_shared_blob_txn;
3366
3367 KeyValueDB::Transaction fix_misreferences_txn;
3368
3369 StoreSpaceTracker space_usage_tracker;
3370
3371 // non-shared extents with multiple references
3372 fsck_interval misreferenced_extents;
3373
3374};
7c673cae 3375#endif