]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.h
Import ceph 15.2.8
[ceph.git] / ceph / src / os / bluestore / BlueStore.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_OSD_BLUESTORE_H
16#define CEPH_OSD_BLUESTORE_H
17
18#include "acconfig.h"
19
20#include <unistd.h>
21
22#include <atomic>
9f95a23c
TL
23#include <chrono>
24#include <ratio>
7c673cae
FG
25#include <mutex>
26#include <condition_variable>
27
28#include <boost/intrusive/list.hpp>
29#include <boost/intrusive/unordered_set.hpp>
30#include <boost/intrusive/set.hpp>
31#include <boost/functional/hash.hpp>
32#include <boost/dynamic_bitset.hpp>
9f95a23c 33#include <boost/circular_buffer.hpp>
7c673cae 34
eafe8130
TL
35#include "include/cpp-btree/btree_set.h"
36
11fdf7f2 37#include "include/ceph_assert.h"
7c673cae 38#include "include/unordered_map.h"
7c673cae 39#include "include/mempool.h"
9f95a23c 40#include "include/hash.h"
11fdf7f2 41#include "common/bloom_filter.hpp"
7c673cae 42#include "common/Finisher.h"
9f95a23c 43#include "common/ceph_mutex.h"
11fdf7f2 44#include "common/Throttle.h"
7c673cae 45#include "common/perf_counters.h"
91327a77 46#include "common/PriorityCache.h"
7c673cae
FG
47#include "compressor/Compressor.h"
48#include "os/ObjectStore.h"
49
50#include "bluestore_types.h"
51#include "BlockDevice.h"
11fdf7f2 52#include "BlueFS.h"
7c673cae
FG
53#include "common/EventTrace.h"
54
55class Allocator;
56class FreelistManager;
11fdf7f2 57class BlueStoreRepairer;
7c673cae
FG
58
59//#define DEBUG_CACHE
60//#define DEBUG_DEFERRED
61
31f18b77
FG
62
63
64// constants for Buffer::optimize()
65#define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
66
67
7c673cae
FG
68enum {
69 l_bluestore_first = 732430,
70 l_bluestore_kv_flush_lat,
71 l_bluestore_kv_commit_lat,
11fdf7f2
TL
72 l_bluestore_kv_sync_lat,
73 l_bluestore_kv_final_lat,
7c673cae
FG
74 l_bluestore_state_prepare_lat,
75 l_bluestore_state_aio_wait_lat,
76 l_bluestore_state_io_done_lat,
77 l_bluestore_state_kv_queued_lat,
78 l_bluestore_state_kv_committing_lat,
79 l_bluestore_state_kv_done_lat,
80 l_bluestore_state_deferred_queued_lat,
81 l_bluestore_state_deferred_aio_wait_lat,
82 l_bluestore_state_deferred_cleanup_lat,
83 l_bluestore_state_finishing_lat,
84 l_bluestore_state_done_lat,
85 l_bluestore_throttle_lat,
86 l_bluestore_submit_lat,
87 l_bluestore_commit_lat,
88 l_bluestore_read_lat,
89 l_bluestore_read_onode_meta_lat,
90 l_bluestore_read_wait_aio_lat,
91 l_bluestore_compress_lat,
92 l_bluestore_decompress_lat,
93 l_bluestore_csum_lat,
94 l_bluestore_compress_success_count,
95 l_bluestore_compress_rejected_count,
96 l_bluestore_write_pad_bytes,
97 l_bluestore_deferred_write_ops,
98 l_bluestore_deferred_write_bytes,
99 l_bluestore_write_penalty_read_ops,
100 l_bluestore_allocated,
101 l_bluestore_stored,
102 l_bluestore_compressed,
103 l_bluestore_compressed_allocated,
104 l_bluestore_compressed_original,
105 l_bluestore_onodes,
9f95a23c 106 l_bluestore_pinned_onodes,
7c673cae
FG
107 l_bluestore_onode_hits,
108 l_bluestore_onode_misses,
109 l_bluestore_onode_shard_hits,
110 l_bluestore_onode_shard_misses,
111 l_bluestore_extents,
112 l_bluestore_blobs,
113 l_bluestore_buffers,
114 l_bluestore_buffer_bytes,
115 l_bluestore_buffer_hit_bytes,
116 l_bluestore_buffer_miss_bytes,
117 l_bluestore_write_big,
118 l_bluestore_write_big_bytes,
119 l_bluestore_write_big_blobs,
120 l_bluestore_write_small,
121 l_bluestore_write_small_bytes,
122 l_bluestore_write_small_unused,
123 l_bluestore_write_small_deferred,
124 l_bluestore_write_small_pre_read,
125 l_bluestore_write_small_new,
126 l_bluestore_txc,
127 l_bluestore_onode_reshard,
128 l_bluestore_blob_split,
129 l_bluestore_extent_compress,
130 l_bluestore_gc_merged,
b32b8144 131 l_bluestore_read_eio,
f64942e4 132 l_bluestore_reads_with_retries,
a8e16298 133 l_bluestore_fragmentation,
11fdf7f2
TL
134 l_bluestore_omap_seek_to_first_lat,
135 l_bluestore_omap_upper_bound_lat,
136 l_bluestore_omap_lower_bound_lat,
137 l_bluestore_omap_next_lat,
494da23a 138 l_bluestore_clist_lat,
7c673cae
FG
139 l_bluestore_last
140};
141
11fdf7f2
TL
142#define META_POOL_ID ((uint64_t)-1ull)
143
7c673cae 144class BlueStore : public ObjectStore,
11fdf7f2 145 public BlueFSDeviceExpander,
7c673cae
FG
146 public md_config_obs_t {
147 // -----------------------------------------------------
148 // types
149public:
150 // config observer
151 const char** get_tracked_conf_keys() const override;
11fdf7f2
TL
152 void handle_conf_change(const ConfigProxy& conf,
153 const std::set<std::string> &changed) override;
154
155 //handler for discard event
156 void handle_discard(interval_set<uint64_t>& to_release);
7c673cae
FG
157
158 void _set_csum();
159 void _set_compression();
160 void _set_throttle_params();
31f18b77 161 int _set_cache_sizes();
9f95a23c
TL
162 void _set_max_defer_interval() {
163 max_defer_interval =
164 cct->_conf.get_val<double>("bluestore_max_defer_interval");
165 }
7c673cae
FG
166
167 class TransContext;
168
169 typedef map<uint64_t, bufferlist> ready_regions_t;
170
eafe8130 171
7c673cae
FG
172 struct BufferSpace;
173 struct Collection;
174 typedef boost::intrusive_ptr<Collection> CollectionRef;
175
176 struct AioContext {
177 virtual void aio_finish(BlueStore *store) = 0;
178 virtual ~AioContext() {}
179 };
180
181 /// cached buffer
182 struct Buffer {
183 MEMPOOL_CLASS_HELPERS();
184
185 enum {
186 STATE_EMPTY, ///< empty buffer -- used for cache history
187 STATE_CLEAN, ///< clean data that is up to date
188 STATE_WRITING, ///< data that is being written (io not yet complete)
189 };
190 static const char *get_state_name(int s) {
191 switch (s) {
192 case STATE_EMPTY: return "empty";
193 case STATE_CLEAN: return "clean";
194 case STATE_WRITING: return "writing";
195 default: return "???";
196 }
197 }
198 enum {
199 FLAG_NOCACHE = 1, ///< trim when done WRITING (do not become CLEAN)
200 // NOTE: fix operator<< when you define a second flag
201 };
202 static const char *get_flag_name(int s) {
203 switch (s) {
204 case FLAG_NOCACHE: return "nocache";
205 default: return "???";
206 }
207 }
208
209 BufferSpace *space;
210 uint16_t state; ///< STATE_*
211 uint16_t cache_private = 0; ///< opaque (to us) value used by Cache impl
212 uint32_t flags; ///< FLAG_*
213 uint64_t seq;
214 uint32_t offset, length;
215 bufferlist data;
216
217 boost::intrusive::list_member_hook<> lru_item;
218 boost::intrusive::list_member_hook<> state_item;
219
220 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, uint32_t l,
221 unsigned f = 0)
222 : space(space), state(s), flags(f), seq(q), offset(o), length(l) {}
223 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, bufferlist& b,
224 unsigned f = 0)
225 : space(space), state(s), flags(f), seq(q), offset(o),
226 length(b.length()), data(b) {}
227
228 bool is_empty() const {
229 return state == STATE_EMPTY;
230 }
231 bool is_clean() const {
232 return state == STATE_CLEAN;
233 }
234 bool is_writing() const {
235 return state == STATE_WRITING;
236 }
237
238 uint32_t end() const {
239 return offset + length;
240 }
241
242 void truncate(uint32_t newlen) {
11fdf7f2 243 ceph_assert(newlen < length);
7c673cae
FG
244 if (data.length()) {
245 bufferlist t;
246 t.substr_of(data, 0, newlen);
247 data.claim(t);
248 }
249 length = newlen;
250 }
31f18b77
FG
251 void maybe_rebuild() {
252 if (data.length() &&
253 (data.get_num_buffers() > 1 ||
254 data.front().wasted() > data.length() / MAX_BUFFER_SLOP_RATIO_DEN)) {
255 data.rebuild();
256 }
257 }
7c673cae
FG
258
259 void dump(Formatter *f) const {
260 f->dump_string("state", get_state_name(state));
261 f->dump_unsigned("seq", seq);
262 f->dump_unsigned("offset", offset);
263 f->dump_unsigned("length", length);
264 f->dump_unsigned("data_length", data.length());
265 }
266 };
267
9f95a23c 268 struct BufferCacheShard;
7c673cae
FG
269
270 /// map logical extent range (object) onto buffers
271 struct BufferSpace {
91327a77
AA
272 enum {
273 BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache
274 };
275
7c673cae
FG
276 typedef boost::intrusive::list<
277 Buffer,
278 boost::intrusive::member_hook<
279 Buffer,
280 boost::intrusive::list_member_hook<>,
281 &Buffer::state_item> > state_list_t;
282
f91f0fd5 283 mempool::bluestore_cache_meta::map<uint32_t, std::unique_ptr<Buffer>>
7c673cae
FG
284 buffer_map;
285
286 // we use a bare intrusive list here instead of std::map because
287 // it uses less memory and we expect this to be very small (very
288 // few IOs in flight to the same Blob at the same time).
289 state_list_t writing; ///< writing buffers, sorted by seq, ascending
290
291 ~BufferSpace() {
11fdf7f2
TL
292 ceph_assert(buffer_map.empty());
293 ceph_assert(writing.empty());
7c673cae
FG
294 }
295
9f95a23c 296 void _add_buffer(BufferCacheShard* cache, Buffer *b, int level, Buffer *near) {
7c673cae
FG
297 cache->_audit("_add_buffer start");
298 buffer_map[b->offset].reset(b);
299 if (b->is_writing()) {
31f18b77 300 b->data.reassign_to_mempool(mempool::mempool_bluestore_writing);
224ce89b
WB
301 if (writing.empty() || writing.rbegin()->seq <= b->seq) {
302 writing.push_back(*b);
303 } else {
304 auto it = writing.begin();
305 while (it->seq < b->seq) {
306 ++it;
307 }
308
11fdf7f2 309 ceph_assert(it->seq >= b->seq);
224ce89b
WB
310 // note that this will insert b before it
311 // hence the order is maintained
312 writing.insert(it, *b);
313 }
7c673cae 314 } else {
31f18b77 315 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
9f95a23c 316 cache->_add(b, level, near);
7c673cae
FG
317 }
318 cache->_audit("_add_buffer end");
319 }
9f95a23c 320 void _rm_buffer(BufferCacheShard* cache, Buffer *b) {
7c673cae
FG
321 _rm_buffer(cache, buffer_map.find(b->offset));
322 }
9f95a23c 323 void _rm_buffer(BufferCacheShard* cache,
31f18b77 324 map<uint32_t, std::unique_ptr<Buffer>>::iterator p) {
11fdf7f2 325 ceph_assert(p != buffer_map.end());
7c673cae
FG
326 cache->_audit("_rm_buffer start");
327 if (p->second->is_writing()) {
328 writing.erase(writing.iterator_to(*p->second));
329 } else {
9f95a23c 330 cache->_rm(p->second.get());
7c673cae
FG
331 }
332 buffer_map.erase(p);
333 cache->_audit("_rm_buffer end");
334 }
335
336 map<uint32_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
337 uint32_t offset) {
338 auto i = buffer_map.lower_bound(offset);
339 if (i != buffer_map.begin()) {
340 --i;
341 if (i->first + i->second->length <= offset)
342 ++i;
343 }
344 return i;
345 }
346
347 // must be called under protection of the Cache lock
9f95a23c 348 void _clear(BufferCacheShard* cache);
7c673cae
FG
349
350 // return value is the highest cache_private of a trimmed buffer, or 0.
9f95a23c 351 int discard(BufferCacheShard* cache, uint32_t offset, uint32_t length) {
11fdf7f2 352 std::lock_guard l(cache->lock);
9f95a23c
TL
353 int ret = _discard(cache, offset, length);
354 cache->_trim();
355 return ret;
7c673cae 356 }
9f95a23c 357 int _discard(BufferCacheShard* cache, uint32_t offset, uint32_t length);
7c673cae 358
9f95a23c 359 void write(BufferCacheShard* cache, uint64_t seq, uint32_t offset, bufferlist& bl,
7c673cae 360 unsigned flags) {
11fdf7f2 361 std::lock_guard l(cache->lock);
7c673cae
FG
362 Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
363 flags);
364 b->cache_private = _discard(cache, offset, bl.length());
365 _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
9f95a23c 366 cache->_trim();
7c673cae 367 }
9f95a23c
TL
368 void _finish_write(BufferCacheShard* cache, uint64_t seq);
369 void did_read(BufferCacheShard* cache, uint32_t offset, bufferlist& bl) {
11fdf7f2 370 std::lock_guard l(cache->lock);
7c673cae
FG
371 Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
372 b->cache_private = _discard(cache, offset, bl.length());
373 _add_buffer(cache, b, 1, nullptr);
9f95a23c 374 cache->_trim();
7c673cae
FG
375 }
376
9f95a23c 377 void read(BufferCacheShard* cache, uint32_t offset, uint32_t length,
7c673cae 378 BlueStore::ready_regions_t& res,
91327a77
AA
379 interval_set<uint32_t>& res_intervals,
380 int flags = 0);
7c673cae 381
9f95a23c 382 void truncate(BufferCacheShard* cache, uint32_t offset) {
7c673cae
FG
383 discard(cache, offset, (uint32_t)-1 - offset);
384 }
385
9f95a23c 386 void split(BufferCacheShard* cache, size_t pos, BufferSpace &r);
7c673cae 387
9f95a23c 388 void dump(BufferCacheShard* cache, Formatter *f) const {
11fdf7f2 389 std::lock_guard l(cache->lock);
7c673cae
FG
390 f->open_array_section("buffers");
391 for (auto& i : buffer_map) {
392 f->open_object_section("buffer");
11fdf7f2 393 ceph_assert(i.first == i.second->offset);
7c673cae
FG
394 i.second->dump(f);
395 f->close_section();
396 }
397 f->close_section();
398 }
399 };
400
401 struct SharedBlobSet;
402
403 /// in-memory shared blob state (incl cached buffers)
404 struct SharedBlob {
405 MEMPOOL_CLASS_HELPERS();
406
407 std::atomic_int nref = {0}; ///< reference count
408 bool loaded = false;
409
410 CollectionRef coll;
411 union {
412 uint64_t sbid_unloaded; ///< sbid if persistent isn't loaded
413 bluestore_shared_blob_t *persistent; ///< persistent part of the shared blob if any
414 };
415 BufferSpace bc; ///< buffer cache
416
417 SharedBlob(Collection *_coll) : coll(_coll), sbid_unloaded(0) {
418 if (get_cache()) {
419 get_cache()->add_blob();
420 }
421 }
422 SharedBlob(uint64_t i, Collection *_coll);
423 ~SharedBlob();
424
425 uint64_t get_sbid() const {
426 return loaded ? persistent->sbid : sbid_unloaded;
427 }
428
429 friend void intrusive_ptr_add_ref(SharedBlob *b) { b->get(); }
430 friend void intrusive_ptr_release(SharedBlob *b) { b->put(); }
431
9f95a23c 432 void dump(Formatter* f) const;
7c673cae
FG
433 friend ostream& operator<<(ostream& out, const SharedBlob& sb);
434
435 void get() {
436 ++nref;
437 }
438 void put();
439
440 /// get logical references
441 void get_ref(uint64_t offset, uint32_t length);
442
443 /// put logical references, and get back any released extents
444 void put_ref(uint64_t offset, uint32_t length,
11fdf7f2 445 PExtentVector *r, bool *unshare);
7c673cae 446
f64942e4
AA
447 void finish_write(uint64_t seq);
448
7c673cae
FG
449 friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
450 return l.get_sbid() == r.get_sbid();
451 }
9f95a23c 452 inline BufferCacheShard* get_cache() {
7c673cae
FG
453 return coll ? coll->cache : nullptr;
454 }
455 inline SharedBlobSet* get_parent() {
456 return coll ? &(coll->shared_blob_set) : nullptr;
457 }
458 inline bool is_loaded() const {
459 return loaded;
460 }
461
462 };
463 typedef boost::intrusive_ptr<SharedBlob> SharedBlobRef;
464
465 /// a lookup table of SharedBlobs
466 struct SharedBlobSet {
11fdf7f2
TL
467 /// protect lookup, insertion, removal
468 ceph::mutex lock = ceph::make_mutex("BlueStore::SharedBlobSet::lock");
7c673cae
FG
469
470 // we use a bare pointer because we don't want to affect the ref
471 // count
f91f0fd5 472 mempool::bluestore_cache_meta::unordered_map<uint64_t,SharedBlob*> sb_map;
7c673cae
FG
473
474 SharedBlobRef lookup(uint64_t sbid) {
11fdf7f2 475 std::lock_guard l(lock);
7c673cae 476 auto p = sb_map.find(sbid);
28e407b8
AA
477 if (p == sb_map.end() ||
478 p->second->nref == 0) {
7c673cae
FG
479 return nullptr;
480 }
481 return p->second;
482 }
483
484 void add(Collection* coll, SharedBlob *sb) {
11fdf7f2 485 std::lock_guard l(lock);
7c673cae
FG
486 sb_map[sb->get_sbid()] = sb;
487 sb->coll = coll;
488 }
489
91327a77 490 bool remove(SharedBlob *sb, bool verify_nref_is_zero=false) {
11fdf7f2
TL
491 std::lock_guard l(lock);
492 ceph_assert(sb->get_parent() == this);
91327a77
AA
493 if (verify_nref_is_zero && sb->nref != 0) {
494 return false;
495 }
28e407b8
AA
496 // only remove if it still points to us
497 auto p = sb_map.find(sb->get_sbid());
498 if (p != sb_map.end() &&
499 p->second == sb) {
500 sb_map.erase(p);
501 }
91327a77 502 return true;
3efd9988
FG
503 }
504
7c673cae 505 bool empty() {
11fdf7f2 506 std::lock_guard l(lock);
7c673cae
FG
507 return sb_map.empty();
508 }
3efd9988 509
11fdf7f2
TL
510 template <int LogLevelV>
511 void dump(CephContext *cct);
7c673cae
FG
512 };
513
514//#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
515
516 /// in-memory blob metadata and associated cached buffers (if any)
517 struct Blob {
518 MEMPOOL_CLASS_HELPERS();
519
520 std::atomic_int nref = {0}; ///< reference count
521 int16_t id = -1; ///< id, for spanning blobs only, >= 0
522 int16_t last_encoded_id = -1; ///< (ephemeral) used during encoding only
523 SharedBlobRef shared_blob; ///< shared blob state (if any)
524
525 private:
526 mutable bluestore_blob_t blob; ///< decoded blob metadata
527#ifdef CACHE_BLOB_BL
528 mutable bufferlist blob_bl; ///< cached encoded blob, blob is dirty if empty
529#endif
530 /// refs from this shard. ephemeral if id<0, persisted if spanning.
531 bluestore_blob_use_tracker_t used_in_blob;
532
533 public:
534
535 friend void intrusive_ptr_add_ref(Blob *b) { b->get(); }
536 friend void intrusive_ptr_release(Blob *b) { b->put(); }
537
9f95a23c 538 void dump(Formatter* f) const;
7c673cae
FG
539 friend ostream& operator<<(ostream& out, const Blob &b);
540
541 const bluestore_blob_use_tracker_t& get_blob_use_tracker() const {
542 return used_in_blob;
543 }
544 bool is_referenced() const {
545 return used_in_blob.is_not_empty();
546 }
547 uint32_t get_referenced_bytes() const {
548 return used_in_blob.get_referenced_bytes();
549 }
550
551 bool is_spanning() const {
552 return id >= 0;
553 }
554
555 bool can_split() const {
11fdf7f2 556 std::lock_guard l(shared_blob->get_cache()->lock);
7c673cae
FG
557 // splitting a BufferSpace writing list is too hard; don't try.
558 return shared_blob->bc.writing.empty() &&
559 used_in_blob.can_split() &&
560 get_blob().can_split();
561 }
562
563 bool can_split_at(uint32_t blob_offset) const {
564 return used_in_blob.can_split_at(blob_offset) &&
565 get_blob().can_split_at(blob_offset);
566 }
567
224ce89b 568 bool can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
569 uint32_t target_blob_size,
570 uint32_t b_offset,
571 uint32_t *length0);
572
573 void dup(Blob& o) {
574 o.shared_blob = shared_blob;
575 o.blob = blob;
576#ifdef CACHE_BLOB_BL
577 o.blob_bl = blob_bl;
578#endif
579 }
580
224ce89b 581 inline const bluestore_blob_t& get_blob() const {
7c673cae
FG
582 return blob;
583 }
224ce89b 584 inline bluestore_blob_t& dirty_blob() {
7c673cae
FG
585#ifdef CACHE_BLOB_BL
586 blob_bl.clear();
587#endif
588 return blob;
589 }
590
591 /// discard buffers for unallocated regions
592 void discard_unallocated(Collection *coll);
593
594 /// get logical references
595 void get_ref(Collection *coll, uint32_t offset, uint32_t length);
596 /// put logical references, and get back any released extents
597 bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
598 PExtentVector *r);
599
600 /// split the blob
601 void split(Collection *coll, uint32_t blob_offset, Blob *o);
602
603 void get() {
604 ++nref;
605 }
606 void put() {
607 if (--nref == 0)
608 delete this;
609 }
610
611
612#ifdef CACHE_BLOB_BL
613 void _encode() const {
614 if (blob_bl.length() == 0 ) {
11fdf7f2 615 encode(blob, blob_bl);
7c673cae 616 } else {
11fdf7f2 617 ceph_assert(blob_bl.length());
7c673cae
FG
618 }
619 }
620 void bound_encode(
621 size_t& p,
622 bool include_ref_map) const {
623 _encode();
624 p += blob_bl.length();
625 if (include_ref_map) {
626 used_in_blob.bound_encode(p);
627 }
628 }
629 void encode(
630 bufferlist::contiguous_appender& p,
631 bool include_ref_map) const {
632 _encode();
633 p.append(blob_bl);
634 if (include_ref_map) {
635 used_in_blob.encode(p);
636 }
637 }
638 void decode(
639 Collection */*coll*/,
11fdf7f2 640 bufferptr::const_iterator& p,
7c673cae
FG
641 bool include_ref_map) {
642 const char *start = p.get_pos();
643 denc(blob, p);
644 const char *end = p.get_pos();
645 blob_bl.clear();
646 blob_bl.append(start, end - start);
647 if (include_ref_map) {
648 used_in_blob.decode(p);
649 }
650 }
651#else
652 void bound_encode(
653 size_t& p,
654 uint64_t struct_v,
655 uint64_t sbid,
656 bool include_ref_map) const {
657 denc(blob, p, struct_v);
658 if (blob.is_shared()) {
659 denc(sbid, p);
660 }
661 if (include_ref_map) {
662 used_in_blob.bound_encode(p);
663 }
664 }
665 void encode(
666 bufferlist::contiguous_appender& p,
667 uint64_t struct_v,
668 uint64_t sbid,
669 bool include_ref_map) const {
670 denc(blob, p, struct_v);
671 if (blob.is_shared()) {
672 denc(sbid, p);
673 }
674 if (include_ref_map) {
675 used_in_blob.encode(p);
676 }
677 }
678 void decode(
679 Collection *coll,
11fdf7f2 680 bufferptr::const_iterator& p,
7c673cae
FG
681 uint64_t struct_v,
682 uint64_t* sbid,
683 bool include_ref_map);
684#endif
685 };
686 typedef boost::intrusive_ptr<Blob> BlobRef;
f91f0fd5 687 typedef mempool::bluestore_cache_meta::map<int,BlobRef> blob_map_t;
7c673cae
FG
688
689 /// a logical extent, pointing to (some portion of) a blob
690 typedef boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true> > ExtentBase; //making an alias to avoid build warnings
691 struct Extent : public ExtentBase {
692 MEMPOOL_CLASS_HELPERS();
693
694 uint32_t logical_offset = 0; ///< logical offset
695 uint32_t blob_offset = 0; ///< blob offset
696 uint32_t length = 0; ///< length
697 BlobRef blob; ///< the blob with our data
698
699 /// ctor for lookup only
700 explicit Extent(uint32_t lo) : ExtentBase(), logical_offset(lo) { }
701 /// ctor for delayed initialization (see decode_some())
702 explicit Extent() : ExtentBase() {
703 }
704 /// ctor for general usage
705 Extent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
706 : ExtentBase(),
707 logical_offset(lo), blob_offset(o), length(l) {
708 assign_blob(b);
709 }
710 ~Extent() {
711 if (blob) {
712 blob->shared_blob->get_cache()->rm_extent();
713 }
714 }
715
9f95a23c
TL
716 void dump(Formatter* f) const;
717
7c673cae 718 void assign_blob(const BlobRef& b) {
11fdf7f2 719 ceph_assert(!blob);
7c673cae
FG
720 blob = b;
721 blob->shared_blob->get_cache()->add_extent();
722 }
723
724 // comparators for intrusive_set
725 friend bool operator<(const Extent &a, const Extent &b) {
726 return a.logical_offset < b.logical_offset;
727 }
728 friend bool operator>(const Extent &a, const Extent &b) {
729 return a.logical_offset > b.logical_offset;
730 }
731 friend bool operator==(const Extent &a, const Extent &b) {
732 return a.logical_offset == b.logical_offset;
733 }
734
735 uint32_t blob_start() const {
736 return logical_offset - blob_offset;
737 }
738
739 uint32_t blob_end() const {
740 return blob_start() + blob->get_blob().get_logical_length();
741 }
742
743 uint32_t logical_end() const {
744 return logical_offset + length;
745 }
746
747 // return true if any piece of the blob is out of
748 // the given range [o, o + l].
749 bool blob_escapes_range(uint32_t o, uint32_t l) const {
750 return blob_start() < o || blob_end() > o + l;
751 }
752 };
753 typedef boost::intrusive::set<Extent> extent_map_t;
754
755
756 friend ostream& operator<<(ostream& out, const Extent& e);
757
758 struct OldExtent {
759 boost::intrusive::list_member_hook<> old_extent_item;
760 Extent e;
761 PExtentVector r;
762 bool blob_empty; // flag to track the last removed extent that makes blob
763 // empty - required to update compression stat properly
764 OldExtent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
765 : e(lo, o, l, b), blob_empty(false) {
766 }
767 static OldExtent* create(CollectionRef c,
768 uint32_t lo,
769 uint32_t o,
770 uint32_t l,
771 BlobRef& b);
772 };
773 typedef boost::intrusive::list<
774 OldExtent,
775 boost::intrusive::member_hook<
776 OldExtent,
777 boost::intrusive::list_member_hook<>,
778 &OldExtent::old_extent_item> > old_extent_map_t;
779
780 struct Onode;
781
782 /// a sharded extent map, mapping offsets to lextents to blobs
783 struct ExtentMap {
784 Onode *onode;
785 extent_map_t extent_map; ///< map of Extents to Blobs
786 blob_map_t spanning_blob_map; ///< blobs that span shards
11fdf7f2 787 typedef boost::intrusive_ptr<Onode> OnodeRef;
7c673cae
FG
788
789 struct Shard {
790 bluestore_onode_t::shard_info *shard_info = nullptr;
791 unsigned extents = 0; ///< count extents in this shard
792 bool loaded = false; ///< true if shard is loaded
793 bool dirty = false; ///< true if shard is dirty and needs reencoding
794 };
f91f0fd5 795 mempool::bluestore_cache_meta::vector<Shard> shards; ///< shards
7c673cae
FG
796
797 bufferlist inline_bl; ///< cached encoded map, if unsharded; empty=>dirty
798
799 uint32_t needs_reshard_begin = 0;
800 uint32_t needs_reshard_end = 0;
801
11fdf7f2
TL
802 void dup(BlueStore* b, TransContext*, CollectionRef&, OnodeRef&, OnodeRef&,
803 uint64_t&, uint64_t&, uint64_t&);
804
7c673cae
FG
805 bool needs_reshard() const {
806 return needs_reshard_end > needs_reshard_begin;
807 }
808 void clear_needs_reshard() {
809 needs_reshard_begin = needs_reshard_end = 0;
810 }
811 void request_reshard(uint32_t begin, uint32_t end) {
812 if (begin < needs_reshard_begin) {
813 needs_reshard_begin = begin;
814 }
815 if (end > needs_reshard_end) {
816 needs_reshard_end = end;
817 }
818 }
819
820 struct DeleteDisposer {
821 void operator()(Extent *e) { delete e; }
822 };
823
824 ExtentMap(Onode *o);
825 ~ExtentMap() {
826 extent_map.clear_and_dispose(DeleteDisposer());
827 }
828
829 void clear() {
830 extent_map.clear_and_dispose(DeleteDisposer());
831 shards.clear();
832 inline_bl.clear();
833 clear_needs_reshard();
834 }
835
9f95a23c
TL
836 void dump(Formatter* f) const;
837
7c673cae
FG
838 bool encode_some(uint32_t offset, uint32_t length, bufferlist& bl,
839 unsigned *pn);
840 unsigned decode_some(bufferlist& bl);
841
842 void bound_encode_spanning_blobs(size_t& p);
843 void encode_spanning_blobs(bufferlist::contiguous_appender& p);
11fdf7f2 844 void decode_spanning_blobs(bufferptr::const_iterator& p);
7c673cae
FG
845
846 BlobRef get_spanning_blob(int id) {
847 auto p = spanning_blob_map.find(id);
11fdf7f2 848 ceph_assert(p != spanning_blob_map.end());
7c673cae
FG
849 return p->second;
850 }
851
852 void update(KeyValueDB::Transaction t, bool force);
31f18b77 853 decltype(BlueStore::Blob::id) allocate_spanning_blob_id();
7c673cae
FG
854 void reshard(
855 KeyValueDB *db,
856 KeyValueDB::Transaction t);
857
858 /// initialize Shards from the onode
859 void init_shards(bool loaded, bool dirty);
860
861 /// return index of shard containing offset
862 /// or -1 if not found
863 int seek_shard(uint32_t offset) {
864 size_t end = shards.size();
865 size_t mid, left = 0;
866 size_t right = end; // one passed the right end
867
868 while (left < right) {
869 mid = left + (right - left) / 2;
870 if (offset >= shards[mid].shard_info->offset) {
871 size_t next = mid + 1;
872 if (next >= end || offset < shards[next].shard_info->offset)
873 return mid;
874 //continue to search forwards
875 left = next;
876 } else {
877 //continue to search backwards
878 right = mid;
879 }
880 }
881
882 return -1; // not found
883 }
884
885 /// check if a range spans a shard
886 bool spans_shard(uint32_t offset, uint32_t length) {
887 if (shards.empty()) {
888 return false;
889 }
890 int s = seek_shard(offset);
11fdf7f2 891 ceph_assert(s >= 0);
7c673cae
FG
892 if (s == (int)shards.size() - 1) {
893 return false; // last shard
894 }
895 if (offset + length <= shards[s+1].shard_info->offset) {
896 return false;
897 }
898 return true;
899 }
900
901 /// ensure that a range of the map is loaded
902 void fault_range(KeyValueDB *db,
903 uint32_t offset, uint32_t length);
904
905 /// ensure a range of the map is marked dirty
31f18b77 906 void dirty_range(uint32_t offset, uint32_t length);
7c673cae 907
31f18b77 908 /// for seek_lextent test
7c673cae
FG
909 extent_map_t::iterator find(uint64_t offset);
910
7c673cae
FG
911 /// seek to the first lextent including or after offset
912 extent_map_t::iterator seek_lextent(uint64_t offset);
913 extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
914
915 /// add a new Extent
916 void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
917 extent_map.insert(*new Extent(lo, o, l, b));
918 }
919
920 /// remove (and delete) an Extent
921 void rm(extent_map_t::iterator p) {
922 extent_map.erase_and_dispose(p, DeleteDisposer());
923 }
924
925 bool has_any_lextents(uint64_t offset, uint64_t length);
926
927 /// consolidate adjacent lextents in extent_map
928 int compress_extent_map(uint64_t offset, uint64_t length);
929
930 /// punch a logical hole. add lextents to deref to target list.
931 void punch_hole(CollectionRef &c,
932 uint64_t offset, uint64_t length,
933 old_extent_map_t *old_extents);
934
935 /// put new lextent into lextent_map overwriting existing ones if
936 /// any and update references accordingly
937 Extent *set_lextent(CollectionRef &c,
938 uint64_t logical_offset,
939 uint64_t offset, uint64_t length,
940 BlobRef b,
941 old_extent_map_t *old_extents);
942
943 /// split a blob (and referring extents)
944 BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
945 };
946
947 /// Compressed Blob Garbage collector
948 /*
949 The primary idea of the collector is to estimate a difference between
950 allocation units(AU) currently present for compressed blobs and new AUs
951 required to store that data uncompressed.
952 Estimation is performed for protrusive extents within a logical range
953 determined by a concatenation of old_extents collection and specific(current)
954 write request.
955 The root cause for old_extents use is the need to handle blob ref counts
956 properly. Old extents still hold blob refs and hence we need to traverse
957 the collection to determine if blob to be released.
958 Protrusive extents are extents that fit into the blob set in action
959 (ones that are below the logical range from above) but not removed totally
960 due to the current write.
961 E.g. for
962 extent1 <loffs = 100, boffs = 100, len = 100> ->
963 blob1<compressed, len_on_disk=4096, logical_len=8192>
964 extent2 <loffs = 200, boffs = 200, len = 100> ->
965 blob2<raw, len_on_disk=4096, llen=4096>
966 extent3 <loffs = 300, boffs = 300, len = 100> ->
967 blob1<compressed, len_on_disk=4096, llen=8192>
968 extent4 <loffs = 4096, boffs = 0, len = 100> ->
969 blob3<raw, len_on_disk=4096, llen=4096>
970 write(300~100)
971 protrusive extents are within the following ranges <0~300, 400~8192-400>
972 In this case existing AUs that might be removed due to GC (i.e. blob1)
973 use 2x4K bytes.
974 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
975 Hence we should do a collect.
976 */
977 class GarbageCollector
978 {
979 public:
980 /// return amount of allocation units that might be saved due to GC
981 int64_t estimate(
982 uint64_t offset,
983 uint64_t length,
984 const ExtentMap& extent_map,
985 const old_extent_map_t& old_extents,
986 uint64_t min_alloc_size);
987
988 /// return a collection of extents to perform GC on
eafe8130 989 const interval_set<uint64_t>& get_extents_to_collect() const {
7c673cae
FG
990 return extents_to_collect;
991 }
992 GarbageCollector(CephContext* _cct) : cct(_cct) {}
993
994 private:
995 struct BlobInfo {
996 uint64_t referenced_bytes = 0; ///< amount of bytes referenced in blob
997 int64_t expected_allocations = 0; ///< new alloc units required
998 ///< in case of gc fulfilled
999 bool collect_candidate = false; ///< indicate if blob has any extents
1000 ///< eligible for GC.
1001 extent_map_t::const_iterator first_lextent; ///< points to the first
1002 ///< lextent referring to
1003 ///< the blob if any.
1004 ///< collect_candidate flag
1005 ///< determines the validity
1006 extent_map_t::const_iterator last_lextent; ///< points to the last
1007 ///< lextent referring to
1008 ///< the blob if any.
1009
1010 BlobInfo(uint64_t ref_bytes) :
1011 referenced_bytes(ref_bytes) {
1012 }
1013 };
1014 CephContext* cct;
1015 map<Blob*, BlobInfo> affected_blobs; ///< compressed blobs and their ref_map
1016 ///< copies that are affected by the
1017 ///< specific write
1018
a8e16298 1019 ///< protrusive extents that should be collected if GC takes place
eafe8130 1020 interval_set<uint64_t> extents_to_collect;
7c673cae
FG
1021
1022 boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
1023 ///< unit when traversing
1024 ///< protrusive extents.
1025 ///< Other extents mapped to
1026 ///< this AU to be ignored
1027 ///< (except the case where
1028 ///< uncompressed extent follows
1029 ///< compressed one - see below).
1030 BlobInfo* blob_info_counted = nullptr; ///< set if previous allocation unit
1031 ///< caused expected_allocations
1032 ///< counter increment at this blob.
1033 ///< if uncompressed extent follows
1034 ///< a decrement for the
1035 ///< expected_allocations counter
1036 ///< is needed
1037 int64_t expected_allocations = 0; ///< new alloc units required in case
1038 ///< of gc fulfilled
1039 int64_t expected_for_release = 0; ///< alloc units currently used by
1040 ///< compressed blobs that might
1041 ///< gone after GC
7c673cae
FG
1042
1043 protected:
1044 void process_protrusive_extents(const BlueStore::ExtentMap& extent_map,
1045 uint64_t start_offset,
1046 uint64_t end_offset,
1047 uint64_t start_touch_offset,
1048 uint64_t end_touch_offset,
1049 uint64_t min_alloc_size);
1050 };
1051
1052 struct OnodeSpace;
7c673cae
FG
1053 /// an in-memory object
1054 struct Onode {
1055 MEMPOOL_CLASS_HELPERS();
1056
1057 std::atomic_int nref; ///< reference count
1058 Collection *c;
7c673cae
FG
1059 ghobject_t oid;
1060
1061 /// key under PREFIX_OBJ where we are stored
f91f0fd5 1062 mempool::bluestore_cache_meta::string key;
7c673cae 1063
f6b5b4d7 1064 boost::intrusive::list_member_hook<> lru_item;
7c673cae
FG
1065
1066 bluestore_onode_t onode; ///< metadata stored as value in kv store
1067 bool exists; ///< true if object logically exists
f6b5b4d7
TL
1068 bool cached; ///< Onode is logically in the cache
1069 /// (it can be pinned and hence physically out
1070 /// of it at the moment though)
1071 bool pinned; ///< Onode is pinned
1072 /// (or should be pinned when cached)
7c673cae
FG
1073 ExtentMap extent_map;
1074
1075 // track txc's that have not been committed to kv store (and whose
1076 // effects cannot be read via the kvdb read methods)
1077 std::atomic<int> flushing_count = {0};
9f95a23c 1078 std::atomic<int> waiting_count = {0};
11fdf7f2
TL
1079 /// protect flush_txns
1080 ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock");
1081 ceph::condition_variable flush_cond; ///< wait here for uncommitted txns
7c673cae
FG
1082
1083 Onode(Collection *c, const ghobject_t& o,
f91f0fd5 1084 const mempool::bluestore_cache_meta::string& k)
f6b5b4d7 1085 : nref(0),
7c673cae
FG
1086 c(c),
1087 oid(o),
1088 key(k),
1089 exists(false),
f6b5b4d7
TL
1090 cached(false),
1091 pinned(false),
7c673cae
FG
1092 extent_map(this) {
1093 }
eafe8130 1094 Onode(Collection* c, const ghobject_t& o,
f6b5b4d7
TL
1095 const std::string& k)
1096 : nref(0),
eafe8130
TL
1097 c(c),
1098 oid(o),
1099 key(k),
1100 exists(false),
f6b5b4d7
TL
1101 cached(false),
1102 pinned(false),
eafe8130
TL
1103 extent_map(this) {
1104 }
1105 Onode(Collection* c, const ghobject_t& o,
1106 const char* k)
f6b5b4d7 1107 : nref(0),
eafe8130
TL
1108 c(c),
1109 oid(o),
1110 key(k),
1111 exists(false),
f6b5b4d7
TL
1112 cached(false),
1113 pinned(false),
eafe8130
TL
1114 extent_map(this) {
1115 }
1116
1117 static Onode* decode(
1118 CollectionRef c,
1119 const ghobject_t& oid,
1120 const string& key,
1121 const bufferlist& v);
7c673cae 1122
9f95a23c
TL
1123 void dump(Formatter* f) const;
1124
7c673cae 1125 void flush();
f6b5b4d7
TL
1126 void get();
1127 void put();
1128
1129 inline bool put_cache() {
1130 ceph_assert(!cached);
1131 cached = true;
1132 return !pinned;
7c673cae 1133 }
f6b5b4d7
TL
1134 inline bool pop_cache() {
1135 ceph_assert(cached);
1136 cached = false;
1137 return !pinned;
7c673cae 1138 }
9f95a23c
TL
1139
1140 const string& get_omap_prefix();
1141 void get_omap_header(string *out);
1142 void get_omap_key(const string& key, string *out);
1143 void rewrite_omap_key(const string& old, string *out);
1144 void get_omap_tail(string *out);
1145 void decode_omap_key(const string& key, string *user_key);
7c673cae
FG
1146 };
1147 typedef boost::intrusive_ptr<Onode> OnodeRef;
1148
9f95a23c
TL
1149 /// A generic Cache Shard
1150 struct CacheShard {
1151 CephContext *cct;
7c673cae 1152 PerfCounters *logger;
11fdf7f2
TL
1153
1154 /// protect lru and other structures
1155 ceph::recursive_mutex lock = {
9f95a23c 1156 ceph::make_recursive_mutex("BlueStore::CacheShard::lock") };
7c673cae 1157
9f95a23c
TL
1158 std::atomic<uint64_t> max = {0};
1159 std::atomic<uint64_t> num = {0};
7c673cae 1160
9f95a23c
TL
1161 CacheShard(CephContext* cct) : cct(cct), logger(nullptr) {}
1162 virtual ~CacheShard() {}
7c673cae 1163
9f95a23c
TL
1164 void set_max(uint64_t max_) {
1165 max = max_;
7c673cae
FG
1166 }
1167
9f95a23c
TL
1168 uint64_t _get_num() {
1169 return num;
7c673cae 1170 }
7c673cae 1171
9f95a23c
TL
1172 virtual void _trim_to(uint64_t new_size) = 0;
1173 void _trim() {
1174 if (cct->_conf->objectstore_blackhole) {
1175 // do not trim if we are throwing away IOs a layer down
1176 return;
1177 }
1178 _trim_to(max);
1179 }
7c673cae 1180
9f95a23c 1181 void trim() {
11fdf7f2 1182 std::lock_guard l(lock);
9f95a23c
TL
1183 _trim();
1184 }
1185 void flush() {
1186 std::lock_guard l(lock);
1187 // we should not be shutting down after the blackhole is enabled
1188 assert(!cct->_conf->objectstore_blackhole);
1189 _trim_to(0);
31f18b77
FG
1190 }
1191
7c673cae
FG
1192#ifdef DEBUG_CACHE
1193 virtual void _audit(const char *s) = 0;
1194#else
1195 void _audit(const char *s) { /* no-op */ }
1196#endif
1197 };
1198
9f95a23c
TL
1199 /// A Generic onode Cache Shard
1200 struct OnodeCacheShard : public CacheShard {
1201 std::atomic<uint64_t> num_pinned = {0};
7c673cae 1202
9f95a23c 1203 std::array<std::pair<ghobject_t, mono_clock::time_point>, 64> dumped_onodes;
f6b5b4d7
TL
1204
1205 virtual void _pin(Onode* o) = 0;
1206 virtual void _unpin(Onode* o) = 0;
1207
7c673cae 1208 public:
9f95a23c
TL
1209 OnodeCacheShard(CephContext* cct) : CacheShard(cct) {}
1210 static OnodeCacheShard *create(CephContext* cct, string type,
1211 PerfCounters *logger);
f6b5b4d7
TL
1212 virtual void _add(Onode* o, int level) = 0;
1213 virtual void _rm(Onode* o) = 0;
9f95a23c 1214
f6b5b4d7 1215 void pin(Onode* o, std::function<bool ()> validator) {
9f95a23c 1216 std::lock_guard l(lock);
f6b5b4d7
TL
1217 if (validator()) {
1218 _pin(o);
1219 }
7c673cae
FG
1220 }
1221
f6b5b4d7 1222 void unpin(Onode* o, std::function<bool()> validator) {
11fdf7f2 1223 std::lock_guard l(lock);
f6b5b4d7
TL
1224 if (validator()) {
1225 _unpin(o);
1226 }
7c673cae
FG
1227 }
1228
f6b5b4d7 1229 virtual void move_pinned(OnodeCacheShard *to, Onode *o) = 0;
9f95a23c
TL
1230 virtual void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) = 0;
1231 bool empty() {
1232 return _get_num() == 0;
1233 }
7c673cae
FG
1234 };
1235
9f95a23c
TL
1236 /// A Generic buffer Cache Shard
1237 struct BufferCacheShard : public CacheShard {
1238 std::atomic<uint64_t> num_extents = {0};
1239 std::atomic<uint64_t> num_blobs = {0};
1240 uint64_t buffer_bytes = 0;
7c673cae
FG
1241
1242 public:
9f95a23c
TL
1243 BufferCacheShard(CephContext* cct) : CacheShard(cct) {}
1244 static BufferCacheShard *create(CephContext* cct, string type,
1245 PerfCounters *logger);
1246 virtual void _add(Buffer *b, int level, Buffer *near) = 0;
1247 virtual void _rm(Buffer *b) = 0;
1248 virtual void _move(BufferCacheShard *src, Buffer *b) = 0;
1249 virtual void _touch(Buffer *b) = 0;
1250 virtual void _adjust_size(Buffer *b, int64_t delta) = 0;
1251
1252 uint64_t _get_bytes() {
1253 return buffer_bytes;
7c673cae 1254 }
9f95a23c
TL
1255
1256 void add_extent() {
1257 ++num_extents;
7c673cae 1258 }
9f95a23c
TL
1259 void rm_extent() {
1260 --num_extents;
7c673cae 1261 }
7c673cae 1262
9f95a23c
TL
1263 void add_blob() {
1264 ++num_blobs;
7c673cae 1265 }
9f95a23c
TL
1266 void rm_blob() {
1267 --num_blobs;
7c673cae
FG
1268 }
1269
9f95a23c
TL
1270 virtual void add_stats(uint64_t *extents,
1271 uint64_t *blobs,
1272 uint64_t *buffers,
1273 uint64_t *bytes) = 0;
1274
1275 bool empty() {
1276 std::lock_guard l(lock);
1277 return _get_bytes() == 0;
1278 }
7c673cae
FG
1279 };
1280
1281 struct OnodeSpace {
9f95a23c 1282 OnodeCacheShard *cache;
7c673cae 1283
9f95a23c 1284 private:
7c673cae 1285 /// forward lookups
f91f0fd5 1286 mempool::bluestore_cache_meta::unordered_map<ghobject_t,OnodeRef> onode_map;
7c673cae
FG
1287
1288 friend class Collection; // for split_cache()
1289
f6b5b4d7
TL
1290 friend struct LruOnodeCacheShard;
1291 void _remove(const ghobject_t& oid);
7c673cae 1292 public:
9f95a23c 1293 OnodeSpace(OnodeCacheShard *c) : cache(c) {}
7c673cae
FG
1294 ~OnodeSpace() {
1295 clear();
1296 }
1297
f6b5b4d7 1298 OnodeRef add(const ghobject_t& oid, OnodeRef& o);
7c673cae 1299 OnodeRef lookup(const ghobject_t& o);
7c673cae
FG
1300 void rename(OnodeRef& o, const ghobject_t& old_oid,
1301 const ghobject_t& new_oid,
f91f0fd5 1302 const mempool::bluestore_cache_meta::string& new_okey);
7c673cae
FG
1303 void clear();
1304 bool empty();
1305
11fdf7f2
TL
1306 template <int LogLevelV>
1307 void dump(CephContext *cct);
3efd9988 1308
7c673cae
FG
1309 /// return true if f true for any item
1310 bool map_any(std::function<bool(OnodeRef)> f);
1311 };
1312
11fdf7f2 1313 class OpSequencer;
9f95a23c 1314 using OpSequencerRef = ceph::ref_t<OpSequencer>;
11fdf7f2 1315
7c673cae
FG
1316 struct Collection : public CollectionImpl {
1317 BlueStore *store;
11fdf7f2 1318 OpSequencerRef osr;
9f95a23c 1319 BufferCacheShard *cache; ///< our cache shard
7c673cae 1320 bluestore_cnode_t cnode;
9f95a23c
TL
1321 ceph::shared_mutex lock =
1322 ceph::make_shared_mutex("BlueStore::Collection::lock", true, false);
7c673cae
FG
1323
1324 bool exists;
1325
1326 SharedBlobSet shared_blob_set; ///< open SharedBlobs
1327
1328 // cache onodes on a per-collection basis to avoid lock
1329 // contention.
1330 OnodeSpace onode_map;
1331
1332 //pool options
1333 pool_opts_t pool_opts;
11fdf7f2 1334 ContextQueue *commit_queue;
7c673cae 1335
f6b5b4d7
TL
1336 OnodeCacheShard* get_onode_cache() const {
1337 return onode_map.cache;
1338 }
9f95a23c 1339 OnodeRef get_onode(const ghobject_t& oid, bool create, bool is_createop=false);
7c673cae
FG
1340
1341 // the terminology is confusing here, sorry!
1342 //
1343 // blob_t shared_blob_t
1344 // !shared unused -> open
1345 // shared !loaded -> open + shared
1346 // shared loaded -> open + shared + loaded
1347 //
1348 // i.e.,
1349 // open = SharedBlob is instantiated
1350 // shared = blob_t shared flag is set; SharedBlob is hashed.
1351 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1352 void open_shared_blob(uint64_t sbid, BlobRef b);
1353 void load_shared_blob(SharedBlobRef sb);
1354 void make_blob_shared(uint64_t sbid, BlobRef b);
31f18b77 1355 uint64_t make_blob_unshared(SharedBlob *sb);
7c673cae
FG
1356
1357 BlobRef new_blob() {
1358 BlobRef b = new Blob();
1359 b->shared_blob = new SharedBlob(this);
1360 return b;
1361 }
1362
7c673cae
FG
1363 bool contains(const ghobject_t& oid) {
1364 if (cid.is_meta())
1365 return oid.hobj.pool == -1;
1366 spg_t spgid;
1367 if (cid.is_pg(&spgid))
1368 return
1369 spgid.pgid.contains(cnode.bits, oid) &&
1370 oid.shard_id == spgid.shard;
1371 return false;
1372 }
1373
9f95a23c
TL
1374 int64_t pool() const {
1375 return cid.pool();
1376 }
1377
7c673cae 1378 void split_cache(Collection *dest);
7c673cae 1379
11fdf7f2
TL
1380 bool flush_commit(Context *c) override;
1381 void flush() override;
1382 void flush_all_but_last();
1383
9f95a23c 1384 Collection(BlueStore *ns, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t c);
7c673cae
FG
1385 };
1386
1387 class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
1388 CollectionRef c;
1389 OnodeRef o;
1390 KeyValueDB::Iterator it;
1391 string head, tail;
11fdf7f2
TL
1392
1393 string _stringify() const;
1394
7c673cae
FG
1395 public:
1396 OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
1397 int seek_to_first() override;
1398 int upper_bound(const string &after) override;
1399 int lower_bound(const string &to) override;
1400 bool valid() override;
11fdf7f2 1401 int next() override;
7c673cae
FG
1402 string key() override;
1403 bufferlist value() override;
9f95a23c
TL
1404 std::string tail_key() {
1405 return tail;
1406 }
1407
7c673cae
FG
1408 int status() override {
1409 return 0;
1410 }
1411 };
1412
31f18b77
FG
1413 struct volatile_statfs{
1414 enum {
1415 STATFS_ALLOCATED = 0,
1416 STATFS_STORED,
1417 STATFS_COMPRESSED_ORIGINAL,
1418 STATFS_COMPRESSED,
1419 STATFS_COMPRESSED_ALLOCATED,
1420 STATFS_LAST
1421 };
1422 int64_t values[STATFS_LAST];
1423 volatile_statfs() {
1424 memset(this, 0, sizeof(volatile_statfs));
1425 }
1426 void reset() {
1427 *this = volatile_statfs();
1428 }
11fdf7f2
TL
1429 void publish(store_statfs_t* buf) const {
1430 buf->allocated = allocated();
1431 buf->data_stored = stored();
1432 buf->data_compressed = compressed();
1433 buf->data_compressed_original = compressed_original();
1434 buf->data_compressed_allocated = compressed_allocated();
1435 }
1436
31f18b77
FG
1437 volatile_statfs& operator+=(const volatile_statfs& other) {
1438 for (size_t i = 0; i < STATFS_LAST; ++i) {
1439 values[i] += other.values[i];
1440 }
1441 return *this;
1442 }
1443 int64_t& allocated() {
1444 return values[STATFS_ALLOCATED];
1445 }
1446 int64_t& stored() {
1447 return values[STATFS_STORED];
1448 }
1449 int64_t& compressed_original() {
1450 return values[STATFS_COMPRESSED_ORIGINAL];
1451 }
1452 int64_t& compressed() {
1453 return values[STATFS_COMPRESSED];
1454 }
1455 int64_t& compressed_allocated() {
1456 return values[STATFS_COMPRESSED_ALLOCATED];
1457 }
11fdf7f2
TL
1458 int64_t allocated() const {
1459 return values[STATFS_ALLOCATED];
1460 }
1461 int64_t stored() const {
1462 return values[STATFS_STORED];
1463 }
1464 int64_t compressed_original() const {
1465 return values[STATFS_COMPRESSED_ORIGINAL];
1466 }
1467 int64_t compressed() const {
1468 return values[STATFS_COMPRESSED];
1469 }
1470 int64_t compressed_allocated() const {
1471 return values[STATFS_COMPRESSED_ALLOCATED];
1472 }
1473 volatile_statfs& operator=(const store_statfs_t& st) {
1474 values[STATFS_ALLOCATED] = st.allocated;
1475 values[STATFS_STORED] = st.data_stored;
1476 values[STATFS_COMPRESSED_ORIGINAL] = st.data_compressed_original;
1477 values[STATFS_COMPRESSED] = st.data_compressed;
1478 values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated;
1479 return *this;
1480 }
31f18b77
FG
1481 bool is_empty() {
1482 return values[STATFS_ALLOCATED] == 0 &&
1483 values[STATFS_STORED] == 0 &&
1484 values[STATFS_COMPRESSED] == 0 &&
1485 values[STATFS_COMPRESSED_ORIGINAL] == 0 &&
1486 values[STATFS_COMPRESSED_ALLOCATED] == 0;
1487 }
11fdf7f2
TL
1488 void decode(bufferlist::const_iterator& it) {
1489 using ceph::decode;
31f18b77 1490 for (size_t i = 0; i < STATFS_LAST; i++) {
11fdf7f2 1491 decode(values[i], it);
31f18b77
FG
1492 }
1493 }
1494
1495 void encode(bufferlist& bl) {
11fdf7f2 1496 using ceph::encode;
31f18b77 1497 for (size_t i = 0; i < STATFS_LAST; i++) {
11fdf7f2 1498 encode(values[i], bl);
31f18b77
FG
1499 }
1500 }
1501 };
1502
11fdf7f2 1503 struct TransContext final : public AioContext {
31f18b77
FG
1504 MEMPOOL_CLASS_HELPERS();
1505
7c673cae
FG
1506 typedef enum {
1507 STATE_PREPARE,
1508 STATE_AIO_WAIT,
1509 STATE_IO_DONE,
1510 STATE_KV_QUEUED, // queued for kv_sync_thread submission
1511 STATE_KV_SUBMITTED, // submitted to kv; not yet synced
1512 STATE_KV_DONE,
1513 STATE_DEFERRED_QUEUED, // in deferred_queue (pending or running)
1514 STATE_DEFERRED_CLEANUP, // remove deferred kv record
1515 STATE_DEFERRED_DONE,
1516 STATE_FINISHING,
1517 STATE_DONE,
1518 } state_t;
1519
1520 state_t state = STATE_PREPARE;
1521
1522 const char *get_state_name() {
1523 switch (state) {
1524 case STATE_PREPARE: return "prepare";
1525 case STATE_AIO_WAIT: return "aio_wait";
1526 case STATE_IO_DONE: return "io_done";
1527 case STATE_KV_QUEUED: return "kv_queued";
1528 case STATE_KV_SUBMITTED: return "kv_submitted";
1529 case STATE_KV_DONE: return "kv_done";
1530 case STATE_DEFERRED_QUEUED: return "deferred_queued";
1531 case STATE_DEFERRED_CLEANUP: return "deferred_cleanup";
1532 case STATE_DEFERRED_DONE: return "deferred_done";
1533 case STATE_FINISHING: return "finishing";
1534 case STATE_DONE: return "done";
1535 }
1536 return "???";
1537 }
1538
9f95a23c 1539#if defined(WITH_LTTNG)
7c673cae
FG
1540 const char *get_state_latency_name(int state) {
1541 switch (state) {
1542 case l_bluestore_state_prepare_lat: return "prepare";
1543 case l_bluestore_state_aio_wait_lat: return "aio_wait";
1544 case l_bluestore_state_io_done_lat: return "io_done";
1545 case l_bluestore_state_kv_queued_lat: return "kv_queued";
1546 case l_bluestore_state_kv_committing_lat: return "kv_committing";
1547 case l_bluestore_state_kv_done_lat: return "kv_done";
1548 case l_bluestore_state_deferred_queued_lat: return "deferred_queued";
1549 case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup";
1550 case l_bluestore_state_finishing_lat: return "finishing";
1551 case l_bluestore_state_done_lat: return "done";
1552 }
1553 return "???";
1554 }
1555#endif
1556
11fdf7f2
TL
1557 CollectionRef ch;
1558 OpSequencerRef osr; // this should be ch->osr
7c673cae
FG
1559 boost::intrusive::list_member_hook<> sequencer_item;
1560
9f95a23c 1561 uint64_t bytes = 0, ios = 0, cost = 0;
7c673cae
FG
1562
1563 set<OnodeRef> onodes; ///< these need to be updated/written
1564 set<OnodeRef> modified_objects; ///< objects we modified (and need a ref)
1565 set<SharedBlobRef> shared_blobs; ///< these need to be updated/written
1566 set<SharedBlobRef> shared_blobs_written; ///< update these on io completion
1567
1568 KeyValueDB::Transaction t; ///< then we will commit this
7c673cae
FG
1569 list<Context*> oncommits; ///< more commit completions
1570 list<CollectionRef> removed_collections; ///< colls we removed
1571
1572 boost::intrusive::list_member_hook<> deferred_queue_item;
1573 bluestore_deferred_transaction_t *deferred_txn = nullptr; ///< if any
1574
1575 interval_set<uint64_t> allocated, released;
11fdf7f2
TL
1576 volatile_statfs statfs_delta; ///< overall store statistics delta
1577 uint64_t osd_pool_id = META_POOL_ID; ///< osd pool id we're operating on
1578
7c673cae
FG
1579 IOContext ioc;
1580 bool had_ios = false; ///< true if we submitted IOs before our kv txn
1581
7c673cae 1582 uint64_t seq = 0;
9f95a23c
TL
1583 mono_clock::time_point start;
1584 mono_clock::time_point last_stamp;
7c673cae
FG
1585
1586 uint64_t last_nid = 0; ///< if non-zero, highest new nid we allocated
1587 uint64_t last_blobid = 0; ///< if non-zero, highest new blobid we allocated
1588
9f95a23c
TL
1589#if defined(WITH_LTTNG)
1590 bool tracing = false;
1591#endif
1592
11fdf7f2
TL
1593 explicit TransContext(CephContext* cct, Collection *c, OpSequencer *o,
1594 list<Context*> *on_commits)
1595 : ch(c),
1596 osr(o),
7c673cae 1597 ioc(cct, this),
9f95a23c 1598 start(mono_clock::now()) {
7c673cae 1599 last_stamp = start;
11fdf7f2
TL
1600 if (on_commits) {
1601 oncommits.swap(*on_commits);
1602 }
7c673cae
FG
1603 }
1604 ~TransContext() {
1605 delete deferred_txn;
1606 }
1607
1608 void write_onode(OnodeRef &o) {
1609 onodes.insert(o);
1610 }
1611 void write_shared_blob(SharedBlobRef &sb) {
1612 shared_blobs.insert(sb);
1613 }
31f18b77
FG
1614 void unshare_blob(SharedBlob *sb) {
1615 shared_blobs.erase(sb);
1616 }
1617
7c673cae
FG
1618 /// note we logically modified object (when onode itself is unmodified)
1619 void note_modified_object(OnodeRef &o) {
1620 // onode itself isn't written, though
1621 modified_objects.insert(o);
1622 }
a8e16298 1623 void note_removed_object(OnodeRef& o) {
7c673cae 1624 onodes.erase(o);
a8e16298 1625 modified_objects.insert(o);
7c673cae
FG
1626 }
1627
1628 void aio_finish(BlueStore *store) override {
1629 store->txc_aio_finish(this);
1630 }
1631 };
1632
9f95a23c
TL
1633 class BlueStoreThrottle {
1634#if defined(WITH_LTTNG)
1635 const std::chrono::time_point<mono_clock> time_base = mono_clock::now();
1636
1637 // Time of last chosen io (microseconds)
1638 std::atomic<uint64_t> previous_emitted_tp_time_mono_mcs = {0};
1639 std::atomic<uint64_t> ios_started_since_last_traced = {0};
1640 std::atomic<uint64_t> ios_completed_since_last_traced = {0};
1641
1642 std::atomic_uint pending_kv_ios = {0};
1643 std::atomic_uint pending_deferred_ios = {0};
1644
1645 // Min period between trace points (microseconds)
1646 std::atomic<uint64_t> trace_period_mcs = {0};
1647
1648 bool should_trace(
1649 uint64_t *started,
1650 uint64_t *completed) {
1651 uint64_t min_period_mcs = trace_period_mcs.load(
1652 std::memory_order_relaxed);
1653
1654 if (min_period_mcs == 0) {
1655 *started = 1;
1656 *completed = ios_completed_since_last_traced.exchange(0);
1657 return true;
1658 } else {
1659 ios_started_since_last_traced++;
1660 auto now_mcs = ceph::to_microseconds<uint64_t>(
1661 mono_clock::now() - time_base);
1662 uint64_t previous_mcs = previous_emitted_tp_time_mono_mcs;
1663 uint64_t period_mcs = now_mcs - previous_mcs;
1664 if (period_mcs > min_period_mcs) {
1665 if (previous_emitted_tp_time_mono_mcs.compare_exchange_strong(
1666 previous_mcs, now_mcs)) {
1667 // This would be racy at a sufficiently extreme trace rate, but isn't
1668 // worth the overhead of doing it more carefully.
1669 *started = ios_started_since_last_traced.exchange(0);
1670 *completed = ios_completed_since_last_traced.exchange(0);
1671 return true;
1672 }
1673 }
1674 return false;
1675 }
1676 }
1677#endif
1678
1679#if defined(WITH_LTTNG)
1680 void emit_initial_tracepoint(
1681 KeyValueDB &db,
1682 TransContext &txc,
1683 mono_clock::time_point);
1684#else
1685 void emit_initial_tracepoint(
1686 KeyValueDB &db,
1687 TransContext &txc,
1688 mono_clock::time_point) {}
1689#endif
1690
1691 Throttle throttle_bytes; ///< submit to commit
1692 Throttle throttle_deferred_bytes; ///< submit to deferred complete
1693
1694 public:
1695 BlueStoreThrottle(CephContext *cct) :
1696 throttle_bytes(cct, "bluestore_throttle_bytes", 0),
1697 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes", 0)
1698 {
1699 reset_throttle(cct->_conf);
1700 }
1701
1702#if defined(WITH_LTTNG)
1703 void complete_kv(TransContext &txc);
1704 void complete(TransContext &txc);
1705#else
1706 void complete_kv(TransContext &txc) {}
1707 void complete(TransContext &txc) {}
1708#endif
1709
1710 mono_clock::duration log_state_latency(
1711 TransContext &txc, PerfCounters *logger, int state);
1712 bool try_start_transaction(
1713 KeyValueDB &db,
1714 TransContext &txc,
1715 mono_clock::time_point);
1716 void finish_start_transaction(
1717 KeyValueDB &db,
1718 TransContext &txc,
1719 mono_clock::time_point);
1720 void release_kv_throttle(uint64_t cost) {
1721 throttle_bytes.put(cost);
1722 }
1723 void release_deferred_throttle(uint64_t cost) {
1724 throttle_deferred_bytes.put(cost);
1725 }
1726 bool should_submit_deferred() {
1727 return throttle_deferred_bytes.past_midpoint();
1728 }
1729 void reset_throttle(const ConfigProxy &conf) {
1730 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
1731 throttle_deferred_bytes.reset_max(
1732 conf->bluestore_throttle_bytes +
1733 conf->bluestore_throttle_deferred_bytes);
1734#if defined(WITH_LTTNG)
1735 double rate = conf.get_val<double>("bluestore_throttle_trace_rate");
1736 trace_period_mcs = rate > 0 ? floor((1/rate) * 1000000.0) : 0;
1737#endif
1738 }
1739 } throttle;
1740
7c673cae
FG
1741 typedef boost::intrusive::list<
1742 TransContext,
1743 boost::intrusive::member_hook<
1744 TransContext,
1745 boost::intrusive::list_member_hook<>,
1746 &TransContext::deferred_queue_item> > deferred_queue_t;
1747
11fdf7f2 1748 struct DeferredBatch final : public AioContext {
7c673cae
FG
1749 OpSequencer *osr;
1750 struct deferred_io {
1751 bufferlist bl; ///< data
1752 uint64_t seq; ///< deferred transaction seq
1753 };
1754 map<uint64_t,deferred_io> iomap; ///< map of ios in this batch
1755 deferred_queue_t txcs; ///< txcs in this batch
1756 IOContext ioc; ///< our aios
1757 /// bytes of pending io for each deferred seq (may be 0)
1758 map<uint64_t,int> seq_bytes;
1759
1760 void _discard(CephContext *cct, uint64_t offset, uint64_t length);
1761 void _audit(CephContext *cct);
1762
1763 DeferredBatch(CephContext *cct, OpSequencer *osr)
1764 : osr(osr), ioc(cct, this) {}
1765
1766 /// prepare a write
1767 void prepare_write(CephContext *cct,
1768 uint64_t seq, uint64_t offset, uint64_t length,
1769 bufferlist::const_iterator& p);
1770
1771 void aio_finish(BlueStore *store) override {
1772 store->_deferred_aio_finish(osr);
1773 }
1774 };
1775
11fdf7f2 1776 class OpSequencer : public RefCountedObject {
7c673cae 1777 public:
11fdf7f2
TL
1778 ceph::mutex qlock = ceph::make_mutex("BlueStore::OpSequencer::qlock");
1779 ceph::condition_variable qcond;
7c673cae
FG
1780 typedef boost::intrusive::list<
1781 TransContext,
1782 boost::intrusive::member_hook<
1783 TransContext,
1784 boost::intrusive::list_member_hook<>,
1785 &TransContext::sequencer_item> > q_list_t;
1786 q_list_t q; ///< transactions
1787
1788 boost::intrusive::list_member_hook<> deferred_osr_queue_item;
1789
1790 DeferredBatch *deferred_running = nullptr;
1791 DeferredBatch *deferred_pending = nullptr;
1792
7c673cae 1793 BlueStore *store;
11fdf7f2 1794 coll_t cid;
7c673cae
FG
1795
1796 uint64_t last_seq = 0;
1797
1798 std::atomic_int txc_with_unstable_io = {0}; ///< num txcs with unstable io
1799
1800 std::atomic_int kv_committing_serially = {0};
1801
1802 std::atomic_int kv_submitted_waiters = {0};
1803
11fdf7f2 1804 std::atomic_bool zombie = {false}; ///< in zombie_osr set (collection going away)
7c673cae 1805
9f95a23c
TL
1806 const uint32_t sequencer_id;
1807
1808 uint32_t get_sequencer_id() const {
1809 return sequencer_id;
7c673cae
FG
1810 }
1811
1812 void queue_new(TransContext *txc) {
11fdf7f2 1813 std::lock_guard l(qlock);
7c673cae
FG
1814 txc->seq = ++last_seq;
1815 q.push_back(*txc);
1816 }
1817
1818 void drain() {
11fdf7f2 1819 std::unique_lock l(qlock);
7c673cae
FG
1820 while (!q.empty())
1821 qcond.wait(l);
1822 }
1823
1824 void drain_preceding(TransContext *txc) {
11fdf7f2 1825 std::unique_lock l(qlock);
9f95a23c 1826 while (&q.front() != txc)
7c673cae
FG
1827 qcond.wait(l);
1828 }
1829
1830 bool _is_all_kv_submitted() {
11fdf7f2
TL
1831 // caller must hold qlock & q.empty() must not empty
1832 ceph_assert(!q.empty());
7c673cae
FG
1833 TransContext *txc = &q.back();
1834 if (txc->state >= TransContext::STATE_KV_SUBMITTED) {
1835 return true;
1836 }
1837 return false;
1838 }
1839
11fdf7f2
TL
1840 void flush() {
1841 std::unique_lock l(qlock);
1842 while (true) {
1843 // set flag before the check because the condition
1844 // may become true outside qlock, and we need to make
1845 // sure those threads see waiters and signal qcond.
1846 ++kv_submitted_waiters;
1847 if (q.empty() || _is_all_kv_submitted()) {
1848 --kv_submitted_waiters;
1849 return;
1850 }
1851 qcond.wait(l);
1852 --kv_submitted_waiters;
1853 }
1854 }
1855
1856 void flush_all_but_last() {
1857 std::unique_lock l(qlock);
1858 assert (q.size() >= 1);
7c673cae
FG
1859 while (true) {
1860 // set flag before the check because the condition
1861 // may become true outside qlock, and we need to make
1862 // sure those threads see waiters and signal qcond.
1863 ++kv_submitted_waiters;
11fdf7f2
TL
1864 if (q.size() <= 1) {
1865 --kv_submitted_waiters;
7c673cae 1866 return;
11fdf7f2
TL
1867 } else {
1868 auto it = q.rbegin();
1869 it++;
1870 if (it->state >= TransContext::STATE_KV_SUBMITTED) {
eafe8130 1871 --kv_submitted_waiters;
11fdf7f2
TL
1872 return;
1873 }
7c673cae
FG
1874 }
1875 qcond.wait(l);
1876 --kv_submitted_waiters;
1877 }
1878 }
1879
11fdf7f2
TL
1880 bool flush_commit(Context *c) {
1881 std::lock_guard l(qlock);
7c673cae
FG
1882 if (q.empty()) {
1883 return true;
1884 }
1885 TransContext *txc = &q.back();
1886 if (txc->state >= TransContext::STATE_KV_DONE) {
1887 return true;
1888 }
1889 txc->oncommits.push_back(c);
1890 return false;
1891 }
9f95a23c
TL
1892 private:
1893 FRIEND_MAKE_REF(OpSequencer);
1894 OpSequencer(BlueStore *store, uint32_t sequencer_id, const coll_t& c)
1895 : RefCountedObject(store->cct),
1896 store(store), cid(c), sequencer_id(sequencer_id) {
1897 }
1898 ~OpSequencer() {
1899 ceph_assert(q.empty());
1900 }
7c673cae
FG
1901 };
1902
1903 typedef boost::intrusive::list<
1904 OpSequencer,
1905 boost::intrusive::member_hook<
1906 OpSequencer,
1907 boost::intrusive::list_member_hook<>,
1908 &OpSequencer::deferred_osr_queue_item> > deferred_osr_queue_t;
1909
1910 struct KVSyncThread : public Thread {
1911 BlueStore *store;
1912 explicit KVSyncThread(BlueStore *s) : store(s) {}
1913 void *entry() override {
1914 store->_kv_sync_thread();
1915 return NULL;
1916 }
1917 };
31f18b77
FG
1918 struct KVFinalizeThread : public Thread {
1919 BlueStore *store;
1920 explicit KVFinalizeThread(BlueStore *s) : store(s) {}
1921 void *entry() {
1922 store->_kv_finalize_thread();
1923 return NULL;
1924 }
1925 };
7c673cae
FG
1926
1927 struct DBHistogram {
1928 struct value_dist {
1929 uint64_t count;
1930 uint32_t max_len;
1931 };
1932
1933 struct key_dist {
1934 uint64_t count;
1935 uint32_t max_len;
1936 map<int, struct value_dist> val_map; ///< slab id to count, max length of value and key
1937 };
1938
1939 map<string, map<int, struct key_dist> > key_hist;
1940 map<int, uint64_t> value_hist;
1941 int get_key_slab(size_t sz);
1942 string get_key_slab_to_range(int slab);
1943 int get_value_slab(size_t sz);
1944 string get_value_slab_to_range(int slab);
1945 void update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
1946 const string &prefix, size_t key_size, size_t value_size);
1947 void dump(Formatter *f);
1948 };
1949
1950 // --------------------------------------------------------
1951 // members
1952private:
1953 BlueFS *bluefs = nullptr;
9f95a23c 1954 bluefs_layout_t bluefs_layout;
11fdf7f2
TL
1955 mono_time bluefs_last_balance;
1956 utime_t next_dump_on_bluefs_alloc_failure;
7c673cae
FG
1957
1958 KeyValueDB *db = nullptr;
1959 BlockDevice *bdev = nullptr;
1960 std::string freelist_type;
1961 FreelistManager *fm = nullptr;
1962 Allocator *alloc = nullptr;
1963 uuid_d fsid;
1964 int path_fd = -1; ///< open handle to $path
1965 int fsid_fd = -1; ///< open handle (locked) to $path/fsid
1966 bool mounted = false;
1967
9f95a23c 1968 ceph::shared_mutex coll_lock = ceph::make_shared_mutex("BlueStore::coll_lock"); ///< rwlock to protect coll_map
31f18b77 1969 mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map;
eafe8130 1970 bool collections_had_errors = false;
11fdf7f2 1971 map<coll_t,CollectionRef> new_coll_map;
7c673cae 1972
9f95a23c
TL
1973 vector<OnodeCacheShard*> onode_cache_shards;
1974 vector<BufferCacheShard*> buffer_cache_shards;
7c673cae 1975
11fdf7f2
TL
1976 /// protect zombie_osr_set
1977 ceph::mutex zombie_osr_lock = ceph::make_mutex("BlueStore::zombie_osr_lock");
9f95a23c 1978 uint32_t next_sequencer_id = 0;
11fdf7f2 1979 std::map<coll_t,OpSequencerRef> zombie_osr_set; ///< set of OpSequencers for deleted collections
7c673cae
FG
1980
1981 std::atomic<uint64_t> nid_last = {0};
1982 std::atomic<uint64_t> nid_max = {0};
1983 std::atomic<uint64_t> blobid_last = {0};
1984 std::atomic<uint64_t> blobid_max = {0};
1985
7c673cae
FG
1986 interval_set<uint64_t> bluefs_extents; ///< block extents owned by bluefs
1987 interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming
1988
11fdf7f2 1989 ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::deferred_lock");
7c673cae
FG
1990 std::atomic<uint64_t> deferred_seq = {0};
1991 deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
1992 int deferred_queue_size = 0; ///< num txc's queued across all osrs
1993 atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
9f95a23c
TL
1994 Finisher finisher;
1995 utime_t deferred_last_submitted = utime_t();
7c673cae
FG
1996
1997 KVSyncThread kv_sync_thread;
11fdf7f2
TL
1998 ceph::mutex kv_lock = ceph::make_mutex("BlueStore::kv_lock");
1999 ceph::condition_variable kv_cond;
3efd9988 2000 bool _kv_only = false;
31f18b77 2001 bool kv_sync_started = false;
7c673cae 2002 bool kv_stop = false;
31f18b77
FG
2003 bool kv_finalize_started = false;
2004 bool kv_finalize_stop = false;
7c673cae
FG
2005 deque<TransContext*> kv_queue; ///< ready, already submitted
2006 deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
2007 deque<TransContext*> kv_committing; ///< currently syncing
2008 deque<DeferredBatch*> deferred_done_queue; ///< deferred ios done
9f95a23c 2009 bool kv_sync_in_progress = false;
7c673cae 2010
31f18b77 2011 KVFinalizeThread kv_finalize_thread;
11fdf7f2
TL
2012 ceph::mutex kv_finalize_lock = ceph::make_mutex("BlueStore::kv_finalize_lock");
2013 ceph::condition_variable kv_finalize_cond;
31f18b77
FG
2014 deque<TransContext*> kv_committing_to_finalize; ///< pending finalization
2015 deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization
9f95a23c 2016 bool kv_finalize_in_progress = false;
31f18b77 2017
7c673cae
FG
2018 PerfCounters *logger = nullptr;
2019
7c673cae
FG
2020 list<CollectionRef> removed_collections;
2021
9f95a23c
TL
2022 ceph::shared_mutex debug_read_error_lock =
2023 ceph::make_shared_mutex("BlueStore::debug_read_error_lock");
7c673cae
FG
2024 set<ghobject_t> debug_data_error_objects;
2025 set<ghobject_t> debug_mdata_error_objects;
2026
2027 std::atomic<int> csum_type = {Checksummer::CSUM_CRC32C};
2028
2029 uint64_t block_size = 0; ///< block size of block device (power of 2)
2030 uint64_t block_mask = 0; ///< mask to get just the block offset
2031 size_t block_size_order = 0; ///< bits to shift to get block size
2032
9f95a23c 2033 uint64_t min_alloc_size; ///< minimum allocation unit (power of 2)
7c673cae 2034 ///< bits for min_alloc_size
224ce89b 2035 uint8_t min_alloc_size_order = 0;
7c673cae
FG
2036 static_assert(std::numeric_limits<uint8_t>::max() >
2037 std::numeric_limits<decltype(min_alloc_size)>::digits,
2038 "not enough bits for min_alloc_size");
2039
9f95a23c
TL
2040 bool per_pool_omap = false;
2041
7c673cae
FG
2042 ///< maximum allocation unit (power of 2)
2043 std::atomic<uint64_t> max_alloc_size = {0};
2044
224ce89b
WB
2045 ///< number threshold for forced deferred writes
2046 std::atomic<int> deferred_batch_ops = {0};
2047
2048 ///< size threshold for forced deferred writes
2049 std::atomic<uint64_t> prefer_deferred_size = {0};
2050
7c673cae
FG
2051 ///< approx cost per io, in bytes
2052 std::atomic<uint64_t> throttle_cost_per_io = {0};
2053
224ce89b
WB
2054 std::atomic<Compressor::CompressionMode> comp_mode =
2055 {Compressor::COMP_NONE}; ///< compression mode
7c673cae
FG
2056 CompressorRef compressor;
2057 std::atomic<uint64_t> comp_min_blob_size = {0};
2058 std::atomic<uint64_t> comp_max_blob_size = {0};
2059
2060 std::atomic<uint64_t> max_blob_size = {0}; ///< maximum blob size
2061
31f18b77
FG
2062 uint64_t kv_ios = 0;
2063 uint64_t kv_throttle_costs = 0;
2064
7c673cae 2065 // cache trim control
91327a77
AA
2066 uint64_t cache_size = 0; ///< total cache size
2067 double cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
2068 double cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
2069 double cache_data_ratio = 0; ///< cache ratio dedicated to object data
2070 bool cache_autotune = false; ///< cache autotune setting
91327a77
AA
2071 double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
2072 uint64_t osd_memory_target = 0; ///< OSD memory target when autotuning cache
2073 uint64_t osd_memory_base = 0; ///< OSD base memory when autotuning cache
2074 double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
11fdf7f2 2075 uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cache
91327a77 2076 double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing
9f95a23c 2077 double max_defer_interval = 0; ///< Time to wait between last deferred submit
92f5a8d4 2078 std::atomic<uint32_t> config_changed = {0}; ///< Counter to determine if there is a configuration change.
11fdf7f2
TL
2079
2080 typedef map<uint64_t, volatile_statfs> osd_pools_map;
2081
2082 ceph::mutex vstatfs_lock = ceph::make_mutex("BlueStore::vstatfs_lock");
31f18b77 2083 volatile_statfs vstatfs;
11fdf7f2
TL
2084 osd_pools_map osd_pools; // protected by vstatfs_lock as well
2085
2086 bool per_pool_stat_collection = true;
7c673cae
FG
2087
2088 struct MempoolThread : public Thread {
91327a77 2089 public:
7c673cae 2090 BlueStore *store;
91327a77 2091
11fdf7f2
TL
2092 ceph::condition_variable cond;
2093 ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock");
7c673cae 2094 bool stop = false;
11fdf7f2 2095 std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr;
eafe8130 2096 std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
91327a77
AA
2097
2098 struct MempoolCache : public PriorityCache::PriCache {
2099 BlueStore *store;
11fdf7f2
TL
2100 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
2101 int64_t committed_bytes = 0;
91327a77
AA
2102 double cache_ratio = 0;
2103
2104 MempoolCache(BlueStore *s) : store(s) {};
2105
2106 virtual uint64_t _get_used_bytes() const = 0;
2107
2108 virtual int64_t request_cache_bytes(
11fdf7f2 2109 PriorityCache::Priority pri, uint64_t total_cache) const {
91327a77
AA
2110 int64_t assigned = get_cache_bytes(pri);
2111
2112 switch (pri) {
eafe8130
TL
2113 // All cache items are currently shoved into the PRI1 priority
2114 case PriorityCache::Priority::PRI1:
91327a77 2115 {
11fdf7f2 2116 int64_t request = _get_used_bytes();
91327a77
AA
2117 return(request > assigned) ? request - assigned : 0;
2118 }
2119 default:
2120 break;
2121 }
2122 return -EOPNOTSUPP;
2123 }
2124
2125 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
2126 return cache_bytes[pri];
2127 }
2128 virtual int64_t get_cache_bytes() const {
2129 int64_t total = 0;
2130
2131 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
2132 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
2133 total += get_cache_bytes(pri);
2134 }
2135 return total;
2136 }
2137 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
2138 cache_bytes[pri] = bytes;
2139 }
2140 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
2141 cache_bytes[pri] += bytes;
2142 }
11fdf7f2
TL
2143 virtual int64_t commit_cache_size(uint64_t total_cache) {
2144 committed_bytes = PriorityCache::get_chunk(
2145 get_cache_bytes(), total_cache);
2146 return committed_bytes;
2147 }
2148 virtual int64_t get_committed_size() const {
2149 return committed_bytes;
91327a77
AA
2150 }
2151 virtual double get_cache_ratio() const {
2152 return cache_ratio;
2153 }
2154 virtual void set_cache_ratio(double ratio) {
2155 cache_ratio = ratio;
2156 }
2157 virtual string get_cache_name() const = 0;
2158 };
2159
2160 struct MetaCache : public MempoolCache {
2161 MetaCache(BlueStore *s) : MempoolCache(s) {};
2162
2163 virtual uint64_t _get_used_bytes() const {
f91f0fd5
TL
2164 return mempool::bluestore_Buffer::allocated_bytes() +
2165 mempool::bluestore_Blob::allocated_bytes() +
2166 mempool::bluestore_Extent::allocated_bytes() +
2167 mempool::bluestore_cache_meta::allocated_bytes() +
2168 mempool::bluestore_cache_other::allocated_bytes() +
2169 mempool::bluestore_cache_onode::allocated_bytes() +
2170 mempool::bluestore_SharedBlob::allocated_bytes() +
2171 mempool::bluestore_inline_bl::allocated_bytes();
91327a77
AA
2172 }
2173
2174 virtual string get_cache_name() const {
2175 return "BlueStore Meta Cache";
2176 }
2177
2178 uint64_t _get_num_onodes() const {
2179 uint64_t onode_num =
2180 mempool::bluestore_cache_onode::allocated_items();
2181 return (2 > onode_num) ? 2 : onode_num;
2182 }
2183
2184 double get_bytes_per_onode() const {
2185 return (double)_get_used_bytes() / (double)_get_num_onodes();
2186 }
11fdf7f2
TL
2187 };
2188 std::shared_ptr<MetaCache> meta_cache;
91327a77
AA
2189
2190 struct DataCache : public MempoolCache {
2191 DataCache(BlueStore *s) : MempoolCache(s) {};
2192
2193 virtual uint64_t _get_used_bytes() const {
2194 uint64_t bytes = 0;
9f95a23c
TL
2195 for (auto i : store->buffer_cache_shards) {
2196 bytes += i->_get_bytes();
91327a77
AA
2197 }
2198 return bytes;
2199 }
2200 virtual string get_cache_name() const {
2201 return "BlueStore Data Cache";
2202 }
11fdf7f2
TL
2203 };
2204 std::shared_ptr<DataCache> data_cache;
91327a77 2205
7c673cae
FG
2206 public:
2207 explicit MempoolThread(BlueStore *s)
2208 : store(s),
11fdf7f2
TL
2209 meta_cache(new MetaCache(s)),
2210 data_cache(new DataCache(s)) {}
91327a77 2211
7c673cae
FG
2212 void *entry() override;
2213 void init() {
11fdf7f2 2214 ceph_assert(stop == false);
7c673cae
FG
2215 create("bstore_mempool");
2216 }
2217 void shutdown() {
11fdf7f2 2218 lock.lock();
7c673cae 2219 stop = true;
11fdf7f2
TL
2220 cond.notify_all();
2221 lock.unlock();
7c673cae
FG
2222 join();
2223 }
91327a77
AA
2224
2225 private:
2226 void _adjust_cache_settings();
92f5a8d4 2227 void _update_cache_settings();
9f95a23c 2228 void _resize_shards(bool interval_stats);
7c673cae
FG
2229 } mempool_thread;
2230
2231 // --------------------------------------------------------
2232 // private methods
2233
2234 void _init_logger();
2235 void _shutdown_logger();
2236 int _reload_logger();
2237
2238 int _open_path();
2239 void _close_path();
2240 int _open_fsid(bool create);
2241 int _lock_fsid();
2242 int _read_fsid(uuid_d *f);
2243 int _write_fsid();
2244 void _close_fsid();
2245 void _set_alloc_sizes();
2246 void _set_blob_size();
1adf2230 2247 void _set_finisher_num();
9f95a23c 2248 void _set_per_pool_omap();
92f5a8d4 2249 void _update_osd_memory_options();
7c673cae
FG
2250
2251 int _open_bdev(bool create);
11fdf7f2
TL
2252 // Verifies if disk space is enough for reserved + min bluefs
2253 // and alters the latter if needed.
2254 // Depends on min_alloc_size hence should be called after
2255 // its initialization (and outside of _open_bdev)
2256 void _validate_bdev();
7c673cae 2257 void _close_bdev();
11fdf7f2
TL
2258
2259 int _minimal_open_bluefs(bool create);
2260 void _minimal_close_bluefs();
2261 int _open_bluefs(bool create);
1911f103 2262 void _close_bluefs(bool cold_close);
11fdf7f2
TL
2263
2264 // Limited (u)mount intended for BlueFS operations only
2265 int _mount_for_bluefs();
2266 void _umount_for_bluefs();
2267
2268
2269 int _is_bluefs(bool create, bool* ret);
2270 /*
2271 * opens both DB and dependant super_meta, FreelistManager and allocator
2272 * in the proper order
2273 */
2274 int _open_db_and_around(bool read_only);
1911f103 2275 void _close_db_and_around(bool read_only);
11fdf7f2
TL
2276
2277 // updates legacy bluefs related recs in DB to a state valid for
2278 // downgrades from nautilus.
2279 void _sync_bluefs_and_fm();
2280
2281 /*
2282 * @warning to_repair_db means that we open this db to repair it, will not
2283 * hold the rocksdb's file lock.
2284 */
2285 int _open_db(bool create,
2286 bool to_repair_db=false,
2287 bool read_only = false);
1911f103
TL
2288 void _close_db(bool read_only);
2289 int _open_fm(KeyValueDB::Transaction t, bool read_only);
7c673cae 2290 void _close_fm();
1911f103
TL
2291 int _write_out_fm_meta(uint64_t target_size,
2292 bool update_root_size = false,
2293 bluestore_bdev_label_t* res_label = nullptr);
7c673cae
FG
2294 int _open_alloc();
2295 void _close_alloc();
eafe8130
TL
2296 int _open_collections();
2297 void _fsck_collections(int64_t* errors);
7c673cae
FG
2298 void _close_collections();
2299
2300 int _setup_block_symlink_or_file(string name, string path, uint64_t size,
2301 bool create);
2302
7c673cae 2303public:
9f95a23c
TL
2304 utime_t get_deferred_last_submitted() {
2305 std::lock_guard l(deferred_lock);
2306 return deferred_last_submitted;
2307 }
2308
3efd9988
FG
2309 static int _write_bdev_label(CephContext* cct,
2310 string path, bluestore_bdev_label_t label);
7c673cae
FG
2311 static int _read_bdev_label(CephContext* cct, string path,
2312 bluestore_bdev_label_t *label);
2313private:
2314 int _check_or_set_bdev_label(string path, uint64_t size, string desc,
2315 bool create);
2316
2317 int _open_super_meta();
2318
224ce89b 2319 void _open_statfs();
11fdf7f2 2320 void _get_statfs_overall(struct store_statfs_t *buf);
31f18b77 2321
11fdf7f2
TL
2322 void _dump_alloc_on_failure();
2323
2324 int64_t _get_bluefs_size_delta(uint64_t bluefs_free, uint64_t bluefs_total);
2325 int _balance_bluefs_freespace();
7c673cae
FG
2326
2327 CollectionRef _get_collection(const coll_t& cid);
2328 void _queue_reap_collection(CollectionRef& c);
2329 void _reap_collections();
2330 void _update_cache_logger();
2331
2332 void _assign_nid(TransContext *txc, OnodeRef o);
2333 uint64_t _assign_blobid(TransContext *txc);
2334
81eedcae
TL
2335 template <int LogLevelV>
2336 friend void _dump_onode(CephContext *cct, const Onode& o);
2337 template <int LogLevelV>
2338 friend void _dump_extent_map(CephContext *cct, const ExtentMap& em);
2339 template <int LogLevelV>
2340 friend void _dump_transaction(CephContext *cct, Transaction *t);
7c673cae 2341
11fdf7f2
TL
2342 TransContext *_txc_create(Collection *c, OpSequencer *osr,
2343 list<Context*> *on_commits);
7c673cae
FG
2344 void _txc_update_store_statfs(TransContext *txc);
2345 void _txc_add_transaction(TransContext *txc, Transaction *t);
2346 void _txc_calc_cost(TransContext *txc);
2347 void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t);
2348 void _txc_state_proc(TransContext *txc);
2349 void _txc_aio_submit(TransContext *txc);
2350public:
2351 void txc_aio_finish(void *p) {
2352 _txc_state_proc(static_cast<TransContext*>(p));
2353 }
2354private:
2355 void _txc_finish_io(TransContext *txc);
2356 void _txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t);
9f95a23c 2357 void _txc_apply_kv(TransContext *txc, bool sync_submit_transaction);
7c673cae
FG
2358 void _txc_committed_kv(TransContext *txc);
2359 void _txc_finish(TransContext *txc);
2360 void _txc_release_alloc(TransContext *txc);
2361
11fdf7f2
TL
2362 void _osr_attach(Collection *c);
2363 void _osr_register_zombie(OpSequencer *osr);
2364 void _osr_drain(OpSequencer *osr);
7c673cae
FG
2365 void _osr_drain_preceding(TransContext *txc);
2366 void _osr_drain_all();
7c673cae 2367
31f18b77
FG
2368 void _kv_start();
2369 void _kv_stop();
7c673cae 2370 void _kv_sync_thread();
31f18b77 2371 void _kv_finalize_thread();
7c673cae 2372
9f95a23c 2373 bluestore_deferred_op_t *_get_deferred_op(TransContext *txc);
7c673cae 2374 void _deferred_queue(TransContext *txc);
3efd9988 2375public:
224ce89b 2376 void deferred_try_submit();
3efd9988 2377private:
224ce89b 2378 void _deferred_submit_unlock(OpSequencer *osr);
7c673cae
FG
2379 void _deferred_aio_finish(OpSequencer *osr);
2380 int _deferred_replay();
2381
2382public:
2383 using mempool_dynamic_bitset =
2384 boost::dynamic_bitset<uint64_t,
2385 mempool::bluestore_fsck::pool_allocator<uint64_t>>;
eafe8130
TL
2386 using per_pool_statfs =
2387 mempool::bluestore_fsck::map<uint64_t, store_statfs_t>;
2388
2389 enum FSCKDepth {
2390 FSCK_REGULAR,
2391 FSCK_DEEP,
2392 FSCK_SHALLOW
2393 };
9f95a23c
TL
2394 enum {
2395 MAX_FSCK_ERROR_LINES = 100,
2396 };
7c673cae
FG
2397
2398private:
2399 int _fsck_check_extents(
11fdf7f2 2400 const coll_t& cid,
7c673cae
FG
2401 const ghobject_t& oid,
2402 const PExtentVector& extents,
2403 bool compressed,
2404 mempool_dynamic_bitset &used_blocks,
b32b8144 2405 uint64_t granularity,
11fdf7f2 2406 BlueStoreRepairer* repairer,
eafe8130
TL
2407 store_statfs_t& expected_statfs,
2408 FSCKDepth depth);
7c673cae 2409
11fdf7f2
TL
2410 void _fsck_check_pool_statfs(
2411 per_pool_statfs& expected_pool_statfs,
eafe8130
TL
2412 int64_t& errors,
2413 int64_t &warnings,
11fdf7f2
TL
2414 BlueStoreRepairer* repairer);
2415
eafe8130
TL
2416 int _fsck(FSCKDepth depth, bool repair);
2417 int _fsck_on_open(BlueStore::FSCKDepth depth, bool repair);
2418
7c673cae
FG
2419 void _buffer_cache_write(
2420 TransContext *txc,
2421 BlobRef b,
2422 uint64_t offset,
2423 bufferlist& bl,
2424 unsigned flags) {
2425 b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl,
2426 flags);
2427 txc->shared_blobs_written.insert(b->shared_blob);
2428 }
2429
2430 int _collection_list(
2431 Collection *c, const ghobject_t& start, const ghobject_t& end,
f91f0fd5 2432 int max, bool legacy, vector<ghobject_t> *ls, ghobject_t *next);
7c673cae
FG
2433
2434 template <typename T, typename F>
2435 T select_option(const std::string& opt_name, T val1, F f) {
2436 //NB: opt_name reserved for future use
2437 boost::optional<T> val2 = f();
2438 if (val2) {
2439 return *val2;
2440 }
2441 return val1;
2442 }
2443
2444 void _apply_padding(uint64_t head_pad,
2445 uint64_t tail_pad,
7c673cae
FG
2446 bufferlist& padded);
2447
11fdf7f2
TL
2448 void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn);
2449
7c673cae
FG
2450 // -- ondisk version ---
2451public:
1911f103 2452 const int32_t latest_ondisk_format = 4; ///< our version
7c673cae 2453 const int32_t min_readable_ondisk_format = 1; ///< what we can read
9f95a23c 2454 const int32_t min_compat_ondisk_format = 3; ///< who can read us
7c673cae
FG
2455
2456private:
2457 int32_t ondisk_format = 0; ///< value detected on mount
2458
2459 int _upgrade_super(); ///< upgrade (called during open_super)
11fdf7f2 2460 uint64_t _get_ondisk_reserved() const;
7c673cae
FG
2461 void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
2462
2463 // --- public interface ---
2464public:
2465 BlueStore(CephContext *cct, const string& path);
2466 BlueStore(CephContext *cct, const string& path, uint64_t min_alloc_size); // Ctor for UT only
2467 ~BlueStore() override;
2468
2469 string get_type() override {
2470 return "bluestore";
2471 }
2472
2473 bool needs_journal() override { return false; };
2474 bool wants_journal() override { return false; };
2475 bool allows_journal() override { return false; };
2476
9f95a23c
TL
2477 uint64_t get_min_alloc_size() const override {
2478 return min_alloc_size;
2479 }
2480
11fdf7f2
TL
2481 int get_devices(set<string> *ls) override;
2482
31f18b77 2483 bool is_rotational() override;
d2e6a577 2484 bool is_journal_rotational() override;
31f18b77 2485
224ce89b
WB
2486 string get_default_device_class() override {
2487 string device_class;
2488 map<string, string> metadata;
2489 collect_metadata(&metadata);
2490 auto it = metadata.find("bluestore_bdev_type");
2491 if (it != metadata.end()) {
2492 device_class = it->second;
2493 }
2494 return device_class;
2495 }
2496
11fdf7f2
TL
2497 int get_numa_node(
2498 int *numa_node,
2499 set<int> *nodes,
2500 set<string> *failed) override;
2501
7c673cae
FG
2502 static int get_block_device_fsid(CephContext* cct, const string& path,
2503 uuid_d *fsid);
2504
2505 bool test_mount_in_use() override;
2506
2507private:
11fdf7f2 2508 int _mount(bool kv_only, bool open_db=true);
7c673cae
FG
2509public:
2510 int mount() override {
2511 return _mount(false);
2512 }
2513 int umount() override;
2514
11fdf7f2
TL
2515 int start_kv_only(KeyValueDB **pdb, bool open_db=true) {
2516 int r = _mount(true, open_db);
7c673cae
FG
2517 if (r < 0)
2518 return r;
2519 *pdb = db;
2520 return 0;
2521 }
2522
3efd9988
FG
2523 int write_meta(const std::string& key, const std::string& value) override;
2524 int read_meta(const std::string& key, std::string *value) override;
2525
eafe8130
TL
2526 int cold_open();
2527 int cold_close();
3efd9988
FG
2528
2529 int fsck(bool deep) override {
eafe8130 2530 return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, false);
3efd9988
FG
2531 }
2532 int repair(bool deep) override {
eafe8130
TL
2533 return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, true);
2534 }
2535 int quick_fix() override {
2536 return _fsck(FSCK_SHALLOW, true);
3efd9988 2537 }
7c673cae
FG
2538
2539 void set_cache_shards(unsigned num) override;
11fdf7f2
TL
2540 void dump_cache_stats(Formatter *f) override {
2541 int onode_count = 0, buffers_bytes = 0;
9f95a23c
TL
2542 for (auto i: onode_cache_shards) {
2543 onode_count += i->_get_num();
2544 }
2545 for (auto i: buffer_cache_shards) {
2546 buffers_bytes += i->_get_bytes();
11fdf7f2
TL
2547 }
2548 f->dump_int("bluestore_onode", onode_count);
2549 f->dump_int("bluestore_buffers", buffers_bytes);
2550 }
2551 void dump_cache_stats(ostream& ss) override {
2552 int onode_count = 0, buffers_bytes = 0;
9f95a23c
TL
2553 for (auto i: onode_cache_shards) {
2554 onode_count += i->_get_num();
2555 }
2556 for (auto i: buffer_cache_shards) {
2557 buffers_bytes += i->_get_bytes();
11fdf7f2
TL
2558 }
2559 ss << "bluestore_onode: " << onode_count;
2560 ss << "bluestore_buffers: " << buffers_bytes;
2561 }
7c673cae
FG
2562
2563 int validate_hobject_key(const hobject_t &obj) const override {
2564 return 0;
2565 }
2566 unsigned get_max_attr_name_length() override {
2567 return 256; // arbitrary; there is no real limit internally
2568 }
2569
2570 int mkfs() override;
2571 int mkjournal() override {
2572 return 0;
2573 }
2574
2575 void get_db_statistics(Formatter *f) override;
2576 void generate_db_histogram(Formatter *f) override;
f6b5b4d7 2577 void _shutdown_cache();
11fdf7f2 2578 int flush_cache(ostream *os = NULL) override;
7c673cae
FG
2579 void dump_perf_counters(Formatter *f) override {
2580 f->open_object_section("perf_counters");
2581 logger->dump_formatted(f, false);
2582 f->close_section();
2583 }
2584
11fdf7f2
TL
2585 int add_new_bluefs_device(int id, const string& path);
2586 int migrate_to_existing_bluefs_device(const set<int>& devs_source,
2587 int id);
2588 int migrate_to_new_bluefs_device(const set<int>& devs_source,
2589 int id,
2590 const string& path);
2591 int expand_devices(ostream& out);
2592 string get_device_path(unsigned id);
7c673cae 2593
1911f103
TL
2594 int dump_bluefs_sizes(ostream& out);
2595
7c673cae 2596public:
11fdf7f2
TL
2597 int statfs(struct store_statfs_t *buf,
2598 osd_alert_list_t* alerts = nullptr) override;
9f95a23c
TL
2599 int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
2600 bool *per_pool_omap) override;
7c673cae
FG
2601
2602 void collect_metadata(map<string,string> *pm) override;
2603
7c673cae
FG
2604 bool exists(CollectionHandle &c, const ghobject_t& oid) override;
2605 int set_collection_opts(
11fdf7f2 2606 CollectionHandle& c,
7c673cae 2607 const pool_opts_t& opts) override;
7c673cae
FG
2608 int stat(
2609 CollectionHandle &c,
2610 const ghobject_t& oid,
2611 struct stat *st,
2612 bool allow_eio = false) override;
7c673cae
FG
2613 int read(
2614 CollectionHandle &c,
2615 const ghobject_t& oid,
2616 uint64_t offset,
2617 size_t len,
2618 bufferlist& bl,
224ce89b 2619 uint32_t op_flags = 0) override;
9f95a23c
TL
2620
2621private:
2622
2623 // --------------------------------------------------------
2624 // intermediate data structures used while reading
2625 struct region_t {
2626 uint64_t logical_offset;
2627 uint64_t blob_xoffset; //region offset within the blob
2628 uint64_t length;
2629
2630 // used later in read process
2631 uint64_t front = 0;
2632
2633 region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0)
2634 : logical_offset(offset),
2635 blob_xoffset(b_offs),
2636 length(len),
2637 front(front){}
2638 region_t(const region_t& from)
2639 : logical_offset(from.logical_offset),
2640 blob_xoffset(from.blob_xoffset),
2641 length(from.length),
2642 front(from.front){}
2643
2644 friend ostream& operator<<(ostream& out, const region_t& r) {
2645 return out << "0x" << std::hex << r.logical_offset << ":"
2646 << r.blob_xoffset << "~" << r.length << std::dec;
2647 }
2648 };
2649
2650 // merged blob read request
2651 struct read_req_t {
2652 uint64_t r_off = 0;
2653 uint64_t r_len = 0;
2654 bufferlist bl;
2655 std::list<region_t> regs; // original read regions
2656
2657 read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {}
2658
2659 friend ostream& operator<<(ostream& out, const read_req_t& r) {
2660 out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : [";
2661 for (const auto& reg : r.regs)
2662 out << reg;
2663 return out << "]}" << std::dec;
2664 }
2665 };
2666
2667 typedef list<read_req_t> regions2read_t;
2668 typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
2669
2670 void _read_cache(
2671 OnodeRef o,
2672 uint64_t offset,
2673 size_t length,
2674 int read_cache_policy,
2675 ready_regions_t& ready_regions,
2676 blobs2read_t& blobs2read);
2677
2678
2679 int _prepare_read_ioc(
2680 blobs2read_t& blobs2read,
2681 vector<bufferlist>* compressed_blob_bls,
2682 IOContext* ioc);
2683
2684 int _generate_read_result_bl(
2685 OnodeRef o,
2686 uint64_t offset,
2687 size_t length,
2688 ready_regions_t& ready_regions,
2689 vector<bufferlist>& compressed_blob_bls,
2690 blobs2read_t& blobs2read,
2691 bool buffered,
2692 bool* csum_error,
2693 bufferlist& bl);
2694
7c673cae
FG
2695 int _do_read(
2696 Collection *c,
2697 OnodeRef o,
2698 uint64_t offset,
2699 size_t len,
2700 bufferlist& bl,
f64942e4
AA
2701 uint32_t op_flags = 0,
2702 uint64_t retry_count = 0);
7c673cae 2703
9f95a23c
TL
2704 int _do_readv(
2705 Collection *c,
2706 OnodeRef o,
2707 const interval_set<uint64_t>& m,
2708 bufferlist& bl,
2709 uint32_t op_flags = 0,
2710 uint64_t retry_count = 0);
2711
7c673cae
FG
2712 int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
2713 uint64_t offset, size_t len, interval_set<uint64_t>& destset);
2714public:
7c673cae
FG
2715 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2716 uint64_t offset, size_t len, bufferlist& bl) override;
7c673cae
FG
2717 int fiemap(CollectionHandle &c, const ghobject_t& oid,
2718 uint64_t offset, size_t len, map<uint64_t, uint64_t>& destmap) override;
2719
9f95a23c
TL
2720 int readv(
2721 CollectionHandle &c_,
2722 const ghobject_t& oid,
2723 interval_set<uint64_t>& m,
2724 bufferlist& bl,
2725 uint32_t op_flags) override;
2726
2727 int dump_onode(CollectionHandle &c, const ghobject_t& oid,
2728 const string& section_name, Formatter *f) override;
7c673cae 2729
7c673cae
FG
2730 int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
2731 bufferptr& value) override;
2732
7c673cae
FG
2733 int getattrs(CollectionHandle &c, const ghobject_t& oid,
2734 map<string,bufferptr>& aset) override;
2735
2736 int list_collections(vector<coll_t>& ls) override;
2737
2738 CollectionHandle open_collection(const coll_t &c) override;
11fdf7f2
TL
2739 CollectionHandle create_new_collection(const coll_t& cid) override;
2740 void set_collection_commit_queue(const coll_t& cid,
2741 ContextQueue *commit_queue) override;
7c673cae
FG
2742
2743 bool collection_exists(const coll_t& c) override;
11fdf7f2
TL
2744 int collection_empty(CollectionHandle& c, bool *empty) override;
2745 int collection_bits(CollectionHandle& c) override;
7c673cae 2746
7c673cae
FG
2747 int collection_list(CollectionHandle &c,
2748 const ghobject_t& start,
2749 const ghobject_t& end,
2750 int max,
2751 vector<ghobject_t> *ls, ghobject_t *next) override;
2752
f91f0fd5
TL
2753 int collection_list_legacy(CollectionHandle &c,
2754 const ghobject_t& start,
2755 const ghobject_t& end,
2756 int max,
2757 vector<ghobject_t> *ls,
2758 ghobject_t *next) override;
2759
7c673cae
FG
2760 int omap_get(
2761 CollectionHandle &c, ///< [in] Collection containing oid
2762 const ghobject_t &oid, ///< [in] Object containing omap
2763 bufferlist *header, ///< [out] omap header
2764 map<string, bufferlist> *out /// < [out] Key to value map
2765 ) override;
9f95a23c
TL
2766 int _omap_get(
2767 Collection *c, ///< [in] Collection containing oid
2768 const ghobject_t &oid, ///< [in] Object containing omap
2769 bufferlist *header, ///< [out] omap header
2770 map<string, bufferlist> *out /// < [out] Key to value map
2771 );
2772 int _onode_omap_get(
2773 const OnodeRef &o, ///< [in] Object containing omap
2774 bufferlist *header, ///< [out] omap header
2775 map<string, bufferlist> *out /// < [out] Key to value map
2776 );
2777
7c673cae
FG
2778
2779 /// Get omap header
7c673cae
FG
2780 int omap_get_header(
2781 CollectionHandle &c, ///< [in] Collection containing oid
2782 const ghobject_t &oid, ///< [in] Object containing omap
2783 bufferlist *header, ///< [out] omap header
2784 bool allow_eio = false ///< [in] don't assert on eio
2785 ) override;
2786
2787 /// Get keys defined on oid
7c673cae
FG
2788 int omap_get_keys(
2789 CollectionHandle &c, ///< [in] Collection containing oid
2790 const ghobject_t &oid, ///< [in] Object containing omap
2791 set<string> *keys ///< [out] Keys defined on oid
2792 ) override;
2793
2794 /// Get key values
7c673cae
FG
2795 int omap_get_values(
2796 CollectionHandle &c, ///< [in] Collection containing oid
2797 const ghobject_t &oid, ///< [in] Object containing omap
2798 const set<string> &keys, ///< [in] Keys to get
2799 map<string, bufferlist> *out ///< [out] Returned keys and values
2800 ) override;
2801
9f95a23c
TL
2802#ifdef WITH_SEASTAR
2803 int omap_get_values(
2804 CollectionHandle &c, ///< [in] Collection containing oid
2805 const ghobject_t &oid, ///< [in] Object containing omap
2806 const std::optional<string> &start_after, ///< [in] Keys to get
2807 map<string, bufferlist> *out ///< [out] Returned keys and values
2808 ) override;
2809#endif
2810
7c673cae 2811 /// Filters keys into out which are defined on oid
7c673cae
FG
2812 int omap_check_keys(
2813 CollectionHandle &c, ///< [in] Collection containing oid
2814 const ghobject_t &oid, ///< [in] Object containing omap
2815 const set<string> &keys, ///< [in] Keys to check
2816 set<string> *out ///< [out] Subset of keys defined on oid
2817 ) override;
2818
7c673cae
FG
2819 ObjectMap::ObjectMapIterator get_omap_iterator(
2820 CollectionHandle &c, ///< [in] collection
2821 const ghobject_t &oid ///< [in] object
2822 ) override;
2823
2824 void set_fsid(uuid_d u) override {
2825 fsid = u;
2826 }
2827 uuid_d get_fsid() override {
2828 return fsid;
2829 }
2830
2831 uint64_t estimate_objects_overhead(uint64_t num_objects) override {
2832 return num_objects * 300; //assuming per-object overhead is 300 bytes
2833 }
2834
2835 struct BSPerfTracker {
11fdf7f2
TL
2836 PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
2837 PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
7c673cae
FG
2838
2839 objectstore_perf_stat_t get_cur_stats() const {
2840 objectstore_perf_stat_t ret;
11fdf7f2
TL
2841 ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
2842 ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
7c673cae
FG
2843 return ret;
2844 }
2845
2846 void update_from_perfcounters(PerfCounters &logger);
2847 } perf_tracker;
2848
2849 objectstore_perf_stat_t get_cur_stats() override {
2850 perf_tracker.update_from_perfcounters(*logger);
2851 return perf_tracker.get_cur_stats();
2852 }
2853 const PerfCounters* get_perf_counters() const override {
2854 return logger;
2855 }
9f95a23c
TL
2856 const PerfCounters* get_bluefs_perf_counters() const {
2857 return bluefs->get_perf_counters();
2858 }
7c673cae
FG
2859
2860 int queue_transactions(
11fdf7f2 2861 CollectionHandle& ch,
7c673cae
FG
2862 vector<Transaction>& tls,
2863 TrackedOpRef op = TrackedOpRef(),
2864 ThreadPool::TPHandle *handle = NULL) override;
2865
2866 // error injection
2867 void inject_data_error(const ghobject_t& o) override {
9f95a23c 2868 std::unique_lock l(debug_read_error_lock);
7c673cae
FG
2869 debug_data_error_objects.insert(o);
2870 }
2871 void inject_mdata_error(const ghobject_t& o) override {
9f95a23c 2872 std::unique_lock l(debug_read_error_lock);
7c673cae
FG
2873 debug_mdata_error_objects.insert(o);
2874 }
11fdf7f2
TL
2875
2876 /// methods to inject various errors fsck can repair
2877 void inject_broken_shared_blob_key(const string& key,
2878 const bufferlist& bl);
2879 void inject_leaked(uint64_t len);
2880 void inject_false_free(coll_t cid, ghobject_t oid);
2881 void inject_statfs(const string& key, const store_statfs_t& new_statfs);
eafe8130 2882 void inject_global_statfs(const store_statfs_t& new_statfs);
11fdf7f2
TL
2883 void inject_misreference(coll_t cid1, ghobject_t oid1,
2884 coll_t cid2, ghobject_t oid2,
2885 uint64_t offset);
9f95a23c
TL
2886 // resets global per_pool_omap in DB
2887 void inject_legacy_omap();
2888 // resets per_pool_omap | pgmeta_omap for onode
2889 void inject_legacy_omap(coll_t cid, ghobject_t oid);
11fdf7f2 2890
224ce89b 2891 void compact() override {
11fdf7f2 2892 ceph_assert(db);
224ce89b
WB
2893 db->compact();
2894 }
28e407b8
AA
2895 bool has_builtin_csum() const override {
2896 return true;
2897 }
2898
11fdf7f2
TL
2899 /*
2900 Allocate space for BlueFS from slow device.
2901 Either automatically applies allocated extents to underlying
2902 BlueFS (extents == nullptr) or just return them (non-null extents) provided
2903 */
2904 int allocate_bluefs_freespace(
2905 uint64_t min_size,
2906 uint64_t size,
2907 PExtentVector* extents);
2908
494da23a
TL
2909 inline void log_latency(const char* name,
2910 int idx,
2911 const ceph::timespan& lat,
2912 double lat_threshold,
2913 const char* info = "") const;
2914
2915 inline void log_latency_fn(const char* name,
2916 int idx,
2917 const ceph::timespan& lat,
2918 double lat_threshold,
2919 std::function<string (const ceph::timespan& lat)> fn) const;
11fdf7f2 2920
7c673cae
FG
2921private:
2922 bool _debug_data_eio(const ghobject_t& o) {
2923 if (!cct->_conf->bluestore_debug_inject_read_err) {
2924 return false;
2925 }
9f95a23c 2926 std::shared_lock l(debug_read_error_lock);
7c673cae
FG
2927 return debug_data_error_objects.count(o);
2928 }
2929 bool _debug_mdata_eio(const ghobject_t& o) {
2930 if (!cct->_conf->bluestore_debug_inject_read_err) {
2931 return false;
2932 }
9f95a23c 2933 std::shared_lock l(debug_read_error_lock);
7c673cae
FG
2934 return debug_mdata_error_objects.count(o);
2935 }
2936 void _debug_obj_on_delete(const ghobject_t& o) {
2937 if (cct->_conf->bluestore_debug_inject_read_err) {
9f95a23c 2938 std::unique_lock l(debug_read_error_lock);
7c673cae
FG
2939 debug_data_error_objects.erase(o);
2940 debug_mdata_error_objects.erase(o);
2941 }
2942 }
11fdf7f2
TL
2943private:
2944 ceph::mutex qlock = ceph::make_mutex("BlueStore::Alerts::qlock");
2945 string failed_cmode;
2946 set<string> failed_compressors;
2947 string spillover_alert;
81eedcae 2948 string legacy_statfs_alert;
9f95a23c 2949 string no_per_pool_omap_alert;
81eedcae 2950 string disk_size_mismatch_alert;
11fdf7f2
TL
2951
2952 void _log_alerts(osd_alert_list_t& alerts);
2953 bool _set_compression_alert(bool cmode, const char* s) {
2954 std::lock_guard l(qlock);
2955 if (cmode) {
2956 bool ret = failed_cmode.empty();
2957 failed_cmode = s;
2958 return ret;
2959 }
2960 return failed_compressors.emplace(s).second;
2961 }
2962 void _clear_compression_alert() {
2963 std::lock_guard l(qlock);
2964 failed_compressors.clear();
2965 failed_cmode.clear();
2966 }
2967
2968 void _set_spillover_alert(const string& s) {
2969 std::lock_guard l(qlock);
2970 spillover_alert = s;
2971 }
2972 void _clear_spillover_alert() {
2973 std::lock_guard l(qlock);
2974 spillover_alert.clear();
2975 }
7c673cae 2976
81eedcae 2977 void _check_legacy_statfs_alert();
9f95a23c 2978 void _check_no_per_pool_omap_alert();
81eedcae
TL
2979 void _set_disk_size_mismatch_alert(const string& s) {
2980 std::lock_guard l(qlock);
2981 disk_size_mismatch_alert = s;
2982 }
2983
7c673cae
FG
2984private:
2985
2986 // --------------------------------------------------------
2987 // read processing internal methods
2988 int _verify_csum(
2989 OnodeRef& o,
2990 const bluestore_blob_t* blob,
2991 uint64_t blob_xoffset,
2992 const bufferlist& bl,
2993 uint64_t logical_offset) const;
2994 int _decompress(bufferlist& source, bufferlist* result);
2995
2996
2997 // --------------------------------------------------------
2998 // write ops
2999
3000 struct WriteContext {
3001 bool buffered = false; ///< buffered write
3002 bool compress = false; ///< compressed write
3003 uint64_t target_blob_size = 0; ///< target (max) blob size
3004 unsigned csum_order = 0; ///< target checksum chunk order
3005
3006 old_extent_map_t old_extents; ///< must deref these blobs
eafe8130 3007 interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection
7c673cae
FG
3008
3009 struct write_item {
3010 uint64_t logical_offset; ///< write logical offset
3011 BlobRef b;
3012 uint64_t blob_length;
3013 uint64_t b_off;
3014 bufferlist bl;
3015 uint64_t b_off0; ///< original offset in a blob prior to padding
3016 uint64_t length0; ///< original data length prior to padding
3017
3018 bool mark_unused;
3019 bool new_blob; ///< whether new blob was created
3020
3efd9988
FG
3021 bool compressed = false;
3022 bufferlist compressed_bl;
3023 size_t compressed_len = 0;
3024
7c673cae
FG
3025 write_item(
3026 uint64_t logical_offs,
3027 BlobRef b,
3028 uint64_t blob_len,
3029 uint64_t o,
3030 bufferlist& bl,
3031 uint64_t o0,
3032 uint64_t l0,
3033 bool _mark_unused,
3034 bool _new_blob)
3035 :
3036 logical_offset(logical_offs),
3037 b(b),
3038 blob_length(blob_len),
3039 b_off(o),
3040 bl(bl),
3041 b_off0(o0),
3042 length0(l0),
3043 mark_unused(_mark_unused),
3044 new_blob(_new_blob) {}
3045 };
3046 vector<write_item> writes; ///< blobs we're writing
3047
3048 /// partial clone of the context
3049 void fork(const WriteContext& other) {
3050 buffered = other.buffered;
3051 compress = other.compress;
3052 target_blob_size = other.target_blob_size;
3053 csum_order = other.csum_order;
3054 }
3055 void write(
3056 uint64_t loffs,
3057 BlobRef b,
3058 uint64_t blob_len,
3059 uint64_t o,
3060 bufferlist& bl,
3061 uint64_t o0,
3062 uint64_t len0,
3063 bool _mark_unused,
3064 bool _new_blob) {
3065 writes.emplace_back(loffs,
3066 b,
3067 blob_len,
3068 o,
3069 bl,
3070 o0,
3071 len0,
3072 _mark_unused,
3073 _new_blob);
3074 }
3075 /// Checks for writes to the same pextent within a blob
3076 bool has_conflict(
3077 BlobRef b,
3078 uint64_t loffs,
3079 uint64_t loffs_end,
3080 uint64_t min_alloc_size);
3081 };
3082
3083 void _do_write_small(
3084 TransContext *txc,
3085 CollectionRef &c,
3086 OnodeRef o,
3087 uint64_t offset, uint64_t length,
3088 bufferlist::iterator& blp,
3089 WriteContext *wctx);
3090 void _do_write_big(
3091 TransContext *txc,
3092 CollectionRef &c,
3093 OnodeRef o,
3094 uint64_t offset, uint64_t length,
3095 bufferlist::iterator& blp,
3096 WriteContext *wctx);
3097 int _do_alloc_write(
3098 TransContext *txc,
3099 CollectionRef c,
3100 OnodeRef o,
3101 WriteContext *wctx);
3102 void _wctx_finish(
3103 TransContext *txc,
3104 CollectionRef& c,
3105 OnodeRef o,
31f18b77
FG
3106 WriteContext *wctx,
3107 set<SharedBlob*> *maybe_unshared_blobs=0);
7c673cae 3108
7c673cae
FG
3109 int _write(TransContext *txc,
3110 CollectionRef& c,
3111 OnodeRef& o,
3112 uint64_t offset, size_t len,
3113 bufferlist& bl,
3114 uint32_t fadvise_flags);
3115 void _pad_zeros(bufferlist *bl, uint64_t *offset,
3116 uint64_t chunk_size);
3117
31f18b77
FG
3118 void _choose_write_options(CollectionRef& c,
3119 OnodeRef o,
3120 uint32_t fadvise_flags,
3121 WriteContext *wctx);
3122
3123 int _do_gc(TransContext *txc,
3124 CollectionRef& c,
3125 OnodeRef o,
31f18b77
FG
3126 const WriteContext& wctx,
3127 uint64_t *dirty_start,
3128 uint64_t *dirty_end);
3129
7c673cae
FG
3130 int _do_write(TransContext *txc,
3131 CollectionRef &c,
3132 OnodeRef o,
3133 uint64_t offset, uint64_t length,
3134 bufferlist& bl,
3135 uint32_t fadvise_flags);
3136 void _do_write_data(TransContext *txc,
3137 CollectionRef& c,
3138 OnodeRef o,
3139 uint64_t offset,
3140 uint64_t length,
3141 bufferlist& bl,
3142 WriteContext *wctx);
3143
3144 int _touch(TransContext *txc,
3145 CollectionRef& c,
3146 OnodeRef& o);
3147 int _do_zero(TransContext *txc,
3148 CollectionRef& c,
3149 OnodeRef& o,
3150 uint64_t offset, size_t len);
3151 int _zero(TransContext *txc,
3152 CollectionRef& c,
3153 OnodeRef& o,
3154 uint64_t offset, size_t len);
3155 void _do_truncate(TransContext *txc,
3156 CollectionRef& c,
3157 OnodeRef o,
31f18b77
FG
3158 uint64_t offset,
3159 set<SharedBlob*> *maybe_unshared_blobs=0);
35e4c445 3160 int _truncate(TransContext *txc,
7c673cae
FG
3161 CollectionRef& c,
3162 OnodeRef& o,
3163 uint64_t offset);
3164 int _remove(TransContext *txc,
3165 CollectionRef& c,
3166 OnodeRef& o);
3167 int _do_remove(TransContext *txc,
3168 CollectionRef& c,
3169 OnodeRef o);
3170 int _setattr(TransContext *txc,
3171 CollectionRef& c,
3172 OnodeRef& o,
3173 const string& name,
3174 bufferptr& val);
3175 int _setattrs(TransContext *txc,
3176 CollectionRef& c,
3177 OnodeRef& o,
3178 const map<string,bufferptr>& aset);
3179 int _rmattr(TransContext *txc,
3180 CollectionRef& c,
3181 OnodeRef& o,
3182 const string& name);
3183 int _rmattrs(TransContext *txc,
3184 CollectionRef& c,
3185 OnodeRef& o);
9f95a23c 3186 void _do_omap_clear(TransContext *txc, OnodeRef &o);
7c673cae
FG
3187 int _omap_clear(TransContext *txc,
3188 CollectionRef& c,
3189 OnodeRef& o);
3190 int _omap_setkeys(TransContext *txc,
3191 CollectionRef& c,
3192 OnodeRef& o,
3193 bufferlist& bl);
3194 int _omap_setheader(TransContext *txc,
3195 CollectionRef& c,
3196 OnodeRef& o,
3197 bufferlist& header);
3198 int _omap_rmkeys(TransContext *txc,
3199 CollectionRef& c,
3200 OnodeRef& o,
3201 bufferlist& bl);
3202 int _omap_rmkey_range(TransContext *txc,
3203 CollectionRef& c,
3204 OnodeRef& o,
3205 const string& first, const string& last);
3206 int _set_alloc_hint(
3207 TransContext *txc,
3208 CollectionRef& c,
3209 OnodeRef& o,
3210 uint64_t expected_object_size,
3211 uint64_t expected_write_size,
3212 uint32_t flags);
3213 int _do_clone_range(TransContext *txc,
3214 CollectionRef& c,
3215 OnodeRef& oldo,
3216 OnodeRef& newo,
3217 uint64_t srcoff, uint64_t length, uint64_t dstoff);
3218 int _clone(TransContext *txc,
3219 CollectionRef& c,
3220 OnodeRef& oldo,
3221 OnodeRef& newo);
3222 int _clone_range(TransContext *txc,
3223 CollectionRef& c,
3224 OnodeRef& oldo,
3225 OnodeRef& newo,
3226 uint64_t srcoff, uint64_t length, uint64_t dstoff);
3227 int _rename(TransContext *txc,
3228 CollectionRef& c,
3229 OnodeRef& oldo,
3230 OnodeRef& newo,
3231 const ghobject_t& new_oid);
3232 int _create_collection(TransContext *txc, const coll_t &cid,
3233 unsigned bits, CollectionRef *c);
3234 int _remove_collection(TransContext *txc, const coll_t &cid,
3235 CollectionRef *c);
11fdf7f2 3236 void _do_remove_collection(TransContext *txc, CollectionRef *c);
7c673cae
FG
3237 int _split_collection(TransContext *txc,
3238 CollectionRef& c,
3239 CollectionRef& d,
3240 unsigned bits, int rem);
11fdf7f2
TL
3241 int _merge_collection(TransContext *txc,
3242 CollectionRef *c,
3243 CollectionRef& d,
3244 unsigned bits);
3245
9f95a23c
TL
3246 void _collect_allocation_stats(uint64_t need, uint32_t alloc_size,
3247 size_t extents);
3248 void _record_allocation_stats();
11fdf7f2 3249private:
9f95a23c
TL
3250 uint64_t probe_count = 0;
3251 std::atomic<uint64_t> alloc_stats_count = {0};
3252 std::atomic<uint64_t> alloc_stats_fragments = { 0 };
3253 std::atomic<uint64_t> alloc_stats_size = { 0 };
3254 //
3255 std::array<std::tuple<uint64_t, uint64_t, uint64_t>, 5> alloc_stats_history =
3256 { std::make_tuple(0ul, 0ul, 0ul) };
3257
11fdf7f2
TL
3258 std::atomic<uint64_t> out_of_sync_fm = {0};
3259 // --------------------------------------------------------
3260 // BlueFSDeviceExpander implementation
3261 uint64_t get_recommended_expansion_delta(uint64_t bluefs_free,
3262 uint64_t bluefs_total) override {
3263 auto delta = _get_bluefs_size_delta(bluefs_free, bluefs_total);
3264 return delta > 0 ? delta : 0;
3265 }
3266 int allocate_freespace(
3267 uint64_t min_size,
3268 uint64_t size,
3269 PExtentVector& extents) override {
3270 return allocate_bluefs_freespace(min_size, size, &extents);
3271 };
9f95a23c
TL
3272 uint64_t available_freespace(uint64_t alloc_size) override;
3273 inline bool _use_rotational_settings();
eafe8130
TL
3274
3275public:
3276 struct sb_info_t {
3277 coll_t cid;
3278 int64_t pool_id = INT64_MIN;
3279 list<ghobject_t> oids;
3280 BlueStore::SharedBlobRef sb;
3281 bluestore_extent_ref_map_t ref_map;
3282 bool compressed = false;
3283 bool passed = false;
3284 bool updated = false;
3285 };
3286 typedef btree::btree_set<
3287 uint64_t, std::less<uint64_t>,
3288 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
3289
3290 typedef mempool::bluestore_fsck::map<uint64_t, sb_info_t> sb_info_map_t;
3291 struct FSCK_ObjectCtx {
3292 int64_t& errors;
3293 int64_t& warnings;
3294 uint64_t& num_objects;
3295 uint64_t& num_extents;
3296 uint64_t& num_blobs;
3297 uint64_t& num_sharded_objects;
3298 uint64_t& num_spanning_blobs;
3299
3300 mempool_dynamic_bitset* used_blocks;
3301 uint64_t_btree_t* used_omap_head;
eafe8130
TL
3302
3303 ceph::mutex* sb_info_lock;
3304 sb_info_map_t& sb_info;
3305
3306 store_statfs_t& expected_store_statfs;
3307 per_pool_statfs& expected_pool_statfs;
3308 BlueStoreRepairer* repairer;
3309
3310 FSCK_ObjectCtx(int64_t& e,
3311 int64_t& w,
3312 uint64_t& _num_objects,
3313 uint64_t& _num_extents,
3314 uint64_t& _num_blobs,
3315 uint64_t& _num_sharded_objects,
3316 uint64_t& _num_spanning_blobs,
3317 mempool_dynamic_bitset* _ub,
3318 uint64_t_btree_t* _used_omap_head,
eafe8130
TL
3319 ceph::mutex* _sb_info_lock,
3320 sb_info_map_t& _sb_info,
3321 store_statfs_t& _store_statfs,
3322 per_pool_statfs& _pool_statfs,
3323 BlueStoreRepairer* _repairer) :
3324 errors(e),
3325 warnings(w),
3326 num_objects(_num_objects),
3327 num_extents(_num_extents),
3328 num_blobs(_num_blobs),
3329 num_sharded_objects(_num_sharded_objects),
3330 num_spanning_blobs(_num_spanning_blobs),
3331 used_blocks(_ub),
3332 used_omap_head(_used_omap_head),
eafe8130
TL
3333 sb_info_lock(_sb_info_lock),
3334 sb_info(_sb_info),
3335 expected_store_statfs(_store_statfs),
3336 expected_pool_statfs(_pool_statfs),
3337 repairer(_repairer) {
3338 }
3339 };
3340
3341 OnodeRef fsck_check_objects_shallow(
3342 FSCKDepth depth,
3343 int64_t pool_id,
3344 CollectionRef c,
3345 const ghobject_t& oid,
3346 const string& key,
3347 const bufferlist& value,
9f95a23c 3348 mempool::bluestore_fsck::list<string>* expecting_shards,
eafe8130
TL
3349 map<BlobRef, bluestore_blob_t::unused_t>* referenced,
3350 const BlueStore::FSCK_ObjectCtx& ctx);
3351
3352private:
9f95a23c
TL
3353 void _fsck_check_object_omap(FSCKDepth depth,
3354 OnodeRef& o,
3355 const BlueStore::FSCK_ObjectCtx& ctx);
3356
eafe8130
TL
3357 void _fsck_check_objects(FSCKDepth depth,
3358 FSCK_ObjectCtx& ctx);
7c673cae
FG
3359};
3360
11fdf7f2
TL
3361inline ostream& operator<<(ostream& out, const BlueStore::volatile_statfs& s) {
3362 return out
3363 << " allocated:"
3364 << s.values[BlueStore::volatile_statfs::STATFS_ALLOCATED]
3365 << " stored:"
3366 << s.values[BlueStore::volatile_statfs::STATFS_STORED]
3367 << " compressed:"
3368 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED]
3369 << " compressed_orig:"
3370 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ORIGINAL]
3371 << " compressed_alloc:"
3372 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ALLOCATED];
7c673cae
FG
3373}
3374
3375static inline void intrusive_ptr_add_ref(BlueStore::Onode *o) {
3376 o->get();
3377}
3378static inline void intrusive_ptr_release(BlueStore::Onode *o) {
3379 o->put();
3380}
3381
3382static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer *o) {
3383 o->get();
3384}
3385static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
3386 o->put();
3387}
3388
11fdf7f2
TL
3389class BlueStoreRepairer
3390{
3391public:
3392 // to simplify future potential migration to mempools
3393 using fsck_interval = interval_set<uint64_t>;
3394
3395 // Structure to track what pextents are used for specific cid/oid.
3396 // Similar to Bloom filter positive and false-positive matches are
3397 // possible only.
3398 // Maintains two lists of bloom filters for both cids and oids
3399 // where each list entry is a BF for specific disk pextent
3400 // The length of the extent per filter is measured on init.
3401 // Allows to filter out 'uninteresting' pextents to speadup subsequent
3402 // 'is_used' access.
3403 struct StoreSpaceTracker {
3404 const uint64_t BLOOM_FILTER_SALT_COUNT = 2;
3405 const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter
3406 const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected
3407 static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024;
3408
3409 typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector;
3410 bloom_vector collections_bfs;
3411 bloom_vector objects_bfs;
3412
3413 bool was_filtered_out = false;
3414 uint64_t granularity = 0; // extent length for a single filter
3415
3416 StoreSpaceTracker() {
3417 }
3418 StoreSpaceTracker(const StoreSpaceTracker& from) :
3419 collections_bfs(from.collections_bfs),
3420 objects_bfs(from.objects_bfs),
3421 granularity(from.granularity) {
3422 }
3423
3424 void init(uint64_t total,
3425 uint64_t min_alloc_size,
3426 uint64_t mem_cap = DEF_MEM_CAP) {
3427 ceph_assert(!granularity); // not initialized yet
3428 ceph_assert(min_alloc_size && isp2(min_alloc_size));
3429 ceph_assert(mem_cap);
3430
3431 total = round_up_to(total, min_alloc_size);
3432 granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap;
3433
3434 if (!granularity) {
3435 granularity = min_alloc_size;
3436 } else {
3437 granularity = round_up_to(granularity, min_alloc_size);
3438 }
3439
3440 uint64_t entries = round_up_to(total, granularity) / granularity;
3441 collections_bfs.resize(entries,
3442 bloom_filter(BLOOM_FILTER_SALT_COUNT,
3443 BLOOM_FILTER_TABLE_SIZE,
3444 0,
3445 BLOOM_FILTER_EXPECTED_COUNT));
3446 objects_bfs.resize(entries,
3447 bloom_filter(BLOOM_FILTER_SALT_COUNT,
3448 BLOOM_FILTER_TABLE_SIZE,
3449 0,
3450 BLOOM_FILTER_EXPECTED_COUNT));
3451 }
3452 inline uint32_t get_hash(const coll_t& cid) const {
3453 return cid.hash_to_shard(1);
3454 }
3455 inline void set_used(uint64_t offset, uint64_t len,
3456 const coll_t& cid, const ghobject_t& oid) {
3457 ceph_assert(granularity); // initialized
3458
3459 // can't call this func after filter_out has been applied
3460 ceph_assert(!was_filtered_out);
3461 if (!len) {
3462 return;
3463 }
3464 auto pos = offset / granularity;
3465 auto end_pos = (offset + len - 1) / granularity;
3466 while (pos <= end_pos) {
3467 collections_bfs[pos].insert(get_hash(cid));
3468 objects_bfs[pos].insert(oid.hobj.get_hash());
3469 ++pos;
3470 }
3471 }
3472 // filter-out entries unrelated to the specified(broken) extents.
3473 // 'is_used' calls are permitted after that only
3474 size_t filter_out(const fsck_interval& extents);
3475
3476 // determines if collection's present after filtering-out
3477 inline bool is_used(const coll_t& cid) const {
3478 ceph_assert(was_filtered_out);
3479 for(auto& bf : collections_bfs) {
3480 if (bf.contains(get_hash(cid))) {
3481 return true;
3482 }
3483 }
3484 return false;
3485 }
3486 // determines if object's present after filtering-out
3487 inline bool is_used(const ghobject_t& oid) const {
3488 ceph_assert(was_filtered_out);
3489 for(auto& bf : objects_bfs) {
3490 if (bf.contains(oid.hobj.get_hash())) {
3491 return true;
3492 }
3493 }
3494 return false;
3495 }
3496 // determines if collection's present before filtering-out
3497 inline bool is_used(const coll_t& cid, uint64_t offs) const {
3498 ceph_assert(granularity); // initialized
3499 ceph_assert(!was_filtered_out);
3500 auto &bf = collections_bfs[offs / granularity];
3501 if (bf.contains(get_hash(cid))) {
3502 return true;
3503 }
3504 return false;
3505 }
3506 // determines if object's present before filtering-out
3507 inline bool is_used(const ghobject_t& oid, uint64_t offs) const {
3508 ceph_assert(granularity); // initialized
3509 ceph_assert(!was_filtered_out);
3510 auto &bf = objects_bfs[offs / granularity];
3511 if (bf.contains(oid.hobj.get_hash())) {
3512 return true;
3513 }
3514 return false;
3515 }
3516 };
3517public:
9f95a23c 3518 void fix_per_pool_omap(KeyValueDB *db);
11fdf7f2
TL
3519 bool remove_key(KeyValueDB *db, const string& prefix, const string& key);
3520 bool fix_shared_blob(KeyValueDB *db,
3521 uint64_t sbid,
3522 const bufferlist* bl);
3523 bool fix_statfs(KeyValueDB *db, const string& key,
3524 const store_statfs_t& new_statfs);
3525
3526 bool fix_leaked(KeyValueDB *db,
3527 FreelistManager* fm,
3528 uint64_t offset, uint64_t len);
3529 bool fix_false_free(KeyValueDB *db,
3530 FreelistManager* fm,
3531 uint64_t offset, uint64_t len);
3532 bool fix_bluefs_extents(std::atomic<uint64_t>& out_of_sync_flag);
3533
3534 void init(uint64_t total_space, uint64_t lres_tracking_unit_size);
3535
3536 bool preprocess_misreference(KeyValueDB *db);
3537
3538 unsigned apply(KeyValueDB* db);
3539
3540 void note_misreference(uint64_t offs, uint64_t len, bool inc_error) {
3541 misreferenced_extents.union_insert(offs, len);
3542 if (inc_error) {
3543 ++to_repair_cnt;
3544 }
3545 }
9f95a23c 3546 // In fact this is the only repairer's method which is thread-safe!!
eafe8130
TL
3547 void inc_repaired() {
3548 ++to_repair_cnt;
9f95a23c 3549 }
11fdf7f2
TL
3550
3551 StoreSpaceTracker& get_space_usage_tracker() {
3552 return space_usage_tracker;
3553 }
3554 const fsck_interval& get_misreferences() const {
3555 return misreferenced_extents;
3556 }
3557 KeyValueDB::Transaction get_fix_misreferences_txn() {
3558 return fix_misreferences_txn;
3559 }
3560
3561private:
9f95a23c
TL
3562 std::atomic<unsigned> to_repair_cnt = { 0 };
3563 KeyValueDB::Transaction fix_per_pool_omap_txn;
11fdf7f2
TL
3564 KeyValueDB::Transaction fix_fm_leaked_txn;
3565 KeyValueDB::Transaction fix_fm_false_free_txn;
3566 KeyValueDB::Transaction remove_key_txn;
3567 KeyValueDB::Transaction fix_statfs_txn;
3568 KeyValueDB::Transaction fix_shared_blob_txn;
3569
3570 KeyValueDB::Transaction fix_misreferences_txn;
3571
3572 StoreSpaceTracker space_usage_tracker;
3573
3574 // non-shared extents with multiple references
3575 fsck_interval misreferenced_extents;
3576
3577};
9f95a23c
TL
3578
3579class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
3580{
3581 template <class T, size_t MaxX, size_t MaxY>
3582 class matrix_2d {
3583 T values[MaxX][MaxY];
3584 public:
3585 matrix_2d() {
3586 clear();
3587 }
3588 T& at(size_t x, size_t y) {
3589 ceph_assert(x < MaxX);
3590 ceph_assert(y < MaxY);
3591
3592 return values[x][y];
3593 }
3594 size_t get_max_x() const {
3595 return MaxX;
3596 }
3597 size_t get_max_y() const {
3598 return MaxY;
3599 }
3600 void clear() {
3601 memset(values, 0, sizeof(values));
3602 }
3603 };
3604
3605 enum {
3606 // use 0/nullptr as unset indication
3607 LEVEL_FIRST = 1,
f6b5b4d7
TL
3608 LEVEL_LOG = LEVEL_FIRST, // BlueFS log
3609 LEVEL_WAL,
9f95a23c
TL
3610 LEVEL_DB,
3611 LEVEL_SLOW,
3612 LEVEL_MAX
3613 };
3614 // add +1 row for corresponding per-device totals
3615 // add +1 column for per-level actual (taken from file size) total
3616 typedef matrix_2d<uint64_t, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t;
3617
3618 per_level_per_dev_usage_t per_level_per_dev_usage;
f6b5b4d7
TL
3619 // file count per level, add +1 to keep total file count
3620 uint64_t per_level_files[LEVEL_MAX - LEVEL_FIRST + 1] = { 0 };
9f95a23c
TL
3621
3622 // Note: maximum per-device totals below might be smaller than corresponding
3623 // perf counters by up to a single alloc unit (1M) due to superblock extent.
3624 // The later is not accounted here.
3625 per_level_per_dev_usage_t per_level_per_dev_max;
3626
3627 uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST];
3628 uint64_t db_avail4slow = 0;
3629 enum {
3630 OLD_POLICY,
3631 USE_SOME_EXTRA
3632 };
3633
3634public:
3635 RocksDBBlueFSVolumeSelector(
3636 uint64_t _wal_total,
3637 uint64_t _db_total,
3638 uint64_t _slow_total,
3639 uint64_t _level0_size,
3640 uint64_t _level_base,
3641 uint64_t _level_multiplier,
3642 double reserved_factor,
3643 uint64_t reserved,
3644 bool new_pol)
3645 {
f6b5b4d7 3646 l_totals[LEVEL_LOG - LEVEL_FIRST] = 0; // not used at the moment
9f95a23c
TL
3647 l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total;
3648 l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total;
3649 l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total;
3650
3651 if (!new_pol) {
3652 return;
3653 }
3654
3655 // Calculating how much extra space is available at DB volume.
3656 // Depending on the presence of explicit reserved size specification it might be either
3657 // * DB volume size - reserved
3658 // or
3659 // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
3660 if (!reserved) {
3661 uint64_t prev_levels = _level0_size;
3662 uint64_t cur_level = _level_base;
3663 uint64_t cur_threshold = 0;
3664 do {
3665 uint64_t next_level = cur_level * _level_multiplier;
3666 uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor;
3667 if (_db_total <= next_threshold) {
3668 db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0;
3669 break;
3670 } else {
3671 prev_levels += cur_level;
3672 cur_level = next_level;
3673 cur_threshold = next_threshold;
3674 }
3675 } while (true);
3676 } else {
3677 db_avail4slow = _db_total - reserved;
3678 }
3679 }
3680
f6b5b4d7
TL
3681 void* get_hint_for_log() const override {
3682 return reinterpret_cast<void*>(LEVEL_LOG);
9f95a23c
TL
3683 }
3684 void* get_hint_by_dir(const string& dirname) const override;
3685
3686 void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
3687 if (hint == nullptr)
3688 return;
3689 size_t pos = (size_t)hint - LEVEL_FIRST;
3690 for (auto& p : fnode.extents) {
3691 auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
3692 auto& max = per_level_per_dev_max.at(p.bdev, pos);
3693 cur += p.length;
3694 if (cur > max) {
3695 max = cur;
3696 }
3697 {
3698 //update per-device totals
3699 auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
3700 auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
3701 cur += p.length;
3702 if (cur > max) {
3703 max = cur;
3704 }
3705 }
3706 }
3707 {
3708 //update per-level actual totals
3709 auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
3710 auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
3711 cur += fnode.size;
3712 if (cur > max) {
3713 max = cur;
3714 }
3715 }
f6b5b4d7
TL
3716 ++per_level_files[pos];
3717 ++per_level_files[LEVEL_MAX - LEVEL_FIRST];
9f95a23c
TL
3718 }
3719 void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
3720 if (hint == nullptr)
3721 return;
3722 size_t pos = (size_t)hint - LEVEL_FIRST;
3723 for (auto& p : fnode.extents) {
3724 auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
3725 ceph_assert(cur >= p.length);
3726 cur -= p.length;
3727
3728 //update per-device totals
3729 auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
3730 ceph_assert(cur2 >= p.length);
3731 cur2 -= p.length;
3732 }
3733 //update per-level actual totals
3734 auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
3735 ceph_assert(cur >= fnode.size);
3736 cur -= fnode.size;
f6b5b4d7
TL
3737 ceph_assert(per_level_files[pos] > 0);
3738 --per_level_files[pos];
3739 ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0);
3740 --per_level_files[LEVEL_MAX - LEVEL_FIRST];
9f95a23c
TL
3741 }
3742 void add_usage(void* hint, uint64_t fsize) override {
3743 if (hint == nullptr)
3744 return;
3745 size_t pos = (size_t)hint - LEVEL_FIRST;
3746 //update per-level actual totals
3747 auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
3748 auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
3749 cur += fsize;
3750 if (cur > max) {
3751 max = cur;
3752 }
3753 }
3754 void sub_usage(void* hint, uint64_t fsize) override {
3755 if (hint == nullptr)
3756 return;
3757 size_t pos = (size_t)hint - LEVEL_FIRST;
3758 //update per-level actual totals
3759 auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
3760 ceph_assert(cur >= fsize);
3761 per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos) -= fsize;
3762 }
3763
3764 uint8_t select_prefer_bdev(void* h) override;
3765 void get_paths(
3766 const std::string& base,
3767 BlueFSVolumeSelector::paths& res) const override;
3768
3769 void dump(ostream& sout) override;
3770};
3771
7c673cae 3772#endif