]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueStore.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / os / bluestore / BlueStore.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2014 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_OSD_BLUESTORE_H
16#define CEPH_OSD_BLUESTORE_H
17
18#include "acconfig.h"
19
20#include <unistd.h>
21
22#include <atomic>
9f95a23c
TL
23#include <chrono>
24#include <ratio>
7c673cae
FG
25#include <mutex>
26#include <condition_variable>
27
28#include <boost/intrusive/list.hpp>
29#include <boost/intrusive/unordered_set.hpp>
30#include <boost/intrusive/set.hpp>
31#include <boost/functional/hash.hpp>
32#include <boost/dynamic_bitset.hpp>
9f95a23c 33#include <boost/circular_buffer.hpp>
7c673cae 34
eafe8130
TL
35#include "include/cpp-btree/btree_set.h"
36
11fdf7f2 37#include "include/ceph_assert.h"
f67539c2 38#include "include/interval_set.h"
7c673cae 39#include "include/unordered_map.h"
7c673cae 40#include "include/mempool.h"
9f95a23c 41#include "include/hash.h"
11fdf7f2 42#include "common/bloom_filter.hpp"
7c673cae 43#include "common/Finisher.h"
9f95a23c 44#include "common/ceph_mutex.h"
11fdf7f2 45#include "common/Throttle.h"
7c673cae 46#include "common/perf_counters.h"
91327a77 47#include "common/PriorityCache.h"
7c673cae
FG
48#include "compressor/Compressor.h"
49#include "os/ObjectStore.h"
50
51#include "bluestore_types.h"
11fdf7f2 52#include "BlueFS.h"
7c673cae
FG
53#include "common/EventTrace.h"
54
f67539c2
TL
55#ifdef WITH_BLKIN
56#include "common/zipkin_trace.h"
57#endif
58
7c673cae
FG
59class Allocator;
60class FreelistManager;
11fdf7f2 61class BlueStoreRepairer;
7c673cae
FG
62
63//#define DEBUG_CACHE
64//#define DEBUG_DEFERRED
65
31f18b77
FG
66
67
68// constants for Buffer::optimize()
69#define MAX_BUFFER_SLOP_RATIO_DEN 8 // so actually 1/N
70
71
7c673cae
FG
72enum {
73 l_bluestore_first = 732430,
74 l_bluestore_kv_flush_lat,
75 l_bluestore_kv_commit_lat,
11fdf7f2
TL
76 l_bluestore_kv_sync_lat,
77 l_bluestore_kv_final_lat,
7c673cae
FG
78 l_bluestore_state_prepare_lat,
79 l_bluestore_state_aio_wait_lat,
80 l_bluestore_state_io_done_lat,
81 l_bluestore_state_kv_queued_lat,
82 l_bluestore_state_kv_committing_lat,
83 l_bluestore_state_kv_done_lat,
84 l_bluestore_state_deferred_queued_lat,
85 l_bluestore_state_deferred_aio_wait_lat,
86 l_bluestore_state_deferred_cleanup_lat,
87 l_bluestore_state_finishing_lat,
88 l_bluestore_state_done_lat,
89 l_bluestore_throttle_lat,
90 l_bluestore_submit_lat,
91 l_bluestore_commit_lat,
92 l_bluestore_read_lat,
93 l_bluestore_read_onode_meta_lat,
94 l_bluestore_read_wait_aio_lat,
95 l_bluestore_compress_lat,
96 l_bluestore_decompress_lat,
97 l_bluestore_csum_lat,
98 l_bluestore_compress_success_count,
99 l_bluestore_compress_rejected_count,
100 l_bluestore_write_pad_bytes,
101 l_bluestore_deferred_write_ops,
102 l_bluestore_deferred_write_bytes,
103 l_bluestore_write_penalty_read_ops,
104 l_bluestore_allocated,
105 l_bluestore_stored,
106 l_bluestore_compressed,
107 l_bluestore_compressed_allocated,
108 l_bluestore_compressed_original,
109 l_bluestore_onodes,
9f95a23c 110 l_bluestore_pinned_onodes,
7c673cae
FG
111 l_bluestore_onode_hits,
112 l_bluestore_onode_misses,
113 l_bluestore_onode_shard_hits,
114 l_bluestore_onode_shard_misses,
115 l_bluestore_extents,
116 l_bluestore_blobs,
117 l_bluestore_buffers,
118 l_bluestore_buffer_bytes,
119 l_bluestore_buffer_hit_bytes,
120 l_bluestore_buffer_miss_bytes,
121 l_bluestore_write_big,
122 l_bluestore_write_big_bytes,
123 l_bluestore_write_big_blobs,
f67539c2 124 l_bluestore_write_big_deferred,
7c673cae
FG
125 l_bluestore_write_small,
126 l_bluestore_write_small_bytes,
127 l_bluestore_write_small_unused,
f67539c2 128 l_bluestore_write_deferred,
7c673cae 129 l_bluestore_write_small_pre_read,
f67539c2 130 l_bluestore_write_new,
7c673cae
FG
131 l_bluestore_txc,
132 l_bluestore_onode_reshard,
133 l_bluestore_blob_split,
134 l_bluestore_extent_compress,
135 l_bluestore_gc_merged,
b32b8144 136 l_bluestore_read_eio,
f64942e4 137 l_bluestore_reads_with_retries,
a8e16298 138 l_bluestore_fragmentation,
11fdf7f2
TL
139 l_bluestore_omap_seek_to_first_lat,
140 l_bluestore_omap_upper_bound_lat,
141 l_bluestore_omap_lower_bound_lat,
142 l_bluestore_omap_next_lat,
adb31ebb
TL
143 l_bluestore_omap_get_keys_lat,
144 l_bluestore_omap_get_values_lat,
494da23a 145 l_bluestore_clist_lat,
adb31ebb 146 l_bluestore_remove_lat,
7c673cae
FG
147 l_bluestore_last
148};
149
11fdf7f2
TL
150#define META_POOL_ID ((uint64_t)-1ull)
151
7c673cae
FG
152class BlueStore : public ObjectStore,
153 public md_config_obs_t {
154 // -----------------------------------------------------
155 // types
156public:
157 // config observer
158 const char** get_tracked_conf_keys() const override;
11fdf7f2
TL
159 void handle_conf_change(const ConfigProxy& conf,
160 const std::set<std::string> &changed) override;
161
162 //handler for discard event
163 void handle_discard(interval_set<uint64_t>& to_release);
7c673cae
FG
164
165 void _set_csum();
166 void _set_compression();
167 void _set_throttle_params();
31f18b77 168 int _set_cache_sizes();
9f95a23c
TL
169 void _set_max_defer_interval() {
170 max_defer_interval =
171 cct->_conf.get_val<double>("bluestore_max_defer_interval");
172 }
7c673cae 173
f67539c2 174 struct TransContext;
7c673cae 175
f67539c2 176 typedef std::map<uint64_t, ceph::buffer::list> ready_regions_t;
7c673cae 177
eafe8130 178
7c673cae
FG
179 struct BufferSpace;
180 struct Collection;
181 typedef boost::intrusive_ptr<Collection> CollectionRef;
182
183 struct AioContext {
184 virtual void aio_finish(BlueStore *store) = 0;
185 virtual ~AioContext() {}
186 };
187
188 /// cached buffer
189 struct Buffer {
190 MEMPOOL_CLASS_HELPERS();
191
192 enum {
193 STATE_EMPTY, ///< empty buffer -- used for cache history
194 STATE_CLEAN, ///< clean data that is up to date
195 STATE_WRITING, ///< data that is being written (io not yet complete)
196 };
197 static const char *get_state_name(int s) {
198 switch (s) {
199 case STATE_EMPTY: return "empty";
200 case STATE_CLEAN: return "clean";
201 case STATE_WRITING: return "writing";
202 default: return "???";
203 }
204 }
205 enum {
206 FLAG_NOCACHE = 1, ///< trim when done WRITING (do not become CLEAN)
207 // NOTE: fix operator<< when you define a second flag
208 };
209 static const char *get_flag_name(int s) {
210 switch (s) {
211 case FLAG_NOCACHE: return "nocache";
212 default: return "???";
213 }
214 }
215
216 BufferSpace *space;
217 uint16_t state; ///< STATE_*
218 uint16_t cache_private = 0; ///< opaque (to us) value used by Cache impl
219 uint32_t flags; ///< FLAG_*
220 uint64_t seq;
221 uint32_t offset, length;
f67539c2 222 ceph::buffer::list data;
7c673cae
FG
223
224 boost::intrusive::list_member_hook<> lru_item;
225 boost::intrusive::list_member_hook<> state_item;
226
227 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, uint32_t l,
228 unsigned f = 0)
229 : space(space), state(s), flags(f), seq(q), offset(o), length(l) {}
f67539c2 230 Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, ceph::buffer::list& b,
7c673cae
FG
231 unsigned f = 0)
232 : space(space), state(s), flags(f), seq(q), offset(o),
233 length(b.length()), data(b) {}
234
235 bool is_empty() const {
236 return state == STATE_EMPTY;
237 }
238 bool is_clean() const {
239 return state == STATE_CLEAN;
240 }
241 bool is_writing() const {
242 return state == STATE_WRITING;
243 }
244
245 uint32_t end() const {
246 return offset + length;
247 }
248
249 void truncate(uint32_t newlen) {
11fdf7f2 250 ceph_assert(newlen < length);
7c673cae 251 if (data.length()) {
f67539c2 252 ceph::buffer::list t;
7c673cae 253 t.substr_of(data, 0, newlen);
f67539c2 254 data = std::move(t);
7c673cae
FG
255 }
256 length = newlen;
257 }
31f18b77
FG
258 void maybe_rebuild() {
259 if (data.length() &&
260 (data.get_num_buffers() > 1 ||
261 data.front().wasted() > data.length() / MAX_BUFFER_SLOP_RATIO_DEN)) {
262 data.rebuild();
263 }
264 }
7c673cae 265
f67539c2 266 void dump(ceph::Formatter *f) const {
7c673cae
FG
267 f->dump_string("state", get_state_name(state));
268 f->dump_unsigned("seq", seq);
269 f->dump_unsigned("offset", offset);
270 f->dump_unsigned("length", length);
271 f->dump_unsigned("data_length", data.length());
272 }
273 };
274
9f95a23c 275 struct BufferCacheShard;
7c673cae
FG
276
277 /// map logical extent range (object) onto buffers
278 struct BufferSpace {
91327a77
AA
279 enum {
280 BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache
281 };
282
7c673cae
FG
283 typedef boost::intrusive::list<
284 Buffer,
285 boost::intrusive::member_hook<
286 Buffer,
287 boost::intrusive::list_member_hook<>,
288 &Buffer::state_item> > state_list_t;
289
f91f0fd5 290 mempool::bluestore_cache_meta::map<uint32_t, std::unique_ptr<Buffer>>
7c673cae
FG
291 buffer_map;
292
293 // we use a bare intrusive list here instead of std::map because
294 // it uses less memory and we expect this to be very small (very
295 // few IOs in flight to the same Blob at the same time).
296 state_list_t writing; ///< writing buffers, sorted by seq, ascending
297
298 ~BufferSpace() {
11fdf7f2
TL
299 ceph_assert(buffer_map.empty());
300 ceph_assert(writing.empty());
7c673cae
FG
301 }
302
f67539c2 303 void _add_buffer(BufferCacheShard* cache, Buffer* b, int level, Buffer* near) {
7c673cae
FG
304 cache->_audit("_add_buffer start");
305 buffer_map[b->offset].reset(b);
306 if (b->is_writing()) {
f67539c2
TL
307 // we might get already cached data for which resetting mempool is inppropriate
308 // hence calling try_assign_to_mempool
309 b->data.try_assign_to_mempool(mempool::mempool_bluestore_writing);
224ce89b
WB
310 if (writing.empty() || writing.rbegin()->seq <= b->seq) {
311 writing.push_back(*b);
312 } else {
313 auto it = writing.begin();
314 while (it->seq < b->seq) {
315 ++it;
316 }
317
11fdf7f2 318 ceph_assert(it->seq >= b->seq);
224ce89b
WB
319 // note that this will insert b before it
320 // hence the order is maintained
321 writing.insert(it, *b);
322 }
7c673cae 323 } else {
f67539c2
TL
324 b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
325 cache->_add(b, level, near);
7c673cae
FG
326 }
327 cache->_audit("_add_buffer end");
328 }
9f95a23c 329 void _rm_buffer(BufferCacheShard* cache, Buffer *b) {
7c673cae
FG
330 _rm_buffer(cache, buffer_map.find(b->offset));
331 }
9f95a23c 332 void _rm_buffer(BufferCacheShard* cache,
f67539c2 333 std::map<uint32_t, std::unique_ptr<Buffer>>::iterator p) {
11fdf7f2 334 ceph_assert(p != buffer_map.end());
7c673cae
FG
335 cache->_audit("_rm_buffer start");
336 if (p->second->is_writing()) {
337 writing.erase(writing.iterator_to(*p->second));
338 } else {
9f95a23c 339 cache->_rm(p->second.get());
7c673cae
FG
340 }
341 buffer_map.erase(p);
342 cache->_audit("_rm_buffer end");
343 }
344
f67539c2 345 std::map<uint32_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
7c673cae
FG
346 uint32_t offset) {
347 auto i = buffer_map.lower_bound(offset);
348 if (i != buffer_map.begin()) {
349 --i;
350 if (i->first + i->second->length <= offset)
351 ++i;
352 }
353 return i;
354 }
355
356 // must be called under protection of the Cache lock
9f95a23c 357 void _clear(BufferCacheShard* cache);
7c673cae
FG
358
359 // return value is the highest cache_private of a trimmed buffer, or 0.
9f95a23c 360 int discard(BufferCacheShard* cache, uint32_t offset, uint32_t length) {
11fdf7f2 361 std::lock_guard l(cache->lock);
9f95a23c
TL
362 int ret = _discard(cache, offset, length);
363 cache->_trim();
364 return ret;
7c673cae 365 }
9f95a23c 366 int _discard(BufferCacheShard* cache, uint32_t offset, uint32_t length);
7c673cae 367
f67539c2 368 void write(BufferCacheShard* cache, uint64_t seq, uint32_t offset, ceph::buffer::list& bl,
7c673cae 369 unsigned flags) {
11fdf7f2 370 std::lock_guard l(cache->lock);
7c673cae
FG
371 Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
372 flags);
373 b->cache_private = _discard(cache, offset, bl.length());
374 _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
9f95a23c 375 cache->_trim();
7c673cae 376 }
9f95a23c 377 void _finish_write(BufferCacheShard* cache, uint64_t seq);
f67539c2 378 void did_read(BufferCacheShard* cache, uint32_t offset, ceph::buffer::list& bl) {
11fdf7f2 379 std::lock_guard l(cache->lock);
7c673cae
FG
380 Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
381 b->cache_private = _discard(cache, offset, bl.length());
382 _add_buffer(cache, b, 1, nullptr);
9f95a23c 383 cache->_trim();
7c673cae
FG
384 }
385
9f95a23c 386 void read(BufferCacheShard* cache, uint32_t offset, uint32_t length,
7c673cae 387 BlueStore::ready_regions_t& res,
91327a77
AA
388 interval_set<uint32_t>& res_intervals,
389 int flags = 0);
7c673cae 390
9f95a23c 391 void truncate(BufferCacheShard* cache, uint32_t offset) {
7c673cae
FG
392 discard(cache, offset, (uint32_t)-1 - offset);
393 }
394
9f95a23c 395 void split(BufferCacheShard* cache, size_t pos, BufferSpace &r);
7c673cae 396
f67539c2 397 void dump(BufferCacheShard* cache, ceph::Formatter *f) const {
11fdf7f2 398 std::lock_guard l(cache->lock);
7c673cae
FG
399 f->open_array_section("buffers");
400 for (auto& i : buffer_map) {
401 f->open_object_section("buffer");
11fdf7f2 402 ceph_assert(i.first == i.second->offset);
7c673cae
FG
403 i.second->dump(f);
404 f->close_section();
405 }
406 f->close_section();
407 }
408 };
409
410 struct SharedBlobSet;
411
412 /// in-memory shared blob state (incl cached buffers)
413 struct SharedBlob {
414 MEMPOOL_CLASS_HELPERS();
415
416 std::atomic_int nref = {0}; ///< reference count
417 bool loaded = false;
418
419 CollectionRef coll;
420 union {
421 uint64_t sbid_unloaded; ///< sbid if persistent isn't loaded
422 bluestore_shared_blob_t *persistent; ///< persistent part of the shared blob if any
423 };
424 BufferSpace bc; ///< buffer cache
425
426 SharedBlob(Collection *_coll) : coll(_coll), sbid_unloaded(0) {
427 if (get_cache()) {
428 get_cache()->add_blob();
429 }
430 }
431 SharedBlob(uint64_t i, Collection *_coll);
432 ~SharedBlob();
433
434 uint64_t get_sbid() const {
435 return loaded ? persistent->sbid : sbid_unloaded;
436 }
437
438 friend void intrusive_ptr_add_ref(SharedBlob *b) { b->get(); }
439 friend void intrusive_ptr_release(SharedBlob *b) { b->put(); }
440
f67539c2
TL
441 void dump(ceph::Formatter* f) const;
442 friend std::ostream& operator<<(std::ostream& out, const SharedBlob& sb);
7c673cae
FG
443
444 void get() {
445 ++nref;
446 }
447 void put();
448
449 /// get logical references
450 void get_ref(uint64_t offset, uint32_t length);
451
452 /// put logical references, and get back any released extents
453 void put_ref(uint64_t offset, uint32_t length,
11fdf7f2 454 PExtentVector *r, bool *unshare);
7c673cae 455
f64942e4
AA
456 void finish_write(uint64_t seq);
457
7c673cae
FG
458 friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
459 return l.get_sbid() == r.get_sbid();
460 }
9f95a23c 461 inline BufferCacheShard* get_cache() {
7c673cae
FG
462 return coll ? coll->cache : nullptr;
463 }
464 inline SharedBlobSet* get_parent() {
465 return coll ? &(coll->shared_blob_set) : nullptr;
466 }
467 inline bool is_loaded() const {
468 return loaded;
469 }
470
471 };
472 typedef boost::intrusive_ptr<SharedBlob> SharedBlobRef;
473
474 /// a lookup table of SharedBlobs
475 struct SharedBlobSet {
11fdf7f2
TL
476 /// protect lookup, insertion, removal
477 ceph::mutex lock = ceph::make_mutex("BlueStore::SharedBlobSet::lock");
7c673cae
FG
478
479 // we use a bare pointer because we don't want to affect the ref
480 // count
f91f0fd5 481 mempool::bluestore_cache_meta::unordered_map<uint64_t,SharedBlob*> sb_map;
7c673cae
FG
482
483 SharedBlobRef lookup(uint64_t sbid) {
11fdf7f2 484 std::lock_guard l(lock);
7c673cae 485 auto p = sb_map.find(sbid);
28e407b8
AA
486 if (p == sb_map.end() ||
487 p->second->nref == 0) {
7c673cae
FG
488 return nullptr;
489 }
490 return p->second;
491 }
492
493 void add(Collection* coll, SharedBlob *sb) {
11fdf7f2 494 std::lock_guard l(lock);
7c673cae
FG
495 sb_map[sb->get_sbid()] = sb;
496 sb->coll = coll;
497 }
498
91327a77 499 bool remove(SharedBlob *sb, bool verify_nref_is_zero=false) {
11fdf7f2
TL
500 std::lock_guard l(lock);
501 ceph_assert(sb->get_parent() == this);
91327a77
AA
502 if (verify_nref_is_zero && sb->nref != 0) {
503 return false;
504 }
28e407b8
AA
505 // only remove if it still points to us
506 auto p = sb_map.find(sb->get_sbid());
507 if (p != sb_map.end() &&
508 p->second == sb) {
509 sb_map.erase(p);
510 }
91327a77 511 return true;
3efd9988
FG
512 }
513
7c673cae 514 bool empty() {
11fdf7f2 515 std::lock_guard l(lock);
7c673cae
FG
516 return sb_map.empty();
517 }
3efd9988 518
11fdf7f2
TL
519 template <int LogLevelV>
520 void dump(CephContext *cct);
7c673cae
FG
521 };
522
523//#define CACHE_BLOB_BL // not sure if this is a win yet or not... :/
524
525 /// in-memory blob metadata and associated cached buffers (if any)
526 struct Blob {
527 MEMPOOL_CLASS_HELPERS();
528
529 std::atomic_int nref = {0}; ///< reference count
530 int16_t id = -1; ///< id, for spanning blobs only, >= 0
531 int16_t last_encoded_id = -1; ///< (ephemeral) used during encoding only
532 SharedBlobRef shared_blob; ///< shared blob state (if any)
533
534 private:
535 mutable bluestore_blob_t blob; ///< decoded blob metadata
536#ifdef CACHE_BLOB_BL
f67539c2 537 mutable ceph::buffer::list blob_bl; ///< cached encoded blob, blob is dirty if empty
7c673cae
FG
538#endif
539 /// refs from this shard. ephemeral if id<0, persisted if spanning.
540 bluestore_blob_use_tracker_t used_in_blob;
541
542 public:
543
544 friend void intrusive_ptr_add_ref(Blob *b) { b->get(); }
545 friend void intrusive_ptr_release(Blob *b) { b->put(); }
546
f67539c2
TL
547 void dump(ceph::Formatter* f) const;
548 friend std::ostream& operator<<(std::ostream& out, const Blob &b);
7c673cae
FG
549
550 const bluestore_blob_use_tracker_t& get_blob_use_tracker() const {
551 return used_in_blob;
552 }
553 bool is_referenced() const {
554 return used_in_blob.is_not_empty();
555 }
556 uint32_t get_referenced_bytes() const {
557 return used_in_blob.get_referenced_bytes();
558 }
559
560 bool is_spanning() const {
561 return id >= 0;
562 }
563
564 bool can_split() const {
11fdf7f2 565 std::lock_guard l(shared_blob->get_cache()->lock);
7c673cae
FG
566 // splitting a BufferSpace writing list is too hard; don't try.
567 return shared_blob->bc.writing.empty() &&
568 used_in_blob.can_split() &&
569 get_blob().can_split();
570 }
571
572 bool can_split_at(uint32_t blob_offset) const {
573 return used_in_blob.can_split_at(blob_offset) &&
574 get_blob().can_split_at(blob_offset);
575 }
576
224ce89b 577 bool can_reuse_blob(uint32_t min_alloc_size,
7c673cae
FG
578 uint32_t target_blob_size,
579 uint32_t b_offset,
580 uint32_t *length0);
581
582 void dup(Blob& o) {
583 o.shared_blob = shared_blob;
584 o.blob = blob;
585#ifdef CACHE_BLOB_BL
586 o.blob_bl = blob_bl;
587#endif
588 }
589
224ce89b 590 inline const bluestore_blob_t& get_blob() const {
7c673cae
FG
591 return blob;
592 }
224ce89b 593 inline bluestore_blob_t& dirty_blob() {
7c673cae
FG
594#ifdef CACHE_BLOB_BL
595 blob_bl.clear();
596#endif
597 return blob;
598 }
599
600 /// discard buffers for unallocated regions
601 void discard_unallocated(Collection *coll);
602
603 /// get logical references
604 void get_ref(Collection *coll, uint32_t offset, uint32_t length);
605 /// put logical references, and get back any released extents
606 bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
607 PExtentVector *r);
608
609 /// split the blob
610 void split(Collection *coll, uint32_t blob_offset, Blob *o);
611
612 void get() {
613 ++nref;
614 }
615 void put() {
616 if (--nref == 0)
617 delete this;
618 }
619
620
621#ifdef CACHE_BLOB_BL
622 void _encode() const {
623 if (blob_bl.length() == 0 ) {
11fdf7f2 624 encode(blob, blob_bl);
7c673cae 625 } else {
11fdf7f2 626 ceph_assert(blob_bl.length());
7c673cae
FG
627 }
628 }
629 void bound_encode(
630 size_t& p,
631 bool include_ref_map) const {
632 _encode();
633 p += blob_bl.length();
634 if (include_ref_map) {
635 used_in_blob.bound_encode(p);
636 }
637 }
638 void encode(
f67539c2 639 ceph::buffer::list::contiguous_appender& p,
7c673cae
FG
640 bool include_ref_map) const {
641 _encode();
642 p.append(blob_bl);
643 if (include_ref_map) {
644 used_in_blob.encode(p);
645 }
646 }
647 void decode(
648 Collection */*coll*/,
f67539c2 649 ceph::buffer::ptr::const_iterator& p,
7c673cae
FG
650 bool include_ref_map) {
651 const char *start = p.get_pos();
652 denc(blob, p);
653 const char *end = p.get_pos();
654 blob_bl.clear();
655 blob_bl.append(start, end - start);
656 if (include_ref_map) {
657 used_in_blob.decode(p);
658 }
659 }
660#else
661 void bound_encode(
662 size_t& p,
663 uint64_t struct_v,
664 uint64_t sbid,
665 bool include_ref_map) const {
666 denc(blob, p, struct_v);
667 if (blob.is_shared()) {
668 denc(sbid, p);
669 }
670 if (include_ref_map) {
671 used_in_blob.bound_encode(p);
672 }
673 }
674 void encode(
f67539c2 675 ceph::buffer::list::contiguous_appender& p,
7c673cae
FG
676 uint64_t struct_v,
677 uint64_t sbid,
678 bool include_ref_map) const {
679 denc(blob, p, struct_v);
680 if (blob.is_shared()) {
681 denc(sbid, p);
682 }
683 if (include_ref_map) {
684 used_in_blob.encode(p);
685 }
686 }
687 void decode(
688 Collection *coll,
f67539c2 689 ceph::buffer::ptr::const_iterator& p,
7c673cae
FG
690 uint64_t struct_v,
691 uint64_t* sbid,
692 bool include_ref_map);
693#endif
694 };
695 typedef boost::intrusive_ptr<Blob> BlobRef;
f91f0fd5 696 typedef mempool::bluestore_cache_meta::map<int,BlobRef> blob_map_t;
7c673cae
FG
697
698 /// a logical extent, pointing to (some portion of) a blob
699 typedef boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true> > ExtentBase; //making an alias to avoid build warnings
700 struct Extent : public ExtentBase {
701 MEMPOOL_CLASS_HELPERS();
702
703 uint32_t logical_offset = 0; ///< logical offset
704 uint32_t blob_offset = 0; ///< blob offset
705 uint32_t length = 0; ///< length
706 BlobRef blob; ///< the blob with our data
707
708 /// ctor for lookup only
709 explicit Extent(uint32_t lo) : ExtentBase(), logical_offset(lo) { }
710 /// ctor for delayed initialization (see decode_some())
711 explicit Extent() : ExtentBase() {
712 }
713 /// ctor for general usage
714 Extent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
715 : ExtentBase(),
716 logical_offset(lo), blob_offset(o), length(l) {
717 assign_blob(b);
718 }
719 ~Extent() {
720 if (blob) {
721 blob->shared_blob->get_cache()->rm_extent();
722 }
723 }
724
f67539c2 725 void dump(ceph::Formatter* f) const;
9f95a23c 726
7c673cae 727 void assign_blob(const BlobRef& b) {
11fdf7f2 728 ceph_assert(!blob);
7c673cae
FG
729 blob = b;
730 blob->shared_blob->get_cache()->add_extent();
731 }
732
733 // comparators for intrusive_set
734 friend bool operator<(const Extent &a, const Extent &b) {
735 return a.logical_offset < b.logical_offset;
736 }
737 friend bool operator>(const Extent &a, const Extent &b) {
738 return a.logical_offset > b.logical_offset;
739 }
740 friend bool operator==(const Extent &a, const Extent &b) {
741 return a.logical_offset == b.logical_offset;
742 }
743
744 uint32_t blob_start() const {
745 return logical_offset - blob_offset;
746 }
747
748 uint32_t blob_end() const {
749 return blob_start() + blob->get_blob().get_logical_length();
750 }
751
752 uint32_t logical_end() const {
753 return logical_offset + length;
754 }
755
756 // return true if any piece of the blob is out of
757 // the given range [o, o + l].
758 bool blob_escapes_range(uint32_t o, uint32_t l) const {
759 return blob_start() < o || blob_end() > o + l;
760 }
761 };
762 typedef boost::intrusive::set<Extent> extent_map_t;
763
764
f67539c2 765 friend std::ostream& operator<<(std::ostream& out, const Extent& e);
7c673cae
FG
766
767 struct OldExtent {
768 boost::intrusive::list_member_hook<> old_extent_item;
769 Extent e;
770 PExtentVector r;
771 bool blob_empty; // flag to track the last removed extent that makes blob
772 // empty - required to update compression stat properly
773 OldExtent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
774 : e(lo, o, l, b), blob_empty(false) {
775 }
776 static OldExtent* create(CollectionRef c,
777 uint32_t lo,
778 uint32_t o,
779 uint32_t l,
780 BlobRef& b);
781 };
782 typedef boost::intrusive::list<
783 OldExtent,
784 boost::intrusive::member_hook<
785 OldExtent,
786 boost::intrusive::list_member_hook<>,
787 &OldExtent::old_extent_item> > old_extent_map_t;
788
789 struct Onode;
790
791 /// a sharded extent map, mapping offsets to lextents to blobs
792 struct ExtentMap {
793 Onode *onode;
794 extent_map_t extent_map; ///< map of Extents to Blobs
795 blob_map_t spanning_blob_map; ///< blobs that span shards
11fdf7f2 796 typedef boost::intrusive_ptr<Onode> OnodeRef;
7c673cae
FG
797
798 struct Shard {
799 bluestore_onode_t::shard_info *shard_info = nullptr;
800 unsigned extents = 0; ///< count extents in this shard
801 bool loaded = false; ///< true if shard is loaded
802 bool dirty = false; ///< true if shard is dirty and needs reencoding
803 };
f91f0fd5 804 mempool::bluestore_cache_meta::vector<Shard> shards; ///< shards
7c673cae 805
f67539c2 806 ceph::buffer::list inline_bl; ///< cached encoded map, if unsharded; empty=>dirty
7c673cae
FG
807
808 uint32_t needs_reshard_begin = 0;
809 uint32_t needs_reshard_end = 0;
810
11fdf7f2
TL
811 void dup(BlueStore* b, TransContext*, CollectionRef&, OnodeRef&, OnodeRef&,
812 uint64_t&, uint64_t&, uint64_t&);
813
7c673cae
FG
814 bool needs_reshard() const {
815 return needs_reshard_end > needs_reshard_begin;
816 }
817 void clear_needs_reshard() {
818 needs_reshard_begin = needs_reshard_end = 0;
819 }
820 void request_reshard(uint32_t begin, uint32_t end) {
821 if (begin < needs_reshard_begin) {
822 needs_reshard_begin = begin;
823 }
824 if (end > needs_reshard_end) {
825 needs_reshard_end = end;
826 }
827 }
828
829 struct DeleteDisposer {
830 void operator()(Extent *e) { delete e; }
831 };
832
833 ExtentMap(Onode *o);
834 ~ExtentMap() {
835 extent_map.clear_and_dispose(DeleteDisposer());
836 }
837
838 void clear() {
839 extent_map.clear_and_dispose(DeleteDisposer());
840 shards.clear();
841 inline_bl.clear();
842 clear_needs_reshard();
843 }
844
f67539c2 845 void dump(ceph::Formatter* f) const;
9f95a23c 846
f67539c2 847 bool encode_some(uint32_t offset, uint32_t length, ceph::buffer::list& bl,
7c673cae 848 unsigned *pn);
f67539c2 849 unsigned decode_some(ceph::buffer::list& bl);
7c673cae
FG
850
851 void bound_encode_spanning_blobs(size_t& p);
f67539c2
TL
852 void encode_spanning_blobs(ceph::buffer::list::contiguous_appender& p);
853 void decode_spanning_blobs(ceph::buffer::ptr::const_iterator& p);
7c673cae
FG
854
855 BlobRef get_spanning_blob(int id) {
856 auto p = spanning_blob_map.find(id);
11fdf7f2 857 ceph_assert(p != spanning_blob_map.end());
7c673cae
FG
858 return p->second;
859 }
860
861 void update(KeyValueDB::Transaction t, bool force);
31f18b77 862 decltype(BlueStore::Blob::id) allocate_spanning_blob_id();
7c673cae
FG
863 void reshard(
864 KeyValueDB *db,
865 KeyValueDB::Transaction t);
866
867 /// initialize Shards from the onode
868 void init_shards(bool loaded, bool dirty);
869
870 /// return index of shard containing offset
871 /// or -1 if not found
872 int seek_shard(uint32_t offset) {
873 size_t end = shards.size();
874 size_t mid, left = 0;
875 size_t right = end; // one passed the right end
876
877 while (left < right) {
878 mid = left + (right - left) / 2;
879 if (offset >= shards[mid].shard_info->offset) {
880 size_t next = mid + 1;
881 if (next >= end || offset < shards[next].shard_info->offset)
882 return mid;
883 //continue to search forwards
884 left = next;
885 } else {
886 //continue to search backwards
887 right = mid;
888 }
889 }
890
891 return -1; // not found
892 }
893
894 /// check if a range spans a shard
895 bool spans_shard(uint32_t offset, uint32_t length) {
896 if (shards.empty()) {
897 return false;
898 }
899 int s = seek_shard(offset);
11fdf7f2 900 ceph_assert(s >= 0);
7c673cae
FG
901 if (s == (int)shards.size() - 1) {
902 return false; // last shard
903 }
904 if (offset + length <= shards[s+1].shard_info->offset) {
905 return false;
906 }
907 return true;
908 }
909
910 /// ensure that a range of the map is loaded
911 void fault_range(KeyValueDB *db,
912 uint32_t offset, uint32_t length);
913
914 /// ensure a range of the map is marked dirty
31f18b77 915 void dirty_range(uint32_t offset, uint32_t length);
7c673cae 916
31f18b77 917 /// for seek_lextent test
7c673cae
FG
918 extent_map_t::iterator find(uint64_t offset);
919
7c673cae
FG
920 /// seek to the first lextent including or after offset
921 extent_map_t::iterator seek_lextent(uint64_t offset);
922 extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
923
924 /// add a new Extent
925 void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
926 extent_map.insert(*new Extent(lo, o, l, b));
927 }
928
929 /// remove (and delete) an Extent
930 void rm(extent_map_t::iterator p) {
931 extent_map.erase_and_dispose(p, DeleteDisposer());
932 }
933
934 bool has_any_lextents(uint64_t offset, uint64_t length);
935
936 /// consolidate adjacent lextents in extent_map
937 int compress_extent_map(uint64_t offset, uint64_t length);
938
939 /// punch a logical hole. add lextents to deref to target list.
940 void punch_hole(CollectionRef &c,
941 uint64_t offset, uint64_t length,
942 old_extent_map_t *old_extents);
943
944 /// put new lextent into lextent_map overwriting existing ones if
945 /// any and update references accordingly
946 Extent *set_lextent(CollectionRef &c,
947 uint64_t logical_offset,
948 uint64_t offset, uint64_t length,
949 BlobRef b,
950 old_extent_map_t *old_extents);
951
952 /// split a blob (and referring extents)
953 BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
954 };
955
956 /// Compressed Blob Garbage collector
957 /*
958 The primary idea of the collector is to estimate a difference between
959 allocation units(AU) currently present for compressed blobs and new AUs
960 required to store that data uncompressed.
961 Estimation is performed for protrusive extents within a logical range
962 determined by a concatenation of old_extents collection and specific(current)
963 write request.
964 The root cause for old_extents use is the need to handle blob ref counts
965 properly. Old extents still hold blob refs and hence we need to traverse
966 the collection to determine if blob to be released.
f67539c2 967 Protrusive extents are extents that fit into the blob std::set in action
7c673cae
FG
968 (ones that are below the logical range from above) but not removed totally
969 due to the current write.
970 E.g. for
971 extent1 <loffs = 100, boffs = 100, len = 100> ->
972 blob1<compressed, len_on_disk=4096, logical_len=8192>
973 extent2 <loffs = 200, boffs = 200, len = 100> ->
974 blob2<raw, len_on_disk=4096, llen=4096>
975 extent3 <loffs = 300, boffs = 300, len = 100> ->
976 blob1<compressed, len_on_disk=4096, llen=8192>
977 extent4 <loffs = 4096, boffs = 0, len = 100> ->
978 blob3<raw, len_on_disk=4096, llen=4096>
979 write(300~100)
980 protrusive extents are within the following ranges <0~300, 400~8192-400>
981 In this case existing AUs that might be removed due to GC (i.e. blob1)
982 use 2x4K bytes.
983 And new AUs expected after GC = 0 since extent1 to be merged into blob2.
984 Hence we should do a collect.
985 */
986 class GarbageCollector
987 {
988 public:
989 /// return amount of allocation units that might be saved due to GC
990 int64_t estimate(
991 uint64_t offset,
992 uint64_t length,
993 const ExtentMap& extent_map,
994 const old_extent_map_t& old_extents,
995 uint64_t min_alloc_size);
996
997 /// return a collection of extents to perform GC on
eafe8130 998 const interval_set<uint64_t>& get_extents_to_collect() const {
7c673cae
FG
999 return extents_to_collect;
1000 }
1001 GarbageCollector(CephContext* _cct) : cct(_cct) {}
1002
1003 private:
1004 struct BlobInfo {
1005 uint64_t referenced_bytes = 0; ///< amount of bytes referenced in blob
1006 int64_t expected_allocations = 0; ///< new alloc units required
1007 ///< in case of gc fulfilled
1008 bool collect_candidate = false; ///< indicate if blob has any extents
1009 ///< eligible for GC.
1010 extent_map_t::const_iterator first_lextent; ///< points to the first
1011 ///< lextent referring to
1012 ///< the blob if any.
1013 ///< collect_candidate flag
1014 ///< determines the validity
1015 extent_map_t::const_iterator last_lextent; ///< points to the last
1016 ///< lextent referring to
1017 ///< the blob if any.
1018
1019 BlobInfo(uint64_t ref_bytes) :
1020 referenced_bytes(ref_bytes) {
1021 }
1022 };
1023 CephContext* cct;
f67539c2 1024 std::map<Blob*, BlobInfo> affected_blobs; ///< compressed blobs and their ref_map
7c673cae
FG
1025 ///< copies that are affected by the
1026 ///< specific write
1027
a8e16298 1028 ///< protrusive extents that should be collected if GC takes place
eafe8130 1029 interval_set<uint64_t> extents_to_collect;
7c673cae
FG
1030
1031 boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
1032 ///< unit when traversing
1033 ///< protrusive extents.
1034 ///< Other extents mapped to
1035 ///< this AU to be ignored
1036 ///< (except the case where
1037 ///< uncompressed extent follows
1038 ///< compressed one - see below).
f67539c2 1039 BlobInfo* blob_info_counted = nullptr; ///< std::set if previous allocation unit
7c673cae
FG
1040 ///< caused expected_allocations
1041 ///< counter increment at this blob.
1042 ///< if uncompressed extent follows
1043 ///< a decrement for the
1044 ///< expected_allocations counter
1045 ///< is needed
1046 int64_t expected_allocations = 0; ///< new alloc units required in case
1047 ///< of gc fulfilled
1048 int64_t expected_for_release = 0; ///< alloc units currently used by
1049 ///< compressed blobs that might
1050 ///< gone after GC
7c673cae
FG
1051
1052 protected:
1053 void process_protrusive_extents(const BlueStore::ExtentMap& extent_map,
1054 uint64_t start_offset,
1055 uint64_t end_offset,
1056 uint64_t start_touch_offset,
1057 uint64_t end_touch_offset,
1058 uint64_t min_alloc_size);
1059 };
1060
1061 struct OnodeSpace;
7c673cae
FG
1062 /// an in-memory object
1063 struct Onode {
1064 MEMPOOL_CLASS_HELPERS();
1065
1066 std::atomic_int nref; ///< reference count
1067 Collection *c;
7c673cae
FG
1068 ghobject_t oid;
1069
1070 /// key under PREFIX_OBJ where we are stored
f91f0fd5 1071 mempool::bluestore_cache_meta::string key;
7c673cae 1072
f6b5b4d7 1073 boost::intrusive::list_member_hook<> lru_item;
7c673cae
FG
1074
1075 bluestore_onode_t onode; ///< metadata stored as value in kv store
1076 bool exists; ///< true if object logically exists
f6b5b4d7
TL
1077 bool cached; ///< Onode is logically in the cache
1078 /// (it can be pinned and hence physically out
1079 /// of it at the moment though)
adb31ebb 1080 std::atomic_bool pinned; ///< Onode is pinned
f6b5b4d7 1081 /// (or should be pinned when cached)
7c673cae
FG
1082 ExtentMap extent_map;
1083
1084 // track txc's that have not been committed to kv store (and whose
1085 // effects cannot be read via the kvdb read methods)
1086 std::atomic<int> flushing_count = {0};
9f95a23c 1087 std::atomic<int> waiting_count = {0};
11fdf7f2
TL
1088 /// protect flush_txns
1089 ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock");
1090 ceph::condition_variable flush_cond; ///< wait here for uncommitted txns
7c673cae
FG
1091
1092 Onode(Collection *c, const ghobject_t& o,
f91f0fd5 1093 const mempool::bluestore_cache_meta::string& k)
f6b5b4d7 1094 : nref(0),
7c673cae
FG
1095 c(c),
1096 oid(o),
1097 key(k),
1098 exists(false),
f6b5b4d7
TL
1099 cached(false),
1100 pinned(false),
7c673cae
FG
1101 extent_map(this) {
1102 }
eafe8130 1103 Onode(Collection* c, const ghobject_t& o,
f6b5b4d7
TL
1104 const std::string& k)
1105 : nref(0),
eafe8130
TL
1106 c(c),
1107 oid(o),
1108 key(k),
1109 exists(false),
f6b5b4d7
TL
1110 cached(false),
1111 pinned(false),
eafe8130
TL
1112 extent_map(this) {
1113 }
1114 Onode(Collection* c, const ghobject_t& o,
1115 const char* k)
f6b5b4d7 1116 : nref(0),
eafe8130
TL
1117 c(c),
1118 oid(o),
1119 key(k),
1120 exists(false),
f6b5b4d7
TL
1121 cached(false),
1122 pinned(false),
eafe8130
TL
1123 extent_map(this) {
1124 }
1125
1126 static Onode* decode(
1127 CollectionRef c,
1128 const ghobject_t& oid,
f67539c2
TL
1129 const std::string& key,
1130 const ceph::buffer::list& v);
7c673cae 1131
f67539c2 1132 void dump(ceph::Formatter* f) const;
9f95a23c 1133
7c673cae 1134 void flush();
f6b5b4d7
TL
1135 void get();
1136 void put();
1137
1138 inline bool put_cache() {
1139 ceph_assert(!cached);
1140 cached = true;
1141 return !pinned;
7c673cae 1142 }
f6b5b4d7
TL
1143 inline bool pop_cache() {
1144 ceph_assert(cached);
1145 cached = false;
1146 return !pinned;
7c673cae 1147 }
9f95a23c 1148
f67539c2
TL
1149 const std::string& get_omap_prefix();
1150 void get_omap_header(std::string *out);
1151 void get_omap_key(const std::string& key, std::string *out);
1152 void rewrite_omap_key(const std::string& old, std::string *out);
1153 void get_omap_tail(std::string *out);
1154 void decode_omap_key(const std::string& key, std::string *user_key);
1155
1156 // Return the offset of an object on disk. This function is intended *only*
1157 // for use with zoned storage devices because in these devices, the objects
1158 // are laid out contiguously on disk, which is not the case in general.
1159 // Also, it should always be called after calling extent_map.fault_range(),
1160 // so that the extent map is loaded.
1161 int64_t zoned_get_ondisk_starting_offset() const {
1162 return extent_map.extent_map.begin()->blob->
1163 get_blob().calc_offset(0, nullptr);
1164 }
7c673cae
FG
1165 };
1166 typedef boost::intrusive_ptr<Onode> OnodeRef;
1167
9f95a23c
TL
1168 /// A generic Cache Shard
1169 struct CacheShard {
1170 CephContext *cct;
7c673cae 1171 PerfCounters *logger;
11fdf7f2
TL
1172
1173 /// protect lru and other structures
1174 ceph::recursive_mutex lock = {
9f95a23c 1175 ceph::make_recursive_mutex("BlueStore::CacheShard::lock") };
7c673cae 1176
9f95a23c
TL
1177 std::atomic<uint64_t> max = {0};
1178 std::atomic<uint64_t> num = {0};
7c673cae 1179
9f95a23c
TL
1180 CacheShard(CephContext* cct) : cct(cct), logger(nullptr) {}
1181 virtual ~CacheShard() {}
7c673cae 1182
9f95a23c
TL
1183 void set_max(uint64_t max_) {
1184 max = max_;
7c673cae
FG
1185 }
1186
9f95a23c
TL
1187 uint64_t _get_num() {
1188 return num;
7c673cae 1189 }
7c673cae 1190
9f95a23c
TL
1191 virtual void _trim_to(uint64_t new_size) = 0;
1192 void _trim() {
1193 if (cct->_conf->objectstore_blackhole) {
1194 // do not trim if we are throwing away IOs a layer down
1195 return;
1196 }
1197 _trim_to(max);
1198 }
7c673cae 1199
9f95a23c 1200 void trim() {
11fdf7f2 1201 std::lock_guard l(lock);
9f95a23c
TL
1202 _trim();
1203 }
1204 void flush() {
1205 std::lock_guard l(lock);
1206 // we should not be shutting down after the blackhole is enabled
1207 assert(!cct->_conf->objectstore_blackhole);
1208 _trim_to(0);
31f18b77
FG
1209 }
1210
7c673cae
FG
1211#ifdef DEBUG_CACHE
1212 virtual void _audit(const char *s) = 0;
1213#else
1214 void _audit(const char *s) { /* no-op */ }
1215#endif
1216 };
1217
9f95a23c
TL
1218 /// A Generic onode Cache Shard
1219 struct OnodeCacheShard : public CacheShard {
1220 std::atomic<uint64_t> num_pinned = {0};
7c673cae 1221
f67539c2 1222 std::array<std::pair<ghobject_t, ceph::mono_clock::time_point>, 64> dumped_onodes;
f6b5b4d7
TL
1223
1224 virtual void _pin(Onode* o) = 0;
1225 virtual void _unpin(Onode* o) = 0;
1226
7c673cae 1227 public:
9f95a23c 1228 OnodeCacheShard(CephContext* cct) : CacheShard(cct) {}
f67539c2 1229 static OnodeCacheShard *create(CephContext* cct, std::string type,
9f95a23c 1230 PerfCounters *logger);
f6b5b4d7
TL
1231 virtual void _add(Onode* o, int level) = 0;
1232 virtual void _rm(Onode* o) = 0;
adb31ebb 1233 virtual void _unpin_and_rm(Onode* o) = 0;
7c673cae 1234
f6b5b4d7 1235 virtual void move_pinned(OnodeCacheShard *to, Onode *o) = 0;
9f95a23c
TL
1236 virtual void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) = 0;
1237 bool empty() {
1238 return _get_num() == 0;
1239 }
7c673cae
FG
1240 };
1241
9f95a23c
TL
1242 /// A Generic buffer Cache Shard
1243 struct BufferCacheShard : public CacheShard {
1244 std::atomic<uint64_t> num_extents = {0};
1245 std::atomic<uint64_t> num_blobs = {0};
1246 uint64_t buffer_bytes = 0;
7c673cae
FG
1247
1248 public:
9f95a23c 1249 BufferCacheShard(CephContext* cct) : CacheShard(cct) {}
f67539c2 1250 static BufferCacheShard *create(CephContext* cct, std::string type,
9f95a23c
TL
1251 PerfCounters *logger);
1252 virtual void _add(Buffer *b, int level, Buffer *near) = 0;
1253 virtual void _rm(Buffer *b) = 0;
1254 virtual void _move(BufferCacheShard *src, Buffer *b) = 0;
1255 virtual void _touch(Buffer *b) = 0;
1256 virtual void _adjust_size(Buffer *b, int64_t delta) = 0;
1257
1258 uint64_t _get_bytes() {
1259 return buffer_bytes;
7c673cae 1260 }
9f95a23c
TL
1261
1262 void add_extent() {
1263 ++num_extents;
7c673cae 1264 }
9f95a23c
TL
1265 void rm_extent() {
1266 --num_extents;
7c673cae 1267 }
7c673cae 1268
9f95a23c
TL
1269 void add_blob() {
1270 ++num_blobs;
7c673cae 1271 }
9f95a23c
TL
1272 void rm_blob() {
1273 --num_blobs;
7c673cae
FG
1274 }
1275
9f95a23c
TL
1276 virtual void add_stats(uint64_t *extents,
1277 uint64_t *blobs,
1278 uint64_t *buffers,
1279 uint64_t *bytes) = 0;
1280
1281 bool empty() {
1282 std::lock_guard l(lock);
1283 return _get_bytes() == 0;
1284 }
7c673cae
FG
1285 };
1286
1287 struct OnodeSpace {
9f95a23c 1288 OnodeCacheShard *cache;
7c673cae 1289
9f95a23c 1290 private:
7c673cae 1291 /// forward lookups
f91f0fd5 1292 mempool::bluestore_cache_meta::unordered_map<ghobject_t,OnodeRef> onode_map;
7c673cae 1293
adb31ebb
TL
1294 friend struct Collection; // for split_cache()
1295 friend struct Onode; // for put()
f6b5b4d7
TL
1296 friend struct LruOnodeCacheShard;
1297 void _remove(const ghobject_t& oid);
7c673cae 1298 public:
9f95a23c 1299 OnodeSpace(OnodeCacheShard *c) : cache(c) {}
7c673cae
FG
1300 ~OnodeSpace() {
1301 clear();
1302 }
1303
f6b5b4d7 1304 OnodeRef add(const ghobject_t& oid, OnodeRef& o);
7c673cae 1305 OnodeRef lookup(const ghobject_t& o);
7c673cae
FG
1306 void rename(OnodeRef& o, const ghobject_t& old_oid,
1307 const ghobject_t& new_oid,
f91f0fd5 1308 const mempool::bluestore_cache_meta::string& new_okey);
7c673cae
FG
1309 void clear();
1310 bool empty();
1311
11fdf7f2
TL
1312 template <int LogLevelV>
1313 void dump(CephContext *cct);
3efd9988 1314
7c673cae 1315 /// return true if f true for any item
adb31ebb 1316 bool map_any(std::function<bool(Onode*)> f);
7c673cae
FG
1317 };
1318
11fdf7f2 1319 class OpSequencer;
9f95a23c 1320 using OpSequencerRef = ceph::ref_t<OpSequencer>;
11fdf7f2 1321
7c673cae
FG
1322 struct Collection : public CollectionImpl {
1323 BlueStore *store;
11fdf7f2 1324 OpSequencerRef osr;
9f95a23c 1325 BufferCacheShard *cache; ///< our cache shard
7c673cae 1326 bluestore_cnode_t cnode;
9f95a23c
TL
1327 ceph::shared_mutex lock =
1328 ceph::make_shared_mutex("BlueStore::Collection::lock", true, false);
7c673cae
FG
1329
1330 bool exists;
1331
1332 SharedBlobSet shared_blob_set; ///< open SharedBlobs
1333
1334 // cache onodes on a per-collection basis to avoid lock
1335 // contention.
1336 OnodeSpace onode_map;
1337
1338 //pool options
1339 pool_opts_t pool_opts;
11fdf7f2 1340 ContextQueue *commit_queue;
7c673cae 1341
f6b5b4d7
TL
1342 OnodeCacheShard* get_onode_cache() const {
1343 return onode_map.cache;
1344 }
9f95a23c 1345 OnodeRef get_onode(const ghobject_t& oid, bool create, bool is_createop=false);
7c673cae
FG
1346
1347 // the terminology is confusing here, sorry!
1348 //
1349 // blob_t shared_blob_t
1350 // !shared unused -> open
1351 // shared !loaded -> open + shared
1352 // shared loaded -> open + shared + loaded
1353 //
1354 // i.e.,
1355 // open = SharedBlob is instantiated
f67539c2 1356 // shared = blob_t shared flag is std::set; SharedBlob is hashed.
7c673cae
FG
1357 // loaded = SharedBlob::shared_blob_t is loaded from kv store
1358 void open_shared_blob(uint64_t sbid, BlobRef b);
1359 void load_shared_blob(SharedBlobRef sb);
1360 void make_blob_shared(uint64_t sbid, BlobRef b);
31f18b77 1361 uint64_t make_blob_unshared(SharedBlob *sb);
7c673cae
FG
1362
1363 BlobRef new_blob() {
1364 BlobRef b = new Blob();
1365 b->shared_blob = new SharedBlob(this);
1366 return b;
1367 }
1368
7c673cae
FG
1369 bool contains(const ghobject_t& oid) {
1370 if (cid.is_meta())
1371 return oid.hobj.pool == -1;
1372 spg_t spgid;
1373 if (cid.is_pg(&spgid))
1374 return
1375 spgid.pgid.contains(cnode.bits, oid) &&
1376 oid.shard_id == spgid.shard;
1377 return false;
1378 }
1379
9f95a23c
TL
1380 int64_t pool() const {
1381 return cid.pool();
1382 }
1383
7c673cae 1384 void split_cache(Collection *dest);
7c673cae 1385
11fdf7f2
TL
1386 bool flush_commit(Context *c) override;
1387 void flush() override;
1388 void flush_all_but_last();
1389
9f95a23c 1390 Collection(BlueStore *ns, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t c);
7c673cae
FG
1391 };
1392
1393 class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
1394 CollectionRef c;
1395 OnodeRef o;
1396 KeyValueDB::Iterator it;
f67539c2 1397 std::string head, tail;
11fdf7f2 1398
f67539c2 1399 std::string _stringify() const;
11fdf7f2 1400
7c673cae
FG
1401 public:
1402 OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
1403 int seek_to_first() override;
f67539c2
TL
1404 int upper_bound(const std::string &after) override;
1405 int lower_bound(const std::string &to) override;
7c673cae 1406 bool valid() override;
11fdf7f2 1407 int next() override;
f67539c2
TL
1408 std::string key() override;
1409 ceph::buffer::list value() override;
1410 std::string tail_key() override {
9f95a23c
TL
1411 return tail;
1412 }
1413
7c673cae
FG
1414 int status() override {
1415 return 0;
1416 }
1417 };
1418
31f18b77
FG
1419 struct volatile_statfs{
1420 enum {
1421 STATFS_ALLOCATED = 0,
1422 STATFS_STORED,
1423 STATFS_COMPRESSED_ORIGINAL,
1424 STATFS_COMPRESSED,
1425 STATFS_COMPRESSED_ALLOCATED,
1426 STATFS_LAST
1427 };
1428 int64_t values[STATFS_LAST];
1429 volatile_statfs() {
1430 memset(this, 0, sizeof(volatile_statfs));
1431 }
1432 void reset() {
1433 *this = volatile_statfs();
1434 }
11fdf7f2
TL
1435 void publish(store_statfs_t* buf) const {
1436 buf->allocated = allocated();
1437 buf->data_stored = stored();
1438 buf->data_compressed = compressed();
1439 buf->data_compressed_original = compressed_original();
1440 buf->data_compressed_allocated = compressed_allocated();
1441 }
1442
31f18b77
FG
1443 volatile_statfs& operator+=(const volatile_statfs& other) {
1444 for (size_t i = 0; i < STATFS_LAST; ++i) {
1445 values[i] += other.values[i];
1446 }
1447 return *this;
1448 }
1449 int64_t& allocated() {
1450 return values[STATFS_ALLOCATED];
1451 }
1452 int64_t& stored() {
1453 return values[STATFS_STORED];
1454 }
1455 int64_t& compressed_original() {
1456 return values[STATFS_COMPRESSED_ORIGINAL];
1457 }
1458 int64_t& compressed() {
1459 return values[STATFS_COMPRESSED];
1460 }
1461 int64_t& compressed_allocated() {
1462 return values[STATFS_COMPRESSED_ALLOCATED];
1463 }
11fdf7f2
TL
1464 int64_t allocated() const {
1465 return values[STATFS_ALLOCATED];
1466 }
1467 int64_t stored() const {
1468 return values[STATFS_STORED];
1469 }
1470 int64_t compressed_original() const {
1471 return values[STATFS_COMPRESSED_ORIGINAL];
1472 }
1473 int64_t compressed() const {
1474 return values[STATFS_COMPRESSED];
1475 }
1476 int64_t compressed_allocated() const {
1477 return values[STATFS_COMPRESSED_ALLOCATED];
1478 }
1479 volatile_statfs& operator=(const store_statfs_t& st) {
1480 values[STATFS_ALLOCATED] = st.allocated;
1481 values[STATFS_STORED] = st.data_stored;
1482 values[STATFS_COMPRESSED_ORIGINAL] = st.data_compressed_original;
1483 values[STATFS_COMPRESSED] = st.data_compressed;
1484 values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated;
1485 return *this;
1486 }
31f18b77
FG
1487 bool is_empty() {
1488 return values[STATFS_ALLOCATED] == 0 &&
1489 values[STATFS_STORED] == 0 &&
1490 values[STATFS_COMPRESSED] == 0 &&
1491 values[STATFS_COMPRESSED_ORIGINAL] == 0 &&
1492 values[STATFS_COMPRESSED_ALLOCATED] == 0;
1493 }
f67539c2 1494 void decode(ceph::buffer::list::const_iterator& it) {
11fdf7f2 1495 using ceph::decode;
31f18b77 1496 for (size_t i = 0; i < STATFS_LAST; i++) {
11fdf7f2 1497 decode(values[i], it);
31f18b77
FG
1498 }
1499 }
1500
f67539c2 1501 void encode(ceph::buffer::list& bl) {
11fdf7f2 1502 using ceph::encode;
31f18b77 1503 for (size_t i = 0; i < STATFS_LAST; i++) {
11fdf7f2 1504 encode(values[i], bl);
31f18b77
FG
1505 }
1506 }
1507 };
1508
11fdf7f2 1509 struct TransContext final : public AioContext {
31f18b77
FG
1510 MEMPOOL_CLASS_HELPERS();
1511
7c673cae
FG
1512 typedef enum {
1513 STATE_PREPARE,
1514 STATE_AIO_WAIT,
1515 STATE_IO_DONE,
1516 STATE_KV_QUEUED, // queued for kv_sync_thread submission
1517 STATE_KV_SUBMITTED, // submitted to kv; not yet synced
1518 STATE_KV_DONE,
1519 STATE_DEFERRED_QUEUED, // in deferred_queue (pending or running)
1520 STATE_DEFERRED_CLEANUP, // remove deferred kv record
1521 STATE_DEFERRED_DONE,
1522 STATE_FINISHING,
1523 STATE_DONE,
1524 } state_t;
1525
7c673cae
FG
1526 const char *get_state_name() {
1527 switch (state) {
1528 case STATE_PREPARE: return "prepare";
1529 case STATE_AIO_WAIT: return "aio_wait";
1530 case STATE_IO_DONE: return "io_done";
1531 case STATE_KV_QUEUED: return "kv_queued";
1532 case STATE_KV_SUBMITTED: return "kv_submitted";
1533 case STATE_KV_DONE: return "kv_done";
1534 case STATE_DEFERRED_QUEUED: return "deferred_queued";
1535 case STATE_DEFERRED_CLEANUP: return "deferred_cleanup";
1536 case STATE_DEFERRED_DONE: return "deferred_done";
1537 case STATE_FINISHING: return "finishing";
1538 case STATE_DONE: return "done";
1539 }
1540 return "???";
1541 }
1542
9f95a23c 1543#if defined(WITH_LTTNG)
7c673cae
FG
1544 const char *get_state_latency_name(int state) {
1545 switch (state) {
1546 case l_bluestore_state_prepare_lat: return "prepare";
1547 case l_bluestore_state_aio_wait_lat: return "aio_wait";
1548 case l_bluestore_state_io_done_lat: return "io_done";
1549 case l_bluestore_state_kv_queued_lat: return "kv_queued";
1550 case l_bluestore_state_kv_committing_lat: return "kv_committing";
1551 case l_bluestore_state_kv_done_lat: return "kv_done";
1552 case l_bluestore_state_deferred_queued_lat: return "deferred_queued";
1553 case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup";
1554 case l_bluestore_state_finishing_lat: return "finishing";
1555 case l_bluestore_state_done_lat: return "done";
1556 }
1557 return "???";
1558 }
1559#endif
1560
f67539c2
TL
1561 inline void set_state(state_t s) {
1562 state = s;
1563#ifdef WITH_BLKIN
1564 if (trace) {
1565 trace.event(get_state_name());
1566 }
1567#endif
1568 }
1569 inline state_t get_state() {
1570 return state;
1571 }
1572
11fdf7f2
TL
1573 CollectionRef ch;
1574 OpSequencerRef osr; // this should be ch->osr
7c673cae
FG
1575 boost::intrusive::list_member_hook<> sequencer_item;
1576
9f95a23c 1577 uint64_t bytes = 0, ios = 0, cost = 0;
7c673cae 1578
f67539c2
TL
1579 std::set<OnodeRef> onodes; ///< these need to be updated/written
1580 std::set<OnodeRef> modified_objects; ///< objects we modified (and need a ref)
1581
1582 // A map from onode to a vector of object offset. For new objects created
1583 // in the transaction we append the new offset to the vector, for
1584 // overwritten objects we append the negative of the previous ondisk offset
1585 // followed by the new offset, and for truncated objects we append the
1586 // negative of the previous ondisk offset. We need to maintain a vector of
1587 // offsets because *within the same transaction* an object may be truncated
1588 // and then written again, or an object may be overwritten multiple times to
1589 // different zones. See update_cleaning_metadata function for how this map
1590 // is used.
1591 std::map<OnodeRef, std::vector<int64_t>> zoned_onode_to_offset_map;
1592
1593 std::set<SharedBlobRef> shared_blobs; ///< these need to be updated/written
1594 std::set<SharedBlobRef> shared_blobs_written; ///< update these on io completion
7c673cae
FG
1595
1596 KeyValueDB::Transaction t; ///< then we will commit this
f67539c2
TL
1597 std::list<Context*> oncommits; ///< more commit completions
1598 std::list<CollectionRef> removed_collections; ///< colls we removed
7c673cae
FG
1599
1600 boost::intrusive::list_member_hook<> deferred_queue_item;
1601 bluestore_deferred_transaction_t *deferred_txn = nullptr; ///< if any
1602
1603 interval_set<uint64_t> allocated, released;
11fdf7f2
TL
1604 volatile_statfs statfs_delta; ///< overall store statistics delta
1605 uint64_t osd_pool_id = META_POOL_ID; ///< osd pool id we're operating on
f67539c2 1606
7c673cae
FG
1607 IOContext ioc;
1608 bool had_ios = false; ///< true if we submitted IOs before our kv txn
1609
7c673cae 1610 uint64_t seq = 0;
f67539c2
TL
1611 ceph::mono_clock::time_point start;
1612 ceph::mono_clock::time_point last_stamp;
7c673cae
FG
1613
1614 uint64_t last_nid = 0; ///< if non-zero, highest new nid we allocated
1615 uint64_t last_blobid = 0; ///< if non-zero, highest new blobid we allocated
1616
9f95a23c
TL
1617#if defined(WITH_LTTNG)
1618 bool tracing = false;
1619#endif
1620
f67539c2
TL
1621#ifdef WITH_BLKIN
1622 ZTracer::Trace trace;
1623#endif
1624
11fdf7f2 1625 explicit TransContext(CephContext* cct, Collection *c, OpSequencer *o,
f67539c2 1626 std::list<Context*> *on_commits)
11fdf7f2
TL
1627 : ch(c),
1628 osr(o),
7c673cae 1629 ioc(cct, this),
f67539c2 1630 start(ceph::mono_clock::now()) {
7c673cae 1631 last_stamp = start;
11fdf7f2
TL
1632 if (on_commits) {
1633 oncommits.swap(*on_commits);
1634 }
7c673cae
FG
1635 }
1636 ~TransContext() {
f67539c2
TL
1637#ifdef WITH_BLKIN
1638 if (trace) {
1639 trace.event("txc destruct");
1640 }
1641#endif
7c673cae
FG
1642 delete deferred_txn;
1643 }
1644
1645 void write_onode(OnodeRef &o) {
1646 onodes.insert(o);
1647 }
1648 void write_shared_blob(SharedBlobRef &sb) {
1649 shared_blobs.insert(sb);
1650 }
31f18b77
FG
1651 void unshare_blob(SharedBlob *sb) {
1652 shared_blobs.erase(sb);
1653 }
1654
7c673cae
FG
1655 /// note we logically modified object (when onode itself is unmodified)
1656 void note_modified_object(OnodeRef &o) {
1657 // onode itself isn't written, though
1658 modified_objects.insert(o);
1659 }
a8e16298 1660 void note_removed_object(OnodeRef& o) {
a8e16298 1661 modified_objects.insert(o);
f67539c2
TL
1662 onodes.erase(o);
1663 }
1664
1665 void zoned_note_new_object(OnodeRef &o) {
1666 auto [_, ok] = zoned_onode_to_offset_map.emplace(
1667 std::pair<OnodeRef, std::vector<int64_t>>(o, {o->zoned_get_ondisk_starting_offset()}));
1668 ceph_assert(ok);
1669 }
1670
1671 void zoned_note_updated_object(OnodeRef &o, int64_t prev_offset) {
1672 int64_t new_offset = o->zoned_get_ondisk_starting_offset();
1673 auto [it, ok] = zoned_onode_to_offset_map.emplace(
1674 std::pair<OnodeRef, std::vector<int64_t>>(o, {-prev_offset, new_offset}));
1675 if (!ok) {
1676 it->second.push_back(-prev_offset);
1677 it->second.push_back(new_offset);
1678 }
1679 }
1680
1681 void zoned_note_truncated_object(OnodeRef &o, int64_t offset) {
1682 auto [it, ok] = zoned_onode_to_offset_map.emplace(
1683 std::pair<OnodeRef, std::vector<int64_t>>(o, {-offset}));
1684 if (!ok) {
1685 it->second.push_back(-offset);
1686 }
7c673cae
FG
1687 }
1688
1689 void aio_finish(BlueStore *store) override {
1690 store->txc_aio_finish(this);
1691 }
f67539c2
TL
1692 private:
1693 state_t state = STATE_PREPARE;
7c673cae
FG
1694 };
1695
9f95a23c
TL
1696 class BlueStoreThrottle {
1697#if defined(WITH_LTTNG)
f67539c2 1698 const std::chrono::time_point<ceph::mono_clock> time_base = ceph::mono_clock::now();
9f95a23c
TL
1699
1700 // Time of last chosen io (microseconds)
1701 std::atomic<uint64_t> previous_emitted_tp_time_mono_mcs = {0};
1702 std::atomic<uint64_t> ios_started_since_last_traced = {0};
1703 std::atomic<uint64_t> ios_completed_since_last_traced = {0};
1704
1705 std::atomic_uint pending_kv_ios = {0};
1706 std::atomic_uint pending_deferred_ios = {0};
1707
1708 // Min period between trace points (microseconds)
1709 std::atomic<uint64_t> trace_period_mcs = {0};
1710
1711 bool should_trace(
1712 uint64_t *started,
1713 uint64_t *completed) {
1714 uint64_t min_period_mcs = trace_period_mcs.load(
1715 std::memory_order_relaxed);
1716
1717 if (min_period_mcs == 0) {
1718 *started = 1;
1719 *completed = ios_completed_since_last_traced.exchange(0);
1720 return true;
1721 } else {
1722 ios_started_since_last_traced++;
1723 auto now_mcs = ceph::to_microseconds<uint64_t>(
f67539c2 1724 ceph::mono_clock::now() - time_base);
9f95a23c
TL
1725 uint64_t previous_mcs = previous_emitted_tp_time_mono_mcs;
1726 uint64_t period_mcs = now_mcs - previous_mcs;
1727 if (period_mcs > min_period_mcs) {
1728 if (previous_emitted_tp_time_mono_mcs.compare_exchange_strong(
1729 previous_mcs, now_mcs)) {
1730 // This would be racy at a sufficiently extreme trace rate, but isn't
1731 // worth the overhead of doing it more carefully.
1732 *started = ios_started_since_last_traced.exchange(0);
1733 *completed = ios_completed_since_last_traced.exchange(0);
1734 return true;
1735 }
1736 }
1737 return false;
1738 }
1739 }
1740#endif
1741
1742#if defined(WITH_LTTNG)
1743 void emit_initial_tracepoint(
1744 KeyValueDB &db,
1745 TransContext &txc,
f67539c2 1746 ceph::mono_clock::time_point);
9f95a23c
TL
1747#else
1748 void emit_initial_tracepoint(
1749 KeyValueDB &db,
1750 TransContext &txc,
f67539c2 1751 ceph::mono_clock::time_point) {}
9f95a23c
TL
1752#endif
1753
1754 Throttle throttle_bytes; ///< submit to commit
1755 Throttle throttle_deferred_bytes; ///< submit to deferred complete
1756
1757 public:
1758 BlueStoreThrottle(CephContext *cct) :
1759 throttle_bytes(cct, "bluestore_throttle_bytes", 0),
1760 throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes", 0)
1761 {
1762 reset_throttle(cct->_conf);
1763 }
1764
1765#if defined(WITH_LTTNG)
1766 void complete_kv(TransContext &txc);
1767 void complete(TransContext &txc);
1768#else
1769 void complete_kv(TransContext &txc) {}
1770 void complete(TransContext &txc) {}
1771#endif
1772
f67539c2 1773 ceph::mono_clock::duration log_state_latency(
9f95a23c
TL
1774 TransContext &txc, PerfCounters *logger, int state);
1775 bool try_start_transaction(
1776 KeyValueDB &db,
1777 TransContext &txc,
f67539c2 1778 ceph::mono_clock::time_point);
9f95a23c
TL
1779 void finish_start_transaction(
1780 KeyValueDB &db,
1781 TransContext &txc,
f67539c2 1782 ceph::mono_clock::time_point);
9f95a23c
TL
1783 void release_kv_throttle(uint64_t cost) {
1784 throttle_bytes.put(cost);
1785 }
1786 void release_deferred_throttle(uint64_t cost) {
1787 throttle_deferred_bytes.put(cost);
1788 }
1789 bool should_submit_deferred() {
1790 return throttle_deferred_bytes.past_midpoint();
1791 }
1792 void reset_throttle(const ConfigProxy &conf) {
1793 throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
1794 throttle_deferred_bytes.reset_max(
1795 conf->bluestore_throttle_bytes +
1796 conf->bluestore_throttle_deferred_bytes);
1797#if defined(WITH_LTTNG)
1798 double rate = conf.get_val<double>("bluestore_throttle_trace_rate");
1799 trace_period_mcs = rate > 0 ? floor((1/rate) * 1000000.0) : 0;
1800#endif
1801 }
1802 } throttle;
1803
7c673cae
FG
1804 typedef boost::intrusive::list<
1805 TransContext,
1806 boost::intrusive::member_hook<
1807 TransContext,
1808 boost::intrusive::list_member_hook<>,
1809 &TransContext::deferred_queue_item> > deferred_queue_t;
1810
11fdf7f2 1811 struct DeferredBatch final : public AioContext {
7c673cae
FG
1812 OpSequencer *osr;
1813 struct deferred_io {
f67539c2 1814 ceph::buffer::list bl; ///< data
7c673cae
FG
1815 uint64_t seq; ///< deferred transaction seq
1816 };
f67539c2 1817 std::map<uint64_t,deferred_io> iomap; ///< map of ios in this batch
7c673cae
FG
1818 deferred_queue_t txcs; ///< txcs in this batch
1819 IOContext ioc; ///< our aios
1820 /// bytes of pending io for each deferred seq (may be 0)
f67539c2 1821 std::map<uint64_t,int> seq_bytes;
7c673cae
FG
1822
1823 void _discard(CephContext *cct, uint64_t offset, uint64_t length);
1824 void _audit(CephContext *cct);
1825
1826 DeferredBatch(CephContext *cct, OpSequencer *osr)
1827 : osr(osr), ioc(cct, this) {}
1828
1829 /// prepare a write
1830 void prepare_write(CephContext *cct,
1831 uint64_t seq, uint64_t offset, uint64_t length,
f67539c2 1832 ceph::buffer::list::const_iterator& p);
7c673cae
FG
1833
1834 void aio_finish(BlueStore *store) override {
1835 store->_deferred_aio_finish(osr);
1836 }
1837 };
1838
11fdf7f2 1839 class OpSequencer : public RefCountedObject {
7c673cae 1840 public:
11fdf7f2
TL
1841 ceph::mutex qlock = ceph::make_mutex("BlueStore::OpSequencer::qlock");
1842 ceph::condition_variable qcond;
7c673cae
FG
1843 typedef boost::intrusive::list<
1844 TransContext,
1845 boost::intrusive::member_hook<
1846 TransContext,
1847 boost::intrusive::list_member_hook<>,
1848 &TransContext::sequencer_item> > q_list_t;
1849 q_list_t q; ///< transactions
1850
1851 boost::intrusive::list_member_hook<> deferred_osr_queue_item;
1852
1853 DeferredBatch *deferred_running = nullptr;
1854 DeferredBatch *deferred_pending = nullptr;
1855
f67539c2
TL
1856 ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::OpSequencer::deferred_lock");
1857
7c673cae 1858 BlueStore *store;
11fdf7f2 1859 coll_t cid;
7c673cae
FG
1860
1861 uint64_t last_seq = 0;
1862
1863 std::atomic_int txc_with_unstable_io = {0}; ///< num txcs with unstable io
1864
1865 std::atomic_int kv_committing_serially = {0};
1866
1867 std::atomic_int kv_submitted_waiters = {0};
1868
f67539c2 1869 std::atomic_bool zombie = {false}; ///< in zombie_osr std::set (collection going away)
7c673cae 1870
9f95a23c
TL
1871 const uint32_t sequencer_id;
1872
1873 uint32_t get_sequencer_id() const {
1874 return sequencer_id;
7c673cae
FG
1875 }
1876
1877 void queue_new(TransContext *txc) {
11fdf7f2 1878 std::lock_guard l(qlock);
7c673cae
FG
1879 txc->seq = ++last_seq;
1880 q.push_back(*txc);
1881 }
1882
1883 void drain() {
11fdf7f2 1884 std::unique_lock l(qlock);
7c673cae
FG
1885 while (!q.empty())
1886 qcond.wait(l);
1887 }
1888
1889 void drain_preceding(TransContext *txc) {
11fdf7f2 1890 std::unique_lock l(qlock);
9f95a23c 1891 while (&q.front() != txc)
7c673cae
FG
1892 qcond.wait(l);
1893 }
1894
1895 bool _is_all_kv_submitted() {
11fdf7f2
TL
1896 // caller must hold qlock & q.empty() must not empty
1897 ceph_assert(!q.empty());
7c673cae 1898 TransContext *txc = &q.back();
f67539c2 1899 if (txc->get_state() >= TransContext::STATE_KV_SUBMITTED) {
7c673cae
FG
1900 return true;
1901 }
1902 return false;
1903 }
1904
11fdf7f2
TL
1905 void flush() {
1906 std::unique_lock l(qlock);
1907 while (true) {
f67539c2 1908 // std::set flag before the check because the condition
11fdf7f2
TL
1909 // may become true outside qlock, and we need to make
1910 // sure those threads see waiters and signal qcond.
1911 ++kv_submitted_waiters;
1912 if (q.empty() || _is_all_kv_submitted()) {
1913 --kv_submitted_waiters;
1914 return;
1915 }
1916 qcond.wait(l);
1917 --kv_submitted_waiters;
1918 }
1919 }
1920
1921 void flush_all_but_last() {
1922 std::unique_lock l(qlock);
1923 assert (q.size() >= 1);
7c673cae 1924 while (true) {
f67539c2 1925 // std::set flag before the check because the condition
7c673cae
FG
1926 // may become true outside qlock, and we need to make
1927 // sure those threads see waiters and signal qcond.
1928 ++kv_submitted_waiters;
11fdf7f2
TL
1929 if (q.size() <= 1) {
1930 --kv_submitted_waiters;
7c673cae 1931 return;
11fdf7f2
TL
1932 } else {
1933 auto it = q.rbegin();
1934 it++;
f67539c2 1935 if (it->get_state() >= TransContext::STATE_KV_SUBMITTED) {
eafe8130 1936 --kv_submitted_waiters;
11fdf7f2
TL
1937 return;
1938 }
7c673cae
FG
1939 }
1940 qcond.wait(l);
1941 --kv_submitted_waiters;
1942 }
f67539c2 1943 }
7c673cae 1944
11fdf7f2
TL
1945 bool flush_commit(Context *c) {
1946 std::lock_guard l(qlock);
7c673cae
FG
1947 if (q.empty()) {
1948 return true;
1949 }
1950 TransContext *txc = &q.back();
f67539c2 1951 if (txc->get_state() >= TransContext::STATE_KV_DONE) {
7c673cae
FG
1952 return true;
1953 }
1954 txc->oncommits.push_back(c);
1955 return false;
1956 }
9f95a23c
TL
1957 private:
1958 FRIEND_MAKE_REF(OpSequencer);
1959 OpSequencer(BlueStore *store, uint32_t sequencer_id, const coll_t& c)
1960 : RefCountedObject(store->cct),
1961 store(store), cid(c), sequencer_id(sequencer_id) {
1962 }
1963 ~OpSequencer() {
1964 ceph_assert(q.empty());
1965 }
7c673cae
FG
1966 };
1967
1968 typedef boost::intrusive::list<
1969 OpSequencer,
1970 boost::intrusive::member_hook<
1971 OpSequencer,
1972 boost::intrusive::list_member_hook<>,
1973 &OpSequencer::deferred_osr_queue_item> > deferred_osr_queue_t;
1974
1975 struct KVSyncThread : public Thread {
1976 BlueStore *store;
1977 explicit KVSyncThread(BlueStore *s) : store(s) {}
1978 void *entry() override {
1979 store->_kv_sync_thread();
1980 return NULL;
1981 }
1982 };
31f18b77
FG
1983 struct KVFinalizeThread : public Thread {
1984 BlueStore *store;
1985 explicit KVFinalizeThread(BlueStore *s) : store(s) {}
f67539c2 1986 void *entry() override {
31f18b77
FG
1987 store->_kv_finalize_thread();
1988 return NULL;
1989 }
1990 };
f67539c2
TL
1991 struct ZonedCleanerThread : public Thread {
1992 BlueStore *store;
1993 explicit ZonedCleanerThread(BlueStore *s) : store(s) {}
1994 void *entry() override {
1995 store->_zoned_cleaner_thread();
1996 return nullptr;
1997 }
1998 };
7c673cae
FG
1999
2000 struct DBHistogram {
2001 struct value_dist {
2002 uint64_t count;
2003 uint32_t max_len;
2004 };
2005
2006 struct key_dist {
2007 uint64_t count;
2008 uint32_t max_len;
f67539c2 2009 std::map<int, struct value_dist> val_map; ///< slab id to count, max length of value and key
7c673cae
FG
2010 };
2011
f67539c2
TL
2012 std::map<std::string, std::map<int, struct key_dist> > key_hist;
2013 std::map<int, uint64_t> value_hist;
7c673cae 2014 int get_key_slab(size_t sz);
f67539c2 2015 std::string get_key_slab_to_range(int slab);
7c673cae 2016 int get_value_slab(size_t sz);
f67539c2
TL
2017 std::string get_value_slab_to_range(int slab);
2018 void update_hist_entry(std::map<std::string, std::map<int, struct key_dist> > &key_hist,
2019 const std::string &prefix, size_t key_size, size_t value_size);
2020 void dump(ceph::Formatter *f);
2021 };
2022
2023 struct BigDeferredWriteContext {
2024 uint64_t off = 0; // original logical offset
2025 uint32_t b_off = 0; // blob relative offset
2026 uint32_t used = 0;
2027 uint64_t head_read = 0;
2028 uint64_t tail_read = 0;
2029 BlobRef blob_ref;
2030 uint64_t blob_start = 0;
2031 PExtentVector res_extents;
2032
2033 inline uint64_t blob_aligned_len() const {
2034 return used + head_read + tail_read;
2035 }
2036
2037 bool can_defer(BlueStore::extent_map_t::iterator ep,
2038 uint64_t prefer_deferred_size,
2039 uint64_t block_size,
2040 uint64_t offset,
2041 uint64_t l);
2042 bool apply_defer();
7c673cae
FG
2043 };
2044
2045 // --------------------------------------------------------
2046 // members
2047private:
2048 BlueFS *bluefs = nullptr;
9f95a23c 2049 bluefs_layout_t bluefs_layout;
11fdf7f2 2050 utime_t next_dump_on_bluefs_alloc_failure;
7c673cae
FG
2051
2052 KeyValueDB *db = nullptr;
2053 BlockDevice *bdev = nullptr;
2054 std::string freelist_type;
2055 FreelistManager *fm = nullptr;
f67539c2
TL
2056
2057 bluefs_shared_alloc_context_t shared_alloc;
2058
7c673cae
FG
2059 uuid_d fsid;
2060 int path_fd = -1; ///< open handle to $path
2061 int fsid_fd = -1; ///< open handle (locked) to $path/fsid
2062 bool mounted = false;
2063
9f95a23c 2064 ceph::shared_mutex coll_lock = ceph::make_shared_mutex("BlueStore::coll_lock"); ///< rwlock to protect coll_map
31f18b77 2065 mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map;
eafe8130 2066 bool collections_had_errors = false;
f67539c2 2067 std::map<coll_t,CollectionRef> new_coll_map;
7c673cae 2068
f67539c2
TL
2069 std::vector<OnodeCacheShard*> onode_cache_shards;
2070 std::vector<BufferCacheShard*> buffer_cache_shards;
7c673cae 2071
11fdf7f2
TL
2072 /// protect zombie_osr_set
2073 ceph::mutex zombie_osr_lock = ceph::make_mutex("BlueStore::zombie_osr_lock");
9f95a23c 2074 uint32_t next_sequencer_id = 0;
f67539c2 2075 std::map<coll_t,OpSequencerRef> zombie_osr_set; ///< std::set of OpSequencers for deleted collections
7c673cae
FG
2076
2077 std::atomic<uint64_t> nid_last = {0};
2078 std::atomic<uint64_t> nid_max = {0};
2079 std::atomic<uint64_t> blobid_last = {0};
2080 std::atomic<uint64_t> blobid_max = {0};
2081
11fdf7f2 2082 ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::deferred_lock");
f67539c2
TL
2083 ceph::mutex atomic_alloc_and_submit_lock =
2084 ceph::make_mutex("BlueStore::atomic_alloc_and_submit_lock");
7c673cae
FG
2085 std::atomic<uint64_t> deferred_seq = {0};
2086 deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
f67539c2
TL
2087 std::atomic_int deferred_queue_size = {0}; ///< num txc's queued across all osrs
2088 std::atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
9f95a23c
TL
2089 Finisher finisher;
2090 utime_t deferred_last_submitted = utime_t();
7c673cae
FG
2091
2092 KVSyncThread kv_sync_thread;
11fdf7f2
TL
2093 ceph::mutex kv_lock = ceph::make_mutex("BlueStore::kv_lock");
2094 ceph::condition_variable kv_cond;
3efd9988 2095 bool _kv_only = false;
31f18b77 2096 bool kv_sync_started = false;
7c673cae 2097 bool kv_stop = false;
31f18b77
FG
2098 bool kv_finalize_started = false;
2099 bool kv_finalize_stop = false;
f67539c2
TL
2100 std::deque<TransContext*> kv_queue; ///< ready, already submitted
2101 std::deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
2102 std::deque<TransContext*> kv_committing; ///< currently syncing
2103 std::deque<DeferredBatch*> deferred_done_queue; ///< deferred ios done
9f95a23c 2104 bool kv_sync_in_progress = false;
7c673cae 2105
31f18b77 2106 KVFinalizeThread kv_finalize_thread;
11fdf7f2
TL
2107 ceph::mutex kv_finalize_lock = ceph::make_mutex("BlueStore::kv_finalize_lock");
2108 ceph::condition_variable kv_finalize_cond;
f67539c2
TL
2109 std::deque<TransContext*> kv_committing_to_finalize; ///< pending finalization
2110 std::deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization
9f95a23c 2111 bool kv_finalize_in_progress = false;
31f18b77 2112
f67539c2
TL
2113 ZonedCleanerThread zoned_cleaner_thread;
2114 ceph::mutex zoned_cleaner_lock = ceph::make_mutex("BlueStore::zoned_cleaner_lock");
2115 ceph::condition_variable zoned_cleaner_cond;
2116 bool zoned_cleaner_started = false;
2117 bool zoned_cleaner_stop = false;
2118 std::deque<uint64_t> zoned_cleaner_queue;
2119
7c673cae
FG
2120 PerfCounters *logger = nullptr;
2121
f67539c2 2122 std::list<CollectionRef> removed_collections;
7c673cae 2123
9f95a23c
TL
2124 ceph::shared_mutex debug_read_error_lock =
2125 ceph::make_shared_mutex("BlueStore::debug_read_error_lock");
f67539c2
TL
2126 std::set<ghobject_t> debug_data_error_objects;
2127 std::set<ghobject_t> debug_mdata_error_objects;
7c673cae
FG
2128
2129 std::atomic<int> csum_type = {Checksummer::CSUM_CRC32C};
2130
2131 uint64_t block_size = 0; ///< block size of block device (power of 2)
2132 uint64_t block_mask = 0; ///< mask to get just the block offset
2133 size_t block_size_order = 0; ///< bits to shift to get block size
2134
9f95a23c 2135 uint64_t min_alloc_size; ///< minimum allocation unit (power of 2)
7c673cae 2136 ///< bits for min_alloc_size
224ce89b 2137 uint8_t min_alloc_size_order = 0;
7c673cae
FG
2138 static_assert(std::numeric_limits<uint8_t>::max() >
2139 std::numeric_limits<decltype(min_alloc_size)>::digits,
2140 "not enough bits for min_alloc_size");
2141
f67539c2
TL
2142 enum {
2143 // Please preserve the order since it's DB persistent
2144 OMAP_BULK = 0,
2145 OMAP_PER_POOL = 1,
2146 OMAP_PER_PG = 2,
2147 } per_pool_omap = OMAP_BULK;
9f95a23c 2148
7c673cae
FG
2149 ///< maximum allocation unit (power of 2)
2150 std::atomic<uint64_t> max_alloc_size = {0};
2151
224ce89b
WB
2152 ///< number threshold for forced deferred writes
2153 std::atomic<int> deferred_batch_ops = {0};
2154
2155 ///< size threshold for forced deferred writes
2156 std::atomic<uint64_t> prefer_deferred_size = {0};
2157
7c673cae
FG
2158 ///< approx cost per io, in bytes
2159 std::atomic<uint64_t> throttle_cost_per_io = {0};
2160
224ce89b
WB
2161 std::atomic<Compressor::CompressionMode> comp_mode =
2162 {Compressor::COMP_NONE}; ///< compression mode
7c673cae
FG
2163 CompressorRef compressor;
2164 std::atomic<uint64_t> comp_min_blob_size = {0};
2165 std::atomic<uint64_t> comp_max_blob_size = {0};
2166
2167 std::atomic<uint64_t> max_blob_size = {0}; ///< maximum blob size
2168
31f18b77
FG
2169 uint64_t kv_ios = 0;
2170 uint64_t kv_throttle_costs = 0;
2171
7c673cae 2172 // cache trim control
91327a77
AA
2173 uint64_t cache_size = 0; ///< total cache size
2174 double cache_meta_ratio = 0; ///< cache ratio dedicated to metadata
2175 double cache_kv_ratio = 0; ///< cache ratio dedicated to kv (e.g., rocksdb)
f67539c2 2176 double cache_kv_onode_ratio = 0; ///< cache ratio dedicated to kv onodes (e.g., rocksdb onode CF)
91327a77
AA
2177 double cache_data_ratio = 0; ///< cache ratio dedicated to object data
2178 bool cache_autotune = false; ///< cache autotune setting
91327a77
AA
2179 double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
2180 uint64_t osd_memory_target = 0; ///< OSD memory target when autotuning cache
2181 uint64_t osd_memory_base = 0; ///< OSD base memory when autotuning cache
2182 double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
11fdf7f2 2183 uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cache
91327a77 2184 double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing
9f95a23c 2185 double max_defer_interval = 0; ///< Time to wait between last deferred submit
92f5a8d4 2186 std::atomic<uint32_t> config_changed = {0}; ///< Counter to determine if there is a configuration change.
11fdf7f2 2187
f67539c2 2188 typedef std::map<uint64_t, volatile_statfs> osd_pools_map;
11fdf7f2
TL
2189
2190 ceph::mutex vstatfs_lock = ceph::make_mutex("BlueStore::vstatfs_lock");
31f18b77 2191 volatile_statfs vstatfs;
11fdf7f2
TL
2192 osd_pools_map osd_pools; // protected by vstatfs_lock as well
2193
2194 bool per_pool_stat_collection = true;
7c673cae
FG
2195
2196 struct MempoolThread : public Thread {
91327a77 2197 public:
7c673cae 2198 BlueStore *store;
91327a77 2199
11fdf7f2
TL
2200 ceph::condition_variable cond;
2201 ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock");
7c673cae 2202 bool stop = false;
11fdf7f2 2203 std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr;
f67539c2 2204 std::shared_ptr<PriorityCache::PriCache> binned_kv_onode_cache = nullptr;
eafe8130 2205 std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
91327a77
AA
2206
2207 struct MempoolCache : public PriorityCache::PriCache {
2208 BlueStore *store;
11fdf7f2
TL
2209 int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
2210 int64_t committed_bytes = 0;
91327a77
AA
2211 double cache_ratio = 0;
2212
2213 MempoolCache(BlueStore *s) : store(s) {};
2214
2215 virtual uint64_t _get_used_bytes() const = 0;
2216
2217 virtual int64_t request_cache_bytes(
11fdf7f2 2218 PriorityCache::Priority pri, uint64_t total_cache) const {
91327a77
AA
2219 int64_t assigned = get_cache_bytes(pri);
2220
2221 switch (pri) {
eafe8130
TL
2222 // All cache items are currently shoved into the PRI1 priority
2223 case PriorityCache::Priority::PRI1:
91327a77 2224 {
11fdf7f2 2225 int64_t request = _get_used_bytes();
91327a77
AA
2226 return(request > assigned) ? request - assigned : 0;
2227 }
2228 default:
2229 break;
2230 }
2231 return -EOPNOTSUPP;
2232 }
2233
2234 virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
2235 return cache_bytes[pri];
2236 }
2237 virtual int64_t get_cache_bytes() const {
2238 int64_t total = 0;
2239
2240 for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
2241 PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
2242 total += get_cache_bytes(pri);
2243 }
2244 return total;
2245 }
2246 virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
2247 cache_bytes[pri] = bytes;
2248 }
2249 virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
2250 cache_bytes[pri] += bytes;
2251 }
11fdf7f2
TL
2252 virtual int64_t commit_cache_size(uint64_t total_cache) {
2253 committed_bytes = PriorityCache::get_chunk(
2254 get_cache_bytes(), total_cache);
2255 return committed_bytes;
2256 }
2257 virtual int64_t get_committed_size() const {
2258 return committed_bytes;
91327a77
AA
2259 }
2260 virtual double get_cache_ratio() const {
2261 return cache_ratio;
2262 }
2263 virtual void set_cache_ratio(double ratio) {
2264 cache_ratio = ratio;
2265 }
f67539c2 2266 virtual std::string get_cache_name() const = 0;
91327a77
AA
2267 };
2268
2269 struct MetaCache : public MempoolCache {
2270 MetaCache(BlueStore *s) : MempoolCache(s) {};
2271
2272 virtual uint64_t _get_used_bytes() const {
f91f0fd5
TL
2273 return mempool::bluestore_Buffer::allocated_bytes() +
2274 mempool::bluestore_Blob::allocated_bytes() +
2275 mempool::bluestore_Extent::allocated_bytes() +
2276 mempool::bluestore_cache_meta::allocated_bytes() +
2277 mempool::bluestore_cache_other::allocated_bytes() +
2278 mempool::bluestore_cache_onode::allocated_bytes() +
2279 mempool::bluestore_SharedBlob::allocated_bytes() +
2280 mempool::bluestore_inline_bl::allocated_bytes();
91327a77
AA
2281 }
2282
f67539c2 2283 virtual std::string get_cache_name() const {
91327a77
AA
2284 return "BlueStore Meta Cache";
2285 }
2286
2287 uint64_t _get_num_onodes() const {
2288 uint64_t onode_num =
2289 mempool::bluestore_cache_onode::allocated_items();
2290 return (2 > onode_num) ? 2 : onode_num;
2291 }
2292
2293 double get_bytes_per_onode() const {
2294 return (double)_get_used_bytes() / (double)_get_num_onodes();
2295 }
11fdf7f2
TL
2296 };
2297 std::shared_ptr<MetaCache> meta_cache;
91327a77
AA
2298
2299 struct DataCache : public MempoolCache {
2300 DataCache(BlueStore *s) : MempoolCache(s) {};
2301
2302 virtual uint64_t _get_used_bytes() const {
2303 uint64_t bytes = 0;
9f95a23c
TL
2304 for (auto i : store->buffer_cache_shards) {
2305 bytes += i->_get_bytes();
91327a77
AA
2306 }
2307 return bytes;
2308 }
f67539c2 2309 virtual std::string get_cache_name() const {
91327a77
AA
2310 return "BlueStore Data Cache";
2311 }
11fdf7f2
TL
2312 };
2313 std::shared_ptr<DataCache> data_cache;
91327a77 2314
7c673cae
FG
2315 public:
2316 explicit MempoolThread(BlueStore *s)
2317 : store(s),
11fdf7f2
TL
2318 meta_cache(new MetaCache(s)),
2319 data_cache(new DataCache(s)) {}
91327a77 2320
7c673cae
FG
2321 void *entry() override;
2322 void init() {
11fdf7f2 2323 ceph_assert(stop == false);
7c673cae
FG
2324 create("bstore_mempool");
2325 }
2326 void shutdown() {
11fdf7f2 2327 lock.lock();
7c673cae 2328 stop = true;
11fdf7f2
TL
2329 cond.notify_all();
2330 lock.unlock();
7c673cae
FG
2331 join();
2332 }
91327a77
AA
2333
2334 private:
2335 void _adjust_cache_settings();
92f5a8d4 2336 void _update_cache_settings();
9f95a23c 2337 void _resize_shards(bool interval_stats);
7c673cae
FG
2338 } mempool_thread;
2339
f67539c2
TL
2340#ifdef WITH_BLKIN
2341 ZTracer::Endpoint trace_endpoint {"0.0.0.0", 0, "BlueStore"};
2342#endif
2343
7c673cae
FG
2344 // --------------------------------------------------------
2345 // private methods
2346
2347 void _init_logger();
2348 void _shutdown_logger();
2349 int _reload_logger();
2350
2351 int _open_path();
2352 void _close_path();
2353 int _open_fsid(bool create);
2354 int _lock_fsid();
2355 int _read_fsid(uuid_d *f);
2356 int _write_fsid();
2357 void _close_fsid();
2358 void _set_alloc_sizes();
2359 void _set_blob_size();
1adf2230 2360 void _set_finisher_num();
9f95a23c 2361 void _set_per_pool_omap();
92f5a8d4 2362 void _update_osd_memory_options();
7c673cae
FG
2363
2364 int _open_bdev(bool create);
11fdf7f2
TL
2365 // Verifies if disk space is enough for reserved + min bluefs
2366 // and alters the latter if needed.
2367 // Depends on min_alloc_size hence should be called after
2368 // its initialization (and outside of _open_bdev)
2369 void _validate_bdev();
7c673cae 2370 void _close_bdev();
11fdf7f2
TL
2371
2372 int _minimal_open_bluefs(bool create);
2373 void _minimal_close_bluefs();
f67539c2 2374 int _open_bluefs(bool create, bool read_only);
1911f103 2375 void _close_bluefs(bool cold_close);
11fdf7f2 2376
11fdf7f2
TL
2377 int _is_bluefs(bool create, bool* ret);
2378 /*
2379 * opens both DB and dependant super_meta, FreelistManager and allocator
2380 * in the proper order
2381 */
f67539c2 2382 int _open_db_and_around(bool read_only, bool to_repair = false);
1911f103 2383 void _close_db_and_around(bool read_only);
11fdf7f2 2384
f67539c2
TL
2385 int _prepare_db_environment(bool create, bool read_only,
2386 std::string* kv_dir, std::string* kv_backend);
11fdf7f2
TL
2387
2388 /*
2389 * @warning to_repair_db means that we open this db to repair it, will not
2390 * hold the rocksdb's file lock.
2391 */
2392 int _open_db(bool create,
2393 bool to_repair_db=false,
2394 bool read_only = false);
1911f103
TL
2395 void _close_db(bool read_only);
2396 int _open_fm(KeyValueDB::Transaction t, bool read_only);
7c673cae 2397 void _close_fm();
f67539c2
TL
2398 int _write_out_fm_meta(uint64_t target_size);
2399 int _create_alloc();
2400 int _init_alloc();
7c673cae 2401 void _close_alloc();
eafe8130
TL
2402 int _open_collections();
2403 void _fsck_collections(int64_t* errors);
7c673cae
FG
2404 void _close_collections();
2405
f67539c2 2406 int _setup_block_symlink_or_file(std::string name, std::string path, uint64_t size,
7c673cae
FG
2407 bool create);
2408
f67539c2
TL
2409 // Functions related to zoned storage.
2410 uint64_t _zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size);
2411 int _zoned_check_config_settings();
2412 void _zoned_update_cleaning_metadata(TransContext *txc);
2413 std::string _zoned_get_prefix(uint64_t offset);
2414
7c673cae 2415public:
9f95a23c
TL
2416 utime_t get_deferred_last_submitted() {
2417 std::lock_guard l(deferred_lock);
2418 return deferred_last_submitted;
2419 }
2420
3efd9988 2421 static int _write_bdev_label(CephContext* cct,
f67539c2
TL
2422 std::string path, bluestore_bdev_label_t label);
2423 static int _read_bdev_label(CephContext* cct, std::string path,
7c673cae
FG
2424 bluestore_bdev_label_t *label);
2425private:
f67539c2 2426 int _check_or_set_bdev_label(std::string path, uint64_t size, std::string desc,
7c673cae 2427 bool create);
f67539c2 2428 int _set_bdev_label_size(const string& path, uint64_t size);
7c673cae
FG
2429
2430 int _open_super_meta();
2431
224ce89b 2432 void _open_statfs();
11fdf7f2 2433 void _get_statfs_overall(struct store_statfs_t *buf);
31f18b77 2434
11fdf7f2
TL
2435 void _dump_alloc_on_failure();
2436
7c673cae
FG
2437 CollectionRef _get_collection(const coll_t& cid);
2438 void _queue_reap_collection(CollectionRef& c);
2439 void _reap_collections();
2440 void _update_cache_logger();
2441
2442 void _assign_nid(TransContext *txc, OnodeRef o);
2443 uint64_t _assign_blobid(TransContext *txc);
2444
81eedcae
TL
2445 template <int LogLevelV>
2446 friend void _dump_onode(CephContext *cct, const Onode& o);
2447 template <int LogLevelV>
2448 friend void _dump_extent_map(CephContext *cct, const ExtentMap& em);
2449 template <int LogLevelV>
2450 friend void _dump_transaction(CephContext *cct, Transaction *t);
7c673cae 2451
11fdf7f2 2452 TransContext *_txc_create(Collection *c, OpSequencer *osr,
f67539c2
TL
2453 std::list<Context*> *on_commits,
2454 TrackedOpRef osd_op=TrackedOpRef());
7c673cae
FG
2455 void _txc_update_store_statfs(TransContext *txc);
2456 void _txc_add_transaction(TransContext *txc, Transaction *t);
2457 void _txc_calc_cost(TransContext *txc);
2458 void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t);
2459 void _txc_state_proc(TransContext *txc);
2460 void _txc_aio_submit(TransContext *txc);
2461public:
2462 void txc_aio_finish(void *p) {
2463 _txc_state_proc(static_cast<TransContext*>(p));
2464 }
2465private:
2466 void _txc_finish_io(TransContext *txc);
2467 void _txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t);
9f95a23c 2468 void _txc_apply_kv(TransContext *txc, bool sync_submit_transaction);
7c673cae
FG
2469 void _txc_committed_kv(TransContext *txc);
2470 void _txc_finish(TransContext *txc);
2471 void _txc_release_alloc(TransContext *txc);
2472
11fdf7f2
TL
2473 void _osr_attach(Collection *c);
2474 void _osr_register_zombie(OpSequencer *osr);
2475 void _osr_drain(OpSequencer *osr);
7c673cae
FG
2476 void _osr_drain_preceding(TransContext *txc);
2477 void _osr_drain_all();
7c673cae 2478
31f18b77
FG
2479 void _kv_start();
2480 void _kv_stop();
7c673cae 2481 void _kv_sync_thread();
31f18b77 2482 void _kv_finalize_thread();
7c673cae 2483
f67539c2
TL
2484 void _zoned_cleaner_start();
2485 void _zoned_cleaner_stop();
2486 void _zoned_cleaner_thread();
2487 void _zoned_clean_zone(uint64_t zone_num);
2488
9f95a23c 2489 bluestore_deferred_op_t *_get_deferred_op(TransContext *txc);
7c673cae 2490 void _deferred_queue(TransContext *txc);
3efd9988 2491public:
224ce89b 2492 void deferred_try_submit();
3efd9988 2493private:
224ce89b 2494 void _deferred_submit_unlock(OpSequencer *osr);
7c673cae
FG
2495 void _deferred_aio_finish(OpSequencer *osr);
2496 int _deferred_replay();
2497
2498public:
2499 using mempool_dynamic_bitset =
2500 boost::dynamic_bitset<uint64_t,
2501 mempool::bluestore_fsck::pool_allocator<uint64_t>>;
eafe8130
TL
2502 using per_pool_statfs =
2503 mempool::bluestore_fsck::map<uint64_t, store_statfs_t>;
2504
2505 enum FSCKDepth {
2506 FSCK_REGULAR,
2507 FSCK_DEEP,
2508 FSCK_SHALLOW
2509 };
9f95a23c
TL
2510 enum {
2511 MAX_FSCK_ERROR_LINES = 100,
2512 };
7c673cae
FG
2513
2514private:
2515 int _fsck_check_extents(
11fdf7f2 2516 const coll_t& cid,
7c673cae
FG
2517 const ghobject_t& oid,
2518 const PExtentVector& extents,
2519 bool compressed,
2520 mempool_dynamic_bitset &used_blocks,
b32b8144 2521 uint64_t granularity,
11fdf7f2 2522 BlueStoreRepairer* repairer,
eafe8130
TL
2523 store_statfs_t& expected_statfs,
2524 FSCKDepth depth);
7c673cae 2525
11fdf7f2
TL
2526 void _fsck_check_pool_statfs(
2527 per_pool_statfs& expected_pool_statfs,
eafe8130
TL
2528 int64_t& errors,
2529 int64_t &warnings,
11fdf7f2
TL
2530 BlueStoreRepairer* repairer);
2531
eafe8130
TL
2532 int _fsck(FSCKDepth depth, bool repair);
2533 int _fsck_on_open(BlueStore::FSCKDepth depth, bool repair);
2534
7c673cae
FG
2535 void _buffer_cache_write(
2536 TransContext *txc,
2537 BlobRef b,
2538 uint64_t offset,
f67539c2 2539 ceph::buffer::list& bl,
7c673cae
FG
2540 unsigned flags) {
2541 b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl,
2542 flags);
2543 txc->shared_blobs_written.insert(b->shared_blob);
2544 }
2545
2546 int _collection_list(
2547 Collection *c, const ghobject_t& start, const ghobject_t& end,
f67539c2 2548 int max, bool legacy, std::vector<ghobject_t> *ls, ghobject_t *next);
7c673cae
FG
2549
2550 template <typename T, typename F>
2551 T select_option(const std::string& opt_name, T val1, F f) {
2552 //NB: opt_name reserved for future use
2553 boost::optional<T> val2 = f();
2554 if (val2) {
2555 return *val2;
2556 }
2557 return val1;
2558 }
2559
2560 void _apply_padding(uint64_t head_pad,
2561 uint64_t tail_pad,
f67539c2 2562 ceph::buffer::list& padded);
7c673cae 2563
11fdf7f2
TL
2564 void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn);
2565
7c673cae
FG
2566 // -- ondisk version ---
2567public:
1911f103 2568 const int32_t latest_ondisk_format = 4; ///< our version
7c673cae 2569 const int32_t min_readable_ondisk_format = 1; ///< what we can read
9f95a23c 2570 const int32_t min_compat_ondisk_format = 3; ///< who can read us
7c673cae
FG
2571
2572private:
2573 int32_t ondisk_format = 0; ///< value detected on mount
2574
2575 int _upgrade_super(); ///< upgrade (called during open_super)
11fdf7f2 2576 uint64_t _get_ondisk_reserved() const;
7c673cae
FG
2577 void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
2578
2579 // --- public interface ---
2580public:
f67539c2
TL
2581 BlueStore(CephContext *cct, const std::string& path);
2582 BlueStore(CephContext *cct, const std::string& path, uint64_t min_alloc_size); // Ctor for UT only
7c673cae
FG
2583 ~BlueStore() override;
2584
f67539c2 2585 std::string get_type() override {
7c673cae
FG
2586 return "bluestore";
2587 }
2588
2589 bool needs_journal() override { return false; };
2590 bool wants_journal() override { return false; };
2591 bool allows_journal() override { return false; };
2592
9f95a23c
TL
2593 uint64_t get_min_alloc_size() const override {
2594 return min_alloc_size;
2595 }
2596
f67539c2 2597 int get_devices(std::set<std::string> *ls) override;
11fdf7f2 2598
31f18b77 2599 bool is_rotational() override;
d2e6a577 2600 bool is_journal_rotational() override;
31f18b77 2601
f67539c2
TL
2602 std::string get_default_device_class() override {
2603 std::string device_class;
2604 std::map<std::string, std::string> metadata;
224ce89b
WB
2605 collect_metadata(&metadata);
2606 auto it = metadata.find("bluestore_bdev_type");
2607 if (it != metadata.end()) {
2608 device_class = it->second;
2609 }
2610 return device_class;
2611 }
2612
11fdf7f2
TL
2613 int get_numa_node(
2614 int *numa_node,
f67539c2
TL
2615 std::set<int> *nodes,
2616 std::set<std::string> *failed) override;
11fdf7f2 2617
f67539c2 2618 static int get_block_device_fsid(CephContext* cct, const std::string& path,
7c673cae
FG
2619 uuid_d *fsid);
2620
2621 bool test_mount_in_use() override;
2622
2623private:
f67539c2 2624 int _mount();
7c673cae
FG
2625public:
2626 int mount() override {
f67539c2 2627 return _mount();
7c673cae
FG
2628 }
2629 int umount() override;
2630
f67539c2
TL
2631 int open_db_environment(KeyValueDB **pdb, bool to_repair);
2632 int close_db_environment();
7c673cae 2633
3efd9988
FG
2634 int write_meta(const std::string& key, const std::string& value) override;
2635 int read_meta(const std::string& key, std::string *value) override;
2636
f67539c2 2637 // open in read-only and limited mode
eafe8130
TL
2638 int cold_open();
2639 int cold_close();
3efd9988
FG
2640
2641 int fsck(bool deep) override {
eafe8130 2642 return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, false);
3efd9988
FG
2643 }
2644 int repair(bool deep) override {
eafe8130
TL
2645 return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, true);
2646 }
2647 int quick_fix() override {
2648 return _fsck(FSCK_SHALLOW, true);
3efd9988 2649 }
7c673cae
FG
2650
2651 void set_cache_shards(unsigned num) override;
f67539c2 2652 void dump_cache_stats(ceph::Formatter *f) override {
11fdf7f2 2653 int onode_count = 0, buffers_bytes = 0;
9f95a23c
TL
2654 for (auto i: onode_cache_shards) {
2655 onode_count += i->_get_num();
2656 }
2657 for (auto i: buffer_cache_shards) {
2658 buffers_bytes += i->_get_bytes();
11fdf7f2
TL
2659 }
2660 f->dump_int("bluestore_onode", onode_count);
2661 f->dump_int("bluestore_buffers", buffers_bytes);
2662 }
f67539c2 2663 void dump_cache_stats(std::ostream& ss) override {
11fdf7f2 2664 int onode_count = 0, buffers_bytes = 0;
9f95a23c
TL
2665 for (auto i: onode_cache_shards) {
2666 onode_count += i->_get_num();
2667 }
2668 for (auto i: buffer_cache_shards) {
2669 buffers_bytes += i->_get_bytes();
11fdf7f2
TL
2670 }
2671 ss << "bluestore_onode: " << onode_count;
2672 ss << "bluestore_buffers: " << buffers_bytes;
2673 }
7c673cae
FG
2674
2675 int validate_hobject_key(const hobject_t &obj) const override {
2676 return 0;
2677 }
2678 unsigned get_max_attr_name_length() override {
2679 return 256; // arbitrary; there is no real limit internally
2680 }
2681
2682 int mkfs() override;
2683 int mkjournal() override {
2684 return 0;
2685 }
2686
f67539c2
TL
2687 void get_db_statistics(ceph::Formatter *f) override;
2688 void generate_db_histogram(ceph::Formatter *f) override;
f6b5b4d7 2689 void _shutdown_cache();
f67539c2
TL
2690 int flush_cache(std::ostream *os = NULL) override;
2691 void dump_perf_counters(ceph::Formatter *f) override {
7c673cae
FG
2692 f->open_object_section("perf_counters");
2693 logger->dump_formatted(f, false);
2694 f->close_section();
2695 }
2696
f67539c2
TL
2697 int add_new_bluefs_device(int id, const std::string& path);
2698 int migrate_to_existing_bluefs_device(const std::set<int>& devs_source,
11fdf7f2 2699 int id);
f67539c2 2700 int migrate_to_new_bluefs_device(const std::set<int>& devs_source,
11fdf7f2 2701 int id,
f67539c2
TL
2702 const std::string& path);
2703 int expand_devices(std::ostream& out);
2704 std::string get_device_path(unsigned id);
7c673cae 2705
1911f103
TL
2706 int dump_bluefs_sizes(ostream& out);
2707
7c673cae 2708public:
11fdf7f2
TL
2709 int statfs(struct store_statfs_t *buf,
2710 osd_alert_list_t* alerts = nullptr) override;
9f95a23c
TL
2711 int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
2712 bool *per_pool_omap) override;
7c673cae 2713
f67539c2 2714 void collect_metadata(std::map<std::string,std::string> *pm) override;
7c673cae 2715
7c673cae
FG
2716 bool exists(CollectionHandle &c, const ghobject_t& oid) override;
2717 int set_collection_opts(
11fdf7f2 2718 CollectionHandle& c,
7c673cae 2719 const pool_opts_t& opts) override;
7c673cae
FG
2720 int stat(
2721 CollectionHandle &c,
2722 const ghobject_t& oid,
2723 struct stat *st,
2724 bool allow_eio = false) override;
7c673cae
FG
2725 int read(
2726 CollectionHandle &c,
2727 const ghobject_t& oid,
2728 uint64_t offset,
2729 size_t len,
f67539c2 2730 ceph::buffer::list& bl,
224ce89b 2731 uint32_t op_flags = 0) override;
9f95a23c
TL
2732
2733private:
2734
2735 // --------------------------------------------------------
2736 // intermediate data structures used while reading
2737 struct region_t {
2738 uint64_t logical_offset;
2739 uint64_t blob_xoffset; //region offset within the blob
2740 uint64_t length;
2741
2742 // used later in read process
2743 uint64_t front = 0;
2744
2745 region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0)
2746 : logical_offset(offset),
2747 blob_xoffset(b_offs),
2748 length(len),
2749 front(front){}
2750 region_t(const region_t& from)
2751 : logical_offset(from.logical_offset),
2752 blob_xoffset(from.blob_xoffset),
2753 length(from.length),
2754 front(from.front){}
2755
f67539c2 2756 friend std::ostream& operator<<(std::ostream& out, const region_t& r) {
9f95a23c
TL
2757 return out << "0x" << std::hex << r.logical_offset << ":"
2758 << r.blob_xoffset << "~" << r.length << std::dec;
2759 }
2760 };
2761
2762 // merged blob read request
2763 struct read_req_t {
2764 uint64_t r_off = 0;
2765 uint64_t r_len = 0;
f67539c2 2766 ceph::buffer::list bl;
9f95a23c
TL
2767 std::list<region_t> regs; // original read regions
2768
2769 read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {}
2770
f67539c2 2771 friend std::ostream& operator<<(std::ostream& out, const read_req_t& r) {
9f95a23c
TL
2772 out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : [";
2773 for (const auto& reg : r.regs)
2774 out << reg;
2775 return out << "]}" << std::dec;
2776 }
2777 };
2778
f67539c2
TL
2779 typedef std::list<read_req_t> regions2read_t;
2780 typedef std::map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
9f95a23c
TL
2781
2782 void _read_cache(
2783 OnodeRef o,
2784 uint64_t offset,
2785 size_t length,
2786 int read_cache_policy,
2787 ready_regions_t& ready_regions,
2788 blobs2read_t& blobs2read);
2789
2790
2791 int _prepare_read_ioc(
2792 blobs2read_t& blobs2read,
f67539c2 2793 std::vector<ceph::buffer::list>* compressed_blob_bls,
9f95a23c
TL
2794 IOContext* ioc);
2795
2796 int _generate_read_result_bl(
2797 OnodeRef o,
2798 uint64_t offset,
2799 size_t length,
2800 ready_regions_t& ready_regions,
f67539c2 2801 std::vector<ceph::buffer::list>& compressed_blob_bls,
9f95a23c
TL
2802 blobs2read_t& blobs2read,
2803 bool buffered,
2804 bool* csum_error,
f67539c2 2805 ceph::buffer::list& bl);
9f95a23c 2806
7c673cae
FG
2807 int _do_read(
2808 Collection *c,
2809 OnodeRef o,
2810 uint64_t offset,
2811 size_t len,
f67539c2 2812 ceph::buffer::list& bl,
f64942e4
AA
2813 uint32_t op_flags = 0,
2814 uint64_t retry_count = 0);
7c673cae 2815
9f95a23c
TL
2816 int _do_readv(
2817 Collection *c,
2818 OnodeRef o,
2819 const interval_set<uint64_t>& m,
f67539c2 2820 ceph::buffer::list& bl,
9f95a23c
TL
2821 uint32_t op_flags = 0,
2822 uint64_t retry_count = 0);
2823
7c673cae 2824 int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
f67539c2 2825 uint64_t offset, size_t len, interval_set<uint64_t>& destset);
7c673cae 2826public:
7c673cae 2827 int fiemap(CollectionHandle &c, const ghobject_t& oid,
f67539c2 2828 uint64_t offset, size_t len, ceph::buffer::list& bl) override;
7c673cae 2829 int fiemap(CollectionHandle &c, const ghobject_t& oid,
f67539c2 2830 uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) override;
7c673cae 2831
9f95a23c
TL
2832 int readv(
2833 CollectionHandle &c_,
2834 const ghobject_t& oid,
2835 interval_set<uint64_t>& m,
f67539c2 2836 ceph::buffer::list& bl,
9f95a23c
TL
2837 uint32_t op_flags) override;
2838
2839 int dump_onode(CollectionHandle &c, const ghobject_t& oid,
f67539c2 2840 const std::string& section_name, ceph::Formatter *f) override;
7c673cae 2841
7c673cae 2842 int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
f67539c2 2843 ceph::buffer::ptr& value) override;
7c673cae 2844
7c673cae 2845 int getattrs(CollectionHandle &c, const ghobject_t& oid,
f67539c2 2846 std::map<std::string,ceph::buffer::ptr>& aset) override;
7c673cae 2847
f67539c2 2848 int list_collections(std::vector<coll_t>& ls) override;
7c673cae
FG
2849
2850 CollectionHandle open_collection(const coll_t &c) override;
11fdf7f2
TL
2851 CollectionHandle create_new_collection(const coll_t& cid) override;
2852 void set_collection_commit_queue(const coll_t& cid,
2853 ContextQueue *commit_queue) override;
7c673cae
FG
2854
2855 bool collection_exists(const coll_t& c) override;
11fdf7f2
TL
2856 int collection_empty(CollectionHandle& c, bool *empty) override;
2857 int collection_bits(CollectionHandle& c) override;
7c673cae 2858
7c673cae
FG
2859 int collection_list(CollectionHandle &c,
2860 const ghobject_t& start,
2861 const ghobject_t& end,
2862 int max,
f67539c2 2863 std::vector<ghobject_t> *ls, ghobject_t *next) override;
7c673cae 2864
f91f0fd5
TL
2865 int collection_list_legacy(CollectionHandle &c,
2866 const ghobject_t& start,
2867 const ghobject_t& end,
2868 int max,
f67539c2 2869 std::vector<ghobject_t> *ls,
f91f0fd5
TL
2870 ghobject_t *next) override;
2871
7c673cae
FG
2872 int omap_get(
2873 CollectionHandle &c, ///< [in] Collection containing oid
2874 const ghobject_t &oid, ///< [in] Object containing omap
f67539c2
TL
2875 ceph::buffer::list *header, ///< [out] omap header
2876 std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
7c673cae 2877 ) override;
9f95a23c
TL
2878 int _omap_get(
2879 Collection *c, ///< [in] Collection containing oid
2880 const ghobject_t &oid, ///< [in] Object containing omap
f67539c2
TL
2881 ceph::buffer::list *header, ///< [out] omap header
2882 std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
9f95a23c
TL
2883 );
2884 int _onode_omap_get(
2885 const OnodeRef &o, ///< [in] Object containing omap
f67539c2
TL
2886 ceph::buffer::list *header, ///< [out] omap header
2887 std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
9f95a23c
TL
2888 );
2889
7c673cae
FG
2890
2891 /// Get omap header
7c673cae
FG
2892 int omap_get_header(
2893 CollectionHandle &c, ///< [in] Collection containing oid
2894 const ghobject_t &oid, ///< [in] Object containing omap
f67539c2 2895 ceph::buffer::list *header, ///< [out] omap header
7c673cae
FG
2896 bool allow_eio = false ///< [in] don't assert on eio
2897 ) override;
2898
2899 /// Get keys defined on oid
7c673cae
FG
2900 int omap_get_keys(
2901 CollectionHandle &c, ///< [in] Collection containing oid
2902 const ghobject_t &oid, ///< [in] Object containing omap
f67539c2 2903 std::set<std::string> *keys ///< [out] Keys defined on oid
7c673cae
FG
2904 ) override;
2905
2906 /// Get key values
7c673cae
FG
2907 int omap_get_values(
2908 CollectionHandle &c, ///< [in] Collection containing oid
2909 const ghobject_t &oid, ///< [in] Object containing omap
f67539c2
TL
2910 const std::set<std::string> &keys, ///< [in] Keys to get
2911 std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
7c673cae
FG
2912 ) override;
2913
9f95a23c
TL
2914#ifdef WITH_SEASTAR
2915 int omap_get_values(
2916 CollectionHandle &c, ///< [in] Collection containing oid
2917 const ghobject_t &oid, ///< [in] Object containing omap
f67539c2
TL
2918 const std::optional<std::string> &start_after, ///< [in] Keys to get
2919 std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
9f95a23c
TL
2920 ) override;
2921#endif
2922
7c673cae 2923 /// Filters keys into out which are defined on oid
7c673cae
FG
2924 int omap_check_keys(
2925 CollectionHandle &c, ///< [in] Collection containing oid
2926 const ghobject_t &oid, ///< [in] Object containing omap
f67539c2
TL
2927 const std::set<std::string> &keys, ///< [in] Keys to check
2928 std::set<std::string> *out ///< [out] Subset of keys defined on oid
7c673cae
FG
2929 ) override;
2930
7c673cae
FG
2931 ObjectMap::ObjectMapIterator get_omap_iterator(
2932 CollectionHandle &c, ///< [in] collection
2933 const ghobject_t &oid ///< [in] object
2934 ) override;
2935
2936 void set_fsid(uuid_d u) override {
2937 fsid = u;
2938 }
2939 uuid_d get_fsid() override {
2940 return fsid;
2941 }
2942
2943 uint64_t estimate_objects_overhead(uint64_t num_objects) override {
2944 return num_objects * 300; //assuming per-object overhead is 300 bytes
2945 }
2946
2947 struct BSPerfTracker {
11fdf7f2
TL
2948 PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
2949 PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
7c673cae
FG
2950
2951 objectstore_perf_stat_t get_cur_stats() const {
2952 objectstore_perf_stat_t ret;
11fdf7f2
TL
2953 ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
2954 ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
7c673cae
FG
2955 return ret;
2956 }
2957
2958 void update_from_perfcounters(PerfCounters &logger);
2959 } perf_tracker;
2960
2961 objectstore_perf_stat_t get_cur_stats() override {
2962 perf_tracker.update_from_perfcounters(*logger);
2963 return perf_tracker.get_cur_stats();
2964 }
2965 const PerfCounters* get_perf_counters() const override {
2966 return logger;
2967 }
9f95a23c
TL
2968 const PerfCounters* get_bluefs_perf_counters() const {
2969 return bluefs->get_perf_counters();
2970 }
f67539c2
TL
2971 KeyValueDB* get_kv() {
2972 return db;
2973 }
7c673cae
FG
2974
2975 int queue_transactions(
11fdf7f2 2976 CollectionHandle& ch,
f67539c2 2977 std::vector<Transaction>& tls,
7c673cae
FG
2978 TrackedOpRef op = TrackedOpRef(),
2979 ThreadPool::TPHandle *handle = NULL) override;
2980
2981 // error injection
2982 void inject_data_error(const ghobject_t& o) override {
9f95a23c 2983 std::unique_lock l(debug_read_error_lock);
7c673cae
FG
2984 debug_data_error_objects.insert(o);
2985 }
2986 void inject_mdata_error(const ghobject_t& o) override {
9f95a23c 2987 std::unique_lock l(debug_read_error_lock);
7c673cae
FG
2988 debug_mdata_error_objects.insert(o);
2989 }
11fdf7f2
TL
2990
2991 /// methods to inject various errors fsck can repair
f67539c2
TL
2992 void inject_broken_shared_blob_key(const std::string& key,
2993 const ceph::buffer::list& bl);
11fdf7f2
TL
2994 void inject_leaked(uint64_t len);
2995 void inject_false_free(coll_t cid, ghobject_t oid);
f67539c2 2996 void inject_statfs(const std::string& key, const store_statfs_t& new_statfs);
eafe8130 2997 void inject_global_statfs(const store_statfs_t& new_statfs);
11fdf7f2
TL
2998 void inject_misreference(coll_t cid1, ghobject_t oid1,
2999 coll_t cid2, ghobject_t oid2,
3000 uint64_t offset);
adb31ebb 3001 void inject_zombie_spanning_blob(coll_t cid, ghobject_t oid, int16_t blob_id);
9f95a23c
TL
3002 // resets global per_pool_omap in DB
3003 void inject_legacy_omap();
3004 // resets per_pool_omap | pgmeta_omap for onode
3005 void inject_legacy_omap(coll_t cid, ghobject_t oid);
11fdf7f2 3006
224ce89b 3007 void compact() override {
11fdf7f2 3008 ceph_assert(db);
224ce89b
WB
3009 db->compact();
3010 }
28e407b8
AA
3011 bool has_builtin_csum() const override {
3012 return true;
3013 }
3014
494da23a
TL
3015 inline void log_latency(const char* name,
3016 int idx,
3017 const ceph::timespan& lat,
3018 double lat_threshold,
3019 const char* info = "") const;
3020
3021 inline void log_latency_fn(const char* name,
3022 int idx,
3023 const ceph::timespan& lat,
3024 double lat_threshold,
f67539c2 3025 std::function<std::string (const ceph::timespan& lat)> fn) const;
11fdf7f2 3026
7c673cae
FG
3027private:
3028 bool _debug_data_eio(const ghobject_t& o) {
3029 if (!cct->_conf->bluestore_debug_inject_read_err) {
3030 return false;
3031 }
9f95a23c 3032 std::shared_lock l(debug_read_error_lock);
7c673cae
FG
3033 return debug_data_error_objects.count(o);
3034 }
3035 bool _debug_mdata_eio(const ghobject_t& o) {
3036 if (!cct->_conf->bluestore_debug_inject_read_err) {
3037 return false;
3038 }
9f95a23c 3039 std::shared_lock l(debug_read_error_lock);
7c673cae
FG
3040 return debug_mdata_error_objects.count(o);
3041 }
3042 void _debug_obj_on_delete(const ghobject_t& o) {
3043 if (cct->_conf->bluestore_debug_inject_read_err) {
9f95a23c 3044 std::unique_lock l(debug_read_error_lock);
7c673cae
FG
3045 debug_data_error_objects.erase(o);
3046 debug_mdata_error_objects.erase(o);
3047 }
3048 }
11fdf7f2
TL
3049private:
3050 ceph::mutex qlock = ceph::make_mutex("BlueStore::Alerts::qlock");
f67539c2
TL
3051 std::string failed_cmode;
3052 std::set<std::string> failed_compressors;
3053 std::string spillover_alert;
3054 std::string legacy_statfs_alert;
3055 std::string no_per_pool_omap_alert;
3056 std::string no_per_pg_omap_alert;
3057 std::string disk_size_mismatch_alert;
3058 std::string spurious_read_errors_alert;
11fdf7f2
TL
3059
3060 void _log_alerts(osd_alert_list_t& alerts);
3061 bool _set_compression_alert(bool cmode, const char* s) {
3062 std::lock_guard l(qlock);
3063 if (cmode) {
3064 bool ret = failed_cmode.empty();
3065 failed_cmode = s;
3066 return ret;
3067 }
3068 return failed_compressors.emplace(s).second;
3069 }
3070 void _clear_compression_alert() {
3071 std::lock_guard l(qlock);
3072 failed_compressors.clear();
3073 failed_cmode.clear();
3074 }
3075
f67539c2 3076 void _set_spillover_alert(const std::string& s) {
11fdf7f2
TL
3077 std::lock_guard l(qlock);
3078 spillover_alert = s;
3079 }
3080 void _clear_spillover_alert() {
3081 std::lock_guard l(qlock);
3082 spillover_alert.clear();
3083 }
7c673cae 3084
81eedcae 3085 void _check_legacy_statfs_alert();
f67539c2
TL
3086 void _check_no_per_pg_or_pool_omap_alert();
3087 void _set_disk_size_mismatch_alert(const std::string& s) {
81eedcae
TL
3088 std::lock_guard l(qlock);
3089 disk_size_mismatch_alert = s;
3090 }
f67539c2
TL
3091 void _set_spurious_read_errors_alert(const string& s) {
3092 std::lock_guard l(qlock);
3093 spurious_read_errors_alert = s;
3094 }
81eedcae 3095
7c673cae
FG
3096private:
3097
3098 // --------------------------------------------------------
3099 // read processing internal methods
3100 int _verify_csum(
3101 OnodeRef& o,
3102 const bluestore_blob_t* blob,
3103 uint64_t blob_xoffset,
f67539c2 3104 const ceph::buffer::list& bl,
7c673cae 3105 uint64_t logical_offset) const;
f67539c2 3106 int _decompress(ceph::buffer::list& source, ceph::buffer::list* result);
7c673cae
FG
3107
3108
3109 // --------------------------------------------------------
3110 // write ops
3111
3112 struct WriteContext {
3113 bool buffered = false; ///< buffered write
3114 bool compress = false; ///< compressed write
3115 uint64_t target_blob_size = 0; ///< target (max) blob size
3116 unsigned csum_order = 0; ///< target checksum chunk order
3117
3118 old_extent_map_t old_extents; ///< must deref these blobs
eafe8130 3119 interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection
7c673cae
FG
3120
3121 struct write_item {
3122 uint64_t logical_offset; ///< write logical offset
3123 BlobRef b;
3124 uint64_t blob_length;
3125 uint64_t b_off;
f67539c2 3126 ceph::buffer::list bl;
7c673cae
FG
3127 uint64_t b_off0; ///< original offset in a blob prior to padding
3128 uint64_t length0; ///< original data length prior to padding
3129
3130 bool mark_unused;
3131 bool new_blob; ///< whether new blob was created
3132
3efd9988 3133 bool compressed = false;
f67539c2 3134 ceph::buffer::list compressed_bl;
3efd9988
FG
3135 size_t compressed_len = 0;
3136
7c673cae
FG
3137 write_item(
3138 uint64_t logical_offs,
3139 BlobRef b,
3140 uint64_t blob_len,
3141 uint64_t o,
f67539c2 3142 ceph::buffer::list& bl,
7c673cae
FG
3143 uint64_t o0,
3144 uint64_t l0,
3145 bool _mark_unused,
3146 bool _new_blob)
3147 :
3148 logical_offset(logical_offs),
3149 b(b),
3150 blob_length(blob_len),
3151 b_off(o),
3152 bl(bl),
3153 b_off0(o0),
3154 length0(l0),
3155 mark_unused(_mark_unused),
3156 new_blob(_new_blob) {}
3157 };
f67539c2 3158 std::vector<write_item> writes; ///< blobs we're writing
7c673cae
FG
3159
3160 /// partial clone of the context
3161 void fork(const WriteContext& other) {
3162 buffered = other.buffered;
3163 compress = other.compress;
3164 target_blob_size = other.target_blob_size;
3165 csum_order = other.csum_order;
3166 }
3167 void write(
3168 uint64_t loffs,
3169 BlobRef b,
3170 uint64_t blob_len,
3171 uint64_t o,
f67539c2 3172 ceph::buffer::list& bl,
7c673cae
FG
3173 uint64_t o0,
3174 uint64_t len0,
3175 bool _mark_unused,
3176 bool _new_blob) {
3177 writes.emplace_back(loffs,
3178 b,
3179 blob_len,
3180 o,
3181 bl,
3182 o0,
3183 len0,
3184 _mark_unused,
3185 _new_blob);
3186 }
3187 /// Checks for writes to the same pextent within a blob
3188 bool has_conflict(
3189 BlobRef b,
3190 uint64_t loffs,
3191 uint64_t loffs_end,
3192 uint64_t min_alloc_size);
3193 };
3194
3195 void _do_write_small(
3196 TransContext *txc,
3197 CollectionRef &c,
3198 OnodeRef o,
3199 uint64_t offset, uint64_t length,
f67539c2 3200 ceph::buffer::list::iterator& blp,
7c673cae 3201 WriteContext *wctx);
f67539c2
TL
3202 void _do_write_big_apply_deferred(
3203 TransContext* txc,
3204 CollectionRef& c,
3205 OnodeRef o,
3206 BigDeferredWriteContext& dctx,
3207 bufferlist::iterator& blp,
3208 WriteContext* wctx);
7c673cae
FG
3209 void _do_write_big(
3210 TransContext *txc,
3211 CollectionRef &c,
3212 OnodeRef o,
3213 uint64_t offset, uint64_t length,
f67539c2 3214 ceph::buffer::list::iterator& blp,
7c673cae
FG
3215 WriteContext *wctx);
3216 int _do_alloc_write(
3217 TransContext *txc,
3218 CollectionRef c,
3219 OnodeRef o,
3220 WriteContext *wctx);
3221 void _wctx_finish(
3222 TransContext *txc,
3223 CollectionRef& c,
3224 OnodeRef o,
31f18b77 3225 WriteContext *wctx,
f67539c2 3226 std::set<SharedBlob*> *maybe_unshared_blobs=0);
7c673cae 3227
7c673cae
FG
3228 int _write(TransContext *txc,
3229 CollectionRef& c,
3230 OnodeRef& o,
3231 uint64_t offset, size_t len,
f67539c2 3232 ceph::buffer::list& bl,
7c673cae 3233 uint32_t fadvise_flags);
f67539c2 3234 void _pad_zeros(ceph::buffer::list *bl, uint64_t *offset,
7c673cae
FG
3235 uint64_t chunk_size);
3236
31f18b77
FG
3237 void _choose_write_options(CollectionRef& c,
3238 OnodeRef o,
3239 uint32_t fadvise_flags,
3240 WriteContext *wctx);
3241
3242 int _do_gc(TransContext *txc,
3243 CollectionRef& c,
3244 OnodeRef o,
31f18b77
FG
3245 const WriteContext& wctx,
3246 uint64_t *dirty_start,
3247 uint64_t *dirty_end);
3248
7c673cae
FG
3249 int _do_write(TransContext *txc,
3250 CollectionRef &c,
3251 OnodeRef o,
3252 uint64_t offset, uint64_t length,
f67539c2 3253 ceph::buffer::list& bl,
7c673cae
FG
3254 uint32_t fadvise_flags);
3255 void _do_write_data(TransContext *txc,
3256 CollectionRef& c,
3257 OnodeRef o,
3258 uint64_t offset,
3259 uint64_t length,
f67539c2 3260 ceph::buffer::list& bl,
7c673cae
FG
3261 WriteContext *wctx);
3262
3263 int _touch(TransContext *txc,
3264 CollectionRef& c,
3265 OnodeRef& o);
3266 int _do_zero(TransContext *txc,
3267 CollectionRef& c,
3268 OnodeRef& o,
3269 uint64_t offset, size_t len);
3270 int _zero(TransContext *txc,
3271 CollectionRef& c,
3272 OnodeRef& o,
3273 uint64_t offset, size_t len);
3274 void _do_truncate(TransContext *txc,
3275 CollectionRef& c,
3276 OnodeRef o,
31f18b77 3277 uint64_t offset,
f67539c2 3278 std::set<SharedBlob*> *maybe_unshared_blobs=0);
35e4c445 3279 int _truncate(TransContext *txc,
7c673cae
FG
3280 CollectionRef& c,
3281 OnodeRef& o,
3282 uint64_t offset);
3283 int _remove(TransContext *txc,
3284 CollectionRef& c,
3285 OnodeRef& o);
3286 int _do_remove(TransContext *txc,
3287 CollectionRef& c,
3288 OnodeRef o);
3289 int _setattr(TransContext *txc,
3290 CollectionRef& c,
3291 OnodeRef& o,
f67539c2
TL
3292 const std::string& name,
3293 ceph::buffer::ptr& val);
7c673cae
FG
3294 int _setattrs(TransContext *txc,
3295 CollectionRef& c,
3296 OnodeRef& o,
f67539c2 3297 const std::map<std::string,ceph::buffer::ptr>& aset);
7c673cae
FG
3298 int _rmattr(TransContext *txc,
3299 CollectionRef& c,
3300 OnodeRef& o,
f67539c2 3301 const std::string& name);
7c673cae
FG
3302 int _rmattrs(TransContext *txc,
3303 CollectionRef& c,
3304 OnodeRef& o);
9f95a23c 3305 void _do_omap_clear(TransContext *txc, OnodeRef &o);
7c673cae
FG
3306 int _omap_clear(TransContext *txc,
3307 CollectionRef& c,
3308 OnodeRef& o);
3309 int _omap_setkeys(TransContext *txc,
3310 CollectionRef& c,
3311 OnodeRef& o,
f67539c2 3312 ceph::buffer::list& bl);
7c673cae
FG
3313 int _omap_setheader(TransContext *txc,
3314 CollectionRef& c,
3315 OnodeRef& o,
f67539c2 3316 ceph::buffer::list& header);
7c673cae
FG
3317 int _omap_rmkeys(TransContext *txc,
3318 CollectionRef& c,
3319 OnodeRef& o,
f67539c2 3320 ceph::buffer::list& bl);
7c673cae
FG
3321 int _omap_rmkey_range(TransContext *txc,
3322 CollectionRef& c,
3323 OnodeRef& o,
f67539c2 3324 const std::string& first, const std::string& last);
7c673cae
FG
3325 int _set_alloc_hint(
3326 TransContext *txc,
3327 CollectionRef& c,
3328 OnodeRef& o,
3329 uint64_t expected_object_size,
3330 uint64_t expected_write_size,
3331 uint32_t flags);
3332 int _do_clone_range(TransContext *txc,
3333 CollectionRef& c,
3334 OnodeRef& oldo,
3335 OnodeRef& newo,
3336 uint64_t srcoff, uint64_t length, uint64_t dstoff);
3337 int _clone(TransContext *txc,
3338 CollectionRef& c,
3339 OnodeRef& oldo,
3340 OnodeRef& newo);
3341 int _clone_range(TransContext *txc,
3342 CollectionRef& c,
3343 OnodeRef& oldo,
3344 OnodeRef& newo,
3345 uint64_t srcoff, uint64_t length, uint64_t dstoff);
3346 int _rename(TransContext *txc,
3347 CollectionRef& c,
3348 OnodeRef& oldo,
3349 OnodeRef& newo,
3350 const ghobject_t& new_oid);
3351 int _create_collection(TransContext *txc, const coll_t &cid,
3352 unsigned bits, CollectionRef *c);
3353 int _remove_collection(TransContext *txc, const coll_t &cid,
3354 CollectionRef *c);
11fdf7f2 3355 void _do_remove_collection(TransContext *txc, CollectionRef *c);
7c673cae
FG
3356 int _split_collection(TransContext *txc,
3357 CollectionRef& c,
3358 CollectionRef& d,
3359 unsigned bits, int rem);
11fdf7f2
TL
3360 int _merge_collection(TransContext *txc,
3361 CollectionRef *c,
3362 CollectionRef& d,
3363 unsigned bits);
3364
9f95a23c
TL
3365 void _collect_allocation_stats(uint64_t need, uint32_t alloc_size,
3366 size_t extents);
3367 void _record_allocation_stats();
11fdf7f2 3368private:
9f95a23c
TL
3369 uint64_t probe_count = 0;
3370 std::atomic<uint64_t> alloc_stats_count = {0};
3371 std::atomic<uint64_t> alloc_stats_fragments = { 0 };
3372 std::atomic<uint64_t> alloc_stats_size = { 0 };
3373 //
3374 std::array<std::tuple<uint64_t, uint64_t, uint64_t>, 5> alloc_stats_history =
3375 { std::make_tuple(0ul, 0ul, 0ul) };
3376
9f95a23c 3377 inline bool _use_rotational_settings();
eafe8130
TL
3378
3379public:
3380 struct sb_info_t {
3381 coll_t cid;
3382 int64_t pool_id = INT64_MIN;
f67539c2 3383 std::list<ghobject_t> oids;
eafe8130
TL
3384 BlueStore::SharedBlobRef sb;
3385 bluestore_extent_ref_map_t ref_map;
3386 bool compressed = false;
3387 bool passed = false;
3388 bool updated = false;
3389 };
3390 typedef btree::btree_set<
3391 uint64_t, std::less<uint64_t>,
3392 mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
3393
3394 typedef mempool::bluestore_fsck::map<uint64_t, sb_info_t> sb_info_map_t;
3395 struct FSCK_ObjectCtx {
3396 int64_t& errors;
3397 int64_t& warnings;
3398 uint64_t& num_objects;
3399 uint64_t& num_extents;
3400 uint64_t& num_blobs;
3401 uint64_t& num_sharded_objects;
3402 uint64_t& num_spanning_blobs;
3403
3404 mempool_dynamic_bitset* used_blocks;
3405 uint64_t_btree_t* used_omap_head;
eafe8130
TL
3406
3407 ceph::mutex* sb_info_lock;
3408 sb_info_map_t& sb_info;
3409
3410 store_statfs_t& expected_store_statfs;
3411 per_pool_statfs& expected_pool_statfs;
3412 BlueStoreRepairer* repairer;
3413
3414 FSCK_ObjectCtx(int64_t& e,
3415 int64_t& w,
3416 uint64_t& _num_objects,
3417 uint64_t& _num_extents,
3418 uint64_t& _num_blobs,
3419 uint64_t& _num_sharded_objects,
3420 uint64_t& _num_spanning_blobs,
3421 mempool_dynamic_bitset* _ub,
3422 uint64_t_btree_t* _used_omap_head,
eafe8130
TL
3423 ceph::mutex* _sb_info_lock,
3424 sb_info_map_t& _sb_info,
3425 store_statfs_t& _store_statfs,
3426 per_pool_statfs& _pool_statfs,
3427 BlueStoreRepairer* _repairer) :
3428 errors(e),
3429 warnings(w),
3430 num_objects(_num_objects),
3431 num_extents(_num_extents),
3432 num_blobs(_num_blobs),
3433 num_sharded_objects(_num_sharded_objects),
3434 num_spanning_blobs(_num_spanning_blobs),
3435 used_blocks(_ub),
3436 used_omap_head(_used_omap_head),
eafe8130
TL
3437 sb_info_lock(_sb_info_lock),
3438 sb_info(_sb_info),
3439 expected_store_statfs(_store_statfs),
3440 expected_pool_statfs(_pool_statfs),
3441 repairer(_repairer) {
3442 }
3443 };
3444
3445 OnodeRef fsck_check_objects_shallow(
3446 FSCKDepth depth,
3447 int64_t pool_id,
3448 CollectionRef c,
3449 const ghobject_t& oid,
f67539c2
TL
3450 const std::string& key,
3451 const ceph::buffer::list& value,
3452 mempool::bluestore_fsck::list<std::string>* expecting_shards,
3453 std::map<BlobRef, bluestore_blob_t::unused_t>* referenced,
eafe8130
TL
3454 const BlueStore::FSCK_ObjectCtx& ctx);
3455
3456private:
9f95a23c
TL
3457 void _fsck_check_object_omap(FSCKDepth depth,
3458 OnodeRef& o,
3459 const BlueStore::FSCK_ObjectCtx& ctx);
3460
eafe8130
TL
3461 void _fsck_check_objects(FSCKDepth depth,
3462 FSCK_ObjectCtx& ctx);
7c673cae
FG
3463};
3464
f67539c2 3465inline std::ostream& operator<<(std::ostream& out, const BlueStore::volatile_statfs& s) {
11fdf7f2
TL
3466 return out
3467 << " allocated:"
3468 << s.values[BlueStore::volatile_statfs::STATFS_ALLOCATED]
3469 << " stored:"
3470 << s.values[BlueStore::volatile_statfs::STATFS_STORED]
3471 << " compressed:"
3472 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED]
3473 << " compressed_orig:"
3474 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ORIGINAL]
3475 << " compressed_alloc:"
3476 << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ALLOCATED];
7c673cae
FG
3477}
3478
3479static inline void intrusive_ptr_add_ref(BlueStore::Onode *o) {
3480 o->get();
3481}
3482static inline void intrusive_ptr_release(BlueStore::Onode *o) {
3483 o->put();
3484}
3485
3486static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer *o) {
3487 o->get();
3488}
3489static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
3490 o->put();
3491}
3492
11fdf7f2
TL
3493class BlueStoreRepairer
3494{
3495public:
3496 // to simplify future potential migration to mempools
3497 using fsck_interval = interval_set<uint64_t>;
3498
3499 // Structure to track what pextents are used for specific cid/oid.
3500 // Similar to Bloom filter positive and false-positive matches are
3501 // possible only.
3502 // Maintains two lists of bloom filters for both cids and oids
3503 // where each list entry is a BF for specific disk pextent
3504 // The length of the extent per filter is measured on init.
3505 // Allows to filter out 'uninteresting' pextents to speadup subsequent
3506 // 'is_used' access.
3507 struct StoreSpaceTracker {
3508 const uint64_t BLOOM_FILTER_SALT_COUNT = 2;
3509 const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter
3510 const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected
3511 static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024;
3512
3513 typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector;
3514 bloom_vector collections_bfs;
3515 bloom_vector objects_bfs;
3516
3517 bool was_filtered_out = false;
3518 uint64_t granularity = 0; // extent length for a single filter
3519
3520 StoreSpaceTracker() {
3521 }
3522 StoreSpaceTracker(const StoreSpaceTracker& from) :
3523 collections_bfs(from.collections_bfs),
3524 objects_bfs(from.objects_bfs),
3525 granularity(from.granularity) {
3526 }
3527
3528 void init(uint64_t total,
3529 uint64_t min_alloc_size,
3530 uint64_t mem_cap = DEF_MEM_CAP) {
3531 ceph_assert(!granularity); // not initialized yet
3532 ceph_assert(min_alloc_size && isp2(min_alloc_size));
3533 ceph_assert(mem_cap);
3534
3535 total = round_up_to(total, min_alloc_size);
3536 granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap;
3537
3538 if (!granularity) {
3539 granularity = min_alloc_size;
3540 } else {
3541 granularity = round_up_to(granularity, min_alloc_size);
3542 }
3543
3544 uint64_t entries = round_up_to(total, granularity) / granularity;
3545 collections_bfs.resize(entries,
3546 bloom_filter(BLOOM_FILTER_SALT_COUNT,
3547 BLOOM_FILTER_TABLE_SIZE,
3548 0,
3549 BLOOM_FILTER_EXPECTED_COUNT));
3550 objects_bfs.resize(entries,
3551 bloom_filter(BLOOM_FILTER_SALT_COUNT,
3552 BLOOM_FILTER_TABLE_SIZE,
3553 0,
3554 BLOOM_FILTER_EXPECTED_COUNT));
3555 }
3556 inline uint32_t get_hash(const coll_t& cid) const {
3557 return cid.hash_to_shard(1);
3558 }
3559 inline void set_used(uint64_t offset, uint64_t len,
3560 const coll_t& cid, const ghobject_t& oid) {
3561 ceph_assert(granularity); // initialized
3562
3563 // can't call this func after filter_out has been applied
3564 ceph_assert(!was_filtered_out);
3565 if (!len) {
3566 return;
3567 }
3568 auto pos = offset / granularity;
3569 auto end_pos = (offset + len - 1) / granularity;
3570 while (pos <= end_pos) {
3571 collections_bfs[pos].insert(get_hash(cid));
3572 objects_bfs[pos].insert(oid.hobj.get_hash());
3573 ++pos;
3574 }
3575 }
3576 // filter-out entries unrelated to the specified(broken) extents.
3577 // 'is_used' calls are permitted after that only
3578 size_t filter_out(const fsck_interval& extents);
3579
3580 // determines if collection's present after filtering-out
3581 inline bool is_used(const coll_t& cid) const {
3582 ceph_assert(was_filtered_out);
3583 for(auto& bf : collections_bfs) {
3584 if (bf.contains(get_hash(cid))) {
3585 return true;
3586 }
3587 }
3588 return false;
3589 }
3590 // determines if object's present after filtering-out
3591 inline bool is_used(const ghobject_t& oid) const {
3592 ceph_assert(was_filtered_out);
3593 for(auto& bf : objects_bfs) {
3594 if (bf.contains(oid.hobj.get_hash())) {
3595 return true;
3596 }
3597 }
3598 return false;
3599 }
3600 // determines if collection's present before filtering-out
3601 inline bool is_used(const coll_t& cid, uint64_t offs) const {
3602 ceph_assert(granularity); // initialized
3603 ceph_assert(!was_filtered_out);
3604 auto &bf = collections_bfs[offs / granularity];
3605 if (bf.contains(get_hash(cid))) {
3606 return true;
3607 }
3608 return false;
3609 }
3610 // determines if object's present before filtering-out
3611 inline bool is_used(const ghobject_t& oid, uint64_t offs) const {
3612 ceph_assert(granularity); // initialized
3613 ceph_assert(!was_filtered_out);
3614 auto &bf = objects_bfs[offs / granularity];
3615 if (bf.contains(oid.hobj.get_hash())) {
3616 return true;
3617 }
3618 return false;
3619 }
3620 };
3621public:
f67539c2
TL
3622 void fix_per_pool_omap(KeyValueDB *db, int);
3623 bool remove_key(KeyValueDB *db, const std::string& prefix, const std::string& key);
11fdf7f2
TL
3624 bool fix_shared_blob(KeyValueDB *db,
3625 uint64_t sbid,
f67539c2
TL
3626 const ceph::buffer::list* bl);
3627 bool fix_statfs(KeyValueDB *db, const std::string& key,
11fdf7f2
TL
3628 const store_statfs_t& new_statfs);
3629
3630 bool fix_leaked(KeyValueDB *db,
3631 FreelistManager* fm,
3632 uint64_t offset, uint64_t len);
3633 bool fix_false_free(KeyValueDB *db,
3634 FreelistManager* fm,
3635 uint64_t offset, uint64_t len);
adb31ebb 3636 KeyValueDB::Transaction fix_spanning_blobs(KeyValueDB* db);
11fdf7f2
TL
3637
3638 void init(uint64_t total_space, uint64_t lres_tracking_unit_size);
3639
3640 bool preprocess_misreference(KeyValueDB *db);
3641
3642 unsigned apply(KeyValueDB* db);
3643
3644 void note_misreference(uint64_t offs, uint64_t len, bool inc_error) {
3645 misreferenced_extents.union_insert(offs, len);
3646 if (inc_error) {
3647 ++to_repair_cnt;
3648 }
3649 }
9f95a23c 3650 // In fact this is the only repairer's method which is thread-safe!!
eafe8130
TL
3651 void inc_repaired() {
3652 ++to_repair_cnt;
9f95a23c 3653 }
11fdf7f2
TL
3654
3655 StoreSpaceTracker& get_space_usage_tracker() {
3656 return space_usage_tracker;
3657 }
3658 const fsck_interval& get_misreferences() const {
3659 return misreferenced_extents;
3660 }
3661 KeyValueDB::Transaction get_fix_misreferences_txn() {
3662 return fix_misreferences_txn;
3663 }
3664
3665private:
9f95a23c
TL
3666 std::atomic<unsigned> to_repair_cnt = { 0 };
3667 KeyValueDB::Transaction fix_per_pool_omap_txn;
11fdf7f2
TL
3668 KeyValueDB::Transaction fix_fm_leaked_txn;
3669 KeyValueDB::Transaction fix_fm_false_free_txn;
3670 KeyValueDB::Transaction remove_key_txn;
3671 KeyValueDB::Transaction fix_statfs_txn;
3672 KeyValueDB::Transaction fix_shared_blob_txn;
3673
3674 KeyValueDB::Transaction fix_misreferences_txn;
adb31ebb 3675 KeyValueDB::Transaction fix_onode_txn;
11fdf7f2
TL
3676
3677 StoreSpaceTracker space_usage_tracker;
3678
3679 // non-shared extents with multiple references
3680 fsck_interval misreferenced_extents;
3681
3682};
9f95a23c
TL
3683
3684class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
3685{
3686 template <class T, size_t MaxX, size_t MaxY>
3687 class matrix_2d {
3688 T values[MaxX][MaxY];
3689 public:
3690 matrix_2d() {
3691 clear();
3692 }
3693 T& at(size_t x, size_t y) {
3694 ceph_assert(x < MaxX);
3695 ceph_assert(y < MaxY);
3696
3697 return values[x][y];
3698 }
3699 size_t get_max_x() const {
3700 return MaxX;
3701 }
3702 size_t get_max_y() const {
3703 return MaxY;
3704 }
3705 void clear() {
3706 memset(values, 0, sizeof(values));
3707 }
3708 };
3709
3710 enum {
3711 // use 0/nullptr as unset indication
3712 LEVEL_FIRST = 1,
f6b5b4d7
TL
3713 LEVEL_LOG = LEVEL_FIRST, // BlueFS log
3714 LEVEL_WAL,
9f95a23c
TL
3715 LEVEL_DB,
3716 LEVEL_SLOW,
3717 LEVEL_MAX
3718 };
3719 // add +1 row for corresponding per-device totals
3720 // add +1 column for per-level actual (taken from file size) total
3721 typedef matrix_2d<uint64_t, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t;
3722
3723 per_level_per_dev_usage_t per_level_per_dev_usage;
f6b5b4d7
TL
3724 // file count per level, add +1 to keep total file count
3725 uint64_t per_level_files[LEVEL_MAX - LEVEL_FIRST + 1] = { 0 };
9f95a23c
TL
3726
3727 // Note: maximum per-device totals below might be smaller than corresponding
3728 // perf counters by up to a single alloc unit (1M) due to superblock extent.
3729 // The later is not accounted here.
3730 per_level_per_dev_usage_t per_level_per_dev_max;
3731
3732 uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST];
3733 uint64_t db_avail4slow = 0;
3734 enum {
3735 OLD_POLICY,
3736 USE_SOME_EXTRA
3737 };
3738
3739public:
3740 RocksDBBlueFSVolumeSelector(
3741 uint64_t _wal_total,
3742 uint64_t _db_total,
3743 uint64_t _slow_total,
3744 uint64_t _level0_size,
3745 uint64_t _level_base,
3746 uint64_t _level_multiplier,
3747 double reserved_factor,
3748 uint64_t reserved,
3749 bool new_pol)
3750 {
f6b5b4d7 3751 l_totals[LEVEL_LOG - LEVEL_FIRST] = 0; // not used at the moment
9f95a23c
TL
3752 l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total;
3753 l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total;
3754 l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total;
3755
3756 if (!new_pol) {
3757 return;
3758 }
3759
3760 // Calculating how much extra space is available at DB volume.
3761 // Depending on the presence of explicit reserved size specification it might be either
3762 // * DB volume size - reserved
3763 // or
3764 // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
3765 if (!reserved) {
3766 uint64_t prev_levels = _level0_size;
3767 uint64_t cur_level = _level_base;
3768 uint64_t cur_threshold = 0;
3769 do {
3770 uint64_t next_level = cur_level * _level_multiplier;
3771 uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor;
3772 if (_db_total <= next_threshold) {
3773 db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0;
3774 break;
3775 } else {
3776 prev_levels += cur_level;
3777 cur_level = next_level;
3778 cur_threshold = next_threshold;
3779 }
3780 } while (true);
3781 } else {
3782 db_avail4slow = _db_total - reserved;
3783 }
3784 }
3785
f6b5b4d7
TL
3786 void* get_hint_for_log() const override {
3787 return reinterpret_cast<void*>(LEVEL_LOG);
9f95a23c 3788 }
f67539c2 3789 void* get_hint_by_dir(const std::string& dirname) const override;
9f95a23c
TL
3790
3791 void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
3792 if (hint == nullptr)
3793 return;
3794 size_t pos = (size_t)hint - LEVEL_FIRST;
3795 for (auto& p : fnode.extents) {
3796 auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
3797 auto& max = per_level_per_dev_max.at(p.bdev, pos);
3798 cur += p.length;
3799 if (cur > max) {
3800 max = cur;
3801 }
3802 {
3803 //update per-device totals
3804 auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
3805 auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
3806 cur += p.length;
3807 if (cur > max) {
3808 max = cur;
3809 }
3810 }
3811 }
3812 {
3813 //update per-level actual totals
3814 auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
3815 auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
3816 cur += fnode.size;
3817 if (cur > max) {
3818 max = cur;
3819 }
3820 }
f6b5b4d7
TL
3821 ++per_level_files[pos];
3822 ++per_level_files[LEVEL_MAX - LEVEL_FIRST];
9f95a23c
TL
3823 }
3824 void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
3825 if (hint == nullptr)
3826 return;
3827 size_t pos = (size_t)hint - LEVEL_FIRST;
3828 for (auto& p : fnode.extents) {
3829 auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
3830 ceph_assert(cur >= p.length);
3831 cur -= p.length;
3832
3833 //update per-device totals
3834 auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
3835 ceph_assert(cur2 >= p.length);
3836 cur2 -= p.length;
3837 }
3838 //update per-level actual totals
3839 auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
3840 ceph_assert(cur >= fnode.size);
3841 cur -= fnode.size;
f6b5b4d7
TL
3842 ceph_assert(per_level_files[pos] > 0);
3843 --per_level_files[pos];
3844 ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0);
3845 --per_level_files[LEVEL_MAX - LEVEL_FIRST];
9f95a23c
TL
3846 }
3847 void add_usage(void* hint, uint64_t fsize) override {
3848 if (hint == nullptr)
3849 return;
3850 size_t pos = (size_t)hint - LEVEL_FIRST;
3851 //update per-level actual totals
3852 auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
3853 auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
3854 cur += fsize;
3855 if (cur > max) {
3856 max = cur;
3857 }
3858 }
3859 void sub_usage(void* hint, uint64_t fsize) override {
3860 if (hint == nullptr)
3861 return;
3862 size_t pos = (size_t)hint - LEVEL_FIRST;
3863 //update per-level actual totals
3864 auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
3865 ceph_assert(cur >= fsize);
3866 per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos) -= fsize;
3867 }
3868
3869 uint8_t select_prefer_bdev(void* h) override;
3870 void get_paths(
3871 const std::string& base,
3872 BlueFSVolumeSelector::paths& res) const override;
3873
f67539c2 3874 void dump(std::ostream& sout) override;
9f95a23c
TL
3875};
3876
7c673cae 3877#endif