]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.h
bump version to 18.2.4-pve3
[ceph.git] / ceph / src / os / bluestore / BlueFS.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3#ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4#define CEPH_OS_BLUESTORE_BLUEFS_H
5
6#include <atomic>
7#include <mutex>
cd265ab1 8#include <limits>
7c673cae
FG
9
10#include "bluefs_types.h"
f67539c2 11#include "blk/BlockDevice.h"
7c673cae 12
9f95a23c
TL
13#include "common/RefCountedObj.h"
14#include "common/ceph_context.h"
15#include "global/global_context.h"
16#include "include/common_fwd.h"
7c673cae 17
9f95a23c
TL
18#include "boost/intrusive/list.hpp"
19#include "boost/dynamic_bitset.hpp"
7c673cae
FG
20
21class Allocator;
22
23enum {
24 l_bluefs_first = 732600,
7c673cae
FG
25 l_bluefs_db_total_bytes,
26 l_bluefs_db_used_bytes,
27 l_bluefs_wal_total_bytes,
28 l_bluefs_wal_used_bytes,
29 l_bluefs_slow_total_bytes,
30 l_bluefs_slow_used_bytes,
31 l_bluefs_num_files,
32 l_bluefs_log_bytes,
33 l_bluefs_log_compactions,
1e59de90 34 l_bluefs_log_write_count,
7c673cae
FG
35 l_bluefs_logged_bytes,
36 l_bluefs_files_written_wal,
37 l_bluefs_files_written_sst,
1e59de90
TL
38 l_bluefs_write_count_wal,
39 l_bluefs_write_count_sst,
7c673cae
FG
40 l_bluefs_bytes_written_wal,
41 l_bluefs_bytes_written_sst,
11fdf7f2
TL
42 l_bluefs_bytes_written_slow,
43 l_bluefs_max_bytes_wal,
44 l_bluefs_max_bytes_db,
45 l_bluefs_max_bytes_slow,
20effc67
TL
46 l_bluefs_main_alloc_unit,
47 l_bluefs_db_alloc_unit,
48 l_bluefs_wal_alloc_unit,
494da23a
TL
49 l_bluefs_read_random_count,
50 l_bluefs_read_random_bytes,
51 l_bluefs_read_random_disk_count,
52 l_bluefs_read_random_disk_bytes,
20effc67
TL
53 l_bluefs_read_random_disk_bytes_wal,
54 l_bluefs_read_random_disk_bytes_db,
55 l_bluefs_read_random_disk_bytes_slow,
494da23a
TL
56 l_bluefs_read_random_buffer_count,
57 l_bluefs_read_random_buffer_bytes,
58 l_bluefs_read_count,
59 l_bluefs_read_bytes,
20effc67
TL
60 l_bluefs_read_disk_count,
61 l_bluefs_read_disk_bytes,
62 l_bluefs_read_disk_bytes_wal,
63 l_bluefs_read_disk_bytes_db,
64 l_bluefs_read_disk_bytes_slow,
494da23a
TL
65 l_bluefs_read_prefetch_count,
66 l_bluefs_read_prefetch_bytes,
1e59de90
TL
67 l_bluefs_write_count,
68 l_bluefs_write_disk_count,
69 l_bluefs_write_bytes,
39ae355f
TL
70 l_bluefs_compaction_lat,
71 l_bluefs_compaction_lock_lat,
72 l_bluefs_alloc_shared_dev_fallbacks,
73 l_bluefs_alloc_shared_size_fallbacks,
cd265ab1
TL
74 l_bluefs_read_zeros_candidate,
75 l_bluefs_read_zeros_errors,
7c673cae
FG
76 l_bluefs_last,
77};
78
9f95a23c
TL
79class BlueFSVolumeSelector {
80public:
81 typedef std::vector<std::pair<std::string, uint64_t>> paths;
82
83 virtual ~BlueFSVolumeSelector() {
84 }
f6b5b4d7 85 virtual void* get_hint_for_log() const = 0;
b3b6e05e 86 virtual void* get_hint_by_dir(std::string_view dirname) const = 0;
9f95a23c
TL
87
88 virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
89 virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
90 virtual void add_usage(void* file_hint, uint64_t fsize) = 0;
91 virtual void sub_usage(void* file_hint, uint64_t fsize) = 0;
92 virtual uint8_t select_prefer_bdev(void* hint) = 0;
93 virtual void get_paths(const std::string& base, paths& res) const = 0;
f67539c2 94 virtual void dump(std::ostream& sout) = 0;
20effc67
TL
95
96 /* used for sanity checking of vselector */
97 virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; }
98 virtual bool compare(BlueFSVolumeSelector* other) { return true; };
f67539c2
TL
99};
100
101struct bluefs_shared_alloc_context_t {
102 bool need_init = false;
103 Allocator* a = nullptr;
39ae355f 104 uint64_t alloc_unit = 0;
f67539c2
TL
105
106 std::atomic<uint64_t> bluefs_used = 0;
107
39ae355f 108 void set(Allocator* _a, uint64_t _au) {
f67539c2 109 a = _a;
39ae355f 110 alloc_unit = _au;
f67539c2
TL
111 need_init = true;
112 bluefs_used = 0;
113 }
114 void reset() {
115 a = nullptr;
39ae355f 116 alloc_unit = 0;
f67539c2 117 }
11fdf7f2
TL
118};
119
7c673cae
FG
120class BlueFS {
121public:
122 CephContext* cct;
11fdf7f2 123 static constexpr unsigned MAX_BDEV = 5;
7c673cae
FG
124 static constexpr unsigned BDEV_WAL = 0;
125 static constexpr unsigned BDEV_DB = 1;
126 static constexpr unsigned BDEV_SLOW = 2;
11fdf7f2
TL
127 static constexpr unsigned BDEV_NEWWAL = 3;
128 static constexpr unsigned BDEV_NEWDB = 4;
7c673cae
FG
129
130 enum {
131 WRITER_UNKNOWN,
132 WRITER_WAL,
133 WRITER_SST,
134 };
135
136 struct File : public RefCountedObject {
137 MEMPOOL_CLASS_HELPERS();
138
139 bluefs_fnode_t fnode;
140 int refs;
141 uint64_t dirty_seq;
142 bool locked;
143 bool deleted;
522d829b 144 bool is_dirty;
7c673cae
FG
145 boost::intrusive::list_member_hook<> dirty_item;
146
147 std::atomic_int num_readers, num_writers;
148 std::atomic_int num_reading;
149
9f95a23c 150 void* vselector_hint = nullptr;
20effc67
TL
151 /* lock protects fnode and other the parts that can be modified during read & write operations.
152 Does not protect values that are fixed
153 Does not need to be taken when doing one-time operations:
154 _replay, device_migrate_to_existing, device_migrate_to_new */
155 ceph::mutex lock = ceph::make_mutex("BlueFS::File::lock");
9f95a23c
TL
156
157 private:
158 FRIEND_MAKE_REF(File);
7c673cae 159 File()
9f95a23c 160 :
7c673cae
FG
161 refs(0),
162 dirty_seq(0),
163 locked(false),
164 deleted(false),
522d829b 165 is_dirty(false),
7c673cae
FG
166 num_readers(0),
167 num_writers(0),
9f95a23c
TL
168 num_reading(0),
169 vselector_hint(nullptr)
7c673cae
FG
170 {}
171 ~File() override {
11fdf7f2
TL
172 ceph_assert(num_readers.load() == 0);
173 ceph_assert(num_writers.load() == 0);
174 ceph_assert(num_reading.load() == 0);
175 ceph_assert(!locked);
7c673cae 176 }
7c673cae 177 };
9f95a23c 178 using FileRef = ceph::ref_t<File>;
7c673cae
FG
179
180 typedef boost::intrusive::list<
181 File,
182 boost::intrusive::member_hook<
183 File,
184 boost::intrusive::list_member_hook<>,
185 &File::dirty_item> > dirty_file_list_t;
186
187 struct Dir : public RefCountedObject {
188 MEMPOOL_CLASS_HELPERS();
189
b3b6e05e 190 mempool::bluefs::map<std::string, FileRef, std::less<>> file_map;
7c673cae 191
9f95a23c
TL
192 private:
193 FRIEND_MAKE_REF(Dir);
194 Dir() = default;
7c673cae 195 };
9f95a23c 196 using DirRef = ceph::ref_t<Dir>;
7c673cae
FG
197
198 struct FileWriter {
199 MEMPOOL_CLASS_HELPERS();
200
201 FileRef file;
9f95a23c 202 uint64_t pos = 0; ///< start offset for buffer
f67539c2
TL
203 private:
204 ceph::buffer::list buffer; ///< new data to write (at end of file)
205 ceph::buffer::list tail_block; ///< existing partial block at end of file, if any
206 public:
207 unsigned get_buffer_length() const {
208 return buffer.length();
209 }
210 ceph::bufferlist flush_buffer(
211 CephContext* cct,
212 const bool partial,
213 const unsigned length,
214 const bluefs_super_t& super);
215 ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only
216 public:
7c673cae 217 int writer_type = 0; ///< WRITER_*
11fdf7f2 218 int write_hint = WRITE_LIFE_NOT_SET;
7c673cae 219
11fdf7f2 220 ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
7c673cae 221 std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
11fdf7f2 222 std::array<bool, MAX_BDEV> dirty_devs;
7c673cae
FG
223
224 FileWriter(FileRef f)
9f95a23c 225 : file(std::move(f)),
f67539c2
TL
226 buffer_appender(buffer.get_page_aligned_appender(
227 g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
7c673cae
FG
228 ++file->num_writers;
229 iocv.fill(nullptr);
11fdf7f2 230 dirty_devs.fill(false);
9f95a23c 231 if (file->fnode.ino == 1) {
11fdf7f2
TL
232 write_hint = WRITE_LIFE_MEDIUM;
233 }
7c673cae
FG
234 }
235 // NOTE: caller must call BlueFS::close_writer()
236 ~FileWriter() {
237 --file->num_writers;
238 }
239
240 // note: BlueRocksEnv uses this append exclusively, so it's safe
20effc67 241 // to use buffer_appender exclusively here (e.g., its notion of
7c673cae
FG
242 // offset will remain accurate).
243 void append(const char *buf, size_t len) {
f67539c2 244 uint64_t l0 = get_buffer_length();
cd265ab1 245 ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
7c673cae
FG
246 buffer_appender.append(buf, len);
247 }
248
20effc67
TL
249 void append(const std::byte *buf, size_t len) {
250 // allow callers to use byte type instead of char* as we simply pass byte array
251 append((const char*)buf, len);
252 }
253
7c673cae 254 // note: used internally only, for ino 1 or 0.
cd265ab1 255 void append(ceph::buffer::list& bl) {
f67539c2 256 uint64_t l0 = get_buffer_length();
cd265ab1 257 ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max());
7c673cae
FG
258 buffer.claim_append(bl);
259 }
260
f67539c2
TL
261 void append_zero(size_t len) {
262 uint64_t l0 = get_buffer_length();
263 ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
264 buffer_appender.append_zero(len);
265 }
266
7c673cae 267 uint64_t get_effective_write_pos() {
7c673cae
FG
268 return pos + buffer.length();
269 }
270 };
271
272 struct FileReaderBuffer {
273 MEMPOOL_CLASS_HELPERS();
274
9f95a23c 275 uint64_t bl_off = 0; ///< prefetch buffer logical offset
f67539c2 276 ceph::buffer::list bl; ///< prefetch buffer
9f95a23c 277 uint64_t pos = 0; ///< current logical offset
7c673cae
FG
278 uint64_t max_prefetch; ///< max allowed prefetch
279
280 explicit FileReaderBuffer(uint64_t mpf)
9f95a23c 281 : max_prefetch(mpf) {}
7c673cae 282
9f95a23c 283 uint64_t get_buf_end() const {
7c673cae
FG
284 return bl_off + bl.length();
285 }
9f95a23c 286 uint64_t get_buf_remaining(uint64_t p) const {
7c673cae
FG
287 if (p >= bl_off && p < bl_off + bl.length())
288 return bl_off + bl.length() - p;
289 return 0;
290 }
291
292 void skip(size_t n) {
293 pos += n;
294 }
f67539c2
TL
295
296 // For the sake of simplicity, we invalidate completed rather than
297 // for the provided extent
298 void invalidate_cache(uint64_t offset, uint64_t length) {
299 if (offset >= bl_off && offset < get_buf_end()) {
300 bl.clear();
301 bl_off = 0;
302 }
7c673cae
FG
303 }
304 };
305
306 struct FileReader {
307 MEMPOOL_CLASS_HELPERS();
308
309 FileRef file;
310 FileReaderBuffer buf;
311 bool random;
312 bool ignore_eof; ///< used when reading our log file
313
494da23a
TL
314 ceph::shared_mutex lock {
315 ceph::make_shared_mutex(std::string(), false, false, false)
316 };
317
318
7c673cae
FG
319 FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
320 : file(f),
321 buf(mpf),
322 random(rand),
323 ignore_eof(ie) {
324 ++file->num_readers;
325 }
326 ~FileReader() {
327 --file->num_readers;
328 }
329 };
330
331 struct FileLock {
332 MEMPOOL_CLASS_HELPERS();
333
334 FileRef file;
9f95a23c 335 explicit FileLock(FileRef f) : file(std::move(f)) {}
7c673cae
FG
336 };
337
338private:
7c673cae
FG
339 PerfCounters *logger = nullptr;
340
11fdf7f2
TL
341 uint64_t max_bytes[MAX_BDEV] = {0};
342 uint64_t max_bytes_pcounters[MAX_BDEV] = {
343 l_bluefs_max_bytes_wal,
344 l_bluefs_max_bytes_db,
345 l_bluefs_max_bytes_slow,
39ae355f
TL
346 l_bluefs_max_bytes_wal,
347 l_bluefs_max_bytes_db,
11fdf7f2
TL
348 };
349
7c673cae 350 // cache
20effc67
TL
351 struct {
352 ceph::mutex lock = ceph::make_mutex("BlueFS::nodes.lock");
353 mempool::bluefs::map<std::string, DirRef, std::less<>> dir_map; ///< dirname -> Dir
354 mempool::bluefs::unordered_map<uint64_t, FileRef> file_map; ///< ino -> File
355 } nodes;
7c673cae
FG
356
357 bluefs_super_t super; ///< latest superblock (as last written)
358 uint64_t ino_last = 0; ///< last assigned ino (this one is in use)
7c673cae 359
20effc67
TL
360 struct {
361 ceph::mutex lock = ceph::make_mutex("BlueFS::log.lock");
362 uint64_t seq_live = 1; //seq that log is currently writing to; mirrors dirty.seq_live
363 FileWriter *writer = 0;
364 bluefs_transaction_t t;
365 } log;
366
367 struct {
368 ceph::mutex lock = ceph::make_mutex("BlueFS::dirty.lock");
369 uint64_t seq_stable = 0; //seq that is now stable on disk
370 uint64_t seq_live = 1; //seq that is ongoing and dirty files will be written to
371 // map of dirty files, files of same dirty_seq are grouped into list.
372 std::map<uint64_t, dirty_file_list_t> files;
373 std::vector<interval_set<uint64_t>> pending_release; ///< extents to release
374 // TODO: it should be examined what makes pending_release immune to
375 // eras in a way similar to dirty_files. Hints:
376 // 1) we have actually only 2 eras: log_seq and log_seq+1
377 // 2) we usually not remove extents from files. And when we do, we force log-syncing.
378 } dirty;
379
380 ceph::condition_variable log_cond; ///< used for state control between log flush / log compaction
381 std::atomic<bool> log_is_compacting{false}; ///< signals that bluefs log is already ongoing compaction
382 std::atomic<bool> log_forbidden_to_expand{false}; ///< used to signal that async compaction is in state
383 /// that prohibits expansion of bluefs log
7c673cae
FG
384 /*
385 * There are up to 3 block devices:
386 *
387 * BDEV_DB db/ - the primary db device
388 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
389 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
390 */
f67539c2
TL
391 std::vector<BlockDevice*> bdev; ///< block devices we can use
392 std::vector<IOContext*> ioc; ///< IOContexts for bdevs
393 std::vector<uint64_t> block_reserved; ///< starting reserve extent per device
394 std::vector<Allocator*> alloc; ///< allocators for bdevs
395 std::vector<uint64_t> alloc_size; ///< alloc size for each device
20effc67 396
f67539c2 397 //std::vector<interval_set<uint64_t>> block_unused_too_granular;
7c673cae 398
11fdf7f2
TL
399 BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
400
9f95a23c 401 std::unique_ptr<BlueFSVolumeSelector> vselector;
11fdf7f2 402
f67539c2
TL
403 bluefs_shared_alloc_context_t* shared_alloc = nullptr;
404 unsigned shared_alloc_id = unsigned(-1);
405 inline bool is_shared_alloc(unsigned id) const {
406 return id == shared_alloc_id;
407 }
39ae355f 408 std::atomic<int64_t> cooldown_deadline = 0;
f67539c2 409
eafe8130
TL
410 class SocketHook;
411 SocketHook* asok_hook = nullptr;
cd265ab1
TL
412 // used to trigger zeros into read (debug / verify)
413 std::atomic<uint64_t> inject_read_zeros{0};
eafe8130 414
7c673cae
FG
415 void _init_logger();
416 void _shutdown_logger();
417 void _update_logger_stats();
418
419 void _init_alloc();
420 void _stop_alloc();
421
39ae355f
TL
422 ///< pad ceph::buffer::list to max(block size, pad_size) w/ zeros
423 void _pad_bl(ceph::buffer::list& bl, uint64_t pad_size = 0);
f67539c2
TL
424
425 uint64_t _get_used(unsigned id) const;
426 uint64_t _get_total(unsigned id) const;
427
7c673cae
FG
428
429 FileRef _get_file(uint64_t ino);
20effc67 430 void _drop_link_D(FileRef f);
7c673cae 431
1911f103
TL
432 unsigned _get_slow_device_id() {
433 return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB;
434 }
eafe8130 435 const char* get_device_name(unsigned id);
7c673cae 436 int _allocate(uint8_t bdev, uint64_t len,
39ae355f
TL
437 uint64_t alloc_unit,
438 bluefs_fnode_t* node,
439 size_t alloc_attempts = 0,
440 bool permit_dev_fallback = true);
11fdf7f2 441
522d829b 442 /* signal replay log to include h->file in nearest log flush */
20effc67
TL
443 int _signal_dirty_to_log_D(FileWriter *h);
444 int _flush_range_F(FileWriter *h, uint64_t offset, uint64_t length);
445 int _flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered);
446 int _flush_F(FileWriter *h, bool force, bool *flushed = nullptr);
447 uint64_t _flush_special(FileWriter *h);
448 int _fsync(FileWriter *h);
7c673cae 449
11fdf7f2 450#ifdef HAVE_LIBAIO
f67539c2 451 void _claim_completed_aios(FileWriter *h, std::list<aio_t> *ls);
20effc67 452 void _wait_for_aio(FileWriter *h); // safe to call without a lock
11fdf7f2 453#endif
7c673cae 454
20effc67
TL
455 int64_t _maybe_extend_log();
456 void _extend_log();
457 uint64_t _log_advance_seq();
458 void _consume_dirty(uint64_t seq);
459 void _clear_dirty_set_stable_D(uint64_t seq_stable);
460 void _release_pending_allocations(std::vector<interval_set<uint64_t>>& to_release);
461
462 void _flush_and_sync_log_core(int64_t available_runway);
463 int _flush_and_sync_log_jump_D(uint64_t jump_to,
464 int64_t available_runway);
465 int _flush_and_sync_log_LD(uint64_t want_seq = 0);
466
39ae355f
TL
467 uint64_t _estimate_transaction_size(bluefs_transaction_t* t);
468 uint64_t _make_initial_transaction(uint64_t start_seq,
469 bluefs_fnode_t& fnode,
470 uint64_t expected_final_size,
471 bufferlist* out);
20effc67
TL
472 uint64_t _estimate_log_size_N();
473 bool _should_start_compact_log_L_N();
11fdf7f2
TL
474
475 enum {
476 REMOVE_DB = 1,
477 REMOVE_WAL = 2,
478 RENAME_SLOW2DB = 4,
479 RENAME_DB2SLOW = 8,
480 };
39ae355f
TL
481 void _compact_log_dump_metadata_NF(uint64_t start_seq,
482 bluefs_transaction_t *t,
483 int flags,
484 uint64_t capture_before_seq);
11fdf7f2 485
20effc67
TL
486 void _compact_log_sync_LNF_LD();
487 void _compact_log_async_LD_LNF_D();
488
39ae355f 489 void _rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
9f95a23c
TL
490 int super_dev,
491 int log_dev,
492 int new_log_dev,
493 int flags,
494 std::optional<bluefs_layout_t> layout);
7c673cae
FG
495
496 //void _aio_finish(void *priv);
497
39ae355f 498 void _flush_bdev(FileWriter *h, bool check_mutex_locked = true);
20effc67
TL
499 void _flush_bdev(); // this is safe to call without a lock
500 void _flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock
7c673cae
FG
501
502 int _preallocate(FileRef f, uint64_t off, uint64_t len);
503 int _truncate(FileWriter *h, uint64_t off);
504
adb31ebb 505 int64_t _read(
7c673cae 506 FileReader *h, ///< [in] read from here
7c673cae
FG
507 uint64_t offset, ///< [in] offset
508 size_t len, ///< [in] this many bytes
f67539c2 509 ceph::buffer::list *outbl, ///< [out] optional: reference the result here
7c673cae 510 char *out); ///< [out] optional: or copy it here
adb31ebb 511 int64_t _read_random(
7c673cae
FG
512 FileReader *h, ///< [in] read from here
513 uint64_t offset, ///< [in] offset
9f95a23c 514 uint64_t len, ///< [in] this many bytes
7c673cae
FG
515 char *out); ///< [out] optional: or copy it here
516
7c673cae 517 int _open_super();
11fdf7f2 518 int _write_super(int dev);
20effc67
TL
519 int _check_allocations(const bluefs_fnode_t& fnode,
520 boost::dynamic_bitset<uint64_t>* used_blocks,
521 bool is_alloc, //true when allocating, false when deallocating
522 const char* op_name);
9f95a23c
TL
523 int _verify_alloc_granularity(
524 __u8 id, uint64_t offset, uint64_t length,
39ae355f 525 uint64_t alloc_unit,
9f95a23c 526 const char *op);
11fdf7f2 527 int _replay(bool noop, bool to_stdout = false); ///< replay journal
7c673cae
FG
528
529 FileWriter *_create_writer(FileRef f);
20effc67 530 void _drain_writer(FileWriter *h);
7c673cae
FG
531 void _close_writer(FileWriter *h);
532
533 // always put the super in the second 4k block. FIXME should this be
534 // block size independent?
535 unsigned get_super_offset() {
536 return 4096;
537 }
538 unsigned get_super_length() {
539 return 4096;
540 }
20effc67
TL
541 void _maybe_check_vselector_LNF() {
542 if (cct->_conf->bluefs_check_volume_selector_often) {
543 _check_vselector_LNF();
544 }
545 }
7c673cae
FG
546public:
547 BlueFS(CephContext* cct);
548 ~BlueFS();
549
550 // the super is always stored on bdev 0
9f95a23c 551 int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout);
7c673cae 552 int mount();
9f95a23c 553 int maybe_verify_layout(const bluefs_layout_t& layout) const;
1911f103 554 void umount(bool avoid_compact = false);
9f95a23c 555 int prepare_new_device(int id, const bluefs_layout_t& layout);
11fdf7f2
TL
556
557 int log_dump();
7c673cae 558
f67539c2
TL
559 void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id);
560 void get_devices(std::set<std::string> *ls);
eafe8130
TL
561 uint64_t get_alloc_size(int id) {
562 return alloc_size[id];
563 }
7c673cae
FG
564 int fsck();
565
11fdf7f2
TL
566 int device_migrate_to_new(
567 CephContext *cct,
f67539c2 568 const std::set<int>& devs_source,
9f95a23c
TL
569 int dev_target,
570 const bluefs_layout_t& layout);
11fdf7f2
TL
571 int device_migrate_to_existing(
572 CephContext *cct,
f67539c2 573 const std::set<int>& devs_source,
9f95a23c
TL
574 int dev_target,
575 const bluefs_layout_t& layout);
11fdf7f2
TL
576
577 uint64_t get_used();
7c673cae
FG
578 uint64_t get_total(unsigned id);
579 uint64_t get_free(unsigned id);
f67539c2
TL
580 uint64_t get_used(unsigned id);
581 void dump_perf_counters(ceph::Formatter *f);
7c673cae 582
f67539c2 583 void dump_block_extents(std::ostream& out);
3efd9988 584
7c673cae 585 /// get current extents that we own for given block device
1e59de90
TL
586 void foreach_block_extents(
587 unsigned id,
588 std::function<void(uint64_t, uint32_t)> cb);
7c673cae
FG
589
590 int open_for_write(
b3b6e05e
TL
591 std::string_view dir,
592 std::string_view file,
7c673cae
FG
593 FileWriter **h,
594 bool overwrite);
595
596 int open_for_read(
b3b6e05e
TL
597 std::string_view dir,
598 std::string_view file,
7c673cae
FG
599 FileReader **h,
600 bool random = false);
601
20effc67
TL
602 // data added after last fsync() is lost
603 void close_writer(FileWriter *h);
7c673cae 604
b3b6e05e
TL
605 int rename(std::string_view old_dir, std::string_view old_file,
606 std::string_view new_dir, std::string_view new_file);
7c673cae 607
b3b6e05e 608 int readdir(std::string_view dirname, std::vector<std::string> *ls);
7c673cae 609
b3b6e05e
TL
610 int unlink(std::string_view dirname, std::string_view filename);
611 int mkdir(std::string_view dirname);
612 int rmdir(std::string_view dirname);
d2e6a577 613 bool wal_is_rotational();
1d09f67e 614 bool db_is_rotational();
7c673cae 615
b3b6e05e
TL
616 bool dir_exists(std::string_view dirname);
617 int stat(std::string_view dirname, std::string_view filename,
7c673cae
FG
618 uint64_t *size, utime_t *mtime);
619
b3b6e05e 620 int lock_file(std::string_view dirname, std::string_view filename, FileLock **p);
7c673cae
FG
621 int unlock_file(FileLock *l);
622
7c673cae
FG
623 void compact_log();
624
625 /// sync any uncommitted state to disk
1911f103 626 void sync_metadata(bool avoid_compact);
7c673cae 627
9f95a23c
TL
628 void set_volume_selector(BlueFSVolumeSelector* s) {
629 vselector.reset(s);
630 }
f67539c2 631 void dump_volume_selector(std::ostream& sout) {
9f95a23c
TL
632 vselector->dump(sout);
633 }
634 void get_vselector_paths(const std::string& base,
635 BlueFSVolumeSelector::paths& res) const {
636 return vselector->get_paths(base, res);
637 }
638
f67539c2
TL
639 int add_block_device(unsigned bdev, const std::string& path, bool trim,
640 uint64_t reserved,
641 bluefs_shared_alloc_context_t* _shared_alloc = nullptr);
7c673cae 642 bool bdev_support_label(unsigned id);
f67539c2 643 uint64_t get_block_device_size(unsigned bdev) const;
7c673cae 644
11fdf7f2
TL
645 // handler for discard event
646 void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
647
20effc67 648 void flush(FileWriter *h, bool force = false);
cd265ab1 649
20effc67
TL
650 void append_try_flush(FileWriter *h, const char* buf, size_t len);
651 void flush_range(FileWriter *h, uint64_t offset, uint64_t length);
652 int fsync(FileWriter *h);
f67539c2
TL
653 int64_t read(FileReader *h, uint64_t offset, size_t len,
654 ceph::buffer::list *outbl, char *out) {
7c673cae
FG
655 // no need to hold the global lock here; we only touch h and
656 // h->file, and read vs write or delete is already protected (via
657 // atomics and asserts).
f67539c2 658 return _read(h, offset, len, outbl, out);
7c673cae 659 }
adb31ebb 660 int64_t read_random(FileReader *h, uint64_t offset, size_t len,
7c673cae
FG
661 char *out) {
662 // no need to hold the global lock here; we only touch h and
663 // h->file, and read vs write or delete is already protected (via
664 // atomics and asserts).
665 return _read_random(h, offset, len, out);
666 }
20effc67
TL
667 void invalidate_cache(FileRef f, uint64_t offset, uint64_t len);
668 int preallocate(FileRef f, uint64_t offset, uint64_t len);
669 int truncate(FileWriter *h, uint64_t offset);
7c673cae 670
f67539c2
TL
671 size_t probe_alloc_avail(int dev, uint64_t alloc_size);
672
9f95a23c 673 /// test purpose methods
9f95a23c
TL
674 const PerfCounters* get_perf_counters() const {
675 return logger;
676 }
522d829b
TL
677 uint64_t debug_get_dirty_seq(FileWriter *h);
678 bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev);
cd265ab1
TL
679
680private:
681 // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...)
682 // They are used for checking if read values are all 0, and reread if so.
20effc67 683 int _read_and_check(uint8_t ndev, uint64_t off, uint64_t len,
cd265ab1 684 ceph::buffer::list *pbl, IOContext *ioc, bool buffered);
20effc67
TL
685 int _read_random_and_check(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered);
686
687 int _bdev_read(uint8_t ndev, uint64_t off, uint64_t len,
688 ceph::buffer::list* pbl, IOContext* ioc, bool buffered);
689 int _bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len, char* buf, bool buffered);
690
691 /// test and compact log, if necessary
692 void _maybe_compact_log_LNF_NF_LD_D();
693 int _do_replay_recovery_read(FileReader *log,
694 size_t log_pos,
695 size_t read_offset,
696 size_t read_len,
697 bufferlist* bl);
698 void _check_vselector_LNF();
9f95a23c
TL
699};
700
701class OriginalVolumeSelector : public BlueFSVolumeSelector {
702 uint64_t wal_total;
703 uint64_t db_total;
704 uint64_t slow_total;
705
706public:
707 OriginalVolumeSelector(
708 uint64_t _wal_total,
709 uint64_t _db_total,
710 uint64_t _slow_total)
711 : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {}
712
f6b5b4d7 713 void* get_hint_for_log() const override;
b3b6e05e 714 void* get_hint_by_dir(std::string_view dirname) const override;
9f95a23c
TL
715
716 void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
717 // do nothing
718 return;
719 }
720 void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
721 // do nothing
722 return;
723 }
724 void add_usage(void* hint, uint64_t fsize) override {
725 // do nothing
726 return;
727 }
728 void sub_usage(void* hint, uint64_t fsize) override {
729 // do nothing
730 return;
731 }
732
733 uint8_t select_prefer_bdev(void* hint) override;
734 void get_paths(const std::string& base, paths& res) const override;
f67539c2
TL
735 void dump(std::ostream& sout) override;
736};
737
738class FitToFastVolumeSelector : public OriginalVolumeSelector {
739public:
740 FitToFastVolumeSelector(
741 uint64_t _wal_total,
742 uint64_t _db_total,
743 uint64_t _slow_total)
744 : OriginalVolumeSelector(_wal_total, _db_total, _slow_total) {}
745
746 void get_paths(const std::string& base, paths& res) const override;
7c673cae 747};
20effc67
TL
748/**
749 * Directional graph of locks.
750 * Vertices - Locks. Edges (directed) - locking progression.
751 * Edge A->B exist if last taken lock was A and next taken lock is B.
752 *
753 * Row represents last lock taken.
754 * Column represents next lock taken.
755 *
756 * > | W | L | N | D | F
757 * -------------|---|---|---|---|---
758 * FileWriter W | | > | > | > | >
759 * log L | | > | > | >
760 * nodes N | | > | >
761 * dirty D | | | >
762 * File F |
763 *
764 * Claim: Deadlock is possible IFF graph contains cycles.
765 */
7c673cae 766#endif