]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.h
bump version to 19.2.0-pve1
[ceph.git] / ceph / src / os / bluestore / BlueFS.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3#ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4#define CEPH_OS_BLUESTORE_BLUEFS_H
5
6#include <atomic>
7#include <mutex>
cd265ab1 8#include <limits>
7c673cae
FG
9
10#include "bluefs_types.h"
f67539c2 11#include "blk/BlockDevice.h"
7c673cae 12
9f95a23c
TL
13#include "common/RefCountedObj.h"
14#include "common/ceph_context.h"
15#include "global/global_context.h"
16#include "include/common_fwd.h"
7c673cae 17
9f95a23c
TL
18#include "boost/intrusive/list.hpp"
19#include "boost/dynamic_bitset.hpp"
7c673cae
FG
20
21class Allocator;
22
23enum {
24 l_bluefs_first = 732600,
7c673cae
FG
25 l_bluefs_db_total_bytes,
26 l_bluefs_db_used_bytes,
27 l_bluefs_wal_total_bytes,
28 l_bluefs_wal_used_bytes,
29 l_bluefs_slow_total_bytes,
30 l_bluefs_slow_used_bytes,
31 l_bluefs_num_files,
32 l_bluefs_log_bytes,
33 l_bluefs_log_compactions,
1e59de90 34 l_bluefs_log_write_count,
7c673cae
FG
35 l_bluefs_logged_bytes,
36 l_bluefs_files_written_wal,
37 l_bluefs_files_written_sst,
1e59de90
TL
38 l_bluefs_write_count_wal,
39 l_bluefs_write_count_sst,
7c673cae
FG
40 l_bluefs_bytes_written_wal,
41 l_bluefs_bytes_written_sst,
11fdf7f2
TL
42 l_bluefs_bytes_written_slow,
43 l_bluefs_max_bytes_wal,
44 l_bluefs_max_bytes_db,
45 l_bluefs_max_bytes_slow,
f51cf556 46 l_bluefs_slow_alloc_unit,
20effc67
TL
47 l_bluefs_db_alloc_unit,
48 l_bluefs_wal_alloc_unit,
f51cf556 49 l_bluefs_read_random_lat,
494da23a
TL
50 l_bluefs_read_random_count,
51 l_bluefs_read_random_bytes,
52 l_bluefs_read_random_disk_count,
53 l_bluefs_read_random_disk_bytes,
20effc67
TL
54 l_bluefs_read_random_disk_bytes_wal,
55 l_bluefs_read_random_disk_bytes_db,
56 l_bluefs_read_random_disk_bytes_slow,
494da23a
TL
57 l_bluefs_read_random_buffer_count,
58 l_bluefs_read_random_buffer_bytes,
f51cf556 59 l_bluefs_read_lat,
494da23a
TL
60 l_bluefs_read_count,
61 l_bluefs_read_bytes,
20effc67
TL
62 l_bluefs_read_disk_count,
63 l_bluefs_read_disk_bytes,
64 l_bluefs_read_disk_bytes_wal,
65 l_bluefs_read_disk_bytes_db,
66 l_bluefs_read_disk_bytes_slow,
494da23a
TL
67 l_bluefs_read_prefetch_count,
68 l_bluefs_read_prefetch_bytes,
1e59de90
TL
69 l_bluefs_write_count,
70 l_bluefs_write_disk_count,
71 l_bluefs_write_bytes,
39ae355f
TL
72 l_bluefs_compaction_lat,
73 l_bluefs_compaction_lock_lat,
f51cf556
TL
74 l_bluefs_fsync_lat,
75 l_bluefs_flush_lat,
76 l_bluefs_unlink_lat,
77 l_bluefs_truncate_lat,
39ae355f
TL
78 l_bluefs_alloc_shared_dev_fallbacks,
79 l_bluefs_alloc_shared_size_fallbacks,
cd265ab1
TL
80 l_bluefs_read_zeros_candidate,
81 l_bluefs_read_zeros_errors,
f51cf556
TL
82 l_bluefs_wal_alloc_lat,
83 l_bluefs_db_alloc_lat,
84 l_bluefs_slow_alloc_lat,
85 l_bluefs_wal_alloc_max_lat,
86 l_bluefs_db_alloc_max_lat,
87 l_bluefs_slow_alloc_max_lat,
7c673cae
FG
88 l_bluefs_last,
89};
90
9f95a23c
TL
91class BlueFSVolumeSelector {
92public:
93 typedef std::vector<std::pair<std::string, uint64_t>> paths;
94
95 virtual ~BlueFSVolumeSelector() {
96 }
f51cf556
TL
97 /**
98 * Method to learn a hint (aka logic level discriminator) specific for
99 * BlueFS log
100 *
101 */
f6b5b4d7 102 virtual void* get_hint_for_log() const = 0;
f51cf556
TL
103 /**
104 * Method to learn a hint (aka logic level discriminator) provided directory
105 * bound to.
106 *
107 */
b3b6e05e 108 virtual void* get_hint_by_dir(std::string_view dirname) const = 0;
9f95a23c 109
f51cf556
TL
110 /**
111 * Increments stats for a given logical level using provided fnode as a delta,
112 * Parameters:
113 * hint: logical level discriminator
114 * fnode: fnode metadata to be used as a complex delta value:
115 * (+1 file count, +file size, +all the extents)
116 *
117 */
118 void add_usage(void* hint, const bluefs_fnode_t& fnode) {
119 for (auto& e : fnode.extents) {
120 add_usage(hint, e);
121 }
122 add_usage(hint, fnode.size, true);
123 }
124 /**
125 * Decrements stats for a given logical level using provided fnode as a delta
126 * Parameters:
127 * hint: logical level discriminator
128 * fnode: fnode metadata to be used as a complex delta value:
129 * (-1 file count, -file size, -all the extents)
130 *
131 */
132 void sub_usage(void* hint, const bluefs_fnode_t& fnode) {
133 for (auto& e : fnode.extents) {
134 sub_usage(hint, e);
135 }
136 sub_usage(hint, fnode.size, true);
137 }
138 /**
139 * Increments stats for a given logical level using provided extent as a delta,
140 * Parameters:
141 * hint: logical level discriminator
142 * extent: bluefs extent to be used as a complex delta value:
143 * (.bdev determines physical location, +length)
144 *
145 */
146 virtual void add_usage(void* hint, const bluefs_extent_t& extent) = 0;
147 /**
148 * Decrements stats for a given logical level using provided extent as a delta,
149 * Parameters:
150 * hint: logical level discriminator
151 * extent: bluefs extent to be used as a complex delta value:
152 * (.bdev determines physical location, -length)
153 *
154 */
155 virtual void sub_usage(void* hint, const bluefs_extent_t& extent) = 0;
156 /**
157 * Increments files count and overall files size for a given logical level
158 * Parameters:
159 * hint: logical level discriminator
160 * fsize: delta value for file size
161 * upd_files: whether or not to increment file count
162 *
163 */
164 virtual void add_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0;
165 /**
166 * Decrements files count and overall files size for a given logical level
167 * Parameters:
168 * hint: logical level discriminator
169 * fsize: delta value for file size
170 * upd_files: whether or not to decrement file count
171 *
172 */
173 virtual void sub_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0;
174
175 /**
176 * Determines preferred physical device for the given logical level
177 * Parameters:
178 * hint: logical level discriminator
179 *
180 */
9f95a23c 181 virtual uint8_t select_prefer_bdev(void* hint) = 0;
f51cf556
TL
182 /**
183 * Builds path set for RocksDB to use
184 * Parameters:
185 * base: path's root
186 *
187 */
9f95a23c 188 virtual void get_paths(const std::string& base, paths& res) const = 0;
f51cf556
TL
189 /**
190 * Dumps VSelector's state
191 *
192 */
f67539c2 193 virtual void dump(std::ostream& sout) = 0;
20effc67
TL
194
195 /* used for sanity checking of vselector */
196 virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; }
197 virtual bool compare(BlueFSVolumeSelector* other) { return true; };
f67539c2
TL
198};
199
200struct bluefs_shared_alloc_context_t {
201 bool need_init = false;
202 Allocator* a = nullptr;
39ae355f 203 uint64_t alloc_unit = 0;
f67539c2
TL
204
205 std::atomic<uint64_t> bluefs_used = 0;
206
39ae355f 207 void set(Allocator* _a, uint64_t _au) {
f67539c2 208 a = _a;
39ae355f 209 alloc_unit = _au;
f67539c2
TL
210 need_init = true;
211 bluefs_used = 0;
212 }
213 void reset() {
214 a = nullptr;
39ae355f 215 alloc_unit = 0;
f67539c2 216 }
11fdf7f2
TL
217};
218
7c673cae
FG
219class BlueFS {
220public:
221 CephContext* cct;
11fdf7f2 222 static constexpr unsigned MAX_BDEV = 5;
7c673cae
FG
223 static constexpr unsigned BDEV_WAL = 0;
224 static constexpr unsigned BDEV_DB = 1;
225 static constexpr unsigned BDEV_SLOW = 2;
11fdf7f2
TL
226 static constexpr unsigned BDEV_NEWWAL = 3;
227 static constexpr unsigned BDEV_NEWDB = 4;
7c673cae
FG
228
229 enum {
230 WRITER_UNKNOWN,
231 WRITER_WAL,
232 WRITER_SST,
233 };
234
235 struct File : public RefCountedObject {
236 MEMPOOL_CLASS_HELPERS();
237
238 bluefs_fnode_t fnode;
239 int refs;
240 uint64_t dirty_seq;
241 bool locked;
242 bool deleted;
522d829b 243 bool is_dirty;
7c673cae
FG
244 boost::intrusive::list_member_hook<> dirty_item;
245
246 std::atomic_int num_readers, num_writers;
247 std::atomic_int num_reading;
248
9f95a23c 249 void* vselector_hint = nullptr;
20effc67
TL
250 /* lock protects fnode and other the parts that can be modified during read & write operations.
251 Does not protect values that are fixed
252 Does not need to be taken when doing one-time operations:
253 _replay, device_migrate_to_existing, device_migrate_to_new */
254 ceph::mutex lock = ceph::make_mutex("BlueFS::File::lock");
9f95a23c
TL
255
256 private:
257 FRIEND_MAKE_REF(File);
7c673cae 258 File()
9f95a23c 259 :
7c673cae
FG
260 refs(0),
261 dirty_seq(0),
262 locked(false),
263 deleted(false),
522d829b 264 is_dirty(false),
7c673cae
FG
265 num_readers(0),
266 num_writers(0),
9f95a23c
TL
267 num_reading(0),
268 vselector_hint(nullptr)
7c673cae
FG
269 {}
270 ~File() override {
11fdf7f2
TL
271 ceph_assert(num_readers.load() == 0);
272 ceph_assert(num_writers.load() == 0);
273 ceph_assert(num_reading.load() == 0);
274 ceph_assert(!locked);
7c673cae 275 }
7c673cae 276 };
9f95a23c 277 using FileRef = ceph::ref_t<File>;
7c673cae
FG
278
279 typedef boost::intrusive::list<
280 File,
281 boost::intrusive::member_hook<
282 File,
283 boost::intrusive::list_member_hook<>,
284 &File::dirty_item> > dirty_file_list_t;
285
286 struct Dir : public RefCountedObject {
287 MEMPOOL_CLASS_HELPERS();
288
b3b6e05e 289 mempool::bluefs::map<std::string, FileRef, std::less<>> file_map;
7c673cae 290
9f95a23c
TL
291 private:
292 FRIEND_MAKE_REF(Dir);
293 Dir() = default;
7c673cae 294 };
9f95a23c 295 using DirRef = ceph::ref_t<Dir>;
7c673cae
FG
296
297 struct FileWriter {
298 MEMPOOL_CLASS_HELPERS();
299
300 FileRef file;
9f95a23c 301 uint64_t pos = 0; ///< start offset for buffer
f67539c2
TL
302 private:
303 ceph::buffer::list buffer; ///< new data to write (at end of file)
304 ceph::buffer::list tail_block; ///< existing partial block at end of file, if any
305 public:
306 unsigned get_buffer_length() const {
307 return buffer.length();
308 }
309 ceph::bufferlist flush_buffer(
310 CephContext* cct,
311 const bool partial,
312 const unsigned length,
313 const bluefs_super_t& super);
314 ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only
315 public:
7c673cae 316 int writer_type = 0; ///< WRITER_*
11fdf7f2 317 int write_hint = WRITE_LIFE_NOT_SET;
7c673cae 318
11fdf7f2 319 ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
7c673cae 320 std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
11fdf7f2 321 std::array<bool, MAX_BDEV> dirty_devs;
7c673cae
FG
322
323 FileWriter(FileRef f)
9f95a23c 324 : file(std::move(f)),
f67539c2
TL
325 buffer_appender(buffer.get_page_aligned_appender(
326 g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
7c673cae
FG
327 ++file->num_writers;
328 iocv.fill(nullptr);
11fdf7f2 329 dirty_devs.fill(false);
9f95a23c 330 if (file->fnode.ino == 1) {
11fdf7f2
TL
331 write_hint = WRITE_LIFE_MEDIUM;
332 }
7c673cae
FG
333 }
334 // NOTE: caller must call BlueFS::close_writer()
335 ~FileWriter() {
336 --file->num_writers;
337 }
338
339 // note: BlueRocksEnv uses this append exclusively, so it's safe
20effc67 340 // to use buffer_appender exclusively here (e.g., its notion of
7c673cae
FG
341 // offset will remain accurate).
342 void append(const char *buf, size_t len) {
f67539c2 343 uint64_t l0 = get_buffer_length();
cd265ab1 344 ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
7c673cae
FG
345 buffer_appender.append(buf, len);
346 }
347
20effc67
TL
348 void append(const std::byte *buf, size_t len) {
349 // allow callers to use byte type instead of char* as we simply pass byte array
350 append((const char*)buf, len);
351 }
352
7c673cae 353 // note: used internally only, for ino 1 or 0.
cd265ab1 354 void append(ceph::buffer::list& bl) {
f67539c2 355 uint64_t l0 = get_buffer_length();
cd265ab1 356 ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max());
7c673cae
FG
357 buffer.claim_append(bl);
358 }
359
f67539c2
TL
360 void append_zero(size_t len) {
361 uint64_t l0 = get_buffer_length();
362 ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
363 buffer_appender.append_zero(len);
364 }
365
7c673cae 366 uint64_t get_effective_write_pos() {
7c673cae
FG
367 return pos + buffer.length();
368 }
369 };
370
371 struct FileReaderBuffer {
372 MEMPOOL_CLASS_HELPERS();
373
9f95a23c 374 uint64_t bl_off = 0; ///< prefetch buffer logical offset
f67539c2 375 ceph::buffer::list bl; ///< prefetch buffer
9f95a23c 376 uint64_t pos = 0; ///< current logical offset
7c673cae
FG
377 uint64_t max_prefetch; ///< max allowed prefetch
378
379 explicit FileReaderBuffer(uint64_t mpf)
9f95a23c 380 : max_prefetch(mpf) {}
7c673cae 381
9f95a23c 382 uint64_t get_buf_end() const {
7c673cae
FG
383 return bl_off + bl.length();
384 }
9f95a23c 385 uint64_t get_buf_remaining(uint64_t p) const {
7c673cae
FG
386 if (p >= bl_off && p < bl_off + bl.length())
387 return bl_off + bl.length() - p;
388 return 0;
389 }
390
391 void skip(size_t n) {
392 pos += n;
393 }
f67539c2
TL
394
395 // For the sake of simplicity, we invalidate completed rather than
396 // for the provided extent
397 void invalidate_cache(uint64_t offset, uint64_t length) {
398 if (offset >= bl_off && offset < get_buf_end()) {
399 bl.clear();
400 bl_off = 0;
401 }
7c673cae
FG
402 }
403 };
404
405 struct FileReader {
406 MEMPOOL_CLASS_HELPERS();
407
408 FileRef file;
409 FileReaderBuffer buf;
410 bool random;
411 bool ignore_eof; ///< used when reading our log file
412
494da23a
TL
413 ceph::shared_mutex lock {
414 ceph::make_shared_mutex(std::string(), false, false, false)
415 };
416
417
7c673cae
FG
418 FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
419 : file(f),
420 buf(mpf),
421 random(rand),
422 ignore_eof(ie) {
423 ++file->num_readers;
424 }
425 ~FileReader() {
426 --file->num_readers;
427 }
428 };
429
430 struct FileLock {
431 MEMPOOL_CLASS_HELPERS();
432
433 FileRef file;
9f95a23c 434 explicit FileLock(FileRef f) : file(std::move(f)) {}
7c673cae
FG
435 };
436
437private:
7c673cae
FG
438 PerfCounters *logger = nullptr;
439
11fdf7f2
TL
440 uint64_t max_bytes[MAX_BDEV] = {0};
441 uint64_t max_bytes_pcounters[MAX_BDEV] = {
442 l_bluefs_max_bytes_wal,
443 l_bluefs_max_bytes_db,
444 l_bluefs_max_bytes_slow,
39ae355f
TL
445 l_bluefs_max_bytes_wal,
446 l_bluefs_max_bytes_db,
11fdf7f2
TL
447 };
448
f51cf556
TL
449 ceph::timespan max_alloc_lat[MAX_BDEV] = {ceph::make_timespan(0)};
450
7c673cae 451 // cache
20effc67
TL
452 struct {
453 ceph::mutex lock = ceph::make_mutex("BlueFS::nodes.lock");
454 mempool::bluefs::map<std::string, DirRef, std::less<>> dir_map; ///< dirname -> Dir
455 mempool::bluefs::unordered_map<uint64_t, FileRef> file_map; ///< ino -> File
456 } nodes;
7c673cae
FG
457
458 bluefs_super_t super; ///< latest superblock (as last written)
459 uint64_t ino_last = 0; ///< last assigned ino (this one is in use)
7c673cae 460
20effc67
TL
461 struct {
462 ceph::mutex lock = ceph::make_mutex("BlueFS::log.lock");
463 uint64_t seq_live = 1; //seq that log is currently writing to; mirrors dirty.seq_live
464 FileWriter *writer = 0;
465 bluefs_transaction_t t;
466 } log;
467
468 struct {
469 ceph::mutex lock = ceph::make_mutex("BlueFS::dirty.lock");
470 uint64_t seq_stable = 0; //seq that is now stable on disk
471 uint64_t seq_live = 1; //seq that is ongoing and dirty files will be written to
472 // map of dirty files, files of same dirty_seq are grouped into list.
473 std::map<uint64_t, dirty_file_list_t> files;
474 std::vector<interval_set<uint64_t>> pending_release; ///< extents to release
475 // TODO: it should be examined what makes pending_release immune to
476 // eras in a way similar to dirty_files. Hints:
477 // 1) we have actually only 2 eras: log_seq and log_seq+1
478 // 2) we usually not remove extents from files. And when we do, we force log-syncing.
479 } dirty;
480
481 ceph::condition_variable log_cond; ///< used for state control between log flush / log compaction
482 std::atomic<bool> log_is_compacting{false}; ///< signals that bluefs log is already ongoing compaction
483 std::atomic<bool> log_forbidden_to_expand{false}; ///< used to signal that async compaction is in state
484 /// that prohibits expansion of bluefs log
7c673cae
FG
485 /*
486 * There are up to 3 block devices:
487 *
488 * BDEV_DB db/ - the primary db device
489 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
490 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
491 */
f67539c2
TL
492 std::vector<BlockDevice*> bdev; ///< block devices we can use
493 std::vector<IOContext*> ioc; ///< IOContexts for bdevs
494 std::vector<uint64_t> block_reserved; ///< starting reserve extent per device
495 std::vector<Allocator*> alloc; ///< allocators for bdevs
496 std::vector<uint64_t> alloc_size; ///< alloc size for each device
20effc67 497
f67539c2 498 //std::vector<interval_set<uint64_t>> block_unused_too_granular;
7c673cae 499
11fdf7f2
TL
500 BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
501
9f95a23c 502 std::unique_ptr<BlueFSVolumeSelector> vselector;
11fdf7f2 503
f67539c2
TL
504 bluefs_shared_alloc_context_t* shared_alloc = nullptr;
505 unsigned shared_alloc_id = unsigned(-1);
506 inline bool is_shared_alloc(unsigned id) const {
507 return id == shared_alloc_id;
508 }
39ae355f 509 std::atomic<int64_t> cooldown_deadline = 0;
f67539c2 510
eafe8130
TL
511 class SocketHook;
512 SocketHook* asok_hook = nullptr;
cd265ab1
TL
513 // used to trigger zeros into read (debug / verify)
514 std::atomic<uint64_t> inject_read_zeros{0};
eafe8130 515
7c673cae
FG
516 void _init_logger();
517 void _shutdown_logger();
518 void _update_logger_stats();
519
520 void _init_alloc();
521 void _stop_alloc();
522
39ae355f
TL
523 ///< pad ceph::buffer::list to max(block size, pad_size) w/ zeros
524 void _pad_bl(ceph::buffer::list& bl, uint64_t pad_size = 0);
f67539c2
TL
525
526 uint64_t _get_used(unsigned id) const;
527 uint64_t _get_total(unsigned id) const;
528
7c673cae
FG
529
530 FileRef _get_file(uint64_t ino);
20effc67 531 void _drop_link_D(FileRef f);
7c673cae 532
1911f103
TL
533 unsigned _get_slow_device_id() {
534 return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB;
535 }
eafe8130 536 const char* get_device_name(unsigned id);
f51cf556
TL
537
538 typedef std::function<void(const bluefs_extent_t)> update_fn_t;
539 void _update_allocate_stats(uint8_t id, const ceph::timespan& d);
7c673cae 540 int _allocate(uint8_t bdev, uint64_t len,
39ae355f
TL
541 uint64_t alloc_unit,
542 bluefs_fnode_t* node,
f51cf556 543 update_fn_t cb = nullptr,
39ae355f
TL
544 size_t alloc_attempts = 0,
545 bool permit_dev_fallback = true);
11fdf7f2 546
522d829b 547 /* signal replay log to include h->file in nearest log flush */
20effc67
TL
548 int _signal_dirty_to_log_D(FileWriter *h);
549 int _flush_range_F(FileWriter *h, uint64_t offset, uint64_t length);
550 int _flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered);
551 int _flush_F(FileWriter *h, bool force, bool *flushed = nullptr);
552 uint64_t _flush_special(FileWriter *h);
7c673cae 553
11fdf7f2 554#ifdef HAVE_LIBAIO
f67539c2 555 void _claim_completed_aios(FileWriter *h, std::list<aio_t> *ls);
20effc67 556 void _wait_for_aio(FileWriter *h); // safe to call without a lock
11fdf7f2 557#endif
7c673cae 558
20effc67 559 int64_t _maybe_extend_log();
f51cf556 560 void _extend_log(uint64_t amount);
20effc67
TL
561 uint64_t _log_advance_seq();
562 void _consume_dirty(uint64_t seq);
563 void _clear_dirty_set_stable_D(uint64_t seq_stable);
564 void _release_pending_allocations(std::vector<interval_set<uint64_t>>& to_release);
565
f51cf556
TL
566 void _flush_and_sync_log_core();
567 int _flush_and_sync_log_jump_D(uint64_t jump_to);
20effc67
TL
568 int _flush_and_sync_log_LD(uint64_t want_seq = 0);
569
39ae355f
TL
570 uint64_t _estimate_transaction_size(bluefs_transaction_t* t);
571 uint64_t _make_initial_transaction(uint64_t start_seq,
572 bluefs_fnode_t& fnode,
573 uint64_t expected_final_size,
574 bufferlist* out);
20effc67
TL
575 uint64_t _estimate_log_size_N();
576 bool _should_start_compact_log_L_N();
11fdf7f2
TL
577
578 enum {
579 REMOVE_DB = 1,
580 REMOVE_WAL = 2,
581 RENAME_SLOW2DB = 4,
582 RENAME_DB2SLOW = 8,
583 };
39ae355f
TL
584 void _compact_log_dump_metadata_NF(uint64_t start_seq,
585 bluefs_transaction_t *t,
586 int flags,
587 uint64_t capture_before_seq);
11fdf7f2 588
20effc67
TL
589 void _compact_log_sync_LNF_LD();
590 void _compact_log_async_LD_LNF_D();
591
39ae355f 592 void _rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
9f95a23c
TL
593 int super_dev,
594 int log_dev,
595 int new_log_dev,
596 int flags,
597 std::optional<bluefs_layout_t> layout);
7c673cae
FG
598
599 //void _aio_finish(void *priv);
600
39ae355f 601 void _flush_bdev(FileWriter *h, bool check_mutex_locked = true);
20effc67
TL
602 void _flush_bdev(); // this is safe to call without a lock
603 void _flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock
7c673cae
FG
604
605 int _preallocate(FileRef f, uint64_t off, uint64_t len);
606 int _truncate(FileWriter *h, uint64_t off);
607
adb31ebb 608 int64_t _read(
7c673cae 609 FileReader *h, ///< [in] read from here
7c673cae
FG
610 uint64_t offset, ///< [in] offset
611 size_t len, ///< [in] this many bytes
f67539c2 612 ceph::buffer::list *outbl, ///< [out] optional: reference the result here
7c673cae 613 char *out); ///< [out] optional: or copy it here
adb31ebb 614 int64_t _read_random(
7c673cae
FG
615 FileReader *h, ///< [in] read from here
616 uint64_t offset, ///< [in] offset
9f95a23c 617 uint64_t len, ///< [in] this many bytes
7c673cae
FG
618 char *out); ///< [out] optional: or copy it here
619
7c673cae 620 int _open_super();
11fdf7f2 621 int _write_super(int dev);
20effc67
TL
622 int _check_allocations(const bluefs_fnode_t& fnode,
623 boost::dynamic_bitset<uint64_t>* used_blocks,
624 bool is_alloc, //true when allocating, false when deallocating
625 const char* op_name);
9f95a23c
TL
626 int _verify_alloc_granularity(
627 __u8 id, uint64_t offset, uint64_t length,
39ae355f 628 uint64_t alloc_unit,
9f95a23c 629 const char *op);
11fdf7f2 630 int _replay(bool noop, bool to_stdout = false); ///< replay journal
7c673cae
FG
631
632 FileWriter *_create_writer(FileRef f);
20effc67 633 void _drain_writer(FileWriter *h);
7c673cae
FG
634 void _close_writer(FileWriter *h);
635
636 // always put the super in the second 4k block. FIXME should this be
637 // block size independent?
638 unsigned get_super_offset() {
639 return 4096;
640 }
641 unsigned get_super_length() {
642 return 4096;
643 }
20effc67
TL
644 void _maybe_check_vselector_LNF() {
645 if (cct->_conf->bluefs_check_volume_selector_often) {
646 _check_vselector_LNF();
647 }
648 }
7c673cae
FG
649public:
650 BlueFS(CephContext* cct);
651 ~BlueFS();
652
653 // the super is always stored on bdev 0
9f95a23c 654 int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout);
7c673cae 655 int mount();
9f95a23c 656 int maybe_verify_layout(const bluefs_layout_t& layout) const;
1911f103 657 void umount(bool avoid_compact = false);
9f95a23c 658 int prepare_new_device(int id, const bluefs_layout_t& layout);
11fdf7f2
TL
659
660 int log_dump();
7c673cae 661
f67539c2
TL
662 void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id);
663 void get_devices(std::set<std::string> *ls);
eafe8130
TL
664 uint64_t get_alloc_size(int id) {
665 return alloc_size[id];
666 }
7c673cae
FG
667 int fsck();
668
11fdf7f2
TL
669 int device_migrate_to_new(
670 CephContext *cct,
f67539c2 671 const std::set<int>& devs_source,
9f95a23c
TL
672 int dev_target,
673 const bluefs_layout_t& layout);
11fdf7f2
TL
674 int device_migrate_to_existing(
675 CephContext *cct,
f67539c2 676 const std::set<int>& devs_source,
9f95a23c
TL
677 int dev_target,
678 const bluefs_layout_t& layout);
11fdf7f2
TL
679
680 uint64_t get_used();
7c673cae
FG
681 uint64_t get_total(unsigned id);
682 uint64_t get_free(unsigned id);
f67539c2
TL
683 uint64_t get_used(unsigned id);
684 void dump_perf_counters(ceph::Formatter *f);
7c673cae 685
f67539c2 686 void dump_block_extents(std::ostream& out);
3efd9988 687
7c673cae 688 /// get current extents that we own for given block device
1e59de90
TL
689 void foreach_block_extents(
690 unsigned id,
691 std::function<void(uint64_t, uint32_t)> cb);
7c673cae
FG
692
693 int open_for_write(
b3b6e05e
TL
694 std::string_view dir,
695 std::string_view file,
7c673cae
FG
696 FileWriter **h,
697 bool overwrite);
698
699 int open_for_read(
b3b6e05e
TL
700 std::string_view dir,
701 std::string_view file,
7c673cae
FG
702 FileReader **h,
703 bool random = false);
704
20effc67
TL
705 // data added after last fsync() is lost
706 void close_writer(FileWriter *h);
7c673cae 707
b3b6e05e
TL
708 int rename(std::string_view old_dir, std::string_view old_file,
709 std::string_view new_dir, std::string_view new_file);
7c673cae 710
b3b6e05e 711 int readdir(std::string_view dirname, std::vector<std::string> *ls);
7c673cae 712
b3b6e05e
TL
713 int unlink(std::string_view dirname, std::string_view filename);
714 int mkdir(std::string_view dirname);
715 int rmdir(std::string_view dirname);
d2e6a577 716 bool wal_is_rotational();
1d09f67e 717 bool db_is_rotational();
7c673cae 718
b3b6e05e
TL
719 bool dir_exists(std::string_view dirname);
720 int stat(std::string_view dirname, std::string_view filename,
7c673cae
FG
721 uint64_t *size, utime_t *mtime);
722
b3b6e05e 723 int lock_file(std::string_view dirname, std::string_view filename, FileLock **p);
7c673cae
FG
724 int unlock_file(FileLock *l);
725
7c673cae
FG
726 void compact_log();
727
728 /// sync any uncommitted state to disk
1911f103 729 void sync_metadata(bool avoid_compact);
7c673cae 730
9f95a23c
TL
731 void set_volume_selector(BlueFSVolumeSelector* s) {
732 vselector.reset(s);
733 }
f67539c2 734 void dump_volume_selector(std::ostream& sout) {
9f95a23c
TL
735 vselector->dump(sout);
736 }
737 void get_vselector_paths(const std::string& base,
738 BlueFSVolumeSelector::paths& res) const {
739 return vselector->get_paths(base, res);
740 }
741
f67539c2 742 int add_block_device(unsigned bdev, const std::string& path, bool trim,
f67539c2 743 bluefs_shared_alloc_context_t* _shared_alloc = nullptr);
7c673cae 744 bool bdev_support_label(unsigned id);
f67539c2 745 uint64_t get_block_device_size(unsigned bdev) const;
7c673cae 746
11fdf7f2
TL
747 // handler for discard event
748 void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
749
20effc67 750 void flush(FileWriter *h, bool force = false);
cd265ab1 751
20effc67
TL
752 void append_try_flush(FileWriter *h, const char* buf, size_t len);
753 void flush_range(FileWriter *h, uint64_t offset, uint64_t length);
754 int fsync(FileWriter *h);
f67539c2
TL
755 int64_t read(FileReader *h, uint64_t offset, size_t len,
756 ceph::buffer::list *outbl, char *out) {
7c673cae
FG
757 // no need to hold the global lock here; we only touch h and
758 // h->file, and read vs write or delete is already protected (via
759 // atomics and asserts).
f67539c2 760 return _read(h, offset, len, outbl, out);
7c673cae 761 }
adb31ebb 762 int64_t read_random(FileReader *h, uint64_t offset, size_t len,
7c673cae
FG
763 char *out) {
764 // no need to hold the global lock here; we only touch h and
765 // h->file, and read vs write or delete is already protected (via
766 // atomics and asserts).
767 return _read_random(h, offset, len, out);
768 }
20effc67
TL
769 void invalidate_cache(FileRef f, uint64_t offset, uint64_t len);
770 int preallocate(FileRef f, uint64_t offset, uint64_t len);
771 int truncate(FileWriter *h, uint64_t offset);
7c673cae 772
f67539c2
TL
773 size_t probe_alloc_avail(int dev, uint64_t alloc_size);
774
9f95a23c 775 /// test purpose methods
9f95a23c
TL
776 const PerfCounters* get_perf_counters() const {
777 return logger;
778 }
522d829b
TL
779 uint64_t debug_get_dirty_seq(FileWriter *h);
780 bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev);
cd265ab1
TL
781
782private:
783 // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...)
784 // They are used for checking if read values are all 0, and reread if so.
20effc67 785 int _read_and_check(uint8_t ndev, uint64_t off, uint64_t len,
cd265ab1 786 ceph::buffer::list *pbl, IOContext *ioc, bool buffered);
20effc67
TL
787 int _read_random_and_check(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered);
788
789 int _bdev_read(uint8_t ndev, uint64_t off, uint64_t len,
790 ceph::buffer::list* pbl, IOContext* ioc, bool buffered);
791 int _bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len, char* buf, bool buffered);
792
793 /// test and compact log, if necessary
794 void _maybe_compact_log_LNF_NF_LD_D();
795 int _do_replay_recovery_read(FileReader *log,
796 size_t log_pos,
797 size_t read_offset,
798 size_t read_len,
799 bufferlist* bl);
800 void _check_vselector_LNF();
9f95a23c
TL
801};
802
803class OriginalVolumeSelector : public BlueFSVolumeSelector {
804 uint64_t wal_total;
805 uint64_t db_total;
806 uint64_t slow_total;
807
808public:
809 OriginalVolumeSelector(
810 uint64_t _wal_total,
811 uint64_t _db_total,
812 uint64_t _slow_total)
813 : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {}
814
f6b5b4d7 815 void* get_hint_for_log() const override;
b3b6e05e 816 void* get_hint_by_dir(std::string_view dirname) const override;
9f95a23c 817
f51cf556 818 void add_usage(void* hint, const bluefs_extent_t& extent) override {
9f95a23c
TL
819 // do nothing
820 return;
821 }
f51cf556 822 void sub_usage(void* hint, const bluefs_extent_t& extent) override {
9f95a23c
TL
823 // do nothing
824 return;
825 }
f51cf556 826 void add_usage(void*, uint64_t, bool) override {
9f95a23c
TL
827 // do nothing
828 return;
829 }
f51cf556 830 void sub_usage(void*, uint64_t, bool) override {
9f95a23c
TL
831 // do nothing
832 return;
833 }
834
835 uint8_t select_prefer_bdev(void* hint) override;
836 void get_paths(const std::string& base, paths& res) const override;
f67539c2
TL
837 void dump(std::ostream& sout) override;
838};
839
840class FitToFastVolumeSelector : public OriginalVolumeSelector {
841public:
842 FitToFastVolumeSelector(
843 uint64_t _wal_total,
844 uint64_t _db_total,
845 uint64_t _slow_total)
846 : OriginalVolumeSelector(_wal_total, _db_total, _slow_total) {}
847
848 void get_paths(const std::string& base, paths& res) const override;
7c673cae 849};
20effc67
TL
850/**
851 * Directional graph of locks.
852 * Vertices - Locks. Edges (directed) - locking progression.
853 * Edge A->B exist if last taken lock was A and next taken lock is B.
854 *
855 * Row represents last lock taken.
856 * Column represents next lock taken.
857 *
858 * > | W | L | N | D | F
859 * -------------|---|---|---|---|---
860 * FileWriter W | | > | > | > | >
861 * log L | | > | > | >
862 * nodes N | | > | >
863 * dirty D | | | >
864 * File F |
865 *
866 * Claim: Deadlock is possible IFF graph contains cycles.
867 */
7c673cae 868#endif