1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4 #define CEPH_OS_BLUESTORE_BLUEFS_H
10 #include "bluefs_types.h"
11 #include "blk/BlockDevice.h"
13 #include "common/RefCountedObj.h"
14 #include "common/ceph_context.h"
15 #include "global/global_context.h"
16 #include "include/common_fwd.h"
18 #include "boost/intrusive/list.hpp"
19 #include "boost/dynamic_bitset.hpp"
24 l_bluefs_first
= 732600,
25 l_bluefs_db_total_bytes
,
26 l_bluefs_db_used_bytes
,
27 l_bluefs_wal_total_bytes
,
28 l_bluefs_wal_used_bytes
,
29 l_bluefs_slow_total_bytes
,
30 l_bluefs_slow_used_bytes
,
33 l_bluefs_log_compactions
,
34 l_bluefs_logged_bytes
,
35 l_bluefs_files_written_wal
,
36 l_bluefs_files_written_sst
,
37 l_bluefs_bytes_written_wal
,
38 l_bluefs_bytes_written_sst
,
39 l_bluefs_bytes_written_slow
,
40 l_bluefs_max_bytes_wal
,
41 l_bluefs_max_bytes_db
,
42 l_bluefs_max_bytes_slow
,
43 l_bluefs_read_random_count
,
44 l_bluefs_read_random_bytes
,
45 l_bluefs_read_random_disk_count
,
46 l_bluefs_read_random_disk_bytes
,
47 l_bluefs_read_random_buffer_count
,
48 l_bluefs_read_random_buffer_bytes
,
51 l_bluefs_read_prefetch_count
,
52 l_bluefs_read_prefetch_bytes
,
53 l_bluefs_read_zeros_candidate
,
54 l_bluefs_read_zeros_errors
,
59 class BlueFSVolumeSelector
{
61 typedef std::vector
<std::pair
<std::string
, uint64_t>> paths
;
63 virtual ~BlueFSVolumeSelector() {
65 virtual void* get_hint_for_log() const = 0;
66 virtual void* get_hint_by_dir(std::string_view dirname
) const = 0;
68 virtual void add_usage(void* file_hint
, const bluefs_fnode_t
& fnode
) = 0;
69 virtual void sub_usage(void* file_hint
, const bluefs_fnode_t
& fnode
) = 0;
70 virtual void add_usage(void* file_hint
, uint64_t fsize
) = 0;
71 virtual void sub_usage(void* file_hint
, uint64_t fsize
) = 0;
72 virtual uint8_t select_prefer_bdev(void* hint
) = 0;
73 virtual void get_paths(const std::string
& base
, paths
& res
) const = 0;
74 virtual void dump(std::ostream
& sout
) = 0;
77 struct bluefs_shared_alloc_context_t
{
78 bool need_init
= false;
79 Allocator
* a
= nullptr;
81 std::atomic
<uint64_t> bluefs_used
= 0;
83 void set(Allocator
* _a
) {
96 static constexpr unsigned MAX_BDEV
= 5;
97 static constexpr unsigned BDEV_WAL
= 0;
98 static constexpr unsigned BDEV_DB
= 1;
99 static constexpr unsigned BDEV_SLOW
= 2;
100 static constexpr unsigned BDEV_NEWWAL
= 3;
101 static constexpr unsigned BDEV_NEWDB
= 4;
109 struct File
: public RefCountedObject
{
110 MEMPOOL_CLASS_HELPERS();
112 bluefs_fnode_t fnode
;
117 boost::intrusive::list_member_hook
<> dirty_item
;
119 std::atomic_int num_readers
, num_writers
;
120 std::atomic_int num_reading
;
122 void* vselector_hint
= nullptr;
125 FRIEND_MAKE_REF(File
);
135 vselector_hint(nullptr)
138 ceph_assert(num_readers
.load() == 0);
139 ceph_assert(num_writers
.load() == 0);
140 ceph_assert(num_reading
.load() == 0);
141 ceph_assert(!locked
);
144 using FileRef
= ceph::ref_t
<File
>;
146 typedef boost::intrusive::list
<
148 boost::intrusive::member_hook
<
150 boost::intrusive::list_member_hook
<>,
151 &File::dirty_item
> > dirty_file_list_t
;
153 struct Dir
: public RefCountedObject
{
154 MEMPOOL_CLASS_HELPERS();
156 mempool::bluefs::map
<std::string
, FileRef
, std::less
<>> file_map
;
159 FRIEND_MAKE_REF(Dir
);
162 using DirRef
= ceph::ref_t
<Dir
>;
165 MEMPOOL_CLASS_HELPERS();
168 uint64_t pos
= 0; ///< start offset for buffer
170 ceph::buffer::list buffer
; ///< new data to write (at end of file)
171 ceph::buffer::list tail_block
; ///< existing partial block at end of file, if any
173 unsigned get_buffer_length() const {
174 return buffer
.length();
176 ceph::bufferlist
flush_buffer(
179 const unsigned length
,
180 const bluefs_super_t
& super
);
181 ceph::buffer::list::page_aligned_appender buffer_appender
; //< for const char* only
183 int writer_type
= 0; ///< WRITER_*
184 int write_hint
= WRITE_LIFE_NOT_SET
;
186 ceph::mutex lock
= ceph::make_mutex("BlueFS::FileWriter::lock");
187 std::array
<IOContext
*,MAX_BDEV
> iocv
; ///< for each bdev
188 std::array
<bool, MAX_BDEV
> dirty_devs
;
190 FileWriter(FileRef f
)
191 : file(std::move(f
)),
192 buffer_appender(buffer
.get_page_aligned_appender(
193 g_conf()->bluefs_alloc_size
/ CEPH_PAGE_SIZE
)) {
196 dirty_devs
.fill(false);
197 if (file
->fnode
.ino
== 1) {
198 write_hint
= WRITE_LIFE_MEDIUM
;
201 // NOTE: caller must call BlueFS::close_writer()
206 // note: BlueRocksEnv uses this append exclusively, so it's safe
207 // to use buffer_appender exclusively here (e.g., it's notion of
208 // offset will remain accurate).
209 void append(const char *buf
, size_t len
) {
210 uint64_t l0
= get_buffer_length();
211 ceph_assert(l0
+ len
<= std::numeric_limits
<unsigned>::max());
212 buffer_appender
.append(buf
, len
);
215 // note: used internally only, for ino 1 or 0.
216 void append(ceph::buffer::list
& bl
) {
217 uint64_t l0
= get_buffer_length();
218 ceph_assert(l0
+ bl
.length() <= std::numeric_limits
<unsigned>::max());
219 buffer
.claim_append(bl
);
222 void append_zero(size_t len
) {
223 uint64_t l0
= get_buffer_length();
224 ceph_assert(l0
+ len
<= std::numeric_limits
<unsigned>::max());
225 buffer_appender
.append_zero(len
);
228 uint64_t get_effective_write_pos() {
229 return pos
+ buffer
.length();
233 struct FileReaderBuffer
{
234 MEMPOOL_CLASS_HELPERS();
236 uint64_t bl_off
= 0; ///< prefetch buffer logical offset
237 ceph::buffer::list bl
; ///< prefetch buffer
238 uint64_t pos
= 0; ///< current logical offset
239 uint64_t max_prefetch
; ///< max allowed prefetch
241 explicit FileReaderBuffer(uint64_t mpf
)
242 : max_prefetch(mpf
) {}
244 uint64_t get_buf_end() const {
245 return bl_off
+ bl
.length();
247 uint64_t get_buf_remaining(uint64_t p
) const {
248 if (p
>= bl_off
&& p
< bl_off
+ bl
.length())
249 return bl_off
+ bl
.length() - p
;
253 void skip(size_t n
) {
257 // For the sake of simplicity, we invalidate completed rather than
258 // for the provided extent
259 void invalidate_cache(uint64_t offset
, uint64_t length
) {
260 if (offset
>= bl_off
&& offset
< get_buf_end()) {
268 MEMPOOL_CLASS_HELPERS();
271 FileReaderBuffer buf
;
273 bool ignore_eof
; ///< used when reading our log file
275 ceph::shared_mutex lock
{
276 ceph::make_shared_mutex(std::string(), false, false, false)
280 FileReader(FileRef f
, uint64_t mpf
, bool rand
, bool ie
)
293 MEMPOOL_CLASS_HELPERS();
296 explicit FileLock(FileRef f
) : file(std::move(f
)) {}
300 ceph::mutex lock
= ceph::make_mutex("BlueFS::lock");
302 PerfCounters
*logger
= nullptr;
304 uint64_t max_bytes
[MAX_BDEV
] = {0};
305 uint64_t max_bytes_pcounters
[MAX_BDEV
] = {
306 l_bluefs_max_bytes_wal
,
307 l_bluefs_max_bytes_db
,
308 l_bluefs_max_bytes_slow
,
312 mempool::bluefs::map
<std::string
, DirRef
, std::less
<>> dir_map
; ///< dirname -> Dir
313 mempool::bluefs::unordered_map
<uint64_t, FileRef
> file_map
; ///< ino -> File
315 // map of dirty files, files of same dirty_seq are grouped into list.
316 std::map
<uint64_t, dirty_file_list_t
> dirty_files
;
318 bluefs_super_t super
; ///< latest superblock (as last written)
319 uint64_t ino_last
= 0; ///< last assigned ino (this one is in use)
320 uint64_t log_seq
= 0; ///< last used log seq (by current pending log_t)
321 uint64_t log_seq_stable
= 0; ///< last stable/synced log seq
322 FileWriter
*log_writer
= 0; ///< writer for the log
323 bluefs_transaction_t log_t
; ///< pending, unwritten log transaction
324 bool log_flushing
= false; ///< true while flushing the log
325 ceph::condition_variable log_cond
;
327 uint64_t new_log_jump_to
= 0;
328 uint64_t old_log_jump_to
= 0;
329 FileRef new_log
= nullptr;
330 FileWriter
*new_log_writer
= nullptr;
333 * There are up to 3 block devices:
335 * BDEV_DB db/ - the primary db device
336 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
337 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
339 std::vector
<BlockDevice
*> bdev
; ///< block devices we can use
340 std::vector
<IOContext
*> ioc
; ///< IOContexts for bdevs
341 std::vector
<uint64_t> block_reserved
; ///< starting reserve extent per device
342 std::vector
<Allocator
*> alloc
; ///< allocators for bdevs
343 std::vector
<uint64_t> alloc_size
; ///< alloc size for each device
344 std::vector
<interval_set
<uint64_t>> pending_release
; ///< extents to release
345 //std::vector<interval_set<uint64_t>> block_unused_too_granular;
347 BlockDevice::aio_callback_t discard_cb
[3]; //discard callbacks for each dev
349 std::unique_ptr
<BlueFSVolumeSelector
> vselector
;
351 bluefs_shared_alloc_context_t
* shared_alloc
= nullptr;
352 unsigned shared_alloc_id
= unsigned(-1);
353 inline bool is_shared_alloc(unsigned id
) const {
354 return id
== shared_alloc_id
;
358 SocketHook
* asok_hook
= nullptr;
359 // used to trigger zeros into read (debug / verify)
360 std::atomic
<uint64_t> inject_read_zeros
{0};
363 void _shutdown_logger();
364 void _update_logger_stats();
369 void _pad_bl(ceph::buffer::list
& bl
); ///< pad ceph::buffer::list to block size w/ zeros
371 uint64_t _get_used(unsigned id
) const;
372 uint64_t _get_total(unsigned id
) const;
375 FileRef
_get_file(uint64_t ino
);
376 void _drop_link(FileRef f
);
378 unsigned _get_slow_device_id() {
379 return bdev
[BDEV_SLOW
] ? BDEV_SLOW
: BDEV_DB
;
381 const char* get_device_name(unsigned id
);
382 int _allocate(uint8_t bdev
, uint64_t len
,
383 bluefs_fnode_t
* node
);
384 int _allocate_without_fallback(uint8_t id
, uint64_t len
,
385 PExtentVector
* extents
);
387 int _flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
);
388 int _flush(FileWriter
*h
, bool force
, std::unique_lock
<ceph::mutex
>& l
);
389 int _flush(FileWriter
*h
, bool force
, bool *flushed
= nullptr);
390 int _fsync(FileWriter
*h
, std::unique_lock
<ceph::mutex
>& l
);
393 void _claim_completed_aios(FileWriter
*h
, std::list
<aio_t
> *ls
);
394 void wait_for_aio(FileWriter
*h
); // safe to call without a lock
397 int _flush_and_sync_log(std::unique_lock
<ceph::mutex
>& l
,
398 uint64_t want_seq
= 0,
399 uint64_t jump_to
= 0);
400 uint64_t _estimate_log_size();
401 bool _should_compact_log();
409 void _compact_log_dump_metadata(bluefs_transaction_t
*t
,
411 void _compact_log_sync();
412 void _compact_log_async(std::unique_lock
<ceph::mutex
>& l
);
414 void _rewrite_log_and_layout_sync(bool allocate_with_fallback
,
419 std::optional
<bluefs_layout_t
> layout
);
421 //void _aio_finish(void *priv);
423 void _flush_bdev_safely(FileWriter
*h
);
424 void flush_bdev(); // this is safe to call without a lock
425 void flush_bdev(std::array
<bool, MAX_BDEV
>& dirty_bdevs
); // this is safe to call without a lock
427 int _preallocate(FileRef f
, uint64_t off
, uint64_t len
);
428 int _truncate(FileWriter
*h
, uint64_t off
);
431 FileReader
*h
, ///< [in] read from here
432 uint64_t offset
, ///< [in] offset
433 size_t len
, ///< [in] this many bytes
434 ceph::buffer::list
*outbl
, ///< [out] optional: reference the result here
435 char *out
); ///< [out] optional: or copy it here
436 int64_t _read_random(
437 FileReader
*h
, ///< [in] read from here
438 uint64_t offset
, ///< [in] offset
439 uint64_t len
, ///< [in] this many bytes
440 char *out
); ///< [out] optional: or copy it here
442 void _invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
);
445 int _write_super(int dev
);
446 int _check_new_allocations(const bluefs_fnode_t
& fnode
,
448 boost::dynamic_bitset
<uint64_t>* used_blocks
);
449 int _verify_alloc_granularity(
450 __u8 id
, uint64_t offset
, uint64_t length
,
452 int _replay(bool noop
, bool to_stdout
= false); ///< replay journal
454 FileWriter
*_create_writer(FileRef f
);
455 void _close_writer(FileWriter
*h
);
457 // always put the super in the second 4k block. FIXME should this be
458 // block size independent?
459 unsigned get_super_offset() {
462 unsigned get_super_length() {
467 BlueFS(CephContext
* cct
);
470 // the super is always stored on bdev 0
471 int mkfs(uuid_d osd_uuid
, const bluefs_layout_t
& layout
);
473 int maybe_verify_layout(const bluefs_layout_t
& layout
) const;
474 void umount(bool avoid_compact
= false);
475 int prepare_new_device(int id
, const bluefs_layout_t
& layout
);
479 void collect_metadata(std::map
<std::string
,std::string
> *pm
, unsigned skip_bdev_id
);
480 void get_devices(std::set
<std::string
> *ls
);
481 uint64_t get_alloc_size(int id
) {
482 return alloc_size
[id
];
486 int device_migrate_to_new(
488 const std::set
<int>& devs_source
,
490 const bluefs_layout_t
& layout
);
491 int device_migrate_to_existing(
493 const std::set
<int>& devs_source
,
495 const bluefs_layout_t
& layout
);
498 uint64_t get_total(unsigned id
);
499 uint64_t get_free(unsigned id
);
500 uint64_t get_used(unsigned id
);
501 void dump_perf_counters(ceph::Formatter
*f
);
503 void dump_block_extents(std::ostream
& out
);
505 /// get current extents that we own for given block device
506 int get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
);
509 std::string_view dir
,
510 std::string_view file
,
515 std::string_view dir
,
516 std::string_view file
,
518 bool random
= false);
520 void close_writer(FileWriter
*h
) {
521 std::lock_guard
l(lock
);
525 int rename(std::string_view old_dir
, std::string_view old_file
,
526 std::string_view new_dir
, std::string_view new_file
);
528 int readdir(std::string_view dirname
, std::vector
<std::string
> *ls
);
530 int unlink(std::string_view dirname
, std::string_view filename
);
531 int mkdir(std::string_view dirname
);
532 int rmdir(std::string_view dirname
);
533 bool wal_is_rotational();
535 bool dir_exists(std::string_view dirname
);
536 int stat(std::string_view dirname
, std::string_view filename
,
537 uint64_t *size
, utime_t
*mtime
);
539 int lock_file(std::string_view dirname
, std::string_view filename
, FileLock
**p
);
540 int unlock_file(FileLock
*l
);
544 /// sync any uncommitted state to disk
545 void sync_metadata(bool avoid_compact
);
546 /// test and compact log, if necessary
547 void _maybe_compact_log(std::unique_lock
<ceph::mutex
>& l
);
549 void set_volume_selector(BlueFSVolumeSelector
* s
) {
552 void dump_volume_selector(std::ostream
& sout
) {
553 vselector
->dump(sout
);
555 void get_vselector_paths(const std::string
& base
,
556 BlueFSVolumeSelector::paths
& res
) const {
557 return vselector
->get_paths(base
, res
);
560 int add_block_device(unsigned bdev
, const std::string
& path
, bool trim
,
562 bluefs_shared_alloc_context_t
* _shared_alloc
= nullptr);
563 bool bdev_support_label(unsigned id
);
564 uint64_t get_block_device_size(unsigned bdev
) const;
566 // handler for discard event
567 void handle_discard(unsigned dev
, interval_set
<uint64_t>& to_release
);
569 void flush(FileWriter
*h
, bool force
= false) {
570 std::unique_lock
l(lock
);
571 int r
= _flush(h
, force
, l
);
575 void append_try_flush(FileWriter
*h
, const char* buf
, size_t len
) {
576 size_t max_size
= 1ull << 30; // cap to 1GB
578 bool need_flush
= true;
579 auto l0
= h
->get_buffer_length();
581 size_t l
= std::min(len
, max_size
- l0
);
585 need_flush
= h
->get_buffer_length() >= cct
->_conf
->bluefs_min_flush_size
;
589 // make sure we've made any progress with flush hence the
590 // loop doesn't iterate forever
591 ceph_assert(h
->get_buffer_length() < max_size
);
595 void flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
) {
596 std::lock_guard
l(lock
);
597 _flush_range(h
, offset
, length
);
599 int fsync(FileWriter
*h
) {
600 std::unique_lock
l(lock
);
601 int r
= _fsync(h
, l
);
602 _maybe_compact_log(l
);
605 int64_t read(FileReader
*h
, uint64_t offset
, size_t len
,
606 ceph::buffer::list
*outbl
, char *out
) {
607 // no need to hold the global lock here; we only touch h and
608 // h->file, and read vs write or delete is already protected (via
609 // atomics and asserts).
610 return _read(h
, offset
, len
, outbl
, out
);
612 int64_t read_random(FileReader
*h
, uint64_t offset
, size_t len
,
614 // no need to hold the global lock here; we only touch h and
615 // h->file, and read vs write or delete is already protected (via
616 // atomics and asserts).
617 return _read_random(h
, offset
, len
, out
);
619 void invalidate_cache(FileRef f
, uint64_t offset
, uint64_t len
) {
620 std::lock_guard
l(lock
);
621 _invalidate_cache(f
, offset
, len
);
623 int preallocate(FileRef f
, uint64_t offset
, uint64_t len
) {
624 std::lock_guard
l(lock
);
625 return _preallocate(f
, offset
, len
);
627 int truncate(FileWriter
*h
, uint64_t offset
) {
628 std::lock_guard
l(lock
);
629 return _truncate(h
, offset
);
631 int do_replay_recovery_read(FileReader
*log
,
637 size_t probe_alloc_avail(int dev
, uint64_t alloc_size
);
639 /// test purpose methods
640 const PerfCounters
* get_perf_counters() const {
645 // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...)
646 // They are used for checking if read values are all 0, and reread if so.
647 int read(uint8_t ndev
, uint64_t off
, uint64_t len
,
648 ceph::buffer::list
*pbl
, IOContext
*ioc
, bool buffered
);
649 int read_random(uint8_t ndev
, uint64_t off
, uint64_t len
, char *buf
, bool buffered
);
652 class OriginalVolumeSelector
: public BlueFSVolumeSelector
{
658 OriginalVolumeSelector(
661 uint64_t _slow_total
)
662 : wal_total(_wal_total
), db_total(_db_total
), slow_total(_slow_total
) {}
664 void* get_hint_for_log() const override
;
665 void* get_hint_by_dir(std::string_view dirname
) const override
;
667 void add_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
671 void sub_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
675 void add_usage(void* hint
, uint64_t fsize
) override
{
679 void sub_usage(void* hint
, uint64_t fsize
) override
{
684 uint8_t select_prefer_bdev(void* hint
) override
;
685 void get_paths(const std::string
& base
, paths
& res
) const override
;
686 void dump(std::ostream
& sout
) override
;
689 class FitToFastVolumeSelector
: public OriginalVolumeSelector
{
691 FitToFastVolumeSelector(
694 uint64_t _slow_total
)
695 : OriginalVolumeSelector(_wal_total
, _db_total
, _slow_total
) {}
697 void get_paths(const std::string
& base
, paths
& res
) const override
;