1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4 #define CEPH_OS_BLUESTORE_BLUEFS_H
10 #include "bluefs_types.h"
11 #include "BlockDevice.h"
13 #include "common/RefCountedObj.h"
14 #include "common/ceph_context.h"
15 #include "global/global_context.h"
16 #include "include/common_fwd.h"
18 #include "boost/intrusive/list.hpp"
19 #include "boost/dynamic_bitset.hpp"
24 l_bluefs_first
= 732600,
26 l_bluefs_reclaim_bytes
,
27 l_bluefs_db_total_bytes
,
28 l_bluefs_db_used_bytes
,
29 l_bluefs_wal_total_bytes
,
30 l_bluefs_wal_used_bytes
,
31 l_bluefs_slow_total_bytes
,
32 l_bluefs_slow_used_bytes
,
35 l_bluefs_log_compactions
,
36 l_bluefs_logged_bytes
,
37 l_bluefs_files_written_wal
,
38 l_bluefs_files_written_sst
,
39 l_bluefs_bytes_written_wal
,
40 l_bluefs_bytes_written_sst
,
41 l_bluefs_bytes_written_slow
,
42 l_bluefs_max_bytes_wal
,
43 l_bluefs_max_bytes_db
,
44 l_bluefs_max_bytes_slow
,
45 l_bluefs_read_random_count
,
46 l_bluefs_read_random_bytes
,
47 l_bluefs_read_random_disk_count
,
48 l_bluefs_read_random_disk_bytes
,
49 l_bluefs_read_random_buffer_count
,
50 l_bluefs_read_random_buffer_bytes
,
53 l_bluefs_read_prefetch_count
,
54 l_bluefs_read_prefetch_bytes
,
55 l_bluefs_read_zeros_candidate
,
56 l_bluefs_read_zeros_errors
,
61 class BlueFSDeviceExpander
{
63 ~BlueFSDeviceExpander() {}
65 virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free
,
66 uint64_t bluefs_total
) = 0;
67 virtual int allocate_freespace(
70 PExtentVector
& extents
) = 0;
71 /** Reports amount of space that can be transferred to BlueFS.
72 * This gives either current state, when alloc_size is currently used
73 * BlueFS's size, or simulation when alloc_size is different.
75 * alloc_size - allocation unit size to check
77 virtual uint64_t available_freespace(uint64_t alloc_size
) = 0;
80 class BlueFSVolumeSelector
{
82 typedef std::vector
<std::pair
<std::string
, uint64_t>> paths
;
84 virtual ~BlueFSVolumeSelector() {
86 virtual void* get_hint_for_log() const = 0;
87 virtual void* get_hint_by_dir(std::string_view dirname
) const = 0;
89 virtual void add_usage(void* file_hint
, const bluefs_fnode_t
& fnode
) = 0;
90 virtual void sub_usage(void* file_hint
, const bluefs_fnode_t
& fnode
) = 0;
91 virtual void add_usage(void* file_hint
, uint64_t fsize
) = 0;
92 virtual void sub_usage(void* file_hint
, uint64_t fsize
) = 0;
93 virtual uint8_t select_prefer_bdev(void* hint
) = 0;
94 virtual void get_paths(const std::string
& base
, paths
& res
) const = 0;
95 virtual void dump(ostream
& sout
) = 0;
102 static constexpr unsigned MAX_BDEV
= 5;
103 static constexpr unsigned BDEV_WAL
= 0;
104 static constexpr unsigned BDEV_DB
= 1;
105 static constexpr unsigned BDEV_SLOW
= 2;
106 static constexpr unsigned BDEV_NEWWAL
= 3;
107 static constexpr unsigned BDEV_NEWDB
= 4;
115 struct File
: public RefCountedObject
{
116 MEMPOOL_CLASS_HELPERS();
118 bluefs_fnode_t fnode
;
124 boost::intrusive::list_member_hook
<> dirty_item
;
126 std::atomic_int num_readers
, num_writers
;
127 std::atomic_int num_reading
;
129 void* vselector_hint
= nullptr;
132 FRIEND_MAKE_REF(File
);
143 vselector_hint(nullptr)
146 ceph_assert(num_readers
.load() == 0);
147 ceph_assert(num_writers
.load() == 0);
148 ceph_assert(num_reading
.load() == 0);
149 ceph_assert(!locked
);
152 using FileRef
= ceph::ref_t
<File
>;
154 typedef boost::intrusive::list
<
156 boost::intrusive::member_hook
<
158 boost::intrusive::list_member_hook
<>,
159 &File::dirty_item
> > dirty_file_list_t
;
161 struct Dir
: public RefCountedObject
{
162 MEMPOOL_CLASS_HELPERS();
164 mempool::bluefs::map
<std::string
, FileRef
, std::less
<>> file_map
;
167 FRIEND_MAKE_REF(Dir
);
170 using DirRef
= ceph::ref_t
<Dir
>;
173 MEMPOOL_CLASS_HELPERS();
176 uint64_t pos
= 0; ///< start offset for buffer
177 bufferlist buffer
; ///< new data to write (at end of file)
178 bufferlist tail_block
; ///< existing partial block at end of file, if any
179 bufferlist::page_aligned_appender buffer_appender
; //< for const char* only
180 int writer_type
= 0; ///< WRITER_*
181 int write_hint
= WRITE_LIFE_NOT_SET
;
183 ceph::mutex lock
= ceph::make_mutex("BlueFS::FileWriter::lock");
184 std::array
<IOContext
*,MAX_BDEV
> iocv
; ///< for each bdev
185 std::array
<bool, MAX_BDEV
> dirty_devs
;
187 FileWriter(FileRef f
)
188 : file(std::move(f
)),
189 buffer_appender(buffer
.get_page_aligned_appender(
190 g_conf()->bluefs_alloc_size
/ CEPH_PAGE_SIZE
)) {
193 dirty_devs
.fill(false);
194 if (file
->fnode
.ino
== 1) {
195 write_hint
= WRITE_LIFE_MEDIUM
;
198 // NOTE: caller must call BlueFS::close_writer()
203 // note: BlueRocksEnv uses this append exclusively, so it's safe
204 // to use buffer_appender exclusively here (e.g., it's notion of
205 // offset will remain accurate).
206 void append(const char *buf
, size_t len
) {
207 uint64_t l0
= buffer
.length();
208 ceph_assert(l0
+ len
<= std::numeric_limits
<unsigned>::max());
209 buffer_appender
.append(buf
, len
);
212 // note: used internally only, for ino 1 or 0.
213 void append(ceph::buffer::list
& bl
) {
214 uint64_t l0
= buffer
.length();
215 ceph_assert(l0
+ bl
.length() <= std::numeric_limits
<unsigned>::max());
216 buffer
.claim_append(bl
);
219 uint64_t get_effective_write_pos() {
220 buffer_appender
.flush();
221 return pos
+ buffer
.length();
225 struct FileReaderBuffer
{
226 MEMPOOL_CLASS_HELPERS();
228 uint64_t bl_off
= 0; ///< prefetch buffer logical offset
229 bufferlist bl
; ///< prefetch buffer
230 uint64_t pos
= 0; ///< current logical offset
231 uint64_t max_prefetch
; ///< max allowed prefetch
233 explicit FileReaderBuffer(uint64_t mpf
)
234 : max_prefetch(mpf
) {}
236 uint64_t get_buf_end() const {
237 return bl_off
+ bl
.length();
239 uint64_t get_buf_remaining(uint64_t p
) const {
240 if (p
>= bl_off
&& p
< bl_off
+ bl
.length())
241 return bl_off
+ bl
.length() - p
;
245 void skip(size_t n
) {
248 void seek(uint64_t offset
) {
254 MEMPOOL_CLASS_HELPERS();
257 FileReaderBuffer buf
;
259 bool ignore_eof
; ///< used when reading our log file
261 ceph::shared_mutex lock
{
262 ceph::make_shared_mutex(std::string(), false, false, false)
266 FileReader(FileRef f
, uint64_t mpf
, bool rand
, bool ie
)
279 MEMPOOL_CLASS_HELPERS();
282 explicit FileLock(FileRef f
) : file(std::move(f
)) {}
286 ceph::mutex lock
= ceph::make_mutex("BlueFS::lock");
288 PerfCounters
*logger
= nullptr;
290 uint64_t max_bytes
[MAX_BDEV
] = {0};
291 uint64_t max_bytes_pcounters
[MAX_BDEV
] = {
292 l_bluefs_max_bytes_wal
,
293 l_bluefs_max_bytes_db
,
294 l_bluefs_max_bytes_slow
,
298 mempool::bluefs::map
<std::string
, DirRef
, std::less
<>> dir_map
; ///< dirname -> Dir
299 mempool::bluefs::unordered_map
<uint64_t, FileRef
> file_map
; ///< ino -> File
301 // map of dirty files, files of same dirty_seq are grouped into list.
302 map
<uint64_t, dirty_file_list_t
> dirty_files
;
304 bluefs_super_t super
; ///< latest superblock (as last written)
305 uint64_t ino_last
= 0; ///< last assigned ino (this one is in use)
306 uint64_t log_seq
= 0; ///< last used log seq (by current pending log_t)
307 uint64_t log_seq_stable
= 0; ///< last stable/synced log seq
308 FileWriter
*log_writer
= 0; ///< writer for the log
309 bluefs_transaction_t log_t
; ///< pending, unwritten log transaction
310 bool log_flushing
= false; ///< true while flushing the log
311 ceph::condition_variable log_cond
;
313 uint64_t new_log_jump_to
= 0;
314 uint64_t old_log_jump_to
= 0;
315 FileRef new_log
= nullptr;
316 FileWriter
*new_log_writer
= nullptr;
319 * There are up to 3 block devices:
321 * BDEV_DB db/ - the primary db device
322 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
323 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
325 vector
<BlockDevice
*> bdev
; ///< block devices we can use
326 vector
<IOContext
*> ioc
; ///< IOContexts for bdevs
327 vector
<interval_set
<uint64_t> > block_all
; ///< extents in bdev we own
328 vector
<Allocator
*> alloc
; ///< allocators for bdevs
329 vector
<uint64_t> alloc_size
; ///< alloc size for each device
330 vector
<interval_set
<uint64_t>> pending_release
; ///< extents to release
331 vector
<interval_set
<uint64_t>> block_unused_too_granular
;
333 BlockDevice::aio_callback_t discard_cb
[3]; //discard callbacks for each dev
335 BlueFSDeviceExpander
* slow_dev_expander
= nullptr;
336 std::unique_ptr
<BlueFSVolumeSelector
> vselector
;
339 SocketHook
* asok_hook
= nullptr;
340 // used to trigger zeros into read (debug / verify)
341 std::atomic
<uint64_t> inject_read_zeros
{0};
344 void _shutdown_logger();
345 void _update_logger_stats();
350 void _pad_bl(bufferlist
& bl
); ///< pad bufferlist to block size w/ zeros
352 FileRef
_get_file(uint64_t ino
);
353 void _drop_link(FileRef f
);
355 unsigned _get_slow_device_id() {
356 return bdev
[BDEV_SLOW
] ? BDEV_SLOW
: BDEV_DB
;
358 const char* get_device_name(unsigned id
);
359 int _expand_slow_device(uint64_t min_size
, PExtentVector
& extents
);
360 int _allocate(uint8_t bdev
, uint64_t len
,
361 bluefs_fnode_t
* node
);
362 int _allocate_without_fallback(uint8_t id
, uint64_t len
,
363 PExtentVector
* extents
);
365 /* signal replay log to include h->file in nearest log flush */
366 int _signal_dirty_to_log(FileWriter
*h
);
367 int _flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
);
368 int _flush(FileWriter
*h
, bool focce
, std::unique_lock
<ceph::mutex
>& l
);
369 int _flush(FileWriter
*h
, bool force
, bool *flushed
= nullptr);
370 int _fsync(FileWriter
*h
, std::unique_lock
<ceph::mutex
>& l
);
373 void _claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
);
374 void wait_for_aio(FileWriter
*h
); // safe to call without a lock
377 int _flush_and_sync_log(std::unique_lock
<ceph::mutex
>& l
,
378 uint64_t want_seq
= 0,
379 uint64_t jump_to
= 0);
380 uint64_t _estimate_log_size();
381 bool _should_compact_log();
389 void _compact_log_dump_metadata(bluefs_transaction_t
*t
,
391 void _compact_log_sync();
392 void _compact_log_async(std::unique_lock
<ceph::mutex
>& l
);
394 void _rewrite_log_and_layout_sync(bool allocate_with_fallback
,
399 std::optional
<bluefs_layout_t
> layout
);
401 //void _aio_finish(void *priv);
403 void _flush_bdev_safely(FileWriter
*h
);
404 void flush_bdev(); // this is safe to call without a lock
405 void flush_bdev(std::array
<bool, MAX_BDEV
>& dirty_bdevs
); // this is safe to call without a lock
407 int _preallocate(FileRef f
, uint64_t off
, uint64_t len
);
408 int _truncate(FileWriter
*h
, uint64_t off
);
411 FileReader
*h
, ///< [in] read from here
412 FileReaderBuffer
*buf
, ///< [in] reader state
413 uint64_t offset
, ///< [in] offset
414 size_t len
, ///< [in] this many bytes
415 bufferlist
*outbl
, ///< [out] optional: reference the result here
416 char *out
); ///< [out] optional: or copy it here
417 int64_t _read_random(
418 FileReader
*h
, ///< [in] read from here
419 uint64_t offset
, ///< [in] offset
420 uint64_t len
, ///< [in] this many bytes
421 char *out
); ///< [out] optional: or copy it here
423 void _invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
);
426 int _write_super(int dev
);
427 int _check_new_allocations(const bluefs_fnode_t
& fnode
,
429 boost::dynamic_bitset
<uint64_t>* owned_blocks
,
430 boost::dynamic_bitset
<uint64_t>* used_blocks
);
431 int _verify_alloc_granularity(
432 __u8 id
, uint64_t offset
, uint64_t length
,
434 int _adjust_granularity(
435 __u8 id
, uint64_t *offset
, uint64_t *length
, bool alloc
);
436 int _replay(bool noop
, bool to_stdout
= false); ///< replay journal
438 FileWriter
*_create_writer(FileRef f
);
439 void _close_writer(FileWriter
*h
);
441 // always put the super in the second 4k block. FIXME should this be
442 // block size independent?
443 unsigned get_super_offset() {
446 unsigned get_super_length() {
450 void _add_block_extent(unsigned bdev
, uint64_t offset
, uint64_t len
,
454 BlueFS(CephContext
* cct
);
457 // the super is always stored on bdev 0
458 int mkfs(uuid_d osd_uuid
, const bluefs_layout_t
& layout
);
460 int maybe_verify_layout(const bluefs_layout_t
& layout
) const;
461 void umount(bool avoid_compact
= false);
462 int prepare_new_device(int id
, const bluefs_layout_t
& layout
);
466 void collect_metadata(map
<string
,string
> *pm
, unsigned skip_bdev_id
);
467 void get_devices(set
<string
> *ls
);
468 uint64_t get_alloc_size(int id
) {
469 return alloc_size
[id
];
473 int device_migrate_to_new(
475 const set
<int>& devs_source
,
477 const bluefs_layout_t
& layout
);
478 int device_migrate_to_existing(
480 const set
<int>& devs_source
,
482 const bluefs_layout_t
& layout
);
485 uint64_t get_total(unsigned id
);
486 uint64_t get_free(unsigned id
);
487 void get_usage(vector
<pair
<uint64_t,uint64_t>> *usage
); // [<free,total> ...]
488 void dump_perf_counters(Formatter
*f
);
490 void dump_block_extents(ostream
& out
);
492 /// get current extents that we own for given block device
493 int get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
);
496 std::string_view dir
,
497 std::string_view file
,
502 std::string_view dir
,
503 std::string_view file
,
505 bool random
= false);
507 void close_writer(FileWriter
*h
) {
508 std::lock_guard
l(lock
);
512 int rename(std::string_view old_dir
, std::string_view old_file
,
513 std::string_view new_dir
, std::string_view new_file
);
515 int readdir(std::string_view dirname
, std::vector
<std::string
> *ls
);
517 int unlink(std::string_view dirname
, std::string_view filename
);
518 int mkdir(std::string_view dirname
);
519 int rmdir(std::string_view dirname
);
520 bool wal_is_rotational();
522 bool dir_exists(std::string_view dirname
);
523 int stat(std::string_view dirname
, std::string_view filename
,
524 uint64_t *size
, utime_t
*mtime
);
526 int lock_file(std::string_view dirname
, std::string_view filename
, FileLock
**p
);
527 int unlock_file(FileLock
*l
);
531 /// sync any uncommitted state to disk
532 void sync_metadata(bool avoid_compact
);
533 /// test and compact log, if necessary
534 void _maybe_compact_log(std::unique_lock
<ceph::mutex
>& l
);
536 void set_slow_device_expander(BlueFSDeviceExpander
* a
) {
537 slow_dev_expander
= a
;
539 void set_volume_selector(BlueFSVolumeSelector
* s
) {
542 void dump_volume_selector(ostream
& sout
) {
543 vselector
->dump(sout
);
545 void get_vselector_paths(const std::string
& base
,
546 BlueFSVolumeSelector::paths
& res
) const {
547 return vselector
->get_paths(base
, res
);
550 int add_block_device(unsigned bdev
, const string
& path
, bool trim
,
551 bool shared_with_bluestore
=false);
552 bool bdev_support_label(unsigned id
);
553 uint64_t get_block_device_size(unsigned bdev
);
555 /// gift more block space
556 void add_block_extent(unsigned bdev
, uint64_t offset
, uint64_t len
,
558 std::unique_lock
l(lock
);
559 _add_block_extent(bdev
, offset
, len
, skip
);
560 int r
= _flush_and_sync_log(l
);
564 /// reclaim block space
565 int reclaim_blocks(unsigned bdev
, uint64_t want
,
566 PExtentVector
*extents
);
568 // handler for discard event
569 void handle_discard(unsigned dev
, interval_set
<uint64_t>& to_release
);
571 void flush(FileWriter
*h
, bool force
= false) {
572 std::unique_lock
l(lock
);
573 int r
= _flush(h
, force
, l
);
577 void append_try_flush(FileWriter
*h
, const char* buf
, size_t len
) {
578 size_t max_size
= 1ull << 30; // cap to 1GB
580 bool need_flush
= true;
581 auto l0
= h
->buffer
.length();
583 size_t l
= std::min(len
, max_size
- l0
);
587 need_flush
= h
->buffer
.length() >= cct
->_conf
->bluefs_min_flush_size
;
591 // make sure we've made any progress with flush hence the
592 // loop doesn't iterate forever
593 ceph_assert(h
->buffer
.length() < max_size
);
597 void flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
) {
598 std::lock_guard
l(lock
);
599 _flush_range(h
, offset
, length
);
601 int fsync(FileWriter
*h
) {
602 std::unique_lock
l(lock
);
603 int r
= _fsync(h
, l
);
604 _maybe_compact_log(l
);
607 int64_t read(FileReader
*h
, FileReaderBuffer
*buf
, uint64_t offset
, size_t len
,
608 bufferlist
*outbl
, char *out
) {
609 // no need to hold the global lock here; we only touch h and
610 // h->file, and read vs write or delete is already protected (via
611 // atomics and asserts).
612 return _read(h
, buf
, offset
, len
, outbl
, out
);
614 int64_t read_random(FileReader
*h
, uint64_t offset
, size_t len
,
616 // no need to hold the global lock here; we only touch h and
617 // h->file, and read vs write or delete is already protected (via
618 // atomics and asserts).
619 return _read_random(h
, offset
, len
, out
);
621 void invalidate_cache(FileRef f
, uint64_t offset
, uint64_t len
) {
622 std::lock_guard
l(lock
);
623 _invalidate_cache(f
, offset
, len
);
625 int preallocate(FileRef f
, uint64_t offset
, uint64_t len
) {
626 std::lock_guard
l(lock
);
627 return _preallocate(f
, offset
, len
);
629 int truncate(FileWriter
*h
, uint64_t offset
) {
630 std::lock_guard
l(lock
);
631 return _truncate(h
, offset
);
633 int do_replay_recovery_read(FileReader
*log
,
639 /// test purpose methods
640 void debug_inject_duplicate_gift(unsigned bdev
, uint64_t offset
, uint64_t len
);
641 const PerfCounters
* get_perf_counters() const {
644 uint64_t debug_get_dirty_seq(FileWriter
*h
);
645 bool debug_get_is_dev_dirty(FileWriter
*h
, uint8_t dev
);
648 // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...)
649 // They are used for checking if read values are all 0, and reread if so.
650 int read(uint8_t ndev
, uint64_t off
, uint64_t len
,
651 ceph::buffer::list
*pbl
, IOContext
*ioc
, bool buffered
);
652 int read_random(uint8_t ndev
, uint64_t off
, uint64_t len
, char *buf
, bool buffered
);
655 class OriginalVolumeSelector
: public BlueFSVolumeSelector
{
661 OriginalVolumeSelector(
664 uint64_t _slow_total
)
665 : wal_total(_wal_total
), db_total(_db_total
), slow_total(_slow_total
) {}
667 void* get_hint_for_log() const override
;
668 void* get_hint_by_dir(std::string_view dirname
) const override
;
670 void add_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
674 void sub_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
678 void add_usage(void* hint
, uint64_t fsize
) override
{
682 void sub_usage(void* hint
, uint64_t fsize
) override
{
687 uint8_t select_prefer_bdev(void* hint
) override
;
688 void get_paths(const std::string
& base
, paths
& res
) const override
;
689 void dump(ostream
& sout
) override
;