1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4 #define CEPH_OS_BLUESTORE_BLUEFS_H
10 #include "bluefs_types.h"
11 #include "BlockDevice.h"
13 #include "common/RefCountedObj.h"
14 #include "common/ceph_context.h"
15 #include "global/global_context.h"
16 #include "include/common_fwd.h"
18 #include "boost/intrusive/list.hpp"
19 #include "boost/dynamic_bitset.hpp"
24 l_bluefs_first
= 732600,
26 l_bluefs_reclaim_bytes
,
27 l_bluefs_db_total_bytes
,
28 l_bluefs_db_used_bytes
,
29 l_bluefs_wal_total_bytes
,
30 l_bluefs_wal_used_bytes
,
31 l_bluefs_slow_total_bytes
,
32 l_bluefs_slow_used_bytes
,
35 l_bluefs_log_compactions
,
36 l_bluefs_logged_bytes
,
37 l_bluefs_files_written_wal
,
38 l_bluefs_files_written_sst
,
39 l_bluefs_bytes_written_wal
,
40 l_bluefs_bytes_written_sst
,
41 l_bluefs_bytes_written_slow
,
42 l_bluefs_max_bytes_wal
,
43 l_bluefs_max_bytes_db
,
44 l_bluefs_max_bytes_slow
,
45 l_bluefs_read_random_count
,
46 l_bluefs_read_random_bytes
,
47 l_bluefs_read_random_disk_count
,
48 l_bluefs_read_random_disk_bytes
,
49 l_bluefs_read_random_buffer_count
,
50 l_bluefs_read_random_buffer_bytes
,
53 l_bluefs_read_prefetch_count
,
54 l_bluefs_read_prefetch_bytes
,
55 l_bluefs_read_zeros_candidate
,
56 l_bluefs_read_zeros_errors
,
61 class BlueFSDeviceExpander
{
63 ~BlueFSDeviceExpander() {}
65 virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free
,
66 uint64_t bluefs_total
) = 0;
67 virtual int allocate_freespace(
70 PExtentVector
& extents
) = 0;
71 /** Reports amount of space that can be transferred to BlueFS.
72 * This gives either current state, when alloc_size is currently used
73 * BlueFS's size, or simulation when alloc_size is different.
75 * alloc_size - allocation unit size to check
77 virtual uint64_t available_freespace(uint64_t alloc_size
) = 0;
80 class BlueFSVolumeSelector
{
82 typedef std::vector
<std::pair
<std::string
, uint64_t>> paths
;
84 virtual ~BlueFSVolumeSelector() {
86 virtual void* get_hint_for_log() const = 0;
87 virtual void* get_hint_by_dir(const std::string
& dirname
) const = 0;
89 virtual void add_usage(void* file_hint
, const bluefs_fnode_t
& fnode
) = 0;
90 virtual void sub_usage(void* file_hint
, const bluefs_fnode_t
& fnode
) = 0;
91 virtual void add_usage(void* file_hint
, uint64_t fsize
) = 0;
92 virtual void sub_usage(void* file_hint
, uint64_t fsize
) = 0;
93 virtual uint8_t select_prefer_bdev(void* hint
) = 0;
94 virtual void get_paths(const std::string
& base
, paths
& res
) const = 0;
95 virtual void dump(ostream
& sout
) = 0;
102 static constexpr unsigned MAX_BDEV
= 5;
103 static constexpr unsigned BDEV_WAL
= 0;
104 static constexpr unsigned BDEV_DB
= 1;
105 static constexpr unsigned BDEV_SLOW
= 2;
106 static constexpr unsigned BDEV_NEWWAL
= 3;
107 static constexpr unsigned BDEV_NEWDB
= 4;
115 struct File
: public RefCountedObject
{
116 MEMPOOL_CLASS_HELPERS();
118 bluefs_fnode_t fnode
;
123 boost::intrusive::list_member_hook
<> dirty_item
;
125 std::atomic_int num_readers
, num_writers
;
126 std::atomic_int num_reading
;
128 void* vselector_hint
= nullptr;
131 FRIEND_MAKE_REF(File
);
141 vselector_hint(nullptr)
144 ceph_assert(num_readers
.load() == 0);
145 ceph_assert(num_writers
.load() == 0);
146 ceph_assert(num_reading
.load() == 0);
147 ceph_assert(!locked
);
150 using FileRef
= ceph::ref_t
<File
>;
152 typedef boost::intrusive::list
<
154 boost::intrusive::member_hook
<
156 boost::intrusive::list_member_hook
<>,
157 &File::dirty_item
> > dirty_file_list_t
;
159 struct Dir
: public RefCountedObject
{
160 MEMPOOL_CLASS_HELPERS();
162 mempool::bluefs::map
<string
,FileRef
> file_map
;
165 FRIEND_MAKE_REF(Dir
);
168 using DirRef
= ceph::ref_t
<Dir
>;
171 MEMPOOL_CLASS_HELPERS();
174 uint64_t pos
= 0; ///< start offset for buffer
175 bufferlist buffer
; ///< new data to write (at end of file)
176 bufferlist tail_block
; ///< existing partial block at end of file, if any
177 bufferlist::page_aligned_appender buffer_appender
; //< for const char* only
178 int writer_type
= 0; ///< WRITER_*
179 int write_hint
= WRITE_LIFE_NOT_SET
;
181 ceph::mutex lock
= ceph::make_mutex("BlueFS::FileWriter::lock");
182 std::array
<IOContext
*,MAX_BDEV
> iocv
; ///< for each bdev
183 std::array
<bool, MAX_BDEV
> dirty_devs
;
185 FileWriter(FileRef f
)
186 : file(std::move(f
)),
187 buffer_appender(buffer
.get_page_aligned_appender(
188 g_conf()->bluefs_alloc_size
/ CEPH_PAGE_SIZE
)) {
191 dirty_devs
.fill(false);
192 if (file
->fnode
.ino
== 1) {
193 write_hint
= WRITE_LIFE_MEDIUM
;
196 // NOTE: caller must call BlueFS::close_writer()
201 // note: BlueRocksEnv uses this append exclusively, so it's safe
202 // to use buffer_appender exclusively here (e.g., it's notion of
203 // offset will remain accurate).
204 void append(const char *buf
, size_t len
) {
205 uint64_t l0
= buffer
.length();
206 ceph_assert(l0
+ len
<= std::numeric_limits
<unsigned>::max());
207 buffer_appender
.append(buf
, len
);
210 // note: used internally only, for ino 1 or 0.
211 void append(ceph::buffer::list
& bl
) {
212 uint64_t l0
= buffer
.length();
213 ceph_assert(l0
+ bl
.length() <= std::numeric_limits
<unsigned>::max());
214 buffer
.claim_append(bl
);
217 uint64_t get_effective_write_pos() {
218 buffer_appender
.flush();
219 return pos
+ buffer
.length();
223 struct FileReaderBuffer
{
224 MEMPOOL_CLASS_HELPERS();
226 uint64_t bl_off
= 0; ///< prefetch buffer logical offset
227 bufferlist bl
; ///< prefetch buffer
228 uint64_t pos
= 0; ///< current logical offset
229 uint64_t max_prefetch
; ///< max allowed prefetch
231 explicit FileReaderBuffer(uint64_t mpf
)
232 : max_prefetch(mpf
) {}
234 uint64_t get_buf_end() const {
235 return bl_off
+ bl
.length();
237 uint64_t get_buf_remaining(uint64_t p
) const {
238 if (p
>= bl_off
&& p
< bl_off
+ bl
.length())
239 return bl_off
+ bl
.length() - p
;
243 void skip(size_t n
) {
246 void seek(uint64_t offset
) {
252 MEMPOOL_CLASS_HELPERS();
255 FileReaderBuffer buf
;
257 bool ignore_eof
; ///< used when reading our log file
259 ceph::shared_mutex lock
{
260 ceph::make_shared_mutex(std::string(), false, false, false)
264 FileReader(FileRef f
, uint64_t mpf
, bool rand
, bool ie
)
277 MEMPOOL_CLASS_HELPERS();
280 explicit FileLock(FileRef f
) : file(std::move(f
)) {}
284 ceph::mutex lock
= ceph::make_mutex("BlueFS::lock");
286 PerfCounters
*logger
= nullptr;
288 uint64_t max_bytes
[MAX_BDEV
] = {0};
289 uint64_t max_bytes_pcounters
[MAX_BDEV
] = {
290 l_bluefs_max_bytes_wal
,
291 l_bluefs_max_bytes_db
,
292 l_bluefs_max_bytes_slow
,
296 mempool::bluefs::map
<string
, DirRef
> dir_map
; ///< dirname -> Dir
297 mempool::bluefs::unordered_map
<uint64_t,FileRef
> file_map
; ///< ino -> File
299 // map of dirty files, files of same dirty_seq are grouped into list.
300 map
<uint64_t, dirty_file_list_t
> dirty_files
;
302 bluefs_super_t super
; ///< latest superblock (as last written)
303 uint64_t ino_last
= 0; ///< last assigned ino (this one is in use)
304 uint64_t log_seq
= 0; ///< last used log seq (by current pending log_t)
305 uint64_t log_seq_stable
= 0; ///< last stable/synced log seq
306 FileWriter
*log_writer
= 0; ///< writer for the log
307 bluefs_transaction_t log_t
; ///< pending, unwritten log transaction
308 bool log_flushing
= false; ///< true while flushing the log
309 ceph::condition_variable log_cond
;
311 uint64_t new_log_jump_to
= 0;
312 uint64_t old_log_jump_to
= 0;
313 FileRef new_log
= nullptr;
314 FileWriter
*new_log_writer
= nullptr;
317 * There are up to 3 block devices:
319 * BDEV_DB db/ - the primary db device
320 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
321 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
323 vector
<BlockDevice
*> bdev
; ///< block devices we can use
324 vector
<IOContext
*> ioc
; ///< IOContexts for bdevs
325 vector
<interval_set
<uint64_t> > block_all
; ///< extents in bdev we own
326 vector
<Allocator
*> alloc
; ///< allocators for bdevs
327 vector
<uint64_t> alloc_size
; ///< alloc size for each device
328 vector
<interval_set
<uint64_t>> pending_release
; ///< extents to release
329 vector
<interval_set
<uint64_t>> block_unused_too_granular
;
331 BlockDevice::aio_callback_t discard_cb
[3]; //discard callbacks for each dev
333 BlueFSDeviceExpander
* slow_dev_expander
= nullptr;
334 std::unique_ptr
<BlueFSVolumeSelector
> vselector
;
337 SocketHook
* asok_hook
= nullptr;
338 // used to trigger zeros into read (debug / verify)
339 std::atomic
<uint64_t> inject_read_zeros
{0};
342 void _shutdown_logger();
343 void _update_logger_stats();
348 void _pad_bl(bufferlist
& bl
); ///< pad bufferlist to block size w/ zeros
350 FileRef
_get_file(uint64_t ino
);
351 void _drop_link(FileRef f
);
353 unsigned _get_slow_device_id() {
354 return bdev
[BDEV_SLOW
] ? BDEV_SLOW
: BDEV_DB
;
356 const char* get_device_name(unsigned id
);
357 int _expand_slow_device(uint64_t min_size
, PExtentVector
& extents
);
358 int _allocate(uint8_t bdev
, uint64_t len
,
359 bluefs_fnode_t
* node
);
360 int _allocate_without_fallback(uint8_t id
, uint64_t len
,
361 PExtentVector
* extents
);
363 int _flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
);
364 int _flush(FileWriter
*h
, bool focce
, std::unique_lock
<ceph::mutex
>& l
);
365 int _flush(FileWriter
*h
, bool force
, bool *flushed
= nullptr);
366 int _fsync(FileWriter
*h
, std::unique_lock
<ceph::mutex
>& l
);
369 void _claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
);
370 void wait_for_aio(FileWriter
*h
); // safe to call without a lock
373 int _flush_and_sync_log(std::unique_lock
<ceph::mutex
>& l
,
374 uint64_t want_seq
= 0,
375 uint64_t jump_to
= 0);
376 uint64_t _estimate_log_size();
377 bool _should_compact_log();
385 void _compact_log_dump_metadata(bluefs_transaction_t
*t
,
387 void _compact_log_sync();
388 void _compact_log_async(std::unique_lock
<ceph::mutex
>& l
);
390 void _rewrite_log_and_layout_sync(bool allocate_with_fallback
,
395 std::optional
<bluefs_layout_t
> layout
);
397 //void _aio_finish(void *priv);
399 void _flush_bdev_safely(FileWriter
*h
);
400 void flush_bdev(); // this is safe to call without a lock
401 void flush_bdev(std::array
<bool, MAX_BDEV
>& dirty_bdevs
); // this is safe to call without a lock
403 int _preallocate(FileRef f
, uint64_t off
, uint64_t len
);
404 int _truncate(FileWriter
*h
, uint64_t off
);
407 FileReader
*h
, ///< [in] read from here
408 FileReaderBuffer
*buf
, ///< [in] reader state
409 uint64_t offset
, ///< [in] offset
410 size_t len
, ///< [in] this many bytes
411 bufferlist
*outbl
, ///< [out] optional: reference the result here
412 char *out
); ///< [out] optional: or copy it here
413 int64_t _read_random(
414 FileReader
*h
, ///< [in] read from here
415 uint64_t offset
, ///< [in] offset
416 uint64_t len
, ///< [in] this many bytes
417 char *out
); ///< [out] optional: or copy it here
419 void _invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
);
422 int _write_super(int dev
);
423 int _check_new_allocations(const bluefs_fnode_t
& fnode
,
425 boost::dynamic_bitset
<uint64_t>* owned_blocks
,
426 boost::dynamic_bitset
<uint64_t>* used_blocks
);
427 int _verify_alloc_granularity(
428 __u8 id
, uint64_t offset
, uint64_t length
,
430 int _adjust_granularity(
431 __u8 id
, uint64_t *offset
, uint64_t *length
, bool alloc
);
432 int _replay(bool noop
, bool to_stdout
= false); ///< replay journal
434 FileWriter
*_create_writer(FileRef f
);
435 void _close_writer(FileWriter
*h
);
437 // always put the super in the second 4k block. FIXME should this be
438 // block size independent?
439 unsigned get_super_offset() {
442 unsigned get_super_length() {
446 void _add_block_extent(unsigned bdev
, uint64_t offset
, uint64_t len
,
450 BlueFS(CephContext
* cct
);
453 // the super is always stored on bdev 0
454 int mkfs(uuid_d osd_uuid
, const bluefs_layout_t
& layout
);
456 int maybe_verify_layout(const bluefs_layout_t
& layout
) const;
457 void umount(bool avoid_compact
= false);
458 int prepare_new_device(int id
, const bluefs_layout_t
& layout
);
462 void collect_metadata(map
<string
,string
> *pm
, unsigned skip_bdev_id
);
463 void get_devices(set
<string
> *ls
);
464 uint64_t get_alloc_size(int id
) {
465 return alloc_size
[id
];
469 int device_migrate_to_new(
471 const set
<int>& devs_source
,
473 const bluefs_layout_t
& layout
);
474 int device_migrate_to_existing(
476 const set
<int>& devs_source
,
478 const bluefs_layout_t
& layout
);
481 uint64_t get_total(unsigned id
);
482 uint64_t get_free(unsigned id
);
483 void get_usage(vector
<pair
<uint64_t,uint64_t>> *usage
); // [<free,total> ...]
484 void dump_perf_counters(Formatter
*f
);
486 void dump_block_extents(ostream
& out
);
488 /// get current extents that we own for given block device
489 int get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
);
501 bool random
= false);
503 void close_writer(FileWriter
*h
) {
504 std::lock_guard
l(lock
);
508 int rename(const string
& old_dir
, const string
& old_file
,
509 const string
& new_dir
, const string
& new_file
);
511 int readdir(const string
& dirname
, vector
<string
> *ls
);
513 int unlink(const string
& dirname
, const string
& filename
);
514 int mkdir(const string
& dirname
);
515 int rmdir(const string
& dirname
);
516 bool wal_is_rotational();
518 bool dir_exists(const string
& dirname
);
519 int stat(const string
& dirname
, const string
& filename
,
520 uint64_t *size
, utime_t
*mtime
);
522 int lock_file(const string
& dirname
, const string
& filename
, FileLock
**p
);
523 int unlock_file(FileLock
*l
);
527 /// sync any uncommitted state to disk
528 void sync_metadata(bool avoid_compact
);
529 /// test and compact log, if necessary
530 void _maybe_compact_log(std::unique_lock
<ceph::mutex
>& l
);
532 void set_slow_device_expander(BlueFSDeviceExpander
* a
) {
533 slow_dev_expander
= a
;
535 void set_volume_selector(BlueFSVolumeSelector
* s
) {
538 void dump_volume_selector(ostream
& sout
) {
539 vselector
->dump(sout
);
541 void get_vselector_paths(const std::string
& base
,
542 BlueFSVolumeSelector::paths
& res
) const {
543 return vselector
->get_paths(base
, res
);
546 int add_block_device(unsigned bdev
, const string
& path
, bool trim
,
547 bool shared_with_bluestore
=false);
548 bool bdev_support_label(unsigned id
);
549 uint64_t get_block_device_size(unsigned bdev
);
551 /// gift more block space
552 void add_block_extent(unsigned bdev
, uint64_t offset
, uint64_t len
,
554 std::unique_lock
l(lock
);
555 _add_block_extent(bdev
, offset
, len
, skip
);
556 int r
= _flush_and_sync_log(l
);
560 /// reclaim block space
561 int reclaim_blocks(unsigned bdev
, uint64_t want
,
562 PExtentVector
*extents
);
564 // handler for discard event
565 void handle_discard(unsigned dev
, interval_set
<uint64_t>& to_release
);
567 void flush(FileWriter
*h
, bool force
= false) {
568 std::unique_lock
l(lock
);
569 int r
= _flush(h
, force
, l
);
573 void append_try_flush(FileWriter
*h
, const char* buf
, size_t len
) {
574 size_t max_size
= 1ull << 30; // cap to 1GB
576 bool need_flush
= true;
577 auto l0
= h
->buffer
.length();
579 size_t l
= std::min(len
, max_size
- l0
);
583 need_flush
= h
->buffer
.length() >= cct
->_conf
->bluefs_min_flush_size
;
587 // make sure we've made any progress with flush hence the
588 // loop doesn't iterate forever
589 ceph_assert(h
->buffer
.length() < max_size
);
593 void flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
) {
594 std::lock_guard
l(lock
);
595 _flush_range(h
, offset
, length
);
597 int fsync(FileWriter
*h
) {
598 std::unique_lock
l(lock
);
599 int r
= _fsync(h
, l
);
600 _maybe_compact_log(l
);
603 int64_t read(FileReader
*h
, FileReaderBuffer
*buf
, uint64_t offset
, size_t len
,
604 bufferlist
*outbl
, char *out
) {
605 // no need to hold the global lock here; we only touch h and
606 // h->file, and read vs write or delete is already protected (via
607 // atomics and asserts).
608 return _read(h
, buf
, offset
, len
, outbl
, out
);
610 int64_t read_random(FileReader
*h
, uint64_t offset
, size_t len
,
612 // no need to hold the global lock here; we only touch h and
613 // h->file, and read vs write or delete is already protected (via
614 // atomics and asserts).
615 return _read_random(h
, offset
, len
, out
);
617 void invalidate_cache(FileRef f
, uint64_t offset
, uint64_t len
) {
618 std::lock_guard
l(lock
);
619 _invalidate_cache(f
, offset
, len
);
621 int preallocate(FileRef f
, uint64_t offset
, uint64_t len
) {
622 std::lock_guard
l(lock
);
623 return _preallocate(f
, offset
, len
);
625 int truncate(FileWriter
*h
, uint64_t offset
) {
626 std::lock_guard
l(lock
);
627 return _truncate(h
, offset
);
629 int do_replay_recovery_read(FileReader
*log
,
635 /// test purpose methods
636 void debug_inject_duplicate_gift(unsigned bdev
, uint64_t offset
, uint64_t len
);
637 const PerfCounters
* get_perf_counters() const {
642 // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...)
643 // They are used for checking if read values are all 0, and reread if so.
644 int read(uint8_t ndev
, uint64_t off
, uint64_t len
,
645 ceph::buffer::list
*pbl
, IOContext
*ioc
, bool buffered
);
646 int read_random(uint8_t ndev
, uint64_t off
, uint64_t len
, char *buf
, bool buffered
);
649 class OriginalVolumeSelector
: public BlueFSVolumeSelector
{
655 OriginalVolumeSelector(
658 uint64_t _slow_total
)
659 : wal_total(_wal_total
), db_total(_db_total
), slow_total(_slow_total
) {}
661 void* get_hint_for_log() const override
;
662 void* get_hint_by_dir(const std::string
& dirname
) const override
;
664 void add_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
668 void sub_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
672 void add_usage(void* hint
, uint64_t fsize
) override
{
676 void sub_usage(void* hint
, uint64_t fsize
) override
{
681 uint8_t select_prefer_bdev(void* hint
) override
;
682 void get_paths(const std::string
& base
, paths
& res
) const override
;
683 void dump(ostream
& sout
) override
;