1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4 #define CEPH_OS_BLUESTORE_BLUEFS_H
9 #include "bluefs_types.h"
10 #include "BlockDevice.h"
12 #include "common/RefCountedObj.h"
13 #include "common/ceph_context.h"
14 #include "global/global_context.h"
15 #include "include/common_fwd.h"
17 #include "boost/intrusive/list.hpp"
18 #include "boost/dynamic_bitset.hpp"
23 l_bluefs_first
= 732600,
25 l_bluefs_reclaim_bytes
,
26 l_bluefs_db_total_bytes
,
27 l_bluefs_db_used_bytes
,
28 l_bluefs_wal_total_bytes
,
29 l_bluefs_wal_used_bytes
,
30 l_bluefs_slow_total_bytes
,
31 l_bluefs_slow_used_bytes
,
34 l_bluefs_log_compactions
,
35 l_bluefs_logged_bytes
,
36 l_bluefs_files_written_wal
,
37 l_bluefs_files_written_sst
,
38 l_bluefs_bytes_written_wal
,
39 l_bluefs_bytes_written_sst
,
40 l_bluefs_bytes_written_slow
,
41 l_bluefs_max_bytes_wal
,
42 l_bluefs_max_bytes_db
,
43 l_bluefs_max_bytes_slow
,
44 l_bluefs_read_random_count
,
45 l_bluefs_read_random_bytes
,
46 l_bluefs_read_random_disk_count
,
47 l_bluefs_read_random_disk_bytes
,
48 l_bluefs_read_random_buffer_count
,
49 l_bluefs_read_random_buffer_bytes
,
52 l_bluefs_read_prefetch_count
,
53 l_bluefs_read_prefetch_bytes
,
58 class BlueFSDeviceExpander
{
60 ~BlueFSDeviceExpander() {}
62 virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free
,
63 uint64_t bluefs_total
) = 0;
64 virtual int allocate_freespace(
67 PExtentVector
& extents
) = 0;
68 /** Reports amount of space that can be transferred to BlueFS.
69 * This gives either current state, when alloc_size is currently used
70 * BlueFS's size, or simulation when alloc_size is different.
72 * alloc_size - allocation unit size to check
74 virtual uint64_t available_freespace(uint64_t alloc_size
) = 0;
77 class BlueFSVolumeSelector
{
79 typedef std::vector
<std::pair
<std::string
, uint64_t>> paths
;
81 virtual ~BlueFSVolumeSelector() {
83 virtual void* get_hint_for_log() const = 0;
84 virtual void* get_hint_by_dir(const std::string
& dirname
) const = 0;
86 virtual void add_usage(void* file_hint
, const bluefs_fnode_t
& fnode
) = 0;
87 virtual void sub_usage(void* file_hint
, const bluefs_fnode_t
& fnode
) = 0;
88 virtual void add_usage(void* file_hint
, uint64_t fsize
) = 0;
89 virtual void sub_usage(void* file_hint
, uint64_t fsize
) = 0;
90 virtual uint8_t select_prefer_bdev(void* hint
) = 0;
91 virtual void get_paths(const std::string
& base
, paths
& res
) const = 0;
92 virtual void dump(ostream
& sout
) = 0;
99 static constexpr unsigned MAX_BDEV
= 5;
100 static constexpr unsigned BDEV_WAL
= 0;
101 static constexpr unsigned BDEV_DB
= 1;
102 static constexpr unsigned BDEV_SLOW
= 2;
103 static constexpr unsigned BDEV_NEWWAL
= 3;
104 static constexpr unsigned BDEV_NEWDB
= 4;
112 struct File
: public RefCountedObject
{
113 MEMPOOL_CLASS_HELPERS();
115 bluefs_fnode_t fnode
;
120 boost::intrusive::list_member_hook
<> dirty_item
;
122 std::atomic_int num_readers
, num_writers
;
123 std::atomic_int num_reading
;
125 void* vselector_hint
= nullptr;
128 FRIEND_MAKE_REF(File
);
138 vselector_hint(nullptr)
141 ceph_assert(num_readers
.load() == 0);
142 ceph_assert(num_writers
.load() == 0);
143 ceph_assert(num_reading
.load() == 0);
144 ceph_assert(!locked
);
147 using FileRef
= ceph::ref_t
<File
>;
149 typedef boost::intrusive::list
<
151 boost::intrusive::member_hook
<
153 boost::intrusive::list_member_hook
<>,
154 &File::dirty_item
> > dirty_file_list_t
;
156 struct Dir
: public RefCountedObject
{
157 MEMPOOL_CLASS_HELPERS();
159 mempool::bluefs::map
<string
,FileRef
> file_map
;
162 FRIEND_MAKE_REF(Dir
);
165 using DirRef
= ceph::ref_t
<Dir
>;
168 MEMPOOL_CLASS_HELPERS();
171 uint64_t pos
= 0; ///< start offset for buffer
172 bufferlist buffer
; ///< new data to write (at end of file)
173 bufferlist tail_block
; ///< existing partial block at end of file, if any
174 bufferlist::page_aligned_appender buffer_appender
; //< for const char* only
175 int writer_type
= 0; ///< WRITER_*
176 int write_hint
= WRITE_LIFE_NOT_SET
;
178 ceph::mutex lock
= ceph::make_mutex("BlueFS::FileWriter::lock");
179 std::array
<IOContext
*,MAX_BDEV
> iocv
; ///< for each bdev
180 std::array
<bool, MAX_BDEV
> dirty_devs
;
182 FileWriter(FileRef f
)
183 : file(std::move(f
)),
184 buffer_appender(buffer
.get_page_aligned_appender(
185 g_conf()->bluefs_alloc_size
/ CEPH_PAGE_SIZE
)) {
188 dirty_devs
.fill(false);
189 if (file
->fnode
.ino
== 1) {
190 write_hint
= WRITE_LIFE_MEDIUM
;
193 // NOTE: caller must call BlueFS::close_writer()
198 // note: BlueRocksEnv uses this append exclusively, so it's safe
199 // to use buffer_appender exclusively here (e.g., it's notion of
200 // offset will remain accurate).
201 void append(const char *buf
, size_t len
) {
202 buffer_appender
.append(buf
, len
);
205 // note: used internally only, for ino 1 or 0.
206 void append(bufferlist
& bl
) {
207 buffer
.claim_append(bl
);
210 uint64_t get_effective_write_pos() {
211 buffer_appender
.flush();
212 return pos
+ buffer
.length();
216 struct FileReaderBuffer
{
217 MEMPOOL_CLASS_HELPERS();
219 uint64_t bl_off
= 0; ///< prefetch buffer logical offset
220 bufferlist bl
; ///< prefetch buffer
221 uint64_t pos
= 0; ///< current logical offset
222 uint64_t max_prefetch
; ///< max allowed prefetch
224 explicit FileReaderBuffer(uint64_t mpf
)
225 : max_prefetch(mpf
) {}
227 uint64_t get_buf_end() const {
228 return bl_off
+ bl
.length();
230 uint64_t get_buf_remaining(uint64_t p
) const {
231 if (p
>= bl_off
&& p
< bl_off
+ bl
.length())
232 return bl_off
+ bl
.length() - p
;
236 void skip(size_t n
) {
239 void seek(uint64_t offset
) {
245 MEMPOOL_CLASS_HELPERS();
248 FileReaderBuffer buf
;
250 bool ignore_eof
; ///< used when reading our log file
252 ceph::shared_mutex lock
{
253 ceph::make_shared_mutex(std::string(), false, false, false)
257 FileReader(FileRef f
, uint64_t mpf
, bool rand
, bool ie
)
270 MEMPOOL_CLASS_HELPERS();
273 explicit FileLock(FileRef f
) : file(std::move(f
)) {}
277 ceph::mutex lock
= ceph::make_mutex("BlueFS::lock");
279 PerfCounters
*logger
= nullptr;
281 uint64_t max_bytes
[MAX_BDEV
] = {0};
282 uint64_t max_bytes_pcounters
[MAX_BDEV
] = {
283 l_bluefs_max_bytes_wal
,
284 l_bluefs_max_bytes_db
,
285 l_bluefs_max_bytes_slow
,
289 mempool::bluefs::map
<string
, DirRef
> dir_map
; ///< dirname -> Dir
290 mempool::bluefs::unordered_map
<uint64_t,FileRef
> file_map
; ///< ino -> File
292 // map of dirty files, files of same dirty_seq are grouped into list.
293 map
<uint64_t, dirty_file_list_t
> dirty_files
;
295 bluefs_super_t super
; ///< latest superblock (as last written)
296 uint64_t ino_last
= 0; ///< last assigned ino (this one is in use)
297 uint64_t log_seq
= 0; ///< last used log seq (by current pending log_t)
298 uint64_t log_seq_stable
= 0; ///< last stable/synced log seq
299 FileWriter
*log_writer
= 0; ///< writer for the log
300 bluefs_transaction_t log_t
; ///< pending, unwritten log transaction
301 bool log_flushing
= false; ///< true while flushing the log
302 ceph::condition_variable log_cond
;
304 uint64_t new_log_jump_to
= 0;
305 uint64_t old_log_jump_to
= 0;
306 FileRef new_log
= nullptr;
307 FileWriter
*new_log_writer
= nullptr;
310 * There are up to 3 block devices:
312 * BDEV_DB db/ - the primary db device
313 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
314 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
316 vector
<BlockDevice
*> bdev
; ///< block devices we can use
317 vector
<IOContext
*> ioc
; ///< IOContexts for bdevs
318 vector
<interval_set
<uint64_t> > block_all
; ///< extents in bdev we own
319 vector
<Allocator
*> alloc
; ///< allocators for bdevs
320 vector
<uint64_t> alloc_size
; ///< alloc size for each device
321 vector
<interval_set
<uint64_t>> pending_release
; ///< extents to release
322 vector
<interval_set
<uint64_t>> block_unused_too_granular
;
324 BlockDevice::aio_callback_t discard_cb
[3]; //discard callbacks for each dev
326 BlueFSDeviceExpander
* slow_dev_expander
= nullptr;
327 std::unique_ptr
<BlueFSVolumeSelector
> vselector
;
330 SocketHook
* asok_hook
= nullptr;
333 void _shutdown_logger();
334 void _update_logger_stats();
339 void _pad_bl(bufferlist
& bl
); ///< pad bufferlist to block size w/ zeros
341 FileRef
_get_file(uint64_t ino
);
342 void _drop_link(FileRef f
);
344 unsigned _get_slow_device_id() {
345 return bdev
[BDEV_SLOW
] ? BDEV_SLOW
: BDEV_DB
;
347 const char* get_device_name(unsigned id
);
348 int _expand_slow_device(uint64_t min_size
, PExtentVector
& extents
);
349 int _allocate(uint8_t bdev
, uint64_t len
,
350 bluefs_fnode_t
* node
);
351 int _allocate_without_fallback(uint8_t id
, uint64_t len
,
352 PExtentVector
* extents
);
354 int _flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
);
355 int _flush(FileWriter
*h
, bool focce
, std::unique_lock
<ceph::mutex
>& l
);
356 int _flush(FileWriter
*h
, bool force
, bool *flushed
= nullptr);
357 int _fsync(FileWriter
*h
, std::unique_lock
<ceph::mutex
>& l
);
360 void _claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
);
361 void wait_for_aio(FileWriter
*h
); // safe to call without a lock
364 int _flush_and_sync_log(std::unique_lock
<ceph::mutex
>& l
,
365 uint64_t want_seq
= 0,
366 uint64_t jump_to
= 0);
367 uint64_t _estimate_log_size();
368 bool _should_compact_log();
376 void _compact_log_dump_metadata(bluefs_transaction_t
*t
,
378 void _compact_log_sync();
379 void _compact_log_async(std::unique_lock
<ceph::mutex
>& l
);
381 void _rewrite_log_and_layout_sync(bool allocate_with_fallback
,
386 std::optional
<bluefs_layout_t
> layout
);
388 //void _aio_finish(void *priv);
390 void _flush_bdev_safely(FileWriter
*h
);
391 void flush_bdev(); // this is safe to call without a lock
392 void flush_bdev(std::array
<bool, MAX_BDEV
>& dirty_bdevs
); // this is safe to call without a lock
394 int _preallocate(FileRef f
, uint64_t off
, uint64_t len
);
395 int _truncate(FileWriter
*h
, uint64_t off
);
398 FileReader
*h
, ///< [in] read from here
399 FileReaderBuffer
*buf
, ///< [in] reader state
400 uint64_t offset
, ///< [in] offset
401 size_t len
, ///< [in] this many bytes
402 bufferlist
*outbl
, ///< [out] optional: reference the result here
403 char *out
); ///< [out] optional: or copy it here
405 FileReader
*h
, ///< [in] read from here
406 uint64_t offset
, ///< [in] offset
407 uint64_t len
, ///< [in] this many bytes
408 char *out
); ///< [out] optional: or copy it here
410 void _invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
);
413 int _write_super(int dev
);
414 int _check_new_allocations(const bluefs_fnode_t
& fnode
,
416 boost::dynamic_bitset
<uint64_t>* owned_blocks
,
417 boost::dynamic_bitset
<uint64_t>* used_blocks
);
418 int _verify_alloc_granularity(
419 __u8 id
, uint64_t offset
, uint64_t length
,
421 int _adjust_granularity(
422 __u8 id
, uint64_t *offset
, uint64_t *length
, bool alloc
);
423 int _replay(bool noop
, bool to_stdout
= false); ///< replay journal
425 FileWriter
*_create_writer(FileRef f
);
426 void _close_writer(FileWriter
*h
);
428 // always put the super in the second 4k block. FIXME should this be
429 // block size independent?
430 unsigned get_super_offset() {
433 unsigned get_super_length() {
437 void _add_block_extent(unsigned bdev
, uint64_t offset
, uint64_t len
,
441 BlueFS(CephContext
* cct
);
444 // the super is always stored on bdev 0
445 int mkfs(uuid_d osd_uuid
, const bluefs_layout_t
& layout
);
447 int maybe_verify_layout(const bluefs_layout_t
& layout
) const;
448 void umount(bool avoid_compact
= false);
449 int prepare_new_device(int id
, const bluefs_layout_t
& layout
);
453 void collect_metadata(map
<string
,string
> *pm
, unsigned skip_bdev_id
);
454 void get_devices(set
<string
> *ls
);
455 uint64_t get_alloc_size(int id
) {
456 return alloc_size
[id
];
460 int device_migrate_to_new(
462 const set
<int>& devs_source
,
464 const bluefs_layout_t
& layout
);
465 int device_migrate_to_existing(
467 const set
<int>& devs_source
,
469 const bluefs_layout_t
& layout
);
472 uint64_t get_total(unsigned id
);
473 uint64_t get_free(unsigned id
);
474 void get_usage(vector
<pair
<uint64_t,uint64_t>> *usage
); // [<free,total> ...]
475 void dump_perf_counters(Formatter
*f
);
477 void dump_block_extents(ostream
& out
);
479 /// get current extents that we own for given block device
480 int get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
);
492 bool random
= false);
494 void close_writer(FileWriter
*h
) {
495 std::lock_guard
l(lock
);
499 int rename(const string
& old_dir
, const string
& old_file
,
500 const string
& new_dir
, const string
& new_file
);
502 int readdir(const string
& dirname
, vector
<string
> *ls
);
504 int unlink(const string
& dirname
, const string
& filename
);
505 int mkdir(const string
& dirname
);
506 int rmdir(const string
& dirname
);
507 bool wal_is_rotational();
509 bool dir_exists(const string
& dirname
);
510 int stat(const string
& dirname
, const string
& filename
,
511 uint64_t *size
, utime_t
*mtime
);
513 int lock_file(const string
& dirname
, const string
& filename
, FileLock
**p
);
514 int unlock_file(FileLock
*l
);
518 /// sync any uncommitted state to disk
519 void sync_metadata(bool avoid_compact
);
520 /// test and compact log, if necessary
521 void _maybe_compact_log(std::unique_lock
<ceph::mutex
>& l
);
523 void set_slow_device_expander(BlueFSDeviceExpander
* a
) {
524 slow_dev_expander
= a
;
526 void set_volume_selector(BlueFSVolumeSelector
* s
) {
529 void dump_volume_selector(ostream
& sout
) {
530 vselector
->dump(sout
);
532 void get_vselector_paths(const std::string
& base
,
533 BlueFSVolumeSelector::paths
& res
) const {
534 return vselector
->get_paths(base
, res
);
537 int add_block_device(unsigned bdev
, const string
& path
, bool trim
,
538 bool shared_with_bluestore
=false);
539 bool bdev_support_label(unsigned id
);
540 uint64_t get_block_device_size(unsigned bdev
);
542 /// gift more block space
543 void add_block_extent(unsigned bdev
, uint64_t offset
, uint64_t len
,
545 std::unique_lock
l(lock
);
546 _add_block_extent(bdev
, offset
, len
, skip
);
547 int r
= _flush_and_sync_log(l
);
551 /// reclaim block space
552 int reclaim_blocks(unsigned bdev
, uint64_t want
,
553 PExtentVector
*extents
);
555 // handler for discard event
556 void handle_discard(unsigned dev
, interval_set
<uint64_t>& to_release
);
558 void flush(FileWriter
*h
, bool force
= false) {
559 std::unique_lock
l(lock
);
560 int r
= _flush(h
, force
, l
);
563 void flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
) {
564 std::lock_guard
l(lock
);
565 _flush_range(h
, offset
, length
);
567 int fsync(FileWriter
*h
) {
568 std::unique_lock
l(lock
);
569 int r
= _fsync(h
, l
);
570 _maybe_compact_log(l
);
573 int read(FileReader
*h
, FileReaderBuffer
*buf
, uint64_t offset
, size_t len
,
574 bufferlist
*outbl
, char *out
) {
575 // no need to hold the global lock here; we only touch h and
576 // h->file, and read vs write or delete is already protected (via
577 // atomics and asserts).
578 return _read(h
, buf
, offset
, len
, outbl
, out
);
580 int read_random(FileReader
*h
, uint64_t offset
, size_t len
,
582 // no need to hold the global lock here; we only touch h and
583 // h->file, and read vs write or delete is already protected (via
584 // atomics and asserts).
585 return _read_random(h
, offset
, len
, out
);
587 void invalidate_cache(FileRef f
, uint64_t offset
, uint64_t len
) {
588 std::lock_guard
l(lock
);
589 _invalidate_cache(f
, offset
, len
);
591 int preallocate(FileRef f
, uint64_t offset
, uint64_t len
) {
592 std::lock_guard
l(lock
);
593 return _preallocate(f
, offset
, len
);
595 int truncate(FileWriter
*h
, uint64_t offset
) {
596 std::lock_guard
l(lock
);
597 return _truncate(h
, offset
);
599 int do_replay_recovery_read(FileReader
*log
,
605 /// test purpose methods
606 void debug_inject_duplicate_gift(unsigned bdev
, uint64_t offset
, uint64_t len
);
607 const PerfCounters
* get_perf_counters() const {
612 class OriginalVolumeSelector
: public BlueFSVolumeSelector
{
618 OriginalVolumeSelector(
621 uint64_t _slow_total
)
622 : wal_total(_wal_total
), db_total(_db_total
), slow_total(_slow_total
) {}
624 void* get_hint_for_log() const override
;
625 void* get_hint_by_dir(const std::string
& dirname
) const override
;
627 void add_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
631 void sub_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
635 void add_usage(void* hint
, uint64_t fsize
) override
{
639 void sub_usage(void* hint
, uint64_t fsize
) override
{
644 uint8_t select_prefer_bdev(void* hint
) override
;
645 void get_paths(const std::string
& base
, paths
& res
) const override
;
646 void dump(ostream
& sout
) override
;