1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4 #define CEPH_OS_BLUESTORE_BLUEFS_H
9 #include "bluefs_types.h"
10 #include "BlockDevice.h"
12 #include "common/RefCountedObj.h"
13 #include "common/ceph_context.h"
14 #include "global/global_context.h"
15 #include "include/common_fwd.h"
17 #include "boost/intrusive/list.hpp"
18 #include "boost/dynamic_bitset.hpp"
23 l_bluefs_first
= 732600,
25 l_bluefs_reclaim_bytes
,
26 l_bluefs_db_total_bytes
,
27 l_bluefs_db_used_bytes
,
28 l_bluefs_wal_total_bytes
,
29 l_bluefs_wal_used_bytes
,
30 l_bluefs_slow_total_bytes
,
31 l_bluefs_slow_used_bytes
,
34 l_bluefs_log_compactions
,
35 l_bluefs_logged_bytes
,
36 l_bluefs_files_written_wal
,
37 l_bluefs_files_written_sst
,
38 l_bluefs_bytes_written_wal
,
39 l_bluefs_bytes_written_sst
,
40 l_bluefs_bytes_written_slow
,
41 l_bluefs_max_bytes_wal
,
42 l_bluefs_max_bytes_db
,
43 l_bluefs_max_bytes_slow
,
44 l_bluefs_read_random_count
,
45 l_bluefs_read_random_bytes
,
46 l_bluefs_read_random_disk_count
,
47 l_bluefs_read_random_disk_bytes
,
48 l_bluefs_read_random_buffer_count
,
49 l_bluefs_read_random_buffer_bytes
,
52 l_bluefs_read_prefetch_count
,
53 l_bluefs_read_prefetch_bytes
,
58 class BlueFSDeviceExpander
{
60 ~BlueFSDeviceExpander() {}
62 virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free
,
63 uint64_t bluefs_total
) = 0;
64 virtual int allocate_freespace(
67 PExtentVector
& extents
) = 0;
68 /** Reports amount of space that can be transferred to BlueFS.
69 * This gives either current state, when alloc_size is currently used
70 * BlueFS's size, or simulation when alloc_size is different.
72 * alloc_size - allocation unit size to check
74 virtual uint64_t available_freespace(uint64_t alloc_size
) = 0;
77 class BlueFSVolumeSelector
{
79 typedef std::vector
<std::pair
<std::string
, uint64_t>> paths
;
81 virtual ~BlueFSVolumeSelector() {
83 virtual void* get_hint_by_device(uint8_t dev
) const = 0;
84 virtual void* get_hint_by_dir(const string
& dirname
) const = 0;
86 virtual void add_usage(void* file_hint
, const bluefs_fnode_t
& fnode
) = 0;
87 virtual void sub_usage(void* file_hint
, const bluefs_fnode_t
& fnode
) = 0;
88 virtual void add_usage(void* file_hint
, uint64_t fsize
) = 0;
89 virtual void sub_usage(void* file_hint
, uint64_t fsize
) = 0;
90 virtual uint8_t select_prefer_bdev(void* hint
) = 0;
91 virtual void get_paths(const std::string
& base
, paths
& res
) const = 0;
92 virtual void dump(ostream
& sout
) = 0;
99 static constexpr unsigned MAX_BDEV
= 5;
100 static constexpr unsigned BDEV_WAL
= 0;
101 static constexpr unsigned BDEV_DB
= 1;
102 static constexpr unsigned BDEV_SLOW
= 2;
103 static constexpr unsigned BDEV_NEWWAL
= 3;
104 static constexpr unsigned BDEV_NEWDB
= 4;
112 struct File
: public RefCountedObject
{
113 MEMPOOL_CLASS_HELPERS();
115 bluefs_fnode_t fnode
;
120 boost::intrusive::list_member_hook
<> dirty_item
;
122 std::atomic_int num_readers
, num_writers
;
123 std::atomic_int num_reading
;
125 void* vselector_hint
= nullptr;
128 FRIEND_MAKE_REF(File
);
138 vselector_hint(nullptr)
141 ceph_assert(num_readers
.load() == 0);
142 ceph_assert(num_writers
.load() == 0);
143 ceph_assert(num_reading
.load() == 0);
144 ceph_assert(!locked
);
147 using FileRef
= ceph::ref_t
<File
>;
149 typedef boost::intrusive::list
<
151 boost::intrusive::member_hook
<
153 boost::intrusive::list_member_hook
<>,
154 &File::dirty_item
> > dirty_file_list_t
;
156 struct Dir
: public RefCountedObject
{
157 MEMPOOL_CLASS_HELPERS();
159 mempool::bluefs::map
<string
,FileRef
> file_map
;
162 FRIEND_MAKE_REF(Dir
);
165 using DirRef
= ceph::ref_t
<Dir
>;
168 MEMPOOL_CLASS_HELPERS();
171 uint64_t pos
= 0; ///< start offset for buffer
172 bufferlist buffer
; ///< new data to write (at end of file)
173 bufferlist tail_block
; ///< existing partial block at end of file, if any
174 bufferlist::page_aligned_appender buffer_appender
; //< for const char* only
175 int writer_type
= 0; ///< WRITER_*
176 int write_hint
= WRITE_LIFE_NOT_SET
;
178 ceph::mutex lock
= ceph::make_mutex("BlueFS::FileWriter::lock");
179 std::array
<IOContext
*,MAX_BDEV
> iocv
; ///< for each bdev
180 std::array
<bool, MAX_BDEV
> dirty_devs
;
182 FileWriter(FileRef f
)
183 : file(std::move(f
)),
184 buffer_appender(buffer
.get_page_aligned_appender(
185 g_conf()->bluefs_alloc_size
/ CEPH_PAGE_SIZE
)) {
188 dirty_devs
.fill(false);
189 if (file
->fnode
.ino
== 1) {
190 write_hint
= WRITE_LIFE_MEDIUM
;
193 // NOTE: caller must call BlueFS::close_writer()
198 // note: BlueRocksEnv uses this append exclusively, so it's safe
199 // to use buffer_appender exclusively here (e.g., it's notion of
200 // offset will remain accurate).
201 void append(const char *buf
, size_t len
) {
202 buffer_appender
.append(buf
, len
);
205 // note: used internally only, for ino 1 or 0.
206 void append(bufferlist
& bl
) {
207 buffer
.claim_append(bl
);
210 uint64_t get_effective_write_pos() {
211 buffer_appender
.flush();
212 return pos
+ buffer
.length();
216 struct FileReaderBuffer
{
217 MEMPOOL_CLASS_HELPERS();
219 uint64_t bl_off
= 0; ///< prefetch buffer logical offset
220 bufferlist bl
; ///< prefetch buffer
221 uint64_t pos
= 0; ///< current logical offset
222 uint64_t max_prefetch
; ///< max allowed prefetch
224 explicit FileReaderBuffer(uint64_t mpf
)
225 : max_prefetch(mpf
) {}
227 uint64_t get_buf_end() const {
228 return bl_off
+ bl
.length();
230 uint64_t get_buf_remaining(uint64_t p
) const {
231 if (p
>= bl_off
&& p
< bl_off
+ bl
.length())
232 return bl_off
+ bl
.length() - p
;
236 void skip(size_t n
) {
239 void seek(uint64_t offset
) {
245 MEMPOOL_CLASS_HELPERS();
248 FileReaderBuffer buf
;
250 bool ignore_eof
; ///< used when reading our log file
252 ceph::shared_mutex lock
{
253 ceph::make_shared_mutex(std::string(), false, false, false)
257 FileReader(FileRef f
, uint64_t mpf
, bool rand
, bool ie
)
270 MEMPOOL_CLASS_HELPERS();
273 explicit FileLock(FileRef f
) : file(std::move(f
)) {}
277 ceph::mutex lock
= ceph::make_mutex("BlueFS::lock");
279 PerfCounters
*logger
= nullptr;
281 uint64_t max_bytes
[MAX_BDEV
] = {0};
282 uint64_t max_bytes_pcounters
[MAX_BDEV
] = {
283 l_bluefs_max_bytes_wal
,
284 l_bluefs_max_bytes_db
,
285 l_bluefs_max_bytes_slow
,
289 mempool::bluefs::map
<string
, DirRef
> dir_map
; ///< dirname -> Dir
290 mempool::bluefs::unordered_map
<uint64_t,FileRef
> file_map
; ///< ino -> File
292 // map of dirty files, files of same dirty_seq are grouped into list.
293 map
<uint64_t, dirty_file_list_t
> dirty_files
;
295 bluefs_super_t super
; ///< latest superblock (as last written)
296 uint64_t ino_last
= 0; ///< last assigned ino (this one is in use)
297 uint64_t log_seq
= 0; ///< last used log seq (by current pending log_t)
298 uint64_t log_seq_stable
= 0; ///< last stable/synced log seq
299 FileWriter
*log_writer
= 0; ///< writer for the log
300 bluefs_transaction_t log_t
; ///< pending, unwritten log transaction
301 bool log_flushing
= false; ///< true while flushing the log
302 ceph::condition_variable log_cond
;
304 uint64_t new_log_jump_to
= 0;
305 uint64_t old_log_jump_to
= 0;
306 FileRef new_log
= nullptr;
307 FileWriter
*new_log_writer
= nullptr;
310 * There are up to 3 block devices:
312 * BDEV_DB db/ - the primary db device
313 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
314 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
316 vector
<BlockDevice
*> bdev
; ///< block devices we can use
317 vector
<IOContext
*> ioc
; ///< IOContexts for bdevs
318 vector
<interval_set
<uint64_t> > block_all
; ///< extents in bdev we own
319 vector
<Allocator
*> alloc
; ///< allocators for bdevs
320 vector
<uint64_t> alloc_size
; ///< alloc size for each device
321 vector
<interval_set
<uint64_t>> pending_release
; ///< extents to release
322 vector
<interval_set
<uint64_t>> block_unused_too_granular
;
324 BlockDevice::aio_callback_t discard_cb
[3]; //discard callbacks for each dev
326 BlueFSDeviceExpander
* slow_dev_expander
= nullptr;
327 std::unique_ptr
<BlueFSVolumeSelector
> vselector
;
330 SocketHook
* asok_hook
= nullptr;
333 void _shutdown_logger();
334 void _update_logger_stats();
339 void _pad_bl(bufferlist
& bl
); ///< pad bufferlist to block size w/ zeros
341 FileRef
_get_file(uint64_t ino
);
342 void _drop_link(FileRef f
);
344 int _get_slow_device_id() { return bdev
[BDEV_SLOW
] ? BDEV_SLOW
: BDEV_DB
; }
345 const char* get_device_name(unsigned id
);
346 int _expand_slow_device(uint64_t min_size
, PExtentVector
& extents
);
347 int _allocate(uint8_t bdev
, uint64_t len
,
348 bluefs_fnode_t
* node
);
349 int _allocate_without_fallback(uint8_t id
, uint64_t len
,
350 PExtentVector
* extents
);
352 int _flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
);
353 int _flush(FileWriter
*h
, bool force
);
354 int _fsync(FileWriter
*h
, std::unique_lock
<ceph::mutex
>& l
);
357 void _claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
);
358 void wait_for_aio(FileWriter
*h
); // safe to call without a lock
361 int _flush_and_sync_log(std::unique_lock
<ceph::mutex
>& l
,
362 uint64_t want_seq
= 0,
363 uint64_t jump_to
= 0);
364 uint64_t _estimate_log_size();
365 bool _should_compact_log();
373 void _compact_log_dump_metadata(bluefs_transaction_t
*t
,
375 void _compact_log_sync();
376 void _compact_log_async(std::unique_lock
<ceph::mutex
>& l
);
378 void _rewrite_log_and_layout_sync(bool allocate_with_fallback
,
383 std::optional
<bluefs_layout_t
> layout
);
385 //void _aio_finish(void *priv);
387 void _flush_bdev_safely(FileWriter
*h
);
388 void flush_bdev(); // this is safe to call without a lock
389 void flush_bdev(std::array
<bool, MAX_BDEV
>& dirty_bdevs
); // this is safe to call without a lock
391 int _preallocate(FileRef f
, uint64_t off
, uint64_t len
);
392 int _truncate(FileWriter
*h
, uint64_t off
);
395 FileReader
*h
, ///< [in] read from here
396 FileReaderBuffer
*buf
, ///< [in] reader state
397 uint64_t offset
, ///< [in] offset
398 size_t len
, ///< [in] this many bytes
399 bufferlist
*outbl
, ///< [out] optional: reference the result here
400 char *out
); ///< [out] optional: or copy it here
402 FileReader
*h
, ///< [in] read from here
403 uint64_t offset
, ///< [in] offset
404 uint64_t len
, ///< [in] this many bytes
405 char *out
); ///< [out] optional: or copy it here
407 void _invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
);
410 int _write_super(int dev
);
411 int _check_new_allocations(const bluefs_fnode_t
& fnode
,
413 boost::dynamic_bitset
<uint64_t>* owned_blocks
,
414 boost::dynamic_bitset
<uint64_t>* used_blocks
);
415 int _verify_alloc_granularity(
416 __u8 id
, uint64_t offset
, uint64_t length
,
418 int _adjust_granularity(
419 __u8 id
, uint64_t *offset
, uint64_t *length
, bool alloc
);
420 int _replay(bool noop
, bool to_stdout
= false); ///< replay journal
422 FileWriter
*_create_writer(FileRef f
);
423 void _close_writer(FileWriter
*h
);
425 // always put the super in the second 4k block. FIXME should this be
426 // block size independent?
427 unsigned get_super_offset() {
430 unsigned get_super_length() {
434 void _add_block_extent(unsigned bdev
, uint64_t offset
, uint64_t len
);
437 BlueFS(CephContext
* cct
);
440 // the super is always stored on bdev 0
441 int mkfs(uuid_d osd_uuid
, const bluefs_layout_t
& layout
);
443 int maybe_verify_layout(const bluefs_layout_t
& layout
) const;
445 int prepare_new_device(int id
, const bluefs_layout_t
& layout
);
449 void collect_metadata(map
<string
,string
> *pm
, unsigned skip_bdev_id
);
450 void get_devices(set
<string
> *ls
);
451 uint64_t get_alloc_size(int id
) {
452 return alloc_size
[id
];
456 int device_migrate_to_new(
458 const set
<int>& devs_source
,
460 const bluefs_layout_t
& layout
);
461 int device_migrate_to_existing(
463 const set
<int>& devs_source
,
465 const bluefs_layout_t
& layout
);
468 uint64_t get_total(unsigned id
);
469 uint64_t get_free(unsigned id
);
470 void get_usage(vector
<pair
<uint64_t,uint64_t>> *usage
); // [<free,total> ...]
471 void dump_perf_counters(Formatter
*f
);
473 void dump_block_extents(ostream
& out
);
475 /// get current extents that we own for given block device
476 int get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
);
488 bool random
= false);
490 void close_writer(FileWriter
*h
) {
491 std::lock_guard
l(lock
);
495 int rename(const string
& old_dir
, const string
& old_file
,
496 const string
& new_dir
, const string
& new_file
);
498 int readdir(const string
& dirname
, vector
<string
> *ls
);
500 int unlink(const string
& dirname
, const string
& filename
);
501 int mkdir(const string
& dirname
);
502 int rmdir(const string
& dirname
);
503 bool wal_is_rotational();
505 bool dir_exists(const string
& dirname
);
506 int stat(const string
& dirname
, const string
& filename
,
507 uint64_t *size
, utime_t
*mtime
);
509 int lock_file(const string
& dirname
, const string
& filename
, FileLock
**p
);
510 int unlock_file(FileLock
*l
);
514 /// sync any uncommitted state to disk
515 void sync_metadata();
517 void set_slow_device_expander(BlueFSDeviceExpander
* a
) {
518 slow_dev_expander
= a
;
520 void set_volume_selector(BlueFSVolumeSelector
* s
) {
523 void dump_volume_selector(ostream
& sout
) {
524 vselector
->dump(sout
);
526 void get_vselector_paths(const std::string
& base
,
527 BlueFSVolumeSelector::paths
& res
) const {
528 return vselector
->get_paths(base
, res
);
531 int add_block_device(unsigned bdev
, const string
& path
, bool trim
,
532 bool shared_with_bluestore
=false);
533 bool bdev_support_label(unsigned id
);
534 uint64_t get_block_device_size(unsigned bdev
);
536 /// gift more block space
537 void add_block_extent(unsigned bdev
, uint64_t offset
, uint64_t len
) {
538 std::unique_lock
l(lock
);
539 _add_block_extent(bdev
, offset
, len
);
540 int r
= _flush_and_sync_log(l
);
544 /// reclaim block space
545 int reclaim_blocks(unsigned bdev
, uint64_t want
,
546 PExtentVector
*extents
);
548 // handler for discard event
549 void handle_discard(unsigned dev
, interval_set
<uint64_t>& to_release
);
551 void flush(FileWriter
*h
) {
552 std::lock_guard
l(lock
);
555 void flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
) {
556 std::lock_guard
l(lock
);
557 _flush_range(h
, offset
, length
);
559 int fsync(FileWriter
*h
) {
560 std::unique_lock
l(lock
);
563 int read(FileReader
*h
, FileReaderBuffer
*buf
, uint64_t offset
, size_t len
,
564 bufferlist
*outbl
, char *out
) {
565 // no need to hold the global lock here; we only touch h and
566 // h->file, and read vs write or delete is already protected (via
567 // atomics and asserts).
568 return _read(h
, buf
, offset
, len
, outbl
, out
);
570 int read_random(FileReader
*h
, uint64_t offset
, size_t len
,
572 // no need to hold the global lock here; we only touch h and
573 // h->file, and read vs write or delete is already protected (via
574 // atomics and asserts).
575 return _read_random(h
, offset
, len
, out
);
577 void invalidate_cache(FileRef f
, uint64_t offset
, uint64_t len
) {
578 std::lock_guard
l(lock
);
579 _invalidate_cache(f
, offset
, len
);
581 int preallocate(FileRef f
, uint64_t offset
, uint64_t len
) {
582 std::lock_guard
l(lock
);
583 return _preallocate(f
, offset
, len
);
585 int truncate(FileWriter
*h
, uint64_t offset
) {
586 std::lock_guard
l(lock
);
587 return _truncate(h
, offset
);
590 /// test purpose methods
591 void debug_inject_duplicate_gift(unsigned bdev
, uint64_t offset
, uint64_t len
);
592 const PerfCounters
* get_perf_counters() const {
597 class OriginalVolumeSelector
: public BlueFSVolumeSelector
{
603 OriginalVolumeSelector(
606 uint64_t _slow_total
)
607 : wal_total(_wal_total
), db_total(_db_total
), slow_total(_slow_total
) {}
609 void* get_hint_by_device(uint8_t dev
) const override
;
610 void* get_hint_by_dir(const string
& dirname
) const override
;
612 void add_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
616 void sub_usage(void* hint
, const bluefs_fnode_t
& fnode
) override
{
620 void add_usage(void* hint
, uint64_t fsize
) override
{
624 void sub_usage(void* hint
, uint64_t fsize
) override
{
629 uint8_t select_prefer_bdev(void* hint
) override
;
630 void get_paths(const std::string
& base
, paths
& res
) const override
;
631 void dump(ostream
& sout
) override
;