1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4 #define CEPH_OS_BLUESTORE_BLUEFS_H
9 #include "bluefs_types.h"
10 #include "common/RefCountedObj.h"
11 #include "BlockDevice.h"
13 #include "boost/intrusive/list.hpp"
14 #include <boost/intrusive_ptr.hpp>
21 l_bluefs_first
= 732600,
23 l_bluefs_reclaim_bytes
,
24 l_bluefs_db_total_bytes
,
25 l_bluefs_db_used_bytes
,
26 l_bluefs_wal_total_bytes
,
27 l_bluefs_wal_used_bytes
,
28 l_bluefs_slow_total_bytes
,
29 l_bluefs_slow_used_bytes
,
32 l_bluefs_log_compactions
,
33 l_bluefs_logged_bytes
,
34 l_bluefs_files_written_wal
,
35 l_bluefs_files_written_sst
,
36 l_bluefs_bytes_written_wal
,
37 l_bluefs_bytes_written_sst
,
44 static constexpr unsigned MAX_BDEV
= 3;
45 static constexpr unsigned BDEV_WAL
= 0;
46 static constexpr unsigned BDEV_DB
= 1;
47 static constexpr unsigned BDEV_SLOW
= 2;
55 struct File
: public RefCountedObject
{
56 MEMPOOL_CLASS_HELPERS();
63 boost::intrusive::list_member_hook
<> dirty_item
;
65 std::atomic_int num_readers
, num_writers
;
66 std::atomic_int num_reading
;
69 : RefCountedObject(NULL
, 0),
79 assert(num_readers
.load() == 0);
80 assert(num_writers
.load() == 0);
81 assert(num_reading
.load() == 0);
85 friend void intrusive_ptr_add_ref(File
*f
) {
88 friend void intrusive_ptr_release(File
*f
) {
92 typedef boost::intrusive_ptr
<File
> FileRef
;
94 typedef boost::intrusive::list
<
96 boost::intrusive::member_hook
<
98 boost::intrusive::list_member_hook
<>,
99 &File::dirty_item
> > dirty_file_list_t
;
101 struct Dir
: public RefCountedObject
{
102 MEMPOOL_CLASS_HELPERS();
104 mempool::bluefs::map
<string
,FileRef
> file_map
;
106 Dir() : RefCountedObject(NULL
, 0) {}
108 friend void intrusive_ptr_add_ref(Dir
*d
) {
111 friend void intrusive_ptr_release(Dir
*d
) {
115 typedef boost::intrusive_ptr
<Dir
> DirRef
;
118 MEMPOOL_CLASS_HELPERS();
121 uint64_t pos
; ///< start offset for buffer
122 bufferlist buffer
; ///< new data to write (at end of file)
123 bufferlist tail_block
; ///< existing partial block at end of file, if any
124 bufferlist::page_aligned_appender buffer_appender
; //< for const char* only
125 int writer_type
= 0; ///< WRITER_*
128 std::array
<IOContext
*,MAX_BDEV
> iocv
; ///< for each bdev
130 FileWriter(FileRef f
)
133 buffer_appender(buffer
.get_page_aligned_appender(
134 g_conf
->bluefs_alloc_size
/ CEPH_PAGE_SIZE
)) {
138 // NOTE: caller must call BlueFS::close_writer()
143 // note: BlueRocksEnv uses this append exclusively, so it's safe
144 // to use buffer_appender exclusively here (e.g., it's notion of
145 // offset will remain accurate).
146 void append(const char *buf
, size_t len
) {
147 buffer_appender
.append(buf
, len
);
150 // note: used internally only, for ino 1 or 0.
151 void append(bufferlist
& bl
) {
152 buffer
.claim_append(bl
);
155 uint64_t get_effective_write_pos() {
156 buffer_appender
.flush();
157 return pos
+ buffer
.length();
161 struct FileReaderBuffer
{
162 MEMPOOL_CLASS_HELPERS();
164 uint64_t bl_off
; ///< prefetch buffer logical offset
165 bufferlist bl
; ///< prefetch buffer
166 uint64_t pos
; ///< current logical offset
167 uint64_t max_prefetch
; ///< max allowed prefetch
169 explicit FileReaderBuffer(uint64_t mpf
)
174 uint64_t get_buf_end() {
175 return bl_off
+ bl
.length();
177 uint64_t get_buf_remaining(uint64_t p
) {
178 if (p
>= bl_off
&& p
< bl_off
+ bl
.length())
179 return bl_off
+ bl
.length() - p
;
183 void skip(size_t n
) {
186 void seek(uint64_t offset
) {
192 MEMPOOL_CLASS_HELPERS();
195 FileReaderBuffer buf
;
197 bool ignore_eof
; ///< used when reading our log file
199 FileReader(FileRef f
, uint64_t mpf
, bool rand
, bool ie
)
212 MEMPOOL_CLASS_HELPERS();
215 explicit FileLock(FileRef f
) : file(f
) {}
221 PerfCounters
*logger
= nullptr;
224 mempool::bluefs::map
<string
, DirRef
> dir_map
; ///< dirname -> Dir
225 mempool::bluefs::unordered_map
<uint64_t,FileRef
> file_map
; ///< ino -> File
227 // map of dirty files, files of same dirty_seq are grouped into list.
228 map
<uint64_t, dirty_file_list_t
> dirty_files
;
230 bluefs_super_t super
; ///< latest superblock (as last written)
231 uint64_t ino_last
= 0; ///< last assigned ino (this one is in use)
232 uint64_t log_seq
= 0; ///< last used log seq (by current pending log_t)
233 uint64_t log_seq_stable
= 0; ///< last stable/synced log seq
234 FileWriter
*log_writer
= 0; ///< writer for the log
235 bluefs_transaction_t log_t
; ///< pending, unwritten log transaction
236 bool log_flushing
= false; ///< true while flushing the log
237 std::condition_variable log_cond
;
239 uint64_t new_log_jump_to
= 0;
240 uint64_t old_log_jump_to
= 0;
241 FileRef new_log
= nullptr;
242 FileWriter
*new_log_writer
= nullptr;
245 * There are up to 3 block devices:
247 * BDEV_DB db/ - the primary db device
248 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
249 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
251 vector
<BlockDevice
*> bdev
; ///< block devices we can use
252 vector
<IOContext
*> ioc
; ///< IOContexts for bdevs
253 vector
<interval_set
<uint64_t> > block_all
; ///< extents in bdev we own
254 vector
<uint64_t> block_total
; ///< sum of block_all
255 vector
<Allocator
*> alloc
; ///< allocators for bdevs
256 vector
<interval_set
<uint64_t>> pending_release
; ///< extents to release
259 void _shutdown_logger();
260 void _update_logger_stats();
265 void _pad_bl(bufferlist
& bl
); ///< pad bufferlist to block size w/ zeros
267 FileRef
_get_file(uint64_t ino
);
268 void _drop_link(FileRef f
);
270 int _allocate(uint8_t bdev
, uint64_t len
,
271 bluefs_fnode_t
* node
);
272 int _flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
);
273 int _flush(FileWriter
*h
, bool force
);
274 int _fsync(FileWriter
*h
, std::unique_lock
<std::mutex
>& l
);
276 void _claim_completed_aios(FileWriter
*h
, list
<aio_t
> *ls
);
277 void wait_for_aio(FileWriter
*h
); // safe to call without a lock
279 int _flush_and_sync_log(std::unique_lock
<std::mutex
>& l
,
280 uint64_t want_seq
= 0,
281 uint64_t jump_to
= 0);
282 uint64_t _estimate_log_size();
283 bool _should_compact_log();
284 void _compact_log_dump_metadata(bluefs_transaction_t
*t
);
285 void _compact_log_sync();
286 void _compact_log_async(std::unique_lock
<std::mutex
>& l
);
288 //void _aio_finish(void *priv);
290 void _flush_bdev_safely(FileWriter
*h
);
291 void flush_bdev(); // this is safe to call without a lock
293 int _preallocate(FileRef f
, uint64_t off
, uint64_t len
);
294 int _truncate(FileWriter
*h
, uint64_t off
);
297 FileReader
*h
, ///< [in] read from here
298 FileReaderBuffer
*buf
, ///< [in] reader state
299 uint64_t offset
, ///< [in] offset
300 size_t len
, ///< [in] this many bytes
301 bufferlist
*outbl
, ///< [out] optional: reference the result here
302 char *out
); ///< [out] optional: or copy it here
304 FileReader
*h
, ///< [in] read from here
305 uint64_t offset
, ///< [in] offset
306 size_t len
, ///< [in] this many bytes
307 char *out
); ///< [out] optional: or copy it here
309 void _invalidate_cache(FileRef f
, uint64_t offset
, uint64_t length
);
313 int _replay(bool noop
); ///< replay journal
315 FileWriter
*_create_writer(FileRef f
);
316 void _close_writer(FileWriter
*h
);
318 // always put the super in the second 4k block. FIXME should this be
319 // block size independent?
320 unsigned get_super_offset() {
323 unsigned get_super_length() {
328 BlueFS(CephContext
* cct
);
331 // the super is always stored on bdev 0
332 int mkfs(uuid_d osd_uuid
);
336 void collect_metadata(map
<string
,string
> *pm
);
339 uint64_t get_fs_usage();
340 uint64_t get_total(unsigned id
);
341 uint64_t get_free(unsigned id
);
342 void get_usage(vector
<pair
<uint64_t,uint64_t>> *usage
); // [<free,total> ...]
343 void dump_perf_counters(Formatter
*f
);
345 void dump_block_extents(ostream
& out
);
347 /// get current extents that we own for given block device
348 int get_block_extents(unsigned id
, interval_set
<uint64_t> *extents
);
360 bool random
= false);
362 void close_writer(FileWriter
*h
) {
363 std::lock_guard
<std::mutex
> l(lock
);
367 int rename(const string
& old_dir
, const string
& old_file
,
368 const string
& new_dir
, const string
& new_file
);
370 int readdir(const string
& dirname
, vector
<string
> *ls
);
372 int unlink(const string
& dirname
, const string
& filename
);
373 int mkdir(const string
& dirname
);
374 int rmdir(const string
& dirname
);
375 bool wal_is_rotational();
377 bool dir_exists(const string
& dirname
);
378 int stat(const string
& dirname
, const string
& filename
,
379 uint64_t *size
, utime_t
*mtime
);
381 int lock_file(const string
& dirname
, const string
& filename
, FileLock
**p
);
382 int unlock_file(FileLock
*l
);
387 /// sync any uncommitted state to disk
388 void sync_metadata();
390 int add_block_device(unsigned bdev
, const string
& path
);
391 bool bdev_support_label(unsigned id
);
392 uint64_t get_block_device_size(unsigned bdev
);
394 /// gift more block space
395 void add_block_extent(unsigned bdev
, uint64_t offset
, uint64_t len
);
397 /// reclaim block space
398 int reclaim_blocks(unsigned bdev
, uint64_t want
,
399 AllocExtentVector
*extents
);
401 void flush(FileWriter
*h
) {
402 std::lock_guard
<std::mutex
> l(lock
);
405 void flush_range(FileWriter
*h
, uint64_t offset
, uint64_t length
) {
406 std::lock_guard
<std::mutex
> l(lock
);
407 _flush_range(h
, offset
, length
);
409 int fsync(FileWriter
*h
) {
410 std::unique_lock
<std::mutex
> l(lock
);
413 int read(FileReader
*h
, FileReaderBuffer
*buf
, uint64_t offset
, size_t len
,
414 bufferlist
*outbl
, char *out
) {
415 // no need to hold the global lock here; we only touch h and
416 // h->file, and read vs write or delete is already protected (via
417 // atomics and asserts).
418 return _read(h
, buf
, offset
, len
, outbl
, out
);
420 int read_random(FileReader
*h
, uint64_t offset
, size_t len
,
422 // no need to hold the global lock here; we only touch h and
423 // h->file, and read vs write or delete is already protected (via
424 // atomics and asserts).
425 return _read_random(h
, offset
, len
, out
);
427 void invalidate_cache(FileRef f
, uint64_t offset
, uint64_t len
) {
428 std::lock_guard
<std::mutex
> l(lock
);
429 _invalidate_cache(f
, offset
, len
);
431 int preallocate(FileRef f
, uint64_t offset
, uint64_t len
) {
432 std::lock_guard
<std::mutex
> l(lock
);
433 return _preallocate(f
, offset
, len
);
435 int truncate(FileWriter
*h
, uint64_t offset
) {
436 std::lock_guard
<std::mutex
> l(lock
);
437 return _truncate(h
, offset
);