]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | #ifndef CEPH_OS_BLUESTORE_BLUEFS_H | |
4 | #define CEPH_OS_BLUESTORE_BLUEFS_H | |
5 | ||
6 | #include <atomic> | |
7 | #include <mutex> | |
cd265ab1 | 8 | #include <limits> |
7c673cae FG |
9 | |
10 | #include "bluefs_types.h" | |
f67539c2 | 11 | #include "blk/BlockDevice.h" |
7c673cae | 12 | |
9f95a23c TL |
13 | #include "common/RefCountedObj.h" |
14 | #include "common/ceph_context.h" | |
15 | #include "global/global_context.h" | |
16 | #include "include/common_fwd.h" | |
7c673cae | 17 | |
9f95a23c TL |
18 | #include "boost/intrusive/list.hpp" |
19 | #include "boost/dynamic_bitset.hpp" | |
7c673cae FG |
20 | |
21 | class Allocator; | |
22 | ||
23 | enum { | |
24 | l_bluefs_first = 732600, | |
7c673cae FG |
25 | l_bluefs_db_total_bytes, |
26 | l_bluefs_db_used_bytes, | |
27 | l_bluefs_wal_total_bytes, | |
28 | l_bluefs_wal_used_bytes, | |
29 | l_bluefs_slow_total_bytes, | |
30 | l_bluefs_slow_used_bytes, | |
31 | l_bluefs_num_files, | |
32 | l_bluefs_log_bytes, | |
33 | l_bluefs_log_compactions, | |
34 | l_bluefs_logged_bytes, | |
35 | l_bluefs_files_written_wal, | |
36 | l_bluefs_files_written_sst, | |
37 | l_bluefs_bytes_written_wal, | |
38 | l_bluefs_bytes_written_sst, | |
11fdf7f2 TL |
39 | l_bluefs_bytes_written_slow, |
40 | l_bluefs_max_bytes_wal, | |
41 | l_bluefs_max_bytes_db, | |
42 | l_bluefs_max_bytes_slow, | |
20effc67 TL |
43 | l_bluefs_main_alloc_unit, |
44 | l_bluefs_db_alloc_unit, | |
45 | l_bluefs_wal_alloc_unit, | |
494da23a TL |
46 | l_bluefs_read_random_count, |
47 | l_bluefs_read_random_bytes, | |
48 | l_bluefs_read_random_disk_count, | |
49 | l_bluefs_read_random_disk_bytes, | |
20effc67 TL |
50 | l_bluefs_read_random_disk_bytes_wal, |
51 | l_bluefs_read_random_disk_bytes_db, | |
52 | l_bluefs_read_random_disk_bytes_slow, | |
494da23a TL |
53 | l_bluefs_read_random_buffer_count, |
54 | l_bluefs_read_random_buffer_bytes, | |
55 | l_bluefs_read_count, | |
56 | l_bluefs_read_bytes, | |
20effc67 TL |
57 | l_bluefs_read_disk_count, |
58 | l_bluefs_read_disk_bytes, | |
59 | l_bluefs_read_disk_bytes_wal, | |
60 | l_bluefs_read_disk_bytes_db, | |
61 | l_bluefs_read_disk_bytes_slow, | |
494da23a TL |
62 | l_bluefs_read_prefetch_count, |
63 | l_bluefs_read_prefetch_bytes, | |
cd265ab1 TL |
64 | l_bluefs_read_zeros_candidate, |
65 | l_bluefs_read_zeros_errors, | |
494da23a | 66 | |
7c673cae FG |
67 | l_bluefs_last, |
68 | }; | |
69 | ||
9f95a23c TL |
70 | class BlueFSVolumeSelector { |
71 | public: | |
72 | typedef std::vector<std::pair<std::string, uint64_t>> paths; | |
73 | ||
74 | virtual ~BlueFSVolumeSelector() { | |
75 | } | |
f6b5b4d7 | 76 | virtual void* get_hint_for_log() const = 0; |
b3b6e05e | 77 | virtual void* get_hint_by_dir(std::string_view dirname) const = 0; |
9f95a23c TL |
78 | |
79 | virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; | |
80 | virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; | |
81 | virtual void add_usage(void* file_hint, uint64_t fsize) = 0; | |
82 | virtual void sub_usage(void* file_hint, uint64_t fsize) = 0; | |
83 | virtual uint8_t select_prefer_bdev(void* hint) = 0; | |
84 | virtual void get_paths(const std::string& base, paths& res) const = 0; | |
f67539c2 | 85 | virtual void dump(std::ostream& sout) = 0; |
20effc67 TL |
86 | |
87 | /* used for sanity checking of vselector */ | |
88 | virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; } | |
89 | virtual bool compare(BlueFSVolumeSelector* other) { return true; }; | |
f67539c2 TL |
90 | }; |
91 | ||
92 | struct bluefs_shared_alloc_context_t { | |
93 | bool need_init = false; | |
94 | Allocator* a = nullptr; | |
95 | ||
96 | std::atomic<uint64_t> bluefs_used = 0; | |
97 | ||
98 | void set(Allocator* _a) { | |
99 | a = _a; | |
100 | need_init = true; | |
101 | bluefs_used = 0; | |
102 | } | |
103 | void reset() { | |
104 | a = nullptr; | |
105 | } | |
11fdf7f2 TL |
106 | }; |
107 | ||
7c673cae FG |
108 | class BlueFS { |
109 | public: | |
110 | CephContext* cct; | |
11fdf7f2 | 111 | static constexpr unsigned MAX_BDEV = 5; |
7c673cae FG |
112 | static constexpr unsigned BDEV_WAL = 0; |
113 | static constexpr unsigned BDEV_DB = 1; | |
114 | static constexpr unsigned BDEV_SLOW = 2; | |
11fdf7f2 TL |
115 | static constexpr unsigned BDEV_NEWWAL = 3; |
116 | static constexpr unsigned BDEV_NEWDB = 4; | |
7c673cae FG |
117 | |
118 | enum { | |
119 | WRITER_UNKNOWN, | |
120 | WRITER_WAL, | |
121 | WRITER_SST, | |
122 | }; | |
123 | ||
124 | struct File : public RefCountedObject { | |
125 | MEMPOOL_CLASS_HELPERS(); | |
126 | ||
127 | bluefs_fnode_t fnode; | |
128 | int refs; | |
129 | uint64_t dirty_seq; | |
130 | bool locked; | |
131 | bool deleted; | |
522d829b | 132 | bool is_dirty; |
7c673cae FG |
133 | boost::intrusive::list_member_hook<> dirty_item; |
134 | ||
135 | std::atomic_int num_readers, num_writers; | |
136 | std::atomic_int num_reading; | |
137 | ||
9f95a23c | 138 | void* vselector_hint = nullptr; |
20effc67 TL |
139 | /* lock protects fnode and other the parts that can be modified during read & write operations. |
140 | Does not protect values that are fixed | |
141 | Does not need to be taken when doing one-time operations: | |
142 | _replay, device_migrate_to_existing, device_migrate_to_new */ | |
143 | ceph::mutex lock = ceph::make_mutex("BlueFS::File::lock"); | |
9f95a23c TL |
144 | |
145 | private: | |
146 | FRIEND_MAKE_REF(File); | |
7c673cae | 147 | File() |
9f95a23c | 148 | : |
7c673cae FG |
149 | refs(0), |
150 | dirty_seq(0), | |
151 | locked(false), | |
152 | deleted(false), | |
522d829b | 153 | is_dirty(false), |
7c673cae FG |
154 | num_readers(0), |
155 | num_writers(0), | |
9f95a23c TL |
156 | num_reading(0), |
157 | vselector_hint(nullptr) | |
7c673cae FG |
158 | {} |
159 | ~File() override { | |
11fdf7f2 TL |
160 | ceph_assert(num_readers.load() == 0); |
161 | ceph_assert(num_writers.load() == 0); | |
162 | ceph_assert(num_reading.load() == 0); | |
163 | ceph_assert(!locked); | |
7c673cae | 164 | } |
7c673cae | 165 | }; |
9f95a23c | 166 | using FileRef = ceph::ref_t<File>; |
7c673cae FG |
167 | |
168 | typedef boost::intrusive::list< | |
169 | File, | |
170 | boost::intrusive::member_hook< | |
171 | File, | |
172 | boost::intrusive::list_member_hook<>, | |
173 | &File::dirty_item> > dirty_file_list_t; | |
174 | ||
175 | struct Dir : public RefCountedObject { | |
176 | MEMPOOL_CLASS_HELPERS(); | |
177 | ||
b3b6e05e | 178 | mempool::bluefs::map<std::string, FileRef, std::less<>> file_map; |
7c673cae | 179 | |
9f95a23c TL |
180 | private: |
181 | FRIEND_MAKE_REF(Dir); | |
182 | Dir() = default; | |
7c673cae | 183 | }; |
9f95a23c | 184 | using DirRef = ceph::ref_t<Dir>; |
7c673cae FG |
185 | |
186 | struct FileWriter { | |
187 | MEMPOOL_CLASS_HELPERS(); | |
188 | ||
189 | FileRef file; | |
9f95a23c | 190 | uint64_t pos = 0; ///< start offset for buffer |
f67539c2 TL |
191 | private: |
192 | ceph::buffer::list buffer; ///< new data to write (at end of file) | |
193 | ceph::buffer::list tail_block; ///< existing partial block at end of file, if any | |
194 | public: | |
195 | unsigned get_buffer_length() const { | |
196 | return buffer.length(); | |
197 | } | |
198 | ceph::bufferlist flush_buffer( | |
199 | CephContext* cct, | |
200 | const bool partial, | |
201 | const unsigned length, | |
202 | const bluefs_super_t& super); | |
203 | ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only | |
204 | public: | |
7c673cae | 205 | int writer_type = 0; ///< WRITER_* |
11fdf7f2 | 206 | int write_hint = WRITE_LIFE_NOT_SET; |
7c673cae | 207 | |
11fdf7f2 | 208 | ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock"); |
7c673cae | 209 | std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev |
11fdf7f2 | 210 | std::array<bool, MAX_BDEV> dirty_devs; |
7c673cae FG |
211 | |
212 | FileWriter(FileRef f) | |
9f95a23c | 213 | : file(std::move(f)), |
f67539c2 TL |
214 | buffer_appender(buffer.get_page_aligned_appender( |
215 | g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) { | |
7c673cae FG |
216 | ++file->num_writers; |
217 | iocv.fill(nullptr); | |
11fdf7f2 | 218 | dirty_devs.fill(false); |
9f95a23c | 219 | if (file->fnode.ino == 1) { |
11fdf7f2 TL |
220 | write_hint = WRITE_LIFE_MEDIUM; |
221 | } | |
7c673cae FG |
222 | } |
223 | // NOTE: caller must call BlueFS::close_writer() | |
224 | ~FileWriter() { | |
225 | --file->num_writers; | |
226 | } | |
227 | ||
228 | // note: BlueRocksEnv uses this append exclusively, so it's safe | |
20effc67 | 229 | // to use buffer_appender exclusively here (e.g., its notion of |
7c673cae FG |
230 | // offset will remain accurate). |
231 | void append(const char *buf, size_t len) { | |
f67539c2 | 232 | uint64_t l0 = get_buffer_length(); |
cd265ab1 | 233 | ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max()); |
7c673cae FG |
234 | buffer_appender.append(buf, len); |
235 | } | |
236 | ||
20effc67 TL |
237 | void append(const std::byte *buf, size_t len) { |
238 | // allow callers to use byte type instead of char* as we simply pass byte array | |
239 | append((const char*)buf, len); | |
240 | } | |
241 | ||
7c673cae | 242 | // note: used internally only, for ino 1 or 0. |
cd265ab1 | 243 | void append(ceph::buffer::list& bl) { |
f67539c2 | 244 | uint64_t l0 = get_buffer_length(); |
cd265ab1 | 245 | ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max()); |
7c673cae FG |
246 | buffer.claim_append(bl); |
247 | } | |
248 | ||
f67539c2 TL |
249 | void append_zero(size_t len) { |
250 | uint64_t l0 = get_buffer_length(); | |
251 | ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max()); | |
252 | buffer_appender.append_zero(len); | |
253 | } | |
254 | ||
7c673cae | 255 | uint64_t get_effective_write_pos() { |
7c673cae FG |
256 | return pos + buffer.length(); |
257 | } | |
258 | }; | |
259 | ||
260 | struct FileReaderBuffer { | |
261 | MEMPOOL_CLASS_HELPERS(); | |
262 | ||
9f95a23c | 263 | uint64_t bl_off = 0; ///< prefetch buffer logical offset |
f67539c2 | 264 | ceph::buffer::list bl; ///< prefetch buffer |
9f95a23c | 265 | uint64_t pos = 0; ///< current logical offset |
7c673cae FG |
266 | uint64_t max_prefetch; ///< max allowed prefetch |
267 | ||
268 | explicit FileReaderBuffer(uint64_t mpf) | |
9f95a23c | 269 | : max_prefetch(mpf) {} |
7c673cae | 270 | |
9f95a23c | 271 | uint64_t get_buf_end() const { |
7c673cae FG |
272 | return bl_off + bl.length(); |
273 | } | |
9f95a23c | 274 | uint64_t get_buf_remaining(uint64_t p) const { |
7c673cae FG |
275 | if (p >= bl_off && p < bl_off + bl.length()) |
276 | return bl_off + bl.length() - p; | |
277 | return 0; | |
278 | } | |
279 | ||
280 | void skip(size_t n) { | |
281 | pos += n; | |
282 | } | |
f67539c2 TL |
283 | |
284 | // For the sake of simplicity, we invalidate completed rather than | |
285 | // for the provided extent | |
286 | void invalidate_cache(uint64_t offset, uint64_t length) { | |
287 | if (offset >= bl_off && offset < get_buf_end()) { | |
288 | bl.clear(); | |
289 | bl_off = 0; | |
290 | } | |
7c673cae FG |
291 | } |
292 | }; | |
293 | ||
294 | struct FileReader { | |
295 | MEMPOOL_CLASS_HELPERS(); | |
296 | ||
297 | FileRef file; | |
298 | FileReaderBuffer buf; | |
299 | bool random; | |
300 | bool ignore_eof; ///< used when reading our log file | |
301 | ||
494da23a TL |
302 | ceph::shared_mutex lock { |
303 | ceph::make_shared_mutex(std::string(), false, false, false) | |
304 | }; | |
305 | ||
306 | ||
7c673cae FG |
307 | FileReader(FileRef f, uint64_t mpf, bool rand, bool ie) |
308 | : file(f), | |
309 | buf(mpf), | |
310 | random(rand), | |
311 | ignore_eof(ie) { | |
312 | ++file->num_readers; | |
313 | } | |
314 | ~FileReader() { | |
315 | --file->num_readers; | |
316 | } | |
317 | }; | |
318 | ||
319 | struct FileLock { | |
320 | MEMPOOL_CLASS_HELPERS(); | |
321 | ||
322 | FileRef file; | |
9f95a23c | 323 | explicit FileLock(FileRef f) : file(std::move(f)) {} |
7c673cae FG |
324 | }; |
325 | ||
326 | private: | |
7c673cae FG |
327 | PerfCounters *logger = nullptr; |
328 | ||
11fdf7f2 TL |
329 | uint64_t max_bytes[MAX_BDEV] = {0}; |
330 | uint64_t max_bytes_pcounters[MAX_BDEV] = { | |
331 | l_bluefs_max_bytes_wal, | |
332 | l_bluefs_max_bytes_db, | |
333 | l_bluefs_max_bytes_slow, | |
334 | }; | |
335 | ||
7c673cae | 336 | // cache |
20effc67 TL |
337 | struct { |
338 | ceph::mutex lock = ceph::make_mutex("BlueFS::nodes.lock"); | |
339 | mempool::bluefs::map<std::string, DirRef, std::less<>> dir_map; ///< dirname -> Dir | |
340 | mempool::bluefs::unordered_map<uint64_t, FileRef> file_map; ///< ino -> File | |
341 | } nodes; | |
7c673cae FG |
342 | |
343 | bluefs_super_t super; ///< latest superblock (as last written) | |
344 | uint64_t ino_last = 0; ///< last assigned ino (this one is in use) | |
7c673cae | 345 | |
20effc67 TL |
346 | struct { |
347 | ceph::mutex lock = ceph::make_mutex("BlueFS::log.lock"); | |
348 | uint64_t seq_live = 1; //seq that log is currently writing to; mirrors dirty.seq_live | |
349 | FileWriter *writer = 0; | |
350 | bluefs_transaction_t t; | |
351 | } log; | |
352 | ||
353 | struct { | |
354 | ceph::mutex lock = ceph::make_mutex("BlueFS::dirty.lock"); | |
355 | uint64_t seq_stable = 0; //seq that is now stable on disk | |
356 | uint64_t seq_live = 1; //seq that is ongoing and dirty files will be written to | |
357 | // map of dirty files, files of same dirty_seq are grouped into list. | |
358 | std::map<uint64_t, dirty_file_list_t> files; | |
359 | std::vector<interval_set<uint64_t>> pending_release; ///< extents to release | |
360 | // TODO: it should be examined what makes pending_release immune to | |
361 | // eras in a way similar to dirty_files. Hints: | |
362 | // 1) we have actually only 2 eras: log_seq and log_seq+1 | |
363 | // 2) we usually not remove extents from files. And when we do, we force log-syncing. | |
364 | } dirty; | |
365 | ||
366 | ceph::condition_variable log_cond; ///< used for state control between log flush / log compaction | |
367 | std::atomic<bool> log_is_compacting{false}; ///< signals that bluefs log is already ongoing compaction | |
368 | std::atomic<bool> log_forbidden_to_expand{false}; ///< used to signal that async compaction is in state | |
369 | /// that prohibits expansion of bluefs log | |
7c673cae FG |
370 | /* |
371 | * There are up to 3 block devices: | |
372 | * | |
373 | * BDEV_DB db/ - the primary db device | |
374 | * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL | |
375 | * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills | |
376 | */ | |
f67539c2 TL |
377 | std::vector<BlockDevice*> bdev; ///< block devices we can use |
378 | std::vector<IOContext*> ioc; ///< IOContexts for bdevs | |
379 | std::vector<uint64_t> block_reserved; ///< starting reserve extent per device | |
380 | std::vector<Allocator*> alloc; ///< allocators for bdevs | |
381 | std::vector<uint64_t> alloc_size; ///< alloc size for each device | |
20effc67 | 382 | |
f67539c2 | 383 | //std::vector<interval_set<uint64_t>> block_unused_too_granular; |
7c673cae | 384 | |
11fdf7f2 TL |
385 | BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev |
386 | ||
9f95a23c | 387 | std::unique_ptr<BlueFSVolumeSelector> vselector; |
11fdf7f2 | 388 | |
f67539c2 TL |
389 | bluefs_shared_alloc_context_t* shared_alloc = nullptr; |
390 | unsigned shared_alloc_id = unsigned(-1); | |
391 | inline bool is_shared_alloc(unsigned id) const { | |
392 | return id == shared_alloc_id; | |
393 | } | |
394 | ||
eafe8130 TL |
395 | class SocketHook; |
396 | SocketHook* asok_hook = nullptr; | |
cd265ab1 TL |
397 | // used to trigger zeros into read (debug / verify) |
398 | std::atomic<uint64_t> inject_read_zeros{0}; | |
eafe8130 | 399 | |
7c673cae FG |
400 | void _init_logger(); |
401 | void _shutdown_logger(); | |
402 | void _update_logger_stats(); | |
403 | ||
404 | void _init_alloc(); | |
405 | void _stop_alloc(); | |
406 | ||
f67539c2 TL |
407 | void _pad_bl(ceph::buffer::list& bl); ///< pad ceph::buffer::list to block size w/ zeros |
408 | ||
409 | uint64_t _get_used(unsigned id) const; | |
410 | uint64_t _get_total(unsigned id) const; | |
411 | ||
7c673cae FG |
412 | |
413 | FileRef _get_file(uint64_t ino); | |
20effc67 | 414 | void _drop_link_D(FileRef f); |
7c673cae | 415 | |
1911f103 TL |
416 | unsigned _get_slow_device_id() { |
417 | return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; | |
418 | } | |
eafe8130 | 419 | const char* get_device_name(unsigned id); |
7c673cae | 420 | int _allocate(uint8_t bdev, uint64_t len, |
94b18763 | 421 | bluefs_fnode_t* node); |
11fdf7f2 TL |
422 | int _allocate_without_fallback(uint8_t id, uint64_t len, |
423 | PExtentVector* extents); | |
424 | ||
522d829b | 425 | /* signal replay log to include h->file in nearest log flush */ |
20effc67 TL |
426 | int _signal_dirty_to_log_D(FileWriter *h); |
427 | int _flush_range_F(FileWriter *h, uint64_t offset, uint64_t length); | |
428 | int _flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered); | |
429 | int _flush_F(FileWriter *h, bool force, bool *flushed = nullptr); | |
430 | uint64_t _flush_special(FileWriter *h); | |
431 | int _fsync(FileWriter *h); | |
7c673cae | 432 | |
11fdf7f2 | 433 | #ifdef HAVE_LIBAIO |
f67539c2 | 434 | void _claim_completed_aios(FileWriter *h, std::list<aio_t> *ls); |
20effc67 | 435 | void _wait_for_aio(FileWriter *h); // safe to call without a lock |
11fdf7f2 | 436 | #endif |
7c673cae | 437 | |
20effc67 TL |
438 | int64_t _maybe_extend_log(); |
439 | void _extend_log(); | |
440 | uint64_t _log_advance_seq(); | |
441 | void _consume_dirty(uint64_t seq); | |
442 | void _clear_dirty_set_stable_D(uint64_t seq_stable); | |
443 | void _release_pending_allocations(std::vector<interval_set<uint64_t>>& to_release); | |
444 | ||
445 | void _flush_and_sync_log_core(int64_t available_runway); | |
446 | int _flush_and_sync_log_jump_D(uint64_t jump_to, | |
447 | int64_t available_runway); | |
448 | int _flush_and_sync_log_LD(uint64_t want_seq = 0); | |
449 | ||
450 | uint64_t _estimate_log_size_N(); | |
451 | bool _should_start_compact_log_L_N(); | |
11fdf7f2 TL |
452 | |
453 | enum { | |
454 | REMOVE_DB = 1, | |
455 | REMOVE_WAL = 2, | |
456 | RENAME_SLOW2DB = 4, | |
457 | RENAME_DB2SLOW = 8, | |
458 | }; | |
20effc67 TL |
459 | void _compact_log_dump_metadata_NF(bluefs_transaction_t *t, |
460 | int flags); | |
461 | void _compact_log_async_dump_metadata_NF(bluefs_transaction_t *t, | |
462 | uint64_t capture_before_seq); | |
11fdf7f2 | 463 | |
20effc67 TL |
464 | void _compact_log_sync_LNF_LD(); |
465 | void _compact_log_async_LD_LNF_D(); | |
466 | ||
467 | void _rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback, | |
9f95a23c TL |
468 | int super_dev, |
469 | int log_dev, | |
470 | int new_log_dev, | |
471 | int flags, | |
472 | std::optional<bluefs_layout_t> layout); | |
7c673cae FG |
473 | |
474 | //void _aio_finish(void *priv); | |
475 | ||
20effc67 TL |
476 | void _flush_bdev(FileWriter *h); |
477 | void _flush_bdev(); // this is safe to call without a lock | |
478 | void _flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock | |
7c673cae FG |
479 | |
480 | int _preallocate(FileRef f, uint64_t off, uint64_t len); | |
481 | int _truncate(FileWriter *h, uint64_t off); | |
482 | ||
adb31ebb | 483 | int64_t _read( |
7c673cae | 484 | FileReader *h, ///< [in] read from here |
7c673cae FG |
485 | uint64_t offset, ///< [in] offset |
486 | size_t len, ///< [in] this many bytes | |
f67539c2 | 487 | ceph::buffer::list *outbl, ///< [out] optional: reference the result here |
7c673cae | 488 | char *out); ///< [out] optional: or copy it here |
adb31ebb | 489 | int64_t _read_random( |
7c673cae FG |
490 | FileReader *h, ///< [in] read from here |
491 | uint64_t offset, ///< [in] offset | |
9f95a23c | 492 | uint64_t len, ///< [in] this many bytes |
7c673cae FG |
493 | char *out); ///< [out] optional: or copy it here |
494 | ||
7c673cae | 495 | int _open_super(); |
11fdf7f2 | 496 | int _write_super(int dev); |
20effc67 TL |
497 | int _check_allocations(const bluefs_fnode_t& fnode, |
498 | boost::dynamic_bitset<uint64_t>* used_blocks, | |
499 | bool is_alloc, //true when allocating, false when deallocating | |
500 | const char* op_name); | |
9f95a23c TL |
501 | int _verify_alloc_granularity( |
502 | __u8 id, uint64_t offset, uint64_t length, | |
503 | const char *op); | |
11fdf7f2 | 504 | int _replay(bool noop, bool to_stdout = false); ///< replay journal |
7c673cae FG |
505 | |
506 | FileWriter *_create_writer(FileRef f); | |
20effc67 | 507 | void _drain_writer(FileWriter *h); |
7c673cae FG |
508 | void _close_writer(FileWriter *h); |
509 | ||
510 | // always put the super in the second 4k block. FIXME should this be | |
511 | // block size independent? | |
512 | unsigned get_super_offset() { | |
513 | return 4096; | |
514 | } | |
515 | unsigned get_super_length() { | |
516 | return 4096; | |
517 | } | |
20effc67 TL |
518 | void _maybe_check_vselector_LNF() { |
519 | if (cct->_conf->bluefs_check_volume_selector_often) { | |
520 | _check_vselector_LNF(); | |
521 | } | |
522 | } | |
7c673cae FG |
523 | public: |
524 | BlueFS(CephContext* cct); | |
525 | ~BlueFS(); | |
526 | ||
527 | // the super is always stored on bdev 0 | |
9f95a23c | 528 | int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout); |
7c673cae | 529 | int mount(); |
9f95a23c | 530 | int maybe_verify_layout(const bluefs_layout_t& layout) const; |
1911f103 | 531 | void umount(bool avoid_compact = false); |
9f95a23c | 532 | int prepare_new_device(int id, const bluefs_layout_t& layout); |
11fdf7f2 TL |
533 | |
534 | int log_dump(); | |
7c673cae | 535 | |
f67539c2 TL |
536 | void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id); |
537 | void get_devices(std::set<std::string> *ls); | |
eafe8130 TL |
538 | uint64_t get_alloc_size(int id) { |
539 | return alloc_size[id]; | |
540 | } | |
7c673cae FG |
541 | int fsck(); |
542 | ||
11fdf7f2 TL |
543 | int device_migrate_to_new( |
544 | CephContext *cct, | |
f67539c2 | 545 | const std::set<int>& devs_source, |
9f95a23c TL |
546 | int dev_target, |
547 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
548 | int device_migrate_to_existing( |
549 | CephContext *cct, | |
f67539c2 | 550 | const std::set<int>& devs_source, |
9f95a23c TL |
551 | int dev_target, |
552 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
553 | |
554 | uint64_t get_used(); | |
7c673cae FG |
555 | uint64_t get_total(unsigned id); |
556 | uint64_t get_free(unsigned id); | |
f67539c2 TL |
557 | uint64_t get_used(unsigned id); |
558 | void dump_perf_counters(ceph::Formatter *f); | |
7c673cae | 559 | |
f67539c2 | 560 | void dump_block_extents(std::ostream& out); |
3efd9988 | 561 | |
7c673cae FG |
562 | /// get current extents that we own for given block device |
563 | int get_block_extents(unsigned id, interval_set<uint64_t> *extents); | |
564 | ||
565 | int open_for_write( | |
b3b6e05e TL |
566 | std::string_view dir, |
567 | std::string_view file, | |
7c673cae FG |
568 | FileWriter **h, |
569 | bool overwrite); | |
570 | ||
571 | int open_for_read( | |
b3b6e05e TL |
572 | std::string_view dir, |
573 | std::string_view file, | |
7c673cae FG |
574 | FileReader **h, |
575 | bool random = false); | |
576 | ||
20effc67 TL |
577 | // data added after last fsync() is lost |
578 | void close_writer(FileWriter *h); | |
7c673cae | 579 | |
b3b6e05e TL |
580 | int rename(std::string_view old_dir, std::string_view old_file, |
581 | std::string_view new_dir, std::string_view new_file); | |
7c673cae | 582 | |
b3b6e05e | 583 | int readdir(std::string_view dirname, std::vector<std::string> *ls); |
7c673cae | 584 | |
b3b6e05e TL |
585 | int unlink(std::string_view dirname, std::string_view filename); |
586 | int mkdir(std::string_view dirname); | |
587 | int rmdir(std::string_view dirname); | |
d2e6a577 | 588 | bool wal_is_rotational(); |
1d09f67e | 589 | bool db_is_rotational(); |
7c673cae | 590 | |
b3b6e05e TL |
591 | bool dir_exists(std::string_view dirname); |
592 | int stat(std::string_view dirname, std::string_view filename, | |
7c673cae FG |
593 | uint64_t *size, utime_t *mtime); |
594 | ||
b3b6e05e | 595 | int lock_file(std::string_view dirname, std::string_view filename, FileLock **p); |
7c673cae FG |
596 | int unlock_file(FileLock *l); |
597 | ||
7c673cae FG |
598 | void compact_log(); |
599 | ||
600 | /// sync any uncommitted state to disk | |
1911f103 | 601 | void sync_metadata(bool avoid_compact); |
7c673cae | 602 | |
9f95a23c TL |
603 | void set_volume_selector(BlueFSVolumeSelector* s) { |
604 | vselector.reset(s); | |
605 | } | |
f67539c2 | 606 | void dump_volume_selector(std::ostream& sout) { |
9f95a23c TL |
607 | vselector->dump(sout); |
608 | } | |
609 | void get_vselector_paths(const std::string& base, | |
610 | BlueFSVolumeSelector::paths& res) const { | |
611 | return vselector->get_paths(base, res); | |
612 | } | |
613 | ||
f67539c2 TL |
614 | int add_block_device(unsigned bdev, const std::string& path, bool trim, |
615 | uint64_t reserved, | |
616 | bluefs_shared_alloc_context_t* _shared_alloc = nullptr); | |
7c673cae | 617 | bool bdev_support_label(unsigned id); |
f67539c2 | 618 | uint64_t get_block_device_size(unsigned bdev) const; |
7c673cae | 619 | |
11fdf7f2 TL |
620 | // handler for discard event |
621 | void handle_discard(unsigned dev, interval_set<uint64_t>& to_release); | |
622 | ||
20effc67 | 623 | void flush(FileWriter *h, bool force = false); |
cd265ab1 | 624 | |
20effc67 TL |
625 | void append_try_flush(FileWriter *h, const char* buf, size_t len); |
626 | void flush_range(FileWriter *h, uint64_t offset, uint64_t length); | |
627 | int fsync(FileWriter *h); | |
f67539c2 TL |
628 | int64_t read(FileReader *h, uint64_t offset, size_t len, |
629 | ceph::buffer::list *outbl, char *out) { | |
7c673cae FG |
630 | // no need to hold the global lock here; we only touch h and |
631 | // h->file, and read vs write or delete is already protected (via | |
632 | // atomics and asserts). | |
f67539c2 | 633 | return _read(h, offset, len, outbl, out); |
7c673cae | 634 | } |
adb31ebb | 635 | int64_t read_random(FileReader *h, uint64_t offset, size_t len, |
7c673cae FG |
636 | char *out) { |
637 | // no need to hold the global lock here; we only touch h and | |
638 | // h->file, and read vs write or delete is already protected (via | |
639 | // atomics and asserts). | |
640 | return _read_random(h, offset, len, out); | |
641 | } | |
20effc67 TL |
642 | void invalidate_cache(FileRef f, uint64_t offset, uint64_t len); |
643 | int preallocate(FileRef f, uint64_t offset, uint64_t len); | |
644 | int truncate(FileWriter *h, uint64_t offset); | |
7c673cae | 645 | |
f67539c2 TL |
646 | size_t probe_alloc_avail(int dev, uint64_t alloc_size); |
647 | ||
9f95a23c | 648 | /// test purpose methods |
9f95a23c TL |
649 | const PerfCounters* get_perf_counters() const { |
650 | return logger; | |
651 | } | |
522d829b TL |
652 | uint64_t debug_get_dirty_seq(FileWriter *h); |
653 | bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev); | |
cd265ab1 TL |
654 | |
655 | private: | |
656 | // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...) | |
657 | // They are used for checking if read values are all 0, and reread if so. | |
20effc67 | 658 | int _read_and_check(uint8_t ndev, uint64_t off, uint64_t len, |
cd265ab1 | 659 | ceph::buffer::list *pbl, IOContext *ioc, bool buffered); |
20effc67 TL |
660 | int _read_random_and_check(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered); |
661 | ||
662 | int _bdev_read(uint8_t ndev, uint64_t off, uint64_t len, | |
663 | ceph::buffer::list* pbl, IOContext* ioc, bool buffered); | |
664 | int _bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len, char* buf, bool buffered); | |
665 | ||
666 | /// test and compact log, if necessary | |
667 | void _maybe_compact_log_LNF_NF_LD_D(); | |
668 | int _do_replay_recovery_read(FileReader *log, | |
669 | size_t log_pos, | |
670 | size_t read_offset, | |
671 | size_t read_len, | |
672 | bufferlist* bl); | |
673 | void _check_vselector_LNF(); | |
9f95a23c TL |
674 | }; |
675 | ||
676 | class OriginalVolumeSelector : public BlueFSVolumeSelector { | |
677 | uint64_t wal_total; | |
678 | uint64_t db_total; | |
679 | uint64_t slow_total; | |
680 | ||
681 | public: | |
682 | OriginalVolumeSelector( | |
683 | uint64_t _wal_total, | |
684 | uint64_t _db_total, | |
685 | uint64_t _slow_total) | |
686 | : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {} | |
687 | ||
f6b5b4d7 | 688 | void* get_hint_for_log() const override; |
b3b6e05e | 689 | void* get_hint_by_dir(std::string_view dirname) const override; |
9f95a23c TL |
690 | |
691 | void add_usage(void* hint, const bluefs_fnode_t& fnode) override { | |
692 | // do nothing | |
693 | return; | |
694 | } | |
695 | void sub_usage(void* hint, const bluefs_fnode_t& fnode) override { | |
696 | // do nothing | |
697 | return; | |
698 | } | |
699 | void add_usage(void* hint, uint64_t fsize) override { | |
700 | // do nothing | |
701 | return; | |
702 | } | |
703 | void sub_usage(void* hint, uint64_t fsize) override { | |
704 | // do nothing | |
705 | return; | |
706 | } | |
707 | ||
708 | uint8_t select_prefer_bdev(void* hint) override; | |
709 | void get_paths(const std::string& base, paths& res) const override; | |
f67539c2 TL |
710 | void dump(std::ostream& sout) override; |
711 | }; | |
712 | ||
713 | class FitToFastVolumeSelector : public OriginalVolumeSelector { | |
714 | public: | |
715 | FitToFastVolumeSelector( | |
716 | uint64_t _wal_total, | |
717 | uint64_t _db_total, | |
718 | uint64_t _slow_total) | |
719 | : OriginalVolumeSelector(_wal_total, _db_total, _slow_total) {} | |
720 | ||
721 | void get_paths(const std::string& base, paths& res) const override; | |
7c673cae | 722 | }; |
20effc67 TL |
723 | /** |
724 | * Directional graph of locks. | |
725 | * Vertices - Locks. Edges (directed) - locking progression. | |
726 | * Edge A->B exist if last taken lock was A and next taken lock is B. | |
727 | * | |
728 | * Row represents last lock taken. | |
729 | * Column represents next lock taken. | |
730 | * | |
731 | * > | W | L | N | D | F | |
732 | * -------------|---|---|---|---|--- | |
733 | * FileWriter W | | > | > | > | > | |
734 | * log L | | > | > | > | |
735 | * nodes N | | > | > | |
736 | * dirty D | | | > | |
737 | * File F | | |
738 | * | |
739 | * Claim: Deadlock is possible IFF graph contains cycles. | |
740 | */ | |
7c673cae | 741 | #endif |