]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | #ifndef CEPH_OS_BLUESTORE_BLUEFS_H | |
4 | #define CEPH_OS_BLUESTORE_BLUEFS_H | |
5 | ||
6 | #include <atomic> | |
7 | #include <mutex> | |
cd265ab1 | 8 | #include <limits> |
7c673cae FG |
9 | |
10 | #include "bluefs_types.h" | |
f67539c2 | 11 | #include "blk/BlockDevice.h" |
7c673cae | 12 | |
9f95a23c TL |
13 | #include "common/RefCountedObj.h" |
14 | #include "common/ceph_context.h" | |
15 | #include "global/global_context.h" | |
16 | #include "include/common_fwd.h" | |
7c673cae | 17 | |
9f95a23c TL |
18 | #include "boost/intrusive/list.hpp" |
19 | #include "boost/dynamic_bitset.hpp" | |
7c673cae FG |
20 | |
21 | class Allocator; | |
22 | ||
23 | enum { | |
24 | l_bluefs_first = 732600, | |
7c673cae FG |
25 | l_bluefs_db_total_bytes, |
26 | l_bluefs_db_used_bytes, | |
27 | l_bluefs_wal_total_bytes, | |
28 | l_bluefs_wal_used_bytes, | |
29 | l_bluefs_slow_total_bytes, | |
30 | l_bluefs_slow_used_bytes, | |
31 | l_bluefs_num_files, | |
32 | l_bluefs_log_bytes, | |
33 | l_bluefs_log_compactions, | |
34 | l_bluefs_logged_bytes, | |
35 | l_bluefs_files_written_wal, | |
36 | l_bluefs_files_written_sst, | |
37 | l_bluefs_bytes_written_wal, | |
38 | l_bluefs_bytes_written_sst, | |
11fdf7f2 TL |
39 | l_bluefs_bytes_written_slow, |
40 | l_bluefs_max_bytes_wal, | |
41 | l_bluefs_max_bytes_db, | |
42 | l_bluefs_max_bytes_slow, | |
20effc67 TL |
43 | l_bluefs_main_alloc_unit, |
44 | l_bluefs_db_alloc_unit, | |
45 | l_bluefs_wal_alloc_unit, | |
494da23a TL |
46 | l_bluefs_read_random_count, |
47 | l_bluefs_read_random_bytes, | |
48 | l_bluefs_read_random_disk_count, | |
49 | l_bluefs_read_random_disk_bytes, | |
20effc67 TL |
50 | l_bluefs_read_random_disk_bytes_wal, |
51 | l_bluefs_read_random_disk_bytes_db, | |
52 | l_bluefs_read_random_disk_bytes_slow, | |
494da23a TL |
53 | l_bluefs_read_random_buffer_count, |
54 | l_bluefs_read_random_buffer_bytes, | |
55 | l_bluefs_read_count, | |
56 | l_bluefs_read_bytes, | |
20effc67 TL |
57 | l_bluefs_read_disk_count, |
58 | l_bluefs_read_disk_bytes, | |
59 | l_bluefs_read_disk_bytes_wal, | |
60 | l_bluefs_read_disk_bytes_db, | |
61 | l_bluefs_read_disk_bytes_slow, | |
494da23a TL |
62 | l_bluefs_read_prefetch_count, |
63 | l_bluefs_read_prefetch_bytes, | |
39ae355f TL |
64 | l_bluefs_compaction_lat, |
65 | l_bluefs_compaction_lock_lat, | |
66 | l_bluefs_alloc_shared_dev_fallbacks, | |
67 | l_bluefs_alloc_shared_size_fallbacks, | |
cd265ab1 TL |
68 | l_bluefs_read_zeros_candidate, |
69 | l_bluefs_read_zeros_errors, | |
7c673cae FG |
70 | l_bluefs_last, |
71 | }; | |
72 | ||
9f95a23c TL |
73 | class BlueFSVolumeSelector { |
74 | public: | |
75 | typedef std::vector<std::pair<std::string, uint64_t>> paths; | |
76 | ||
77 | virtual ~BlueFSVolumeSelector() { | |
78 | } | |
f6b5b4d7 | 79 | virtual void* get_hint_for_log() const = 0; |
b3b6e05e | 80 | virtual void* get_hint_by_dir(std::string_view dirname) const = 0; |
9f95a23c TL |
81 | |
82 | virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; | |
83 | virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; | |
84 | virtual void add_usage(void* file_hint, uint64_t fsize) = 0; | |
85 | virtual void sub_usage(void* file_hint, uint64_t fsize) = 0; | |
86 | virtual uint8_t select_prefer_bdev(void* hint) = 0; | |
87 | virtual void get_paths(const std::string& base, paths& res) const = 0; | |
f67539c2 | 88 | virtual void dump(std::ostream& sout) = 0; |
20effc67 TL |
89 | |
90 | /* used for sanity checking of vselector */ | |
91 | virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; } | |
92 | virtual bool compare(BlueFSVolumeSelector* other) { return true; }; | |
f67539c2 TL |
93 | }; |
94 | ||
95 | struct bluefs_shared_alloc_context_t { | |
96 | bool need_init = false; | |
97 | Allocator* a = nullptr; | |
39ae355f | 98 | uint64_t alloc_unit = 0; |
f67539c2 TL |
99 | |
100 | std::atomic<uint64_t> bluefs_used = 0; | |
101 | ||
39ae355f | 102 | void set(Allocator* _a, uint64_t _au) { |
f67539c2 | 103 | a = _a; |
39ae355f | 104 | alloc_unit = _au; |
f67539c2 TL |
105 | need_init = true; |
106 | bluefs_used = 0; | |
107 | } | |
108 | void reset() { | |
109 | a = nullptr; | |
39ae355f | 110 | alloc_unit = 0; |
f67539c2 | 111 | } |
11fdf7f2 TL |
112 | }; |
113 | ||
7c673cae FG |
114 | class BlueFS { |
115 | public: | |
116 | CephContext* cct; | |
11fdf7f2 | 117 | static constexpr unsigned MAX_BDEV = 5; |
7c673cae FG |
118 | static constexpr unsigned BDEV_WAL = 0; |
119 | static constexpr unsigned BDEV_DB = 1; | |
120 | static constexpr unsigned BDEV_SLOW = 2; | |
11fdf7f2 TL |
121 | static constexpr unsigned BDEV_NEWWAL = 3; |
122 | static constexpr unsigned BDEV_NEWDB = 4; | |
7c673cae FG |
123 | |
124 | enum { | |
125 | WRITER_UNKNOWN, | |
126 | WRITER_WAL, | |
127 | WRITER_SST, | |
128 | }; | |
129 | ||
130 | struct File : public RefCountedObject { | |
131 | MEMPOOL_CLASS_HELPERS(); | |
132 | ||
133 | bluefs_fnode_t fnode; | |
134 | int refs; | |
135 | uint64_t dirty_seq; | |
136 | bool locked; | |
137 | bool deleted; | |
522d829b | 138 | bool is_dirty; |
7c673cae FG |
139 | boost::intrusive::list_member_hook<> dirty_item; |
140 | ||
141 | std::atomic_int num_readers, num_writers; | |
142 | std::atomic_int num_reading; | |
143 | ||
9f95a23c | 144 | void* vselector_hint = nullptr; |
20effc67 TL |
145 | /* lock protects fnode and other the parts that can be modified during read & write operations. |
146 | Does not protect values that are fixed | |
147 | Does not need to be taken when doing one-time operations: | |
148 | _replay, device_migrate_to_existing, device_migrate_to_new */ | |
149 | ceph::mutex lock = ceph::make_mutex("BlueFS::File::lock"); | |
9f95a23c TL |
150 | |
151 | private: | |
152 | FRIEND_MAKE_REF(File); | |
7c673cae | 153 | File() |
9f95a23c | 154 | : |
7c673cae FG |
155 | refs(0), |
156 | dirty_seq(0), | |
157 | locked(false), | |
158 | deleted(false), | |
522d829b | 159 | is_dirty(false), |
7c673cae FG |
160 | num_readers(0), |
161 | num_writers(0), | |
9f95a23c TL |
162 | num_reading(0), |
163 | vselector_hint(nullptr) | |
7c673cae FG |
164 | {} |
165 | ~File() override { | |
11fdf7f2 TL |
166 | ceph_assert(num_readers.load() == 0); |
167 | ceph_assert(num_writers.load() == 0); | |
168 | ceph_assert(num_reading.load() == 0); | |
169 | ceph_assert(!locked); | |
7c673cae | 170 | } |
7c673cae | 171 | }; |
9f95a23c | 172 | using FileRef = ceph::ref_t<File>; |
7c673cae FG |
173 | |
174 | typedef boost::intrusive::list< | |
175 | File, | |
176 | boost::intrusive::member_hook< | |
177 | File, | |
178 | boost::intrusive::list_member_hook<>, | |
179 | &File::dirty_item> > dirty_file_list_t; | |
180 | ||
181 | struct Dir : public RefCountedObject { | |
182 | MEMPOOL_CLASS_HELPERS(); | |
183 | ||
b3b6e05e | 184 | mempool::bluefs::map<std::string, FileRef, std::less<>> file_map; |
7c673cae | 185 | |
9f95a23c TL |
186 | private: |
187 | FRIEND_MAKE_REF(Dir); | |
188 | Dir() = default; | |
7c673cae | 189 | }; |
9f95a23c | 190 | using DirRef = ceph::ref_t<Dir>; |
7c673cae FG |
191 | |
192 | struct FileWriter { | |
193 | MEMPOOL_CLASS_HELPERS(); | |
194 | ||
195 | FileRef file; | |
9f95a23c | 196 | uint64_t pos = 0; ///< start offset for buffer |
f67539c2 TL |
197 | private: |
198 | ceph::buffer::list buffer; ///< new data to write (at end of file) | |
199 | ceph::buffer::list tail_block; ///< existing partial block at end of file, if any | |
200 | public: | |
201 | unsigned get_buffer_length() const { | |
202 | return buffer.length(); | |
203 | } | |
204 | ceph::bufferlist flush_buffer( | |
205 | CephContext* cct, | |
206 | const bool partial, | |
207 | const unsigned length, | |
208 | const bluefs_super_t& super); | |
209 | ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only | |
210 | public: | |
7c673cae | 211 | int writer_type = 0; ///< WRITER_* |
11fdf7f2 | 212 | int write_hint = WRITE_LIFE_NOT_SET; |
7c673cae | 213 | |
11fdf7f2 | 214 | ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock"); |
7c673cae | 215 | std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev |
11fdf7f2 | 216 | std::array<bool, MAX_BDEV> dirty_devs; |
7c673cae FG |
217 | |
218 | FileWriter(FileRef f) | |
9f95a23c | 219 | : file(std::move(f)), |
f67539c2 TL |
220 | buffer_appender(buffer.get_page_aligned_appender( |
221 | g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) { | |
7c673cae FG |
222 | ++file->num_writers; |
223 | iocv.fill(nullptr); | |
11fdf7f2 | 224 | dirty_devs.fill(false); |
9f95a23c | 225 | if (file->fnode.ino == 1) { |
11fdf7f2 TL |
226 | write_hint = WRITE_LIFE_MEDIUM; |
227 | } | |
7c673cae FG |
228 | } |
229 | // NOTE: caller must call BlueFS::close_writer() | |
230 | ~FileWriter() { | |
231 | --file->num_writers; | |
232 | } | |
233 | ||
234 | // note: BlueRocksEnv uses this append exclusively, so it's safe | |
20effc67 | 235 | // to use buffer_appender exclusively here (e.g., its notion of |
7c673cae FG |
236 | // offset will remain accurate). |
237 | void append(const char *buf, size_t len) { | |
f67539c2 | 238 | uint64_t l0 = get_buffer_length(); |
cd265ab1 | 239 | ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max()); |
7c673cae FG |
240 | buffer_appender.append(buf, len); |
241 | } | |
242 | ||
20effc67 TL |
243 | void append(const std::byte *buf, size_t len) { |
244 | // allow callers to use byte type instead of char* as we simply pass byte array | |
245 | append((const char*)buf, len); | |
246 | } | |
247 | ||
7c673cae | 248 | // note: used internally only, for ino 1 or 0. |
cd265ab1 | 249 | void append(ceph::buffer::list& bl) { |
f67539c2 | 250 | uint64_t l0 = get_buffer_length(); |
cd265ab1 | 251 | ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max()); |
7c673cae FG |
252 | buffer.claim_append(bl); |
253 | } | |
254 | ||
f67539c2 TL |
255 | void append_zero(size_t len) { |
256 | uint64_t l0 = get_buffer_length(); | |
257 | ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max()); | |
258 | buffer_appender.append_zero(len); | |
259 | } | |
260 | ||
7c673cae | 261 | uint64_t get_effective_write_pos() { |
7c673cae FG |
262 | return pos + buffer.length(); |
263 | } | |
264 | }; | |
265 | ||
266 | struct FileReaderBuffer { | |
267 | MEMPOOL_CLASS_HELPERS(); | |
268 | ||
9f95a23c | 269 | uint64_t bl_off = 0; ///< prefetch buffer logical offset |
f67539c2 | 270 | ceph::buffer::list bl; ///< prefetch buffer |
9f95a23c | 271 | uint64_t pos = 0; ///< current logical offset |
7c673cae FG |
272 | uint64_t max_prefetch; ///< max allowed prefetch |
273 | ||
274 | explicit FileReaderBuffer(uint64_t mpf) | |
9f95a23c | 275 | : max_prefetch(mpf) {} |
7c673cae | 276 | |
9f95a23c | 277 | uint64_t get_buf_end() const { |
7c673cae FG |
278 | return bl_off + bl.length(); |
279 | } | |
9f95a23c | 280 | uint64_t get_buf_remaining(uint64_t p) const { |
7c673cae FG |
281 | if (p >= bl_off && p < bl_off + bl.length()) |
282 | return bl_off + bl.length() - p; | |
283 | return 0; | |
284 | } | |
285 | ||
286 | void skip(size_t n) { | |
287 | pos += n; | |
288 | } | |
f67539c2 TL |
289 | |
290 | // For the sake of simplicity, we invalidate completed rather than | |
291 | // for the provided extent | |
292 | void invalidate_cache(uint64_t offset, uint64_t length) { | |
293 | if (offset >= bl_off && offset < get_buf_end()) { | |
294 | bl.clear(); | |
295 | bl_off = 0; | |
296 | } | |
7c673cae FG |
297 | } |
298 | }; | |
299 | ||
300 | struct FileReader { | |
301 | MEMPOOL_CLASS_HELPERS(); | |
302 | ||
303 | FileRef file; | |
304 | FileReaderBuffer buf; | |
305 | bool random; | |
306 | bool ignore_eof; ///< used when reading our log file | |
307 | ||
494da23a TL |
308 | ceph::shared_mutex lock { |
309 | ceph::make_shared_mutex(std::string(), false, false, false) | |
310 | }; | |
311 | ||
312 | ||
7c673cae FG |
313 | FileReader(FileRef f, uint64_t mpf, bool rand, bool ie) |
314 | : file(f), | |
315 | buf(mpf), | |
316 | random(rand), | |
317 | ignore_eof(ie) { | |
318 | ++file->num_readers; | |
319 | } | |
320 | ~FileReader() { | |
321 | --file->num_readers; | |
322 | } | |
323 | }; | |
324 | ||
325 | struct FileLock { | |
326 | MEMPOOL_CLASS_HELPERS(); | |
327 | ||
328 | FileRef file; | |
9f95a23c | 329 | explicit FileLock(FileRef f) : file(std::move(f)) {} |
7c673cae FG |
330 | }; |
331 | ||
332 | private: | |
7c673cae FG |
333 | PerfCounters *logger = nullptr; |
334 | ||
11fdf7f2 TL |
335 | uint64_t max_bytes[MAX_BDEV] = {0}; |
336 | uint64_t max_bytes_pcounters[MAX_BDEV] = { | |
337 | l_bluefs_max_bytes_wal, | |
338 | l_bluefs_max_bytes_db, | |
339 | l_bluefs_max_bytes_slow, | |
39ae355f TL |
340 | l_bluefs_max_bytes_wal, |
341 | l_bluefs_max_bytes_db, | |
11fdf7f2 TL |
342 | }; |
343 | ||
7c673cae | 344 | // cache |
20effc67 TL |
345 | struct { |
346 | ceph::mutex lock = ceph::make_mutex("BlueFS::nodes.lock"); | |
347 | mempool::bluefs::map<std::string, DirRef, std::less<>> dir_map; ///< dirname -> Dir | |
348 | mempool::bluefs::unordered_map<uint64_t, FileRef> file_map; ///< ino -> File | |
349 | } nodes; | |
7c673cae FG |
350 | |
351 | bluefs_super_t super; ///< latest superblock (as last written) | |
352 | uint64_t ino_last = 0; ///< last assigned ino (this one is in use) | |
7c673cae | 353 | |
20effc67 TL |
354 | struct { |
355 | ceph::mutex lock = ceph::make_mutex("BlueFS::log.lock"); | |
356 | uint64_t seq_live = 1; //seq that log is currently writing to; mirrors dirty.seq_live | |
357 | FileWriter *writer = 0; | |
358 | bluefs_transaction_t t; | |
359 | } log; | |
360 | ||
361 | struct { | |
362 | ceph::mutex lock = ceph::make_mutex("BlueFS::dirty.lock"); | |
363 | uint64_t seq_stable = 0; //seq that is now stable on disk | |
364 | uint64_t seq_live = 1; //seq that is ongoing and dirty files will be written to | |
365 | // map of dirty files, files of same dirty_seq are grouped into list. | |
366 | std::map<uint64_t, dirty_file_list_t> files; | |
367 | std::vector<interval_set<uint64_t>> pending_release; ///< extents to release | |
368 | // TODO: it should be examined what makes pending_release immune to | |
369 | // eras in a way similar to dirty_files. Hints: | |
370 | // 1) we have actually only 2 eras: log_seq and log_seq+1 | |
371 | // 2) we usually not remove extents from files. And when we do, we force log-syncing. | |
372 | } dirty; | |
373 | ||
374 | ceph::condition_variable log_cond; ///< used for state control between log flush / log compaction | |
375 | std::atomic<bool> log_is_compacting{false}; ///< signals that bluefs log is already ongoing compaction | |
376 | std::atomic<bool> log_forbidden_to_expand{false}; ///< used to signal that async compaction is in state | |
377 | /// that prohibits expansion of bluefs log | |
7c673cae FG |
378 | /* |
379 | * There are up to 3 block devices: | |
380 | * | |
381 | * BDEV_DB db/ - the primary db device | |
382 | * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL | |
383 | * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills | |
384 | */ | |
f67539c2 TL |
385 | std::vector<BlockDevice*> bdev; ///< block devices we can use |
386 | std::vector<IOContext*> ioc; ///< IOContexts for bdevs | |
387 | std::vector<uint64_t> block_reserved; ///< starting reserve extent per device | |
388 | std::vector<Allocator*> alloc; ///< allocators for bdevs | |
389 | std::vector<uint64_t> alloc_size; ///< alloc size for each device | |
20effc67 | 390 | |
f67539c2 | 391 | //std::vector<interval_set<uint64_t>> block_unused_too_granular; |
7c673cae | 392 | |
11fdf7f2 TL |
393 | BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev |
394 | ||
9f95a23c | 395 | std::unique_ptr<BlueFSVolumeSelector> vselector; |
11fdf7f2 | 396 | |
f67539c2 TL |
397 | bluefs_shared_alloc_context_t* shared_alloc = nullptr; |
398 | unsigned shared_alloc_id = unsigned(-1); | |
399 | inline bool is_shared_alloc(unsigned id) const { | |
400 | return id == shared_alloc_id; | |
401 | } | |
39ae355f | 402 | std::atomic<int64_t> cooldown_deadline = 0; |
f67539c2 | 403 | |
eafe8130 TL |
404 | class SocketHook; |
405 | SocketHook* asok_hook = nullptr; | |
cd265ab1 TL |
406 | // used to trigger zeros into read (debug / verify) |
407 | std::atomic<uint64_t> inject_read_zeros{0}; | |
eafe8130 | 408 | |
7c673cae FG |
409 | void _init_logger(); |
410 | void _shutdown_logger(); | |
411 | void _update_logger_stats(); | |
412 | ||
413 | void _init_alloc(); | |
414 | void _stop_alloc(); | |
415 | ||
39ae355f TL |
416 | ///< pad ceph::buffer::list to max(block size, pad_size) w/ zeros |
417 | void _pad_bl(ceph::buffer::list& bl, uint64_t pad_size = 0); | |
f67539c2 TL |
418 | |
419 | uint64_t _get_used(unsigned id) const; | |
420 | uint64_t _get_total(unsigned id) const; | |
421 | ||
7c673cae FG |
422 | |
423 | FileRef _get_file(uint64_t ino); | |
20effc67 | 424 | void _drop_link_D(FileRef f); |
7c673cae | 425 | |
1911f103 TL |
426 | unsigned _get_slow_device_id() { |
427 | return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; | |
428 | } | |
eafe8130 | 429 | const char* get_device_name(unsigned id); |
7c673cae | 430 | int _allocate(uint8_t bdev, uint64_t len, |
39ae355f TL |
431 | uint64_t alloc_unit, |
432 | bluefs_fnode_t* node, | |
433 | size_t alloc_attempts = 0, | |
434 | bool permit_dev_fallback = true); | |
11fdf7f2 | 435 | |
522d829b | 436 | /* signal replay log to include h->file in nearest log flush */ |
20effc67 TL |
437 | int _signal_dirty_to_log_D(FileWriter *h); |
438 | int _flush_range_F(FileWriter *h, uint64_t offset, uint64_t length); | |
439 | int _flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered); | |
440 | int _flush_F(FileWriter *h, bool force, bool *flushed = nullptr); | |
441 | uint64_t _flush_special(FileWriter *h); | |
442 | int _fsync(FileWriter *h); | |
7c673cae | 443 | |
11fdf7f2 | 444 | #ifdef HAVE_LIBAIO |
f67539c2 | 445 | void _claim_completed_aios(FileWriter *h, std::list<aio_t> *ls); |
20effc67 | 446 | void _wait_for_aio(FileWriter *h); // safe to call without a lock |
11fdf7f2 | 447 | #endif |
7c673cae | 448 | |
20effc67 TL |
449 | int64_t _maybe_extend_log(); |
450 | void _extend_log(); | |
451 | uint64_t _log_advance_seq(); | |
452 | void _consume_dirty(uint64_t seq); | |
453 | void _clear_dirty_set_stable_D(uint64_t seq_stable); | |
454 | void _release_pending_allocations(std::vector<interval_set<uint64_t>>& to_release); | |
455 | ||
456 | void _flush_and_sync_log_core(int64_t available_runway); | |
457 | int _flush_and_sync_log_jump_D(uint64_t jump_to, | |
458 | int64_t available_runway); | |
459 | int _flush_and_sync_log_LD(uint64_t want_seq = 0); | |
460 | ||
39ae355f TL |
461 | uint64_t _estimate_transaction_size(bluefs_transaction_t* t); |
462 | uint64_t _make_initial_transaction(uint64_t start_seq, | |
463 | bluefs_fnode_t& fnode, | |
464 | uint64_t expected_final_size, | |
465 | bufferlist* out); | |
20effc67 TL |
466 | uint64_t _estimate_log_size_N(); |
467 | bool _should_start_compact_log_L_N(); | |
11fdf7f2 TL |
468 | |
469 | enum { | |
470 | REMOVE_DB = 1, | |
471 | REMOVE_WAL = 2, | |
472 | RENAME_SLOW2DB = 4, | |
473 | RENAME_DB2SLOW = 8, | |
474 | }; | |
39ae355f TL |
475 | void _compact_log_dump_metadata_NF(uint64_t start_seq, |
476 | bluefs_transaction_t *t, | |
477 | int flags, | |
478 | uint64_t capture_before_seq); | |
11fdf7f2 | 479 | |
20effc67 TL |
480 | void _compact_log_sync_LNF_LD(); |
481 | void _compact_log_async_LD_LNF_D(); | |
482 | ||
39ae355f | 483 | void _rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback, |
9f95a23c TL |
484 | int super_dev, |
485 | int log_dev, | |
486 | int new_log_dev, | |
487 | int flags, | |
488 | std::optional<bluefs_layout_t> layout); | |
7c673cae FG |
489 | |
490 | //void _aio_finish(void *priv); | |
491 | ||
39ae355f | 492 | void _flush_bdev(FileWriter *h, bool check_mutex_locked = true); |
20effc67 TL |
493 | void _flush_bdev(); // this is safe to call without a lock |
494 | void _flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock | |
7c673cae FG |
495 | |
496 | int _preallocate(FileRef f, uint64_t off, uint64_t len); | |
497 | int _truncate(FileWriter *h, uint64_t off); | |
498 | ||
adb31ebb | 499 | int64_t _read( |
7c673cae | 500 | FileReader *h, ///< [in] read from here |
7c673cae FG |
501 | uint64_t offset, ///< [in] offset |
502 | size_t len, ///< [in] this many bytes | |
f67539c2 | 503 | ceph::buffer::list *outbl, ///< [out] optional: reference the result here |
7c673cae | 504 | char *out); ///< [out] optional: or copy it here |
adb31ebb | 505 | int64_t _read_random( |
7c673cae FG |
506 | FileReader *h, ///< [in] read from here |
507 | uint64_t offset, ///< [in] offset | |
9f95a23c | 508 | uint64_t len, ///< [in] this many bytes |
7c673cae FG |
509 | char *out); ///< [out] optional: or copy it here |
510 | ||
7c673cae | 511 | int _open_super(); |
11fdf7f2 | 512 | int _write_super(int dev); |
20effc67 TL |
513 | int _check_allocations(const bluefs_fnode_t& fnode, |
514 | boost::dynamic_bitset<uint64_t>* used_blocks, | |
515 | bool is_alloc, //true when allocating, false when deallocating | |
516 | const char* op_name); | |
9f95a23c TL |
517 | int _verify_alloc_granularity( |
518 | __u8 id, uint64_t offset, uint64_t length, | |
39ae355f | 519 | uint64_t alloc_unit, |
9f95a23c | 520 | const char *op); |
11fdf7f2 | 521 | int _replay(bool noop, bool to_stdout = false); ///< replay journal |
7c673cae FG |
522 | |
523 | FileWriter *_create_writer(FileRef f); | |
20effc67 | 524 | void _drain_writer(FileWriter *h); |
7c673cae FG |
525 | void _close_writer(FileWriter *h); |
526 | ||
527 | // always put the super in the second 4k block. FIXME should this be | |
528 | // block size independent? | |
529 | unsigned get_super_offset() { | |
530 | return 4096; | |
531 | } | |
532 | unsigned get_super_length() { | |
533 | return 4096; | |
534 | } | |
20effc67 TL |
535 | void _maybe_check_vselector_LNF() { |
536 | if (cct->_conf->bluefs_check_volume_selector_often) { | |
537 | _check_vselector_LNF(); | |
538 | } | |
539 | } | |
7c673cae FG |
540 | public: |
541 | BlueFS(CephContext* cct); | |
542 | ~BlueFS(); | |
543 | ||
544 | // the super is always stored on bdev 0 | |
9f95a23c | 545 | int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout); |
7c673cae | 546 | int mount(); |
9f95a23c | 547 | int maybe_verify_layout(const bluefs_layout_t& layout) const; |
1911f103 | 548 | void umount(bool avoid_compact = false); |
9f95a23c | 549 | int prepare_new_device(int id, const bluefs_layout_t& layout); |
11fdf7f2 TL |
550 | |
551 | int log_dump(); | |
7c673cae | 552 | |
f67539c2 TL |
553 | void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id); |
554 | void get_devices(std::set<std::string> *ls); | |
eafe8130 TL |
555 | uint64_t get_alloc_size(int id) { |
556 | return alloc_size[id]; | |
557 | } | |
7c673cae FG |
558 | int fsck(); |
559 | ||
11fdf7f2 TL |
560 | int device_migrate_to_new( |
561 | CephContext *cct, | |
f67539c2 | 562 | const std::set<int>& devs_source, |
9f95a23c TL |
563 | int dev_target, |
564 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
565 | int device_migrate_to_existing( |
566 | CephContext *cct, | |
f67539c2 | 567 | const std::set<int>& devs_source, |
9f95a23c TL |
568 | int dev_target, |
569 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
570 | |
571 | uint64_t get_used(); | |
7c673cae FG |
572 | uint64_t get_total(unsigned id); |
573 | uint64_t get_free(unsigned id); | |
f67539c2 TL |
574 | uint64_t get_used(unsigned id); |
575 | void dump_perf_counters(ceph::Formatter *f); | |
7c673cae | 576 | |
f67539c2 | 577 | void dump_block_extents(std::ostream& out); |
3efd9988 | 578 | |
7c673cae FG |
579 | /// get current extents that we own for given block device |
580 | int get_block_extents(unsigned id, interval_set<uint64_t> *extents); | |
581 | ||
582 | int open_for_write( | |
b3b6e05e TL |
583 | std::string_view dir, |
584 | std::string_view file, | |
7c673cae FG |
585 | FileWriter **h, |
586 | bool overwrite); | |
587 | ||
588 | int open_for_read( | |
b3b6e05e TL |
589 | std::string_view dir, |
590 | std::string_view file, | |
7c673cae FG |
591 | FileReader **h, |
592 | bool random = false); | |
593 | ||
20effc67 TL |
594 | // data added after last fsync() is lost |
595 | void close_writer(FileWriter *h); | |
7c673cae | 596 | |
b3b6e05e TL |
597 | int rename(std::string_view old_dir, std::string_view old_file, |
598 | std::string_view new_dir, std::string_view new_file); | |
7c673cae | 599 | |
b3b6e05e | 600 | int readdir(std::string_view dirname, std::vector<std::string> *ls); |
7c673cae | 601 | |
b3b6e05e TL |
602 | int unlink(std::string_view dirname, std::string_view filename); |
603 | int mkdir(std::string_view dirname); | |
604 | int rmdir(std::string_view dirname); | |
d2e6a577 | 605 | bool wal_is_rotational(); |
1d09f67e | 606 | bool db_is_rotational(); |
7c673cae | 607 | |
b3b6e05e TL |
608 | bool dir_exists(std::string_view dirname); |
609 | int stat(std::string_view dirname, std::string_view filename, | |
7c673cae FG |
610 | uint64_t *size, utime_t *mtime); |
611 | ||
b3b6e05e | 612 | int lock_file(std::string_view dirname, std::string_view filename, FileLock **p); |
7c673cae FG |
613 | int unlock_file(FileLock *l); |
614 | ||
7c673cae FG |
615 | void compact_log(); |
616 | ||
617 | /// sync any uncommitted state to disk | |
1911f103 | 618 | void sync_metadata(bool avoid_compact); |
7c673cae | 619 | |
9f95a23c TL |
620 | void set_volume_selector(BlueFSVolumeSelector* s) { |
621 | vselector.reset(s); | |
622 | } | |
f67539c2 | 623 | void dump_volume_selector(std::ostream& sout) { |
9f95a23c TL |
624 | vselector->dump(sout); |
625 | } | |
626 | void get_vselector_paths(const std::string& base, | |
627 | BlueFSVolumeSelector::paths& res) const { | |
628 | return vselector->get_paths(base, res); | |
629 | } | |
630 | ||
f67539c2 TL |
631 | int add_block_device(unsigned bdev, const std::string& path, bool trim, |
632 | uint64_t reserved, | |
633 | bluefs_shared_alloc_context_t* _shared_alloc = nullptr); | |
7c673cae | 634 | bool bdev_support_label(unsigned id); |
f67539c2 | 635 | uint64_t get_block_device_size(unsigned bdev) const; |
7c673cae | 636 | |
11fdf7f2 TL |
637 | // handler for discard event |
638 | void handle_discard(unsigned dev, interval_set<uint64_t>& to_release); | |
639 | ||
20effc67 | 640 | void flush(FileWriter *h, bool force = false); |
cd265ab1 | 641 | |
20effc67 TL |
642 | void append_try_flush(FileWriter *h, const char* buf, size_t len); |
643 | void flush_range(FileWriter *h, uint64_t offset, uint64_t length); | |
644 | int fsync(FileWriter *h); | |
f67539c2 TL |
645 | int64_t read(FileReader *h, uint64_t offset, size_t len, |
646 | ceph::buffer::list *outbl, char *out) { | |
7c673cae FG |
647 | // no need to hold the global lock here; we only touch h and |
648 | // h->file, and read vs write or delete is already protected (via | |
649 | // atomics and asserts). | |
f67539c2 | 650 | return _read(h, offset, len, outbl, out); |
7c673cae | 651 | } |
adb31ebb | 652 | int64_t read_random(FileReader *h, uint64_t offset, size_t len, |
7c673cae FG |
653 | char *out) { |
654 | // no need to hold the global lock here; we only touch h and | |
655 | // h->file, and read vs write or delete is already protected (via | |
656 | // atomics and asserts). | |
657 | return _read_random(h, offset, len, out); | |
658 | } | |
20effc67 TL |
659 | void invalidate_cache(FileRef f, uint64_t offset, uint64_t len); |
660 | int preallocate(FileRef f, uint64_t offset, uint64_t len); | |
661 | int truncate(FileWriter *h, uint64_t offset); | |
7c673cae | 662 | |
f67539c2 TL |
663 | size_t probe_alloc_avail(int dev, uint64_t alloc_size); |
664 | ||
9f95a23c | 665 | /// test purpose methods |
9f95a23c TL |
666 | const PerfCounters* get_perf_counters() const { |
667 | return logger; | |
668 | } | |
522d829b TL |
669 | uint64_t debug_get_dirty_seq(FileWriter *h); |
670 | bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev); | |
cd265ab1 TL |
671 | |
672 | private: | |
673 | // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...) | |
674 | // They are used for checking if read values are all 0, and reread if so. | |
20effc67 | 675 | int _read_and_check(uint8_t ndev, uint64_t off, uint64_t len, |
cd265ab1 | 676 | ceph::buffer::list *pbl, IOContext *ioc, bool buffered); |
20effc67 TL |
677 | int _read_random_and_check(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered); |
678 | ||
679 | int _bdev_read(uint8_t ndev, uint64_t off, uint64_t len, | |
680 | ceph::buffer::list* pbl, IOContext* ioc, bool buffered); | |
681 | int _bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len, char* buf, bool buffered); | |
682 | ||
683 | /// test and compact log, if necessary | |
684 | void _maybe_compact_log_LNF_NF_LD_D(); | |
685 | int _do_replay_recovery_read(FileReader *log, | |
686 | size_t log_pos, | |
687 | size_t read_offset, | |
688 | size_t read_len, | |
689 | bufferlist* bl); | |
690 | void _check_vselector_LNF(); | |
9f95a23c TL |
691 | }; |
692 | ||
693 | class OriginalVolumeSelector : public BlueFSVolumeSelector { | |
694 | uint64_t wal_total; | |
695 | uint64_t db_total; | |
696 | uint64_t slow_total; | |
697 | ||
698 | public: | |
699 | OriginalVolumeSelector( | |
700 | uint64_t _wal_total, | |
701 | uint64_t _db_total, | |
702 | uint64_t _slow_total) | |
703 | : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {} | |
704 | ||
f6b5b4d7 | 705 | void* get_hint_for_log() const override; |
b3b6e05e | 706 | void* get_hint_by_dir(std::string_view dirname) const override; |
9f95a23c TL |
707 | |
708 | void add_usage(void* hint, const bluefs_fnode_t& fnode) override { | |
709 | // do nothing | |
710 | return; | |
711 | } | |
712 | void sub_usage(void* hint, const bluefs_fnode_t& fnode) override { | |
713 | // do nothing | |
714 | return; | |
715 | } | |
716 | void add_usage(void* hint, uint64_t fsize) override { | |
717 | // do nothing | |
718 | return; | |
719 | } | |
720 | void sub_usage(void* hint, uint64_t fsize) override { | |
721 | // do nothing | |
722 | return; | |
723 | } | |
724 | ||
725 | uint8_t select_prefer_bdev(void* hint) override; | |
726 | void get_paths(const std::string& base, paths& res) const override; | |
f67539c2 TL |
727 | void dump(std::ostream& sout) override; |
728 | }; | |
729 | ||
730 | class FitToFastVolumeSelector : public OriginalVolumeSelector { | |
731 | public: | |
732 | FitToFastVolumeSelector( | |
733 | uint64_t _wal_total, | |
734 | uint64_t _db_total, | |
735 | uint64_t _slow_total) | |
736 | : OriginalVolumeSelector(_wal_total, _db_total, _slow_total) {} | |
737 | ||
738 | void get_paths(const std::string& base, paths& res) const override; | |
7c673cae | 739 | }; |
20effc67 TL |
740 | /** |
741 | * Directional graph of locks. | |
742 | * Vertices - Locks. Edges (directed) - locking progression. | |
743 | * Edge A->B exist if last taken lock was A and next taken lock is B. | |
744 | * | |
745 | * Row represents last lock taken. | |
746 | * Column represents next lock taken. | |
747 | * | |
748 | * > | W | L | N | D | F | |
749 | * -------------|---|---|---|---|--- | |
750 | * FileWriter W | | > | > | > | > | |
751 | * log L | | > | > | > | |
752 | * nodes N | | > | > | |
753 | * dirty D | | | > | |
754 | * File F | | |
755 | * | |
756 | * Claim: Deadlock is possible IFF graph contains cycles. | |
757 | */ | |
7c673cae | 758 | #endif |