]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | #ifndef CEPH_OS_BLUESTORE_BLUEFS_H | |
4 | #define CEPH_OS_BLUESTORE_BLUEFS_H | |
5 | ||
6 | #include <atomic> | |
7 | #include <mutex> | |
cd265ab1 | 8 | #include <limits> |
7c673cae FG |
9 | |
10 | #include "bluefs_types.h" | |
f67539c2 | 11 | #include "blk/BlockDevice.h" |
7c673cae | 12 | |
9f95a23c TL |
13 | #include "common/RefCountedObj.h" |
14 | #include "common/ceph_context.h" | |
15 | #include "global/global_context.h" | |
16 | #include "include/common_fwd.h" | |
7c673cae | 17 | |
9f95a23c TL |
18 | #include "boost/intrusive/list.hpp" |
19 | #include "boost/dynamic_bitset.hpp" | |
7c673cae FG |
20 | |
21 | class Allocator; | |
22 | ||
23 | enum { | |
24 | l_bluefs_first = 732600, | |
7c673cae FG |
25 | l_bluefs_db_total_bytes, |
26 | l_bluefs_db_used_bytes, | |
27 | l_bluefs_wal_total_bytes, | |
28 | l_bluefs_wal_used_bytes, | |
29 | l_bluefs_slow_total_bytes, | |
30 | l_bluefs_slow_used_bytes, | |
31 | l_bluefs_num_files, | |
32 | l_bluefs_log_bytes, | |
33 | l_bluefs_log_compactions, | |
1e59de90 | 34 | l_bluefs_log_write_count, |
7c673cae FG |
35 | l_bluefs_logged_bytes, |
36 | l_bluefs_files_written_wal, | |
37 | l_bluefs_files_written_sst, | |
1e59de90 TL |
38 | l_bluefs_write_count_wal, |
39 | l_bluefs_write_count_sst, | |
7c673cae FG |
40 | l_bluefs_bytes_written_wal, |
41 | l_bluefs_bytes_written_sst, | |
11fdf7f2 TL |
42 | l_bluefs_bytes_written_slow, |
43 | l_bluefs_max_bytes_wal, | |
44 | l_bluefs_max_bytes_db, | |
45 | l_bluefs_max_bytes_slow, | |
20effc67 TL |
46 | l_bluefs_main_alloc_unit, |
47 | l_bluefs_db_alloc_unit, | |
48 | l_bluefs_wal_alloc_unit, | |
494da23a TL |
49 | l_bluefs_read_random_count, |
50 | l_bluefs_read_random_bytes, | |
51 | l_bluefs_read_random_disk_count, | |
52 | l_bluefs_read_random_disk_bytes, | |
20effc67 TL |
53 | l_bluefs_read_random_disk_bytes_wal, |
54 | l_bluefs_read_random_disk_bytes_db, | |
55 | l_bluefs_read_random_disk_bytes_slow, | |
494da23a TL |
56 | l_bluefs_read_random_buffer_count, |
57 | l_bluefs_read_random_buffer_bytes, | |
58 | l_bluefs_read_count, | |
59 | l_bluefs_read_bytes, | |
20effc67 TL |
60 | l_bluefs_read_disk_count, |
61 | l_bluefs_read_disk_bytes, | |
62 | l_bluefs_read_disk_bytes_wal, | |
63 | l_bluefs_read_disk_bytes_db, | |
64 | l_bluefs_read_disk_bytes_slow, | |
494da23a TL |
65 | l_bluefs_read_prefetch_count, |
66 | l_bluefs_read_prefetch_bytes, | |
1e59de90 TL |
67 | l_bluefs_write_count, |
68 | l_bluefs_write_disk_count, | |
69 | l_bluefs_write_bytes, | |
39ae355f TL |
70 | l_bluefs_compaction_lat, |
71 | l_bluefs_compaction_lock_lat, | |
72 | l_bluefs_alloc_shared_dev_fallbacks, | |
73 | l_bluefs_alloc_shared_size_fallbacks, | |
cd265ab1 TL |
74 | l_bluefs_read_zeros_candidate, |
75 | l_bluefs_read_zeros_errors, | |
7c673cae FG |
76 | l_bluefs_last, |
77 | }; | |
78 | ||
9f95a23c TL |
79 | class BlueFSVolumeSelector { |
80 | public: | |
81 | typedef std::vector<std::pair<std::string, uint64_t>> paths; | |
82 | ||
83 | virtual ~BlueFSVolumeSelector() { | |
84 | } | |
f6b5b4d7 | 85 | virtual void* get_hint_for_log() const = 0; |
b3b6e05e | 86 | virtual void* get_hint_by_dir(std::string_view dirname) const = 0; |
9f95a23c TL |
87 | |
88 | virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; | |
89 | virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; | |
90 | virtual void add_usage(void* file_hint, uint64_t fsize) = 0; | |
91 | virtual void sub_usage(void* file_hint, uint64_t fsize) = 0; | |
92 | virtual uint8_t select_prefer_bdev(void* hint) = 0; | |
93 | virtual void get_paths(const std::string& base, paths& res) const = 0; | |
f67539c2 | 94 | virtual void dump(std::ostream& sout) = 0; |
20effc67 TL |
95 | |
96 | /* used for sanity checking of vselector */ | |
97 | virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; } | |
98 | virtual bool compare(BlueFSVolumeSelector* other) { return true; }; | |
f67539c2 TL |
99 | }; |
100 | ||
101 | struct bluefs_shared_alloc_context_t { | |
102 | bool need_init = false; | |
103 | Allocator* a = nullptr; | |
39ae355f | 104 | uint64_t alloc_unit = 0; |
f67539c2 TL |
105 | |
106 | std::atomic<uint64_t> bluefs_used = 0; | |
107 | ||
39ae355f | 108 | void set(Allocator* _a, uint64_t _au) { |
f67539c2 | 109 | a = _a; |
39ae355f | 110 | alloc_unit = _au; |
f67539c2 TL |
111 | need_init = true; |
112 | bluefs_used = 0; | |
113 | } | |
114 | void reset() { | |
115 | a = nullptr; | |
39ae355f | 116 | alloc_unit = 0; |
f67539c2 | 117 | } |
11fdf7f2 TL |
118 | }; |
119 | ||
7c673cae FG |
120 | class BlueFS { |
121 | public: | |
122 | CephContext* cct; | |
11fdf7f2 | 123 | static constexpr unsigned MAX_BDEV = 5; |
7c673cae FG |
124 | static constexpr unsigned BDEV_WAL = 0; |
125 | static constexpr unsigned BDEV_DB = 1; | |
126 | static constexpr unsigned BDEV_SLOW = 2; | |
11fdf7f2 TL |
127 | static constexpr unsigned BDEV_NEWWAL = 3; |
128 | static constexpr unsigned BDEV_NEWDB = 4; | |
7c673cae FG |
129 | |
130 | enum { | |
131 | WRITER_UNKNOWN, | |
132 | WRITER_WAL, | |
133 | WRITER_SST, | |
134 | }; | |
135 | ||
136 | struct File : public RefCountedObject { | |
137 | MEMPOOL_CLASS_HELPERS(); | |
138 | ||
139 | bluefs_fnode_t fnode; | |
140 | int refs; | |
141 | uint64_t dirty_seq; | |
142 | bool locked; | |
143 | bool deleted; | |
522d829b | 144 | bool is_dirty; |
7c673cae FG |
145 | boost::intrusive::list_member_hook<> dirty_item; |
146 | ||
147 | std::atomic_int num_readers, num_writers; | |
148 | std::atomic_int num_reading; | |
149 | ||
9f95a23c | 150 | void* vselector_hint = nullptr; |
20effc67 TL |
151 | /* lock protects fnode and other the parts that can be modified during read & write operations. |
152 | Does not protect values that are fixed | |
153 | Does not need to be taken when doing one-time operations: | |
154 | _replay, device_migrate_to_existing, device_migrate_to_new */ | |
155 | ceph::mutex lock = ceph::make_mutex("BlueFS::File::lock"); | |
9f95a23c TL |
156 | |
157 | private: | |
158 | FRIEND_MAKE_REF(File); | |
7c673cae | 159 | File() |
9f95a23c | 160 | : |
7c673cae FG |
161 | refs(0), |
162 | dirty_seq(0), | |
163 | locked(false), | |
164 | deleted(false), | |
522d829b | 165 | is_dirty(false), |
7c673cae FG |
166 | num_readers(0), |
167 | num_writers(0), | |
9f95a23c TL |
168 | num_reading(0), |
169 | vselector_hint(nullptr) | |
7c673cae FG |
170 | {} |
171 | ~File() override { | |
11fdf7f2 TL |
172 | ceph_assert(num_readers.load() == 0); |
173 | ceph_assert(num_writers.load() == 0); | |
174 | ceph_assert(num_reading.load() == 0); | |
175 | ceph_assert(!locked); | |
7c673cae | 176 | } |
7c673cae | 177 | }; |
9f95a23c | 178 | using FileRef = ceph::ref_t<File>; |
7c673cae FG |
179 | |
180 | typedef boost::intrusive::list< | |
181 | File, | |
182 | boost::intrusive::member_hook< | |
183 | File, | |
184 | boost::intrusive::list_member_hook<>, | |
185 | &File::dirty_item> > dirty_file_list_t; | |
186 | ||
187 | struct Dir : public RefCountedObject { | |
188 | MEMPOOL_CLASS_HELPERS(); | |
189 | ||
b3b6e05e | 190 | mempool::bluefs::map<std::string, FileRef, std::less<>> file_map; |
7c673cae | 191 | |
9f95a23c TL |
192 | private: |
193 | FRIEND_MAKE_REF(Dir); | |
194 | Dir() = default; | |
7c673cae | 195 | }; |
9f95a23c | 196 | using DirRef = ceph::ref_t<Dir>; |
7c673cae FG |
197 | |
198 | struct FileWriter { | |
199 | MEMPOOL_CLASS_HELPERS(); | |
200 | ||
201 | FileRef file; | |
9f95a23c | 202 | uint64_t pos = 0; ///< start offset for buffer |
f67539c2 TL |
203 | private: |
204 | ceph::buffer::list buffer; ///< new data to write (at end of file) | |
205 | ceph::buffer::list tail_block; ///< existing partial block at end of file, if any | |
206 | public: | |
207 | unsigned get_buffer_length() const { | |
208 | return buffer.length(); | |
209 | } | |
210 | ceph::bufferlist flush_buffer( | |
211 | CephContext* cct, | |
212 | const bool partial, | |
213 | const unsigned length, | |
214 | const bluefs_super_t& super); | |
215 | ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only | |
216 | public: | |
7c673cae | 217 | int writer_type = 0; ///< WRITER_* |
11fdf7f2 | 218 | int write_hint = WRITE_LIFE_NOT_SET; |
7c673cae | 219 | |
11fdf7f2 | 220 | ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock"); |
7c673cae | 221 | std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev |
11fdf7f2 | 222 | std::array<bool, MAX_BDEV> dirty_devs; |
7c673cae FG |
223 | |
224 | FileWriter(FileRef f) | |
9f95a23c | 225 | : file(std::move(f)), |
f67539c2 TL |
226 | buffer_appender(buffer.get_page_aligned_appender( |
227 | g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) { | |
7c673cae FG |
228 | ++file->num_writers; |
229 | iocv.fill(nullptr); | |
11fdf7f2 | 230 | dirty_devs.fill(false); |
9f95a23c | 231 | if (file->fnode.ino == 1) { |
11fdf7f2 TL |
232 | write_hint = WRITE_LIFE_MEDIUM; |
233 | } | |
7c673cae FG |
234 | } |
235 | // NOTE: caller must call BlueFS::close_writer() | |
236 | ~FileWriter() { | |
237 | --file->num_writers; | |
238 | } | |
239 | ||
240 | // note: BlueRocksEnv uses this append exclusively, so it's safe | |
20effc67 | 241 | // to use buffer_appender exclusively here (e.g., its notion of |
7c673cae FG |
242 | // offset will remain accurate). |
243 | void append(const char *buf, size_t len) { | |
f67539c2 | 244 | uint64_t l0 = get_buffer_length(); |
cd265ab1 | 245 | ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max()); |
7c673cae FG |
246 | buffer_appender.append(buf, len); |
247 | } | |
248 | ||
20effc67 TL |
249 | void append(const std::byte *buf, size_t len) { |
250 | // allow callers to use byte type instead of char* as we simply pass byte array | |
251 | append((const char*)buf, len); | |
252 | } | |
253 | ||
7c673cae | 254 | // note: used internally only, for ino 1 or 0. |
cd265ab1 | 255 | void append(ceph::buffer::list& bl) { |
f67539c2 | 256 | uint64_t l0 = get_buffer_length(); |
cd265ab1 | 257 | ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max()); |
7c673cae FG |
258 | buffer.claim_append(bl); |
259 | } | |
260 | ||
f67539c2 TL |
261 | void append_zero(size_t len) { |
262 | uint64_t l0 = get_buffer_length(); | |
263 | ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max()); | |
264 | buffer_appender.append_zero(len); | |
265 | } | |
266 | ||
7c673cae | 267 | uint64_t get_effective_write_pos() { |
7c673cae FG |
268 | return pos + buffer.length(); |
269 | } | |
270 | }; | |
271 | ||
272 | struct FileReaderBuffer { | |
273 | MEMPOOL_CLASS_HELPERS(); | |
274 | ||
9f95a23c | 275 | uint64_t bl_off = 0; ///< prefetch buffer logical offset |
f67539c2 | 276 | ceph::buffer::list bl; ///< prefetch buffer |
9f95a23c | 277 | uint64_t pos = 0; ///< current logical offset |
7c673cae FG |
278 | uint64_t max_prefetch; ///< max allowed prefetch |
279 | ||
280 | explicit FileReaderBuffer(uint64_t mpf) | |
9f95a23c | 281 | : max_prefetch(mpf) {} |
7c673cae | 282 | |
9f95a23c | 283 | uint64_t get_buf_end() const { |
7c673cae FG |
284 | return bl_off + bl.length(); |
285 | } | |
9f95a23c | 286 | uint64_t get_buf_remaining(uint64_t p) const { |
7c673cae FG |
287 | if (p >= bl_off && p < bl_off + bl.length()) |
288 | return bl_off + bl.length() - p; | |
289 | return 0; | |
290 | } | |
291 | ||
292 | void skip(size_t n) { | |
293 | pos += n; | |
294 | } | |
f67539c2 TL |
295 | |
296 | // For the sake of simplicity, we invalidate completed rather than | |
297 | // for the provided extent | |
298 | void invalidate_cache(uint64_t offset, uint64_t length) { | |
299 | if (offset >= bl_off && offset < get_buf_end()) { | |
300 | bl.clear(); | |
301 | bl_off = 0; | |
302 | } | |
7c673cae FG |
303 | } |
304 | }; | |
305 | ||
306 | struct FileReader { | |
307 | MEMPOOL_CLASS_HELPERS(); | |
308 | ||
309 | FileRef file; | |
310 | FileReaderBuffer buf; | |
311 | bool random; | |
312 | bool ignore_eof; ///< used when reading our log file | |
313 | ||
494da23a TL |
314 | ceph::shared_mutex lock { |
315 | ceph::make_shared_mutex(std::string(), false, false, false) | |
316 | }; | |
317 | ||
318 | ||
7c673cae FG |
319 | FileReader(FileRef f, uint64_t mpf, bool rand, bool ie) |
320 | : file(f), | |
321 | buf(mpf), | |
322 | random(rand), | |
323 | ignore_eof(ie) { | |
324 | ++file->num_readers; | |
325 | } | |
326 | ~FileReader() { | |
327 | --file->num_readers; | |
328 | } | |
329 | }; | |
330 | ||
331 | struct FileLock { | |
332 | MEMPOOL_CLASS_HELPERS(); | |
333 | ||
334 | FileRef file; | |
9f95a23c | 335 | explicit FileLock(FileRef f) : file(std::move(f)) {} |
7c673cae FG |
336 | }; |
337 | ||
338 | private: | |
7c673cae FG |
339 | PerfCounters *logger = nullptr; |
340 | ||
11fdf7f2 TL |
341 | uint64_t max_bytes[MAX_BDEV] = {0}; |
342 | uint64_t max_bytes_pcounters[MAX_BDEV] = { | |
343 | l_bluefs_max_bytes_wal, | |
344 | l_bluefs_max_bytes_db, | |
345 | l_bluefs_max_bytes_slow, | |
39ae355f TL |
346 | l_bluefs_max_bytes_wal, |
347 | l_bluefs_max_bytes_db, | |
11fdf7f2 TL |
348 | }; |
349 | ||
7c673cae | 350 | // cache |
20effc67 TL |
351 | struct { |
352 | ceph::mutex lock = ceph::make_mutex("BlueFS::nodes.lock"); | |
353 | mempool::bluefs::map<std::string, DirRef, std::less<>> dir_map; ///< dirname -> Dir | |
354 | mempool::bluefs::unordered_map<uint64_t, FileRef> file_map; ///< ino -> File | |
355 | } nodes; | |
7c673cae FG |
356 | |
357 | bluefs_super_t super; ///< latest superblock (as last written) | |
358 | uint64_t ino_last = 0; ///< last assigned ino (this one is in use) | |
7c673cae | 359 | |
20effc67 TL |
360 | struct { |
361 | ceph::mutex lock = ceph::make_mutex("BlueFS::log.lock"); | |
362 | uint64_t seq_live = 1; //seq that log is currently writing to; mirrors dirty.seq_live | |
363 | FileWriter *writer = 0; | |
364 | bluefs_transaction_t t; | |
365 | } log; | |
366 | ||
367 | struct { | |
368 | ceph::mutex lock = ceph::make_mutex("BlueFS::dirty.lock"); | |
369 | uint64_t seq_stable = 0; //seq that is now stable on disk | |
370 | uint64_t seq_live = 1; //seq that is ongoing and dirty files will be written to | |
371 | // map of dirty files, files of same dirty_seq are grouped into list. | |
372 | std::map<uint64_t, dirty_file_list_t> files; | |
373 | std::vector<interval_set<uint64_t>> pending_release; ///< extents to release | |
374 | // TODO: it should be examined what makes pending_release immune to | |
375 | // eras in a way similar to dirty_files. Hints: | |
376 | // 1) we have actually only 2 eras: log_seq and log_seq+1 | |
377 | // 2) we usually not remove extents from files. And when we do, we force log-syncing. | |
378 | } dirty; | |
379 | ||
380 | ceph::condition_variable log_cond; ///< used for state control between log flush / log compaction | |
381 | std::atomic<bool> log_is_compacting{false}; ///< signals that bluefs log is already ongoing compaction | |
382 | std::atomic<bool> log_forbidden_to_expand{false}; ///< used to signal that async compaction is in state | |
383 | /// that prohibits expansion of bluefs log | |
7c673cae FG |
384 | /* |
385 | * There are up to 3 block devices: | |
386 | * | |
387 | * BDEV_DB db/ - the primary db device | |
388 | * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL | |
389 | * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills | |
390 | */ | |
f67539c2 TL |
391 | std::vector<BlockDevice*> bdev; ///< block devices we can use |
392 | std::vector<IOContext*> ioc; ///< IOContexts for bdevs | |
393 | std::vector<uint64_t> block_reserved; ///< starting reserve extent per device | |
394 | std::vector<Allocator*> alloc; ///< allocators for bdevs | |
395 | std::vector<uint64_t> alloc_size; ///< alloc size for each device | |
20effc67 | 396 | |
f67539c2 | 397 | //std::vector<interval_set<uint64_t>> block_unused_too_granular; |
7c673cae | 398 | |
11fdf7f2 TL |
399 | BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev |
400 | ||
9f95a23c | 401 | std::unique_ptr<BlueFSVolumeSelector> vselector; |
11fdf7f2 | 402 | |
f67539c2 TL |
403 | bluefs_shared_alloc_context_t* shared_alloc = nullptr; |
404 | unsigned shared_alloc_id = unsigned(-1); | |
405 | inline bool is_shared_alloc(unsigned id) const { | |
406 | return id == shared_alloc_id; | |
407 | } | |
39ae355f | 408 | std::atomic<int64_t> cooldown_deadline = 0; |
f67539c2 | 409 | |
eafe8130 TL |
410 | class SocketHook; |
411 | SocketHook* asok_hook = nullptr; | |
cd265ab1 TL |
412 | // used to trigger zeros into read (debug / verify) |
413 | std::atomic<uint64_t> inject_read_zeros{0}; | |
eafe8130 | 414 | |
7c673cae FG |
415 | void _init_logger(); |
416 | void _shutdown_logger(); | |
417 | void _update_logger_stats(); | |
418 | ||
419 | void _init_alloc(); | |
420 | void _stop_alloc(); | |
421 | ||
39ae355f TL |
422 | ///< pad ceph::buffer::list to max(block size, pad_size) w/ zeros |
423 | void _pad_bl(ceph::buffer::list& bl, uint64_t pad_size = 0); | |
f67539c2 TL |
424 | |
425 | uint64_t _get_used(unsigned id) const; | |
426 | uint64_t _get_total(unsigned id) const; | |
427 | ||
7c673cae FG |
428 | |
429 | FileRef _get_file(uint64_t ino); | |
20effc67 | 430 | void _drop_link_D(FileRef f); |
7c673cae | 431 | |
1911f103 TL |
432 | unsigned _get_slow_device_id() { |
433 | return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; | |
434 | } | |
eafe8130 | 435 | const char* get_device_name(unsigned id); |
7c673cae | 436 | int _allocate(uint8_t bdev, uint64_t len, |
39ae355f TL |
437 | uint64_t alloc_unit, |
438 | bluefs_fnode_t* node, | |
439 | size_t alloc_attempts = 0, | |
440 | bool permit_dev_fallback = true); | |
11fdf7f2 | 441 | |
522d829b | 442 | /* signal replay log to include h->file in nearest log flush */ |
20effc67 TL |
443 | int _signal_dirty_to_log_D(FileWriter *h); |
444 | int _flush_range_F(FileWriter *h, uint64_t offset, uint64_t length); | |
445 | int _flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered); | |
446 | int _flush_F(FileWriter *h, bool force, bool *flushed = nullptr); | |
447 | uint64_t _flush_special(FileWriter *h); | |
448 | int _fsync(FileWriter *h); | |
7c673cae | 449 | |
11fdf7f2 | 450 | #ifdef HAVE_LIBAIO |
f67539c2 | 451 | void _claim_completed_aios(FileWriter *h, std::list<aio_t> *ls); |
20effc67 | 452 | void _wait_for_aio(FileWriter *h); // safe to call without a lock |
11fdf7f2 | 453 | #endif |
7c673cae | 454 | |
20effc67 TL |
455 | int64_t _maybe_extend_log(); |
456 | void _extend_log(); | |
457 | uint64_t _log_advance_seq(); | |
458 | void _consume_dirty(uint64_t seq); | |
459 | void _clear_dirty_set_stable_D(uint64_t seq_stable); | |
460 | void _release_pending_allocations(std::vector<interval_set<uint64_t>>& to_release); | |
461 | ||
462 | void _flush_and_sync_log_core(int64_t available_runway); | |
463 | int _flush_and_sync_log_jump_D(uint64_t jump_to, | |
464 | int64_t available_runway); | |
465 | int _flush_and_sync_log_LD(uint64_t want_seq = 0); | |
466 | ||
39ae355f TL |
467 | uint64_t _estimate_transaction_size(bluefs_transaction_t* t); |
468 | uint64_t _make_initial_transaction(uint64_t start_seq, | |
469 | bluefs_fnode_t& fnode, | |
470 | uint64_t expected_final_size, | |
471 | bufferlist* out); | |
20effc67 TL |
472 | uint64_t _estimate_log_size_N(); |
473 | bool _should_start_compact_log_L_N(); | |
11fdf7f2 TL |
474 | |
475 | enum { | |
476 | REMOVE_DB = 1, | |
477 | REMOVE_WAL = 2, | |
478 | RENAME_SLOW2DB = 4, | |
479 | RENAME_DB2SLOW = 8, | |
480 | }; | |
39ae355f TL |
481 | void _compact_log_dump_metadata_NF(uint64_t start_seq, |
482 | bluefs_transaction_t *t, | |
483 | int flags, | |
484 | uint64_t capture_before_seq); | |
11fdf7f2 | 485 | |
20effc67 TL |
486 | void _compact_log_sync_LNF_LD(); |
487 | void _compact_log_async_LD_LNF_D(); | |
488 | ||
39ae355f | 489 | void _rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback, |
9f95a23c TL |
490 | int super_dev, |
491 | int log_dev, | |
492 | int new_log_dev, | |
493 | int flags, | |
494 | std::optional<bluefs_layout_t> layout); | |
7c673cae FG |
495 | |
496 | //void _aio_finish(void *priv); | |
497 | ||
39ae355f | 498 | void _flush_bdev(FileWriter *h, bool check_mutex_locked = true); |
20effc67 TL |
499 | void _flush_bdev(); // this is safe to call without a lock |
500 | void _flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock | |
7c673cae FG |
501 | |
502 | int _preallocate(FileRef f, uint64_t off, uint64_t len); | |
503 | int _truncate(FileWriter *h, uint64_t off); | |
504 | ||
adb31ebb | 505 | int64_t _read( |
7c673cae | 506 | FileReader *h, ///< [in] read from here |
7c673cae FG |
507 | uint64_t offset, ///< [in] offset |
508 | size_t len, ///< [in] this many bytes | |
f67539c2 | 509 | ceph::buffer::list *outbl, ///< [out] optional: reference the result here |
7c673cae | 510 | char *out); ///< [out] optional: or copy it here |
adb31ebb | 511 | int64_t _read_random( |
7c673cae FG |
512 | FileReader *h, ///< [in] read from here |
513 | uint64_t offset, ///< [in] offset | |
9f95a23c | 514 | uint64_t len, ///< [in] this many bytes |
7c673cae FG |
515 | char *out); ///< [out] optional: or copy it here |
516 | ||
7c673cae | 517 | int _open_super(); |
11fdf7f2 | 518 | int _write_super(int dev); |
20effc67 TL |
519 | int _check_allocations(const bluefs_fnode_t& fnode, |
520 | boost::dynamic_bitset<uint64_t>* used_blocks, | |
521 | bool is_alloc, //true when allocating, false when deallocating | |
522 | const char* op_name); | |
9f95a23c TL |
523 | int _verify_alloc_granularity( |
524 | __u8 id, uint64_t offset, uint64_t length, | |
39ae355f | 525 | uint64_t alloc_unit, |
9f95a23c | 526 | const char *op); |
11fdf7f2 | 527 | int _replay(bool noop, bool to_stdout = false); ///< replay journal |
7c673cae FG |
528 | |
529 | FileWriter *_create_writer(FileRef f); | |
20effc67 | 530 | void _drain_writer(FileWriter *h); |
7c673cae FG |
531 | void _close_writer(FileWriter *h); |
532 | ||
533 | // always put the super in the second 4k block. FIXME should this be | |
534 | // block size independent? | |
535 | unsigned get_super_offset() { | |
536 | return 4096; | |
537 | } | |
538 | unsigned get_super_length() { | |
539 | return 4096; | |
540 | } | |
20effc67 TL |
541 | void _maybe_check_vselector_LNF() { |
542 | if (cct->_conf->bluefs_check_volume_selector_often) { | |
543 | _check_vselector_LNF(); | |
544 | } | |
545 | } | |
7c673cae FG |
546 | public: |
547 | BlueFS(CephContext* cct); | |
548 | ~BlueFS(); | |
549 | ||
550 | // the super is always stored on bdev 0 | |
9f95a23c | 551 | int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout); |
7c673cae | 552 | int mount(); |
9f95a23c | 553 | int maybe_verify_layout(const bluefs_layout_t& layout) const; |
1911f103 | 554 | void umount(bool avoid_compact = false); |
9f95a23c | 555 | int prepare_new_device(int id, const bluefs_layout_t& layout); |
11fdf7f2 TL |
556 | |
557 | int log_dump(); | |
7c673cae | 558 | |
f67539c2 TL |
559 | void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id); |
560 | void get_devices(std::set<std::string> *ls); | |
eafe8130 TL |
561 | uint64_t get_alloc_size(int id) { |
562 | return alloc_size[id]; | |
563 | } | |
7c673cae FG |
564 | int fsck(); |
565 | ||
11fdf7f2 TL |
566 | int device_migrate_to_new( |
567 | CephContext *cct, | |
f67539c2 | 568 | const std::set<int>& devs_source, |
9f95a23c TL |
569 | int dev_target, |
570 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
571 | int device_migrate_to_existing( |
572 | CephContext *cct, | |
f67539c2 | 573 | const std::set<int>& devs_source, |
9f95a23c TL |
574 | int dev_target, |
575 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
576 | |
577 | uint64_t get_used(); | |
7c673cae FG |
578 | uint64_t get_total(unsigned id); |
579 | uint64_t get_free(unsigned id); | |
f67539c2 TL |
580 | uint64_t get_used(unsigned id); |
581 | void dump_perf_counters(ceph::Formatter *f); | |
7c673cae | 582 | |
f67539c2 | 583 | void dump_block_extents(std::ostream& out); |
3efd9988 | 584 | |
7c673cae | 585 | /// get current extents that we own for given block device |
1e59de90 TL |
586 | void foreach_block_extents( |
587 | unsigned id, | |
588 | std::function<void(uint64_t, uint32_t)> cb); | |
7c673cae FG |
589 | |
590 | int open_for_write( | |
b3b6e05e TL |
591 | std::string_view dir, |
592 | std::string_view file, | |
7c673cae FG |
593 | FileWriter **h, |
594 | bool overwrite); | |
595 | ||
596 | int open_for_read( | |
b3b6e05e TL |
597 | std::string_view dir, |
598 | std::string_view file, | |
7c673cae FG |
599 | FileReader **h, |
600 | bool random = false); | |
601 | ||
20effc67 TL |
602 | // data added after last fsync() is lost |
603 | void close_writer(FileWriter *h); | |
7c673cae | 604 | |
b3b6e05e TL |
605 | int rename(std::string_view old_dir, std::string_view old_file, |
606 | std::string_view new_dir, std::string_view new_file); | |
7c673cae | 607 | |
b3b6e05e | 608 | int readdir(std::string_view dirname, std::vector<std::string> *ls); |
7c673cae | 609 | |
b3b6e05e TL |
610 | int unlink(std::string_view dirname, std::string_view filename); |
611 | int mkdir(std::string_view dirname); | |
612 | int rmdir(std::string_view dirname); | |
d2e6a577 | 613 | bool wal_is_rotational(); |
1d09f67e | 614 | bool db_is_rotational(); |
7c673cae | 615 | |
b3b6e05e TL |
616 | bool dir_exists(std::string_view dirname); |
617 | int stat(std::string_view dirname, std::string_view filename, | |
7c673cae FG |
618 | uint64_t *size, utime_t *mtime); |
619 | ||
b3b6e05e | 620 | int lock_file(std::string_view dirname, std::string_view filename, FileLock **p); |
7c673cae FG |
621 | int unlock_file(FileLock *l); |
622 | ||
7c673cae FG |
623 | void compact_log(); |
624 | ||
625 | /// sync any uncommitted state to disk | |
1911f103 | 626 | void sync_metadata(bool avoid_compact); |
7c673cae | 627 | |
9f95a23c TL |
628 | void set_volume_selector(BlueFSVolumeSelector* s) { |
629 | vselector.reset(s); | |
630 | } | |
f67539c2 | 631 | void dump_volume_selector(std::ostream& sout) { |
9f95a23c TL |
632 | vselector->dump(sout); |
633 | } | |
634 | void get_vselector_paths(const std::string& base, | |
635 | BlueFSVolumeSelector::paths& res) const { | |
636 | return vselector->get_paths(base, res); | |
637 | } | |
638 | ||
f67539c2 TL |
639 | int add_block_device(unsigned bdev, const std::string& path, bool trim, |
640 | uint64_t reserved, | |
641 | bluefs_shared_alloc_context_t* _shared_alloc = nullptr); | |
7c673cae | 642 | bool bdev_support_label(unsigned id); |
f67539c2 | 643 | uint64_t get_block_device_size(unsigned bdev) const; |
7c673cae | 644 | |
11fdf7f2 TL |
645 | // handler for discard event |
646 | void handle_discard(unsigned dev, interval_set<uint64_t>& to_release); | |
647 | ||
20effc67 | 648 | void flush(FileWriter *h, bool force = false); |
cd265ab1 | 649 | |
20effc67 TL |
650 | void append_try_flush(FileWriter *h, const char* buf, size_t len); |
651 | void flush_range(FileWriter *h, uint64_t offset, uint64_t length); | |
652 | int fsync(FileWriter *h); | |
f67539c2 TL |
653 | int64_t read(FileReader *h, uint64_t offset, size_t len, |
654 | ceph::buffer::list *outbl, char *out) { | |
7c673cae FG |
655 | // no need to hold the global lock here; we only touch h and |
656 | // h->file, and read vs write or delete is already protected (via | |
657 | // atomics and asserts). | |
f67539c2 | 658 | return _read(h, offset, len, outbl, out); |
7c673cae | 659 | } |
adb31ebb | 660 | int64_t read_random(FileReader *h, uint64_t offset, size_t len, |
7c673cae FG |
661 | char *out) { |
662 | // no need to hold the global lock here; we only touch h and | |
663 | // h->file, and read vs write or delete is already protected (via | |
664 | // atomics and asserts). | |
665 | return _read_random(h, offset, len, out); | |
666 | } | |
20effc67 TL |
667 | void invalidate_cache(FileRef f, uint64_t offset, uint64_t len); |
668 | int preallocate(FileRef f, uint64_t offset, uint64_t len); | |
669 | int truncate(FileWriter *h, uint64_t offset); | |
7c673cae | 670 | |
f67539c2 TL |
671 | size_t probe_alloc_avail(int dev, uint64_t alloc_size); |
672 | ||
9f95a23c | 673 | /// test purpose methods |
9f95a23c TL |
674 | const PerfCounters* get_perf_counters() const { |
675 | return logger; | |
676 | } | |
522d829b TL |
677 | uint64_t debug_get_dirty_seq(FileWriter *h); |
678 | bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev); | |
cd265ab1 TL |
679 | |
680 | private: | |
681 | // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...) | |
682 | // They are used for checking if read values are all 0, and reread if so. | |
20effc67 | 683 | int _read_and_check(uint8_t ndev, uint64_t off, uint64_t len, |
cd265ab1 | 684 | ceph::buffer::list *pbl, IOContext *ioc, bool buffered); |
20effc67 TL |
685 | int _read_random_and_check(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered); |
686 | ||
687 | int _bdev_read(uint8_t ndev, uint64_t off, uint64_t len, | |
688 | ceph::buffer::list* pbl, IOContext* ioc, bool buffered); | |
689 | int _bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len, char* buf, bool buffered); | |
690 | ||
691 | /// test and compact log, if necessary | |
692 | void _maybe_compact_log_LNF_NF_LD_D(); | |
693 | int _do_replay_recovery_read(FileReader *log, | |
694 | size_t log_pos, | |
695 | size_t read_offset, | |
696 | size_t read_len, | |
697 | bufferlist* bl); | |
698 | void _check_vselector_LNF(); | |
9f95a23c TL |
699 | }; |
700 | ||
701 | class OriginalVolumeSelector : public BlueFSVolumeSelector { | |
702 | uint64_t wal_total; | |
703 | uint64_t db_total; | |
704 | uint64_t slow_total; | |
705 | ||
706 | public: | |
707 | OriginalVolumeSelector( | |
708 | uint64_t _wal_total, | |
709 | uint64_t _db_total, | |
710 | uint64_t _slow_total) | |
711 | : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {} | |
712 | ||
f6b5b4d7 | 713 | void* get_hint_for_log() const override; |
b3b6e05e | 714 | void* get_hint_by_dir(std::string_view dirname) const override; |
9f95a23c TL |
715 | |
716 | void add_usage(void* hint, const bluefs_fnode_t& fnode) override { | |
717 | // do nothing | |
718 | return; | |
719 | } | |
720 | void sub_usage(void* hint, const bluefs_fnode_t& fnode) override { | |
721 | // do nothing | |
722 | return; | |
723 | } | |
724 | void add_usage(void* hint, uint64_t fsize) override { | |
725 | // do nothing | |
726 | return; | |
727 | } | |
728 | void sub_usage(void* hint, uint64_t fsize) override { | |
729 | // do nothing | |
730 | return; | |
731 | } | |
732 | ||
733 | uint8_t select_prefer_bdev(void* hint) override; | |
734 | void get_paths(const std::string& base, paths& res) const override; | |
f67539c2 TL |
735 | void dump(std::ostream& sout) override; |
736 | }; | |
737 | ||
738 | class FitToFastVolumeSelector : public OriginalVolumeSelector { | |
739 | public: | |
740 | FitToFastVolumeSelector( | |
741 | uint64_t _wal_total, | |
742 | uint64_t _db_total, | |
743 | uint64_t _slow_total) | |
744 | : OriginalVolumeSelector(_wal_total, _db_total, _slow_total) {} | |
745 | ||
746 | void get_paths(const std::string& base, paths& res) const override; | |
7c673cae | 747 | }; |
20effc67 TL |
748 | /** |
749 | * Directional graph of locks. | |
750 | * Vertices - Locks. Edges (directed) - locking progression. | |
751 | * Edge A->B exist if last taken lock was A and next taken lock is B. | |
752 | * | |
753 | * Row represents last lock taken. | |
754 | * Column represents next lock taken. | |
755 | * | |
756 | * > | W | L | N | D | F | |
757 | * -------------|---|---|---|---|--- | |
758 | * FileWriter W | | > | > | > | > | |
759 | * log L | | > | > | > | |
760 | * nodes N | | > | > | |
761 | * dirty D | | | > | |
762 | * File F | | |
763 | * | |
764 | * Claim: Deadlock is possible IFF graph contains cycles. | |
765 | */ | |
7c673cae | 766 | #endif |