]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | #ifndef CEPH_OS_BLUESTORE_BLUEFS_H | |
4 | #define CEPH_OS_BLUESTORE_BLUEFS_H | |
5 | ||
6 | #include <atomic> | |
7 | #include <mutex> | |
cd265ab1 | 8 | #include <limits> |
7c673cae FG |
9 | |
10 | #include "bluefs_types.h" | |
f67539c2 | 11 | #include "blk/BlockDevice.h" |
7c673cae | 12 | |
9f95a23c TL |
13 | #include "common/RefCountedObj.h" |
14 | #include "common/ceph_context.h" | |
15 | #include "global/global_context.h" | |
16 | #include "include/common_fwd.h" | |
7c673cae | 17 | |
9f95a23c TL |
18 | #include "boost/intrusive/list.hpp" |
19 | #include "boost/dynamic_bitset.hpp" | |
7c673cae FG |
20 | |
21 | class Allocator; | |
22 | ||
23 | enum { | |
24 | l_bluefs_first = 732600, | |
7c673cae FG |
25 | l_bluefs_db_total_bytes, |
26 | l_bluefs_db_used_bytes, | |
27 | l_bluefs_wal_total_bytes, | |
28 | l_bluefs_wal_used_bytes, | |
29 | l_bluefs_slow_total_bytes, | |
30 | l_bluefs_slow_used_bytes, | |
31 | l_bluefs_num_files, | |
32 | l_bluefs_log_bytes, | |
33 | l_bluefs_log_compactions, | |
1e59de90 | 34 | l_bluefs_log_write_count, |
7c673cae FG |
35 | l_bluefs_logged_bytes, |
36 | l_bluefs_files_written_wal, | |
37 | l_bluefs_files_written_sst, | |
1e59de90 TL |
38 | l_bluefs_write_count_wal, |
39 | l_bluefs_write_count_sst, | |
7c673cae FG |
40 | l_bluefs_bytes_written_wal, |
41 | l_bluefs_bytes_written_sst, | |
11fdf7f2 TL |
42 | l_bluefs_bytes_written_slow, |
43 | l_bluefs_max_bytes_wal, | |
44 | l_bluefs_max_bytes_db, | |
45 | l_bluefs_max_bytes_slow, | |
f51cf556 | 46 | l_bluefs_slow_alloc_unit, |
20effc67 TL |
47 | l_bluefs_db_alloc_unit, |
48 | l_bluefs_wal_alloc_unit, | |
f51cf556 | 49 | l_bluefs_read_random_lat, |
494da23a TL |
50 | l_bluefs_read_random_count, |
51 | l_bluefs_read_random_bytes, | |
52 | l_bluefs_read_random_disk_count, | |
53 | l_bluefs_read_random_disk_bytes, | |
20effc67 TL |
54 | l_bluefs_read_random_disk_bytes_wal, |
55 | l_bluefs_read_random_disk_bytes_db, | |
56 | l_bluefs_read_random_disk_bytes_slow, | |
494da23a TL |
57 | l_bluefs_read_random_buffer_count, |
58 | l_bluefs_read_random_buffer_bytes, | |
f51cf556 | 59 | l_bluefs_read_lat, |
494da23a TL |
60 | l_bluefs_read_count, |
61 | l_bluefs_read_bytes, | |
20effc67 TL |
62 | l_bluefs_read_disk_count, |
63 | l_bluefs_read_disk_bytes, | |
64 | l_bluefs_read_disk_bytes_wal, | |
65 | l_bluefs_read_disk_bytes_db, | |
66 | l_bluefs_read_disk_bytes_slow, | |
494da23a TL |
67 | l_bluefs_read_prefetch_count, |
68 | l_bluefs_read_prefetch_bytes, | |
1e59de90 TL |
69 | l_bluefs_write_count, |
70 | l_bluefs_write_disk_count, | |
71 | l_bluefs_write_bytes, | |
39ae355f TL |
72 | l_bluefs_compaction_lat, |
73 | l_bluefs_compaction_lock_lat, | |
f51cf556 TL |
74 | l_bluefs_fsync_lat, |
75 | l_bluefs_flush_lat, | |
76 | l_bluefs_unlink_lat, | |
77 | l_bluefs_truncate_lat, | |
39ae355f TL |
78 | l_bluefs_alloc_shared_dev_fallbacks, |
79 | l_bluefs_alloc_shared_size_fallbacks, | |
cd265ab1 TL |
80 | l_bluefs_read_zeros_candidate, |
81 | l_bluefs_read_zeros_errors, | |
f51cf556 TL |
82 | l_bluefs_wal_alloc_lat, |
83 | l_bluefs_db_alloc_lat, | |
84 | l_bluefs_slow_alloc_lat, | |
85 | l_bluefs_wal_alloc_max_lat, | |
86 | l_bluefs_db_alloc_max_lat, | |
87 | l_bluefs_slow_alloc_max_lat, | |
7c673cae FG |
88 | l_bluefs_last, |
89 | }; | |
90 | ||
9f95a23c TL |
91 | class BlueFSVolumeSelector { |
92 | public: | |
93 | typedef std::vector<std::pair<std::string, uint64_t>> paths; | |
94 | ||
95 | virtual ~BlueFSVolumeSelector() { | |
96 | } | |
f51cf556 TL |
97 | /** |
98 | * Method to learn a hint (aka logic level discriminator) specific for | |
99 | * BlueFS log | |
100 | * | |
101 | */ | |
f6b5b4d7 | 102 | virtual void* get_hint_for_log() const = 0; |
f51cf556 TL |
103 | /** |
104 | * Method to learn a hint (aka logic level discriminator) provided directory | |
105 | * bound to. | |
106 | * | |
107 | */ | |
b3b6e05e | 108 | virtual void* get_hint_by_dir(std::string_view dirname) const = 0; |
9f95a23c | 109 | |
f51cf556 TL |
110 | /** |
111 | * Increments stats for a given logical level using provided fnode as a delta, | |
112 | * Parameters: | |
113 | * hint: logical level discriminator | |
114 | * fnode: fnode metadata to be used as a complex delta value: | |
115 | * (+1 file count, +file size, +all the extents) | |
116 | * | |
117 | */ | |
118 | void add_usage(void* hint, const bluefs_fnode_t& fnode) { | |
119 | for (auto& e : fnode.extents) { | |
120 | add_usage(hint, e); | |
121 | } | |
122 | add_usage(hint, fnode.size, true); | |
123 | } | |
124 | /** | |
125 | * Decrements stats for a given logical level using provided fnode as a delta | |
126 | * Parameters: | |
127 | * hint: logical level discriminator | |
128 | * fnode: fnode metadata to be used as a complex delta value: | |
129 | * (-1 file count, -file size, -all the extents) | |
130 | * | |
131 | */ | |
132 | void sub_usage(void* hint, const bluefs_fnode_t& fnode) { | |
133 | for (auto& e : fnode.extents) { | |
134 | sub_usage(hint, e); | |
135 | } | |
136 | sub_usage(hint, fnode.size, true); | |
137 | } | |
138 | /** | |
139 | * Increments stats for a given logical level using provided extent as a delta, | |
140 | * Parameters: | |
141 | * hint: logical level discriminator | |
142 | * extent: bluefs extent to be used as a complex delta value: | |
143 | * (.bdev determines physical location, +length) | |
144 | * | |
145 | */ | |
146 | virtual void add_usage(void* hint, const bluefs_extent_t& extent) = 0; | |
147 | /** | |
148 | * Decrements stats for a given logical level using provided extent as a delta, | |
149 | * Parameters: | |
150 | * hint: logical level discriminator | |
151 | * extent: bluefs extent to be used as a complex delta value: | |
152 | * (.bdev determines physical location, -length) | |
153 | * | |
154 | */ | |
155 | virtual void sub_usage(void* hint, const bluefs_extent_t& extent) = 0; | |
156 | /** | |
157 | * Increments files count and overall files size for a given logical level | |
158 | * Parameters: | |
159 | * hint: logical level discriminator | |
160 | * fsize: delta value for file size | |
161 | * upd_files: whether or not to increment file count | |
162 | * | |
163 | */ | |
164 | virtual void add_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0; | |
165 | /** | |
166 | * Decrements files count and overall files size for a given logical level | |
167 | * Parameters: | |
168 | * hint: logical level discriminator | |
169 | * fsize: delta value for file size | |
170 | * upd_files: whether or not to decrement file count | |
171 | * | |
172 | */ | |
173 | virtual void sub_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0; | |
174 | ||
175 | /** | |
176 | * Determines preferred physical device for the given logical level | |
177 | * Parameters: | |
178 | * hint: logical level discriminator | |
179 | * | |
180 | */ | |
9f95a23c | 181 | virtual uint8_t select_prefer_bdev(void* hint) = 0; |
f51cf556 TL |
182 | /** |
183 | * Builds path set for RocksDB to use | |
184 | * Parameters: | |
185 | * base: path's root | |
186 | * | |
187 | */ | |
9f95a23c | 188 | virtual void get_paths(const std::string& base, paths& res) const = 0; |
f51cf556 TL |
189 | /** |
190 | * Dumps VSelector's state | |
191 | * | |
192 | */ | |
f67539c2 | 193 | virtual void dump(std::ostream& sout) = 0; |
20effc67 TL |
194 | |
195 | /* used for sanity checking of vselector */ | |
196 | virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; } | |
197 | virtual bool compare(BlueFSVolumeSelector* other) { return true; }; | |
f67539c2 TL |
198 | }; |
199 | ||
200 | struct bluefs_shared_alloc_context_t { | |
201 | bool need_init = false; | |
202 | Allocator* a = nullptr; | |
39ae355f | 203 | uint64_t alloc_unit = 0; |
f67539c2 TL |
204 | |
205 | std::atomic<uint64_t> bluefs_used = 0; | |
206 | ||
39ae355f | 207 | void set(Allocator* _a, uint64_t _au) { |
f67539c2 | 208 | a = _a; |
39ae355f | 209 | alloc_unit = _au; |
f67539c2 TL |
210 | need_init = true; |
211 | bluefs_used = 0; | |
212 | } | |
213 | void reset() { | |
214 | a = nullptr; | |
39ae355f | 215 | alloc_unit = 0; |
f67539c2 | 216 | } |
11fdf7f2 TL |
217 | }; |
218 | ||
7c673cae FG |
219 | class BlueFS { |
220 | public: | |
221 | CephContext* cct; | |
11fdf7f2 | 222 | static constexpr unsigned MAX_BDEV = 5; |
7c673cae FG |
223 | static constexpr unsigned BDEV_WAL = 0; |
224 | static constexpr unsigned BDEV_DB = 1; | |
225 | static constexpr unsigned BDEV_SLOW = 2; | |
11fdf7f2 TL |
226 | static constexpr unsigned BDEV_NEWWAL = 3; |
227 | static constexpr unsigned BDEV_NEWDB = 4; | |
7c673cae FG |
228 | |
229 | enum { | |
230 | WRITER_UNKNOWN, | |
231 | WRITER_WAL, | |
232 | WRITER_SST, | |
233 | }; | |
234 | ||
235 | struct File : public RefCountedObject { | |
236 | MEMPOOL_CLASS_HELPERS(); | |
237 | ||
238 | bluefs_fnode_t fnode; | |
239 | int refs; | |
240 | uint64_t dirty_seq; | |
241 | bool locked; | |
242 | bool deleted; | |
522d829b | 243 | bool is_dirty; |
7c673cae FG |
244 | boost::intrusive::list_member_hook<> dirty_item; |
245 | ||
246 | std::atomic_int num_readers, num_writers; | |
247 | std::atomic_int num_reading; | |
248 | ||
9f95a23c | 249 | void* vselector_hint = nullptr; |
20effc67 TL |
250 | /* lock protects fnode and other the parts that can be modified during read & write operations. |
251 | Does not protect values that are fixed | |
252 | Does not need to be taken when doing one-time operations: | |
253 | _replay, device_migrate_to_existing, device_migrate_to_new */ | |
254 | ceph::mutex lock = ceph::make_mutex("BlueFS::File::lock"); | |
9f95a23c TL |
255 | |
256 | private: | |
257 | FRIEND_MAKE_REF(File); | |
7c673cae | 258 | File() |
9f95a23c | 259 | : |
7c673cae FG |
260 | refs(0), |
261 | dirty_seq(0), | |
262 | locked(false), | |
263 | deleted(false), | |
522d829b | 264 | is_dirty(false), |
7c673cae FG |
265 | num_readers(0), |
266 | num_writers(0), | |
9f95a23c TL |
267 | num_reading(0), |
268 | vselector_hint(nullptr) | |
7c673cae FG |
269 | {} |
270 | ~File() override { | |
11fdf7f2 TL |
271 | ceph_assert(num_readers.load() == 0); |
272 | ceph_assert(num_writers.load() == 0); | |
273 | ceph_assert(num_reading.load() == 0); | |
274 | ceph_assert(!locked); | |
7c673cae | 275 | } |
7c673cae | 276 | }; |
9f95a23c | 277 | using FileRef = ceph::ref_t<File>; |
7c673cae FG |
278 | |
279 | typedef boost::intrusive::list< | |
280 | File, | |
281 | boost::intrusive::member_hook< | |
282 | File, | |
283 | boost::intrusive::list_member_hook<>, | |
284 | &File::dirty_item> > dirty_file_list_t; | |
285 | ||
286 | struct Dir : public RefCountedObject { | |
287 | MEMPOOL_CLASS_HELPERS(); | |
288 | ||
b3b6e05e | 289 | mempool::bluefs::map<std::string, FileRef, std::less<>> file_map; |
7c673cae | 290 | |
9f95a23c TL |
291 | private: |
292 | FRIEND_MAKE_REF(Dir); | |
293 | Dir() = default; | |
7c673cae | 294 | }; |
9f95a23c | 295 | using DirRef = ceph::ref_t<Dir>; |
7c673cae FG |
296 | |
297 | struct FileWriter { | |
298 | MEMPOOL_CLASS_HELPERS(); | |
299 | ||
300 | FileRef file; | |
9f95a23c | 301 | uint64_t pos = 0; ///< start offset for buffer |
f67539c2 TL |
302 | private: |
303 | ceph::buffer::list buffer; ///< new data to write (at end of file) | |
304 | ceph::buffer::list tail_block; ///< existing partial block at end of file, if any | |
305 | public: | |
306 | unsigned get_buffer_length() const { | |
307 | return buffer.length(); | |
308 | } | |
309 | ceph::bufferlist flush_buffer( | |
310 | CephContext* cct, | |
311 | const bool partial, | |
312 | const unsigned length, | |
313 | const bluefs_super_t& super); | |
314 | ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only | |
315 | public: | |
7c673cae | 316 | int writer_type = 0; ///< WRITER_* |
11fdf7f2 | 317 | int write_hint = WRITE_LIFE_NOT_SET; |
7c673cae | 318 | |
11fdf7f2 | 319 | ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock"); |
7c673cae | 320 | std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev |
11fdf7f2 | 321 | std::array<bool, MAX_BDEV> dirty_devs; |
7c673cae FG |
322 | |
323 | FileWriter(FileRef f) | |
9f95a23c | 324 | : file(std::move(f)), |
f67539c2 TL |
325 | buffer_appender(buffer.get_page_aligned_appender( |
326 | g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) { | |
7c673cae FG |
327 | ++file->num_writers; |
328 | iocv.fill(nullptr); | |
11fdf7f2 | 329 | dirty_devs.fill(false); |
9f95a23c | 330 | if (file->fnode.ino == 1) { |
11fdf7f2 TL |
331 | write_hint = WRITE_LIFE_MEDIUM; |
332 | } | |
7c673cae FG |
333 | } |
334 | // NOTE: caller must call BlueFS::close_writer() | |
335 | ~FileWriter() { | |
336 | --file->num_writers; | |
337 | } | |
338 | ||
339 | // note: BlueRocksEnv uses this append exclusively, so it's safe | |
20effc67 | 340 | // to use buffer_appender exclusively here (e.g., its notion of |
7c673cae FG |
341 | // offset will remain accurate). |
342 | void append(const char *buf, size_t len) { | |
f67539c2 | 343 | uint64_t l0 = get_buffer_length(); |
cd265ab1 | 344 | ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max()); |
7c673cae FG |
345 | buffer_appender.append(buf, len); |
346 | } | |
347 | ||
20effc67 TL |
348 | void append(const std::byte *buf, size_t len) { |
349 | // allow callers to use byte type instead of char* as we simply pass byte array | |
350 | append((const char*)buf, len); | |
351 | } | |
352 | ||
7c673cae | 353 | // note: used internally only, for ino 1 or 0. |
cd265ab1 | 354 | void append(ceph::buffer::list& bl) { |
f67539c2 | 355 | uint64_t l0 = get_buffer_length(); |
cd265ab1 | 356 | ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max()); |
7c673cae FG |
357 | buffer.claim_append(bl); |
358 | } | |
359 | ||
f67539c2 TL |
360 | void append_zero(size_t len) { |
361 | uint64_t l0 = get_buffer_length(); | |
362 | ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max()); | |
363 | buffer_appender.append_zero(len); | |
364 | } | |
365 | ||
7c673cae | 366 | uint64_t get_effective_write_pos() { |
7c673cae FG |
367 | return pos + buffer.length(); |
368 | } | |
369 | }; | |
370 | ||
371 | struct FileReaderBuffer { | |
372 | MEMPOOL_CLASS_HELPERS(); | |
373 | ||
9f95a23c | 374 | uint64_t bl_off = 0; ///< prefetch buffer logical offset |
f67539c2 | 375 | ceph::buffer::list bl; ///< prefetch buffer |
9f95a23c | 376 | uint64_t pos = 0; ///< current logical offset |
7c673cae FG |
377 | uint64_t max_prefetch; ///< max allowed prefetch |
378 | ||
379 | explicit FileReaderBuffer(uint64_t mpf) | |
9f95a23c | 380 | : max_prefetch(mpf) {} |
7c673cae | 381 | |
9f95a23c | 382 | uint64_t get_buf_end() const { |
7c673cae FG |
383 | return bl_off + bl.length(); |
384 | } | |
9f95a23c | 385 | uint64_t get_buf_remaining(uint64_t p) const { |
7c673cae FG |
386 | if (p >= bl_off && p < bl_off + bl.length()) |
387 | return bl_off + bl.length() - p; | |
388 | return 0; | |
389 | } | |
390 | ||
391 | void skip(size_t n) { | |
392 | pos += n; | |
393 | } | |
f67539c2 TL |
394 | |
395 | // For the sake of simplicity, we invalidate completed rather than | |
396 | // for the provided extent | |
397 | void invalidate_cache(uint64_t offset, uint64_t length) { | |
398 | if (offset >= bl_off && offset < get_buf_end()) { | |
399 | bl.clear(); | |
400 | bl_off = 0; | |
401 | } | |
7c673cae FG |
402 | } |
403 | }; | |
404 | ||
405 | struct FileReader { | |
406 | MEMPOOL_CLASS_HELPERS(); | |
407 | ||
408 | FileRef file; | |
409 | FileReaderBuffer buf; | |
410 | bool random; | |
411 | bool ignore_eof; ///< used when reading our log file | |
412 | ||
494da23a TL |
413 | ceph::shared_mutex lock { |
414 | ceph::make_shared_mutex(std::string(), false, false, false) | |
415 | }; | |
416 | ||
417 | ||
7c673cae FG |
418 | FileReader(FileRef f, uint64_t mpf, bool rand, bool ie) |
419 | : file(f), | |
420 | buf(mpf), | |
421 | random(rand), | |
422 | ignore_eof(ie) { | |
423 | ++file->num_readers; | |
424 | } | |
425 | ~FileReader() { | |
426 | --file->num_readers; | |
427 | } | |
428 | }; | |
429 | ||
430 | struct FileLock { | |
431 | MEMPOOL_CLASS_HELPERS(); | |
432 | ||
433 | FileRef file; | |
9f95a23c | 434 | explicit FileLock(FileRef f) : file(std::move(f)) {} |
7c673cae FG |
435 | }; |
436 | ||
437 | private: | |
7c673cae FG |
438 | PerfCounters *logger = nullptr; |
439 | ||
11fdf7f2 TL |
440 | uint64_t max_bytes[MAX_BDEV] = {0}; |
441 | uint64_t max_bytes_pcounters[MAX_BDEV] = { | |
442 | l_bluefs_max_bytes_wal, | |
443 | l_bluefs_max_bytes_db, | |
444 | l_bluefs_max_bytes_slow, | |
39ae355f TL |
445 | l_bluefs_max_bytes_wal, |
446 | l_bluefs_max_bytes_db, | |
11fdf7f2 TL |
447 | }; |
448 | ||
f51cf556 TL |
449 | ceph::timespan max_alloc_lat[MAX_BDEV] = {ceph::make_timespan(0)}; |
450 | ||
7c673cae | 451 | // cache |
20effc67 TL |
452 | struct { |
453 | ceph::mutex lock = ceph::make_mutex("BlueFS::nodes.lock"); | |
454 | mempool::bluefs::map<std::string, DirRef, std::less<>> dir_map; ///< dirname -> Dir | |
455 | mempool::bluefs::unordered_map<uint64_t, FileRef> file_map; ///< ino -> File | |
456 | } nodes; | |
7c673cae FG |
457 | |
458 | bluefs_super_t super; ///< latest superblock (as last written) | |
459 | uint64_t ino_last = 0; ///< last assigned ino (this one is in use) | |
7c673cae | 460 | |
20effc67 TL |
461 | struct { |
462 | ceph::mutex lock = ceph::make_mutex("BlueFS::log.lock"); | |
463 | uint64_t seq_live = 1; //seq that log is currently writing to; mirrors dirty.seq_live | |
464 | FileWriter *writer = 0; | |
465 | bluefs_transaction_t t; | |
466 | } log; | |
467 | ||
468 | struct { | |
469 | ceph::mutex lock = ceph::make_mutex("BlueFS::dirty.lock"); | |
470 | uint64_t seq_stable = 0; //seq that is now stable on disk | |
471 | uint64_t seq_live = 1; //seq that is ongoing and dirty files will be written to | |
472 | // map of dirty files, files of same dirty_seq are grouped into list. | |
473 | std::map<uint64_t, dirty_file_list_t> files; | |
474 | std::vector<interval_set<uint64_t>> pending_release; ///< extents to release | |
475 | // TODO: it should be examined what makes pending_release immune to | |
476 | // eras in a way similar to dirty_files. Hints: | |
477 | // 1) we have actually only 2 eras: log_seq and log_seq+1 | |
478 | // 2) we usually not remove extents from files. And when we do, we force log-syncing. | |
479 | } dirty; | |
480 | ||
481 | ceph::condition_variable log_cond; ///< used for state control between log flush / log compaction | |
482 | std::atomic<bool> log_is_compacting{false}; ///< signals that bluefs log is already ongoing compaction | |
483 | std::atomic<bool> log_forbidden_to_expand{false}; ///< used to signal that async compaction is in state | |
484 | /// that prohibits expansion of bluefs log | |
7c673cae FG |
485 | /* |
486 | * There are up to 3 block devices: | |
487 | * | |
488 | * BDEV_DB db/ - the primary db device | |
489 | * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL | |
490 | * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills | |
491 | */ | |
f67539c2 TL |
492 | std::vector<BlockDevice*> bdev; ///< block devices we can use |
493 | std::vector<IOContext*> ioc; ///< IOContexts for bdevs | |
494 | std::vector<uint64_t> block_reserved; ///< starting reserve extent per device | |
495 | std::vector<Allocator*> alloc; ///< allocators for bdevs | |
496 | std::vector<uint64_t> alloc_size; ///< alloc size for each device | |
20effc67 | 497 | |
f67539c2 | 498 | //std::vector<interval_set<uint64_t>> block_unused_too_granular; |
7c673cae | 499 | |
11fdf7f2 TL |
500 | BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev |
501 | ||
9f95a23c | 502 | std::unique_ptr<BlueFSVolumeSelector> vselector; |
11fdf7f2 | 503 | |
f67539c2 TL |
504 | bluefs_shared_alloc_context_t* shared_alloc = nullptr; |
505 | unsigned shared_alloc_id = unsigned(-1); | |
506 | inline bool is_shared_alloc(unsigned id) const { | |
507 | return id == shared_alloc_id; | |
508 | } | |
39ae355f | 509 | std::atomic<int64_t> cooldown_deadline = 0; |
f67539c2 | 510 | |
eafe8130 TL |
511 | class SocketHook; |
512 | SocketHook* asok_hook = nullptr; | |
cd265ab1 TL |
513 | // used to trigger zeros into read (debug / verify) |
514 | std::atomic<uint64_t> inject_read_zeros{0}; | |
eafe8130 | 515 | |
7c673cae FG |
516 | void _init_logger(); |
517 | void _shutdown_logger(); | |
518 | void _update_logger_stats(); | |
519 | ||
520 | void _init_alloc(); | |
521 | void _stop_alloc(); | |
522 | ||
39ae355f TL |
523 | ///< pad ceph::buffer::list to max(block size, pad_size) w/ zeros |
524 | void _pad_bl(ceph::buffer::list& bl, uint64_t pad_size = 0); | |
f67539c2 TL |
525 | |
526 | uint64_t _get_used(unsigned id) const; | |
527 | uint64_t _get_total(unsigned id) const; | |
528 | ||
7c673cae FG |
529 | |
530 | FileRef _get_file(uint64_t ino); | |
20effc67 | 531 | void _drop_link_D(FileRef f); |
7c673cae | 532 | |
1911f103 TL |
533 | unsigned _get_slow_device_id() { |
534 | return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; | |
535 | } | |
eafe8130 | 536 | const char* get_device_name(unsigned id); |
f51cf556 TL |
537 | |
538 | typedef std::function<void(const bluefs_extent_t)> update_fn_t; | |
539 | void _update_allocate_stats(uint8_t id, const ceph::timespan& d); | |
7c673cae | 540 | int _allocate(uint8_t bdev, uint64_t len, |
39ae355f TL |
541 | uint64_t alloc_unit, |
542 | bluefs_fnode_t* node, | |
f51cf556 | 543 | update_fn_t cb = nullptr, |
39ae355f TL |
544 | size_t alloc_attempts = 0, |
545 | bool permit_dev_fallback = true); | |
11fdf7f2 | 546 | |
522d829b | 547 | /* signal replay log to include h->file in nearest log flush */ |
20effc67 TL |
548 | int _signal_dirty_to_log_D(FileWriter *h); |
549 | int _flush_range_F(FileWriter *h, uint64_t offset, uint64_t length); | |
550 | int _flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered); | |
551 | int _flush_F(FileWriter *h, bool force, bool *flushed = nullptr); | |
552 | uint64_t _flush_special(FileWriter *h); | |
7c673cae | 553 | |
11fdf7f2 | 554 | #ifdef HAVE_LIBAIO |
f67539c2 | 555 | void _claim_completed_aios(FileWriter *h, std::list<aio_t> *ls); |
20effc67 | 556 | void _wait_for_aio(FileWriter *h); // safe to call without a lock |
11fdf7f2 | 557 | #endif |
7c673cae | 558 | |
20effc67 | 559 | int64_t _maybe_extend_log(); |
f51cf556 | 560 | void _extend_log(uint64_t amount); |
20effc67 TL |
561 | uint64_t _log_advance_seq(); |
562 | void _consume_dirty(uint64_t seq); | |
563 | void _clear_dirty_set_stable_D(uint64_t seq_stable); | |
564 | void _release_pending_allocations(std::vector<interval_set<uint64_t>>& to_release); | |
565 | ||
f51cf556 TL |
566 | void _flush_and_sync_log_core(); |
567 | int _flush_and_sync_log_jump_D(uint64_t jump_to); | |
20effc67 TL |
568 | int _flush_and_sync_log_LD(uint64_t want_seq = 0); |
569 | ||
39ae355f TL |
570 | uint64_t _estimate_transaction_size(bluefs_transaction_t* t); |
571 | uint64_t _make_initial_transaction(uint64_t start_seq, | |
572 | bluefs_fnode_t& fnode, | |
573 | uint64_t expected_final_size, | |
574 | bufferlist* out); | |
20effc67 TL |
575 | uint64_t _estimate_log_size_N(); |
576 | bool _should_start_compact_log_L_N(); | |
11fdf7f2 TL |
577 | |
578 | enum { | |
579 | REMOVE_DB = 1, | |
580 | REMOVE_WAL = 2, | |
581 | RENAME_SLOW2DB = 4, | |
582 | RENAME_DB2SLOW = 8, | |
583 | }; | |
39ae355f TL |
584 | void _compact_log_dump_metadata_NF(uint64_t start_seq, |
585 | bluefs_transaction_t *t, | |
586 | int flags, | |
587 | uint64_t capture_before_seq); | |
11fdf7f2 | 588 | |
20effc67 TL |
589 | void _compact_log_sync_LNF_LD(); |
590 | void _compact_log_async_LD_LNF_D(); | |
591 | ||
39ae355f | 592 | void _rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback, |
9f95a23c TL |
593 | int super_dev, |
594 | int log_dev, | |
595 | int new_log_dev, | |
596 | int flags, | |
597 | std::optional<bluefs_layout_t> layout); | |
7c673cae FG |
598 | |
599 | //void _aio_finish(void *priv); | |
600 | ||
39ae355f | 601 | void _flush_bdev(FileWriter *h, bool check_mutex_locked = true); |
20effc67 TL |
602 | void _flush_bdev(); // this is safe to call without a lock |
603 | void _flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock | |
7c673cae FG |
604 | |
605 | int _preallocate(FileRef f, uint64_t off, uint64_t len); | |
606 | int _truncate(FileWriter *h, uint64_t off); | |
607 | ||
adb31ebb | 608 | int64_t _read( |
7c673cae | 609 | FileReader *h, ///< [in] read from here |
7c673cae FG |
610 | uint64_t offset, ///< [in] offset |
611 | size_t len, ///< [in] this many bytes | |
f67539c2 | 612 | ceph::buffer::list *outbl, ///< [out] optional: reference the result here |
7c673cae | 613 | char *out); ///< [out] optional: or copy it here |
adb31ebb | 614 | int64_t _read_random( |
7c673cae FG |
615 | FileReader *h, ///< [in] read from here |
616 | uint64_t offset, ///< [in] offset | |
9f95a23c | 617 | uint64_t len, ///< [in] this many bytes |
7c673cae FG |
618 | char *out); ///< [out] optional: or copy it here |
619 | ||
7c673cae | 620 | int _open_super(); |
11fdf7f2 | 621 | int _write_super(int dev); |
20effc67 TL |
622 | int _check_allocations(const bluefs_fnode_t& fnode, |
623 | boost::dynamic_bitset<uint64_t>* used_blocks, | |
624 | bool is_alloc, //true when allocating, false when deallocating | |
625 | const char* op_name); | |
9f95a23c TL |
626 | int _verify_alloc_granularity( |
627 | __u8 id, uint64_t offset, uint64_t length, | |
39ae355f | 628 | uint64_t alloc_unit, |
9f95a23c | 629 | const char *op); |
11fdf7f2 | 630 | int _replay(bool noop, bool to_stdout = false); ///< replay journal |
7c673cae FG |
631 | |
632 | FileWriter *_create_writer(FileRef f); | |
20effc67 | 633 | void _drain_writer(FileWriter *h); |
7c673cae FG |
634 | void _close_writer(FileWriter *h); |
635 | ||
636 | // always put the super in the second 4k block. FIXME should this be | |
637 | // block size independent? | |
638 | unsigned get_super_offset() { | |
639 | return 4096; | |
640 | } | |
641 | unsigned get_super_length() { | |
642 | return 4096; | |
643 | } | |
20effc67 TL |
644 | void _maybe_check_vselector_LNF() { |
645 | if (cct->_conf->bluefs_check_volume_selector_often) { | |
646 | _check_vselector_LNF(); | |
647 | } | |
648 | } | |
7c673cae FG |
649 | public: |
650 | BlueFS(CephContext* cct); | |
651 | ~BlueFS(); | |
652 | ||
653 | // the super is always stored on bdev 0 | |
9f95a23c | 654 | int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout); |
7c673cae | 655 | int mount(); |
9f95a23c | 656 | int maybe_verify_layout(const bluefs_layout_t& layout) const; |
1911f103 | 657 | void umount(bool avoid_compact = false); |
9f95a23c | 658 | int prepare_new_device(int id, const bluefs_layout_t& layout); |
11fdf7f2 TL |
659 | |
660 | int log_dump(); | |
7c673cae | 661 | |
f67539c2 TL |
662 | void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id); |
663 | void get_devices(std::set<std::string> *ls); | |
eafe8130 TL |
664 | uint64_t get_alloc_size(int id) { |
665 | return alloc_size[id]; | |
666 | } | |
7c673cae FG |
667 | int fsck(); |
668 | ||
11fdf7f2 TL |
669 | int device_migrate_to_new( |
670 | CephContext *cct, | |
f67539c2 | 671 | const std::set<int>& devs_source, |
9f95a23c TL |
672 | int dev_target, |
673 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
674 | int device_migrate_to_existing( |
675 | CephContext *cct, | |
f67539c2 | 676 | const std::set<int>& devs_source, |
9f95a23c TL |
677 | int dev_target, |
678 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
679 | |
680 | uint64_t get_used(); | |
7c673cae FG |
681 | uint64_t get_total(unsigned id); |
682 | uint64_t get_free(unsigned id); | |
f67539c2 TL |
683 | uint64_t get_used(unsigned id); |
684 | void dump_perf_counters(ceph::Formatter *f); | |
7c673cae | 685 | |
f67539c2 | 686 | void dump_block_extents(std::ostream& out); |
3efd9988 | 687 | |
7c673cae | 688 | /// get current extents that we own for given block device |
1e59de90 TL |
689 | void foreach_block_extents( |
690 | unsigned id, | |
691 | std::function<void(uint64_t, uint32_t)> cb); | |
7c673cae FG |
692 | |
693 | int open_for_write( | |
b3b6e05e TL |
694 | std::string_view dir, |
695 | std::string_view file, | |
7c673cae FG |
696 | FileWriter **h, |
697 | bool overwrite); | |
698 | ||
699 | int open_for_read( | |
b3b6e05e TL |
700 | std::string_view dir, |
701 | std::string_view file, | |
7c673cae FG |
702 | FileReader **h, |
703 | bool random = false); | |
704 | ||
20effc67 TL |
705 | // data added after last fsync() is lost |
706 | void close_writer(FileWriter *h); | |
7c673cae | 707 | |
b3b6e05e TL |
708 | int rename(std::string_view old_dir, std::string_view old_file, |
709 | std::string_view new_dir, std::string_view new_file); | |
7c673cae | 710 | |
b3b6e05e | 711 | int readdir(std::string_view dirname, std::vector<std::string> *ls); |
7c673cae | 712 | |
b3b6e05e TL |
713 | int unlink(std::string_view dirname, std::string_view filename); |
714 | int mkdir(std::string_view dirname); | |
715 | int rmdir(std::string_view dirname); | |
d2e6a577 | 716 | bool wal_is_rotational(); |
1d09f67e | 717 | bool db_is_rotational(); |
7c673cae | 718 | |
b3b6e05e TL |
719 | bool dir_exists(std::string_view dirname); |
720 | int stat(std::string_view dirname, std::string_view filename, | |
7c673cae FG |
721 | uint64_t *size, utime_t *mtime); |
722 | ||
b3b6e05e | 723 | int lock_file(std::string_view dirname, std::string_view filename, FileLock **p); |
7c673cae FG |
724 | int unlock_file(FileLock *l); |
725 | ||
7c673cae FG |
726 | void compact_log(); |
727 | ||
728 | /// sync any uncommitted state to disk | |
1911f103 | 729 | void sync_metadata(bool avoid_compact); |
7c673cae | 730 | |
9f95a23c TL |
731 | void set_volume_selector(BlueFSVolumeSelector* s) { |
732 | vselector.reset(s); | |
733 | } | |
f67539c2 | 734 | void dump_volume_selector(std::ostream& sout) { |
9f95a23c TL |
735 | vselector->dump(sout); |
736 | } | |
737 | void get_vselector_paths(const std::string& base, | |
738 | BlueFSVolumeSelector::paths& res) const { | |
739 | return vselector->get_paths(base, res); | |
740 | } | |
741 | ||
f67539c2 | 742 | int add_block_device(unsigned bdev, const std::string& path, bool trim, |
f67539c2 | 743 | bluefs_shared_alloc_context_t* _shared_alloc = nullptr); |
7c673cae | 744 | bool bdev_support_label(unsigned id); |
f67539c2 | 745 | uint64_t get_block_device_size(unsigned bdev) const; |
7c673cae | 746 | |
11fdf7f2 TL |
747 | // handler for discard event |
748 | void handle_discard(unsigned dev, interval_set<uint64_t>& to_release); | |
749 | ||
20effc67 | 750 | void flush(FileWriter *h, bool force = false); |
cd265ab1 | 751 | |
20effc67 TL |
752 | void append_try_flush(FileWriter *h, const char* buf, size_t len); |
753 | void flush_range(FileWriter *h, uint64_t offset, uint64_t length); | |
754 | int fsync(FileWriter *h); | |
f67539c2 TL |
755 | int64_t read(FileReader *h, uint64_t offset, size_t len, |
756 | ceph::buffer::list *outbl, char *out) { | |
7c673cae FG |
757 | // no need to hold the global lock here; we only touch h and |
758 | // h->file, and read vs write or delete is already protected (via | |
759 | // atomics and asserts). | |
f67539c2 | 760 | return _read(h, offset, len, outbl, out); |
7c673cae | 761 | } |
adb31ebb | 762 | int64_t read_random(FileReader *h, uint64_t offset, size_t len, |
7c673cae FG |
763 | char *out) { |
764 | // no need to hold the global lock here; we only touch h and | |
765 | // h->file, and read vs write or delete is already protected (via | |
766 | // atomics and asserts). | |
767 | return _read_random(h, offset, len, out); | |
768 | } | |
20effc67 TL |
769 | void invalidate_cache(FileRef f, uint64_t offset, uint64_t len); |
770 | int preallocate(FileRef f, uint64_t offset, uint64_t len); | |
771 | int truncate(FileWriter *h, uint64_t offset); | |
7c673cae | 772 | |
f67539c2 TL |
773 | size_t probe_alloc_avail(int dev, uint64_t alloc_size); |
774 | ||
9f95a23c | 775 | /// test purpose methods |
9f95a23c TL |
776 | const PerfCounters* get_perf_counters() const { |
777 | return logger; | |
778 | } | |
522d829b TL |
779 | uint64_t debug_get_dirty_seq(FileWriter *h); |
780 | bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev); | |
cd265ab1 TL |
781 | |
782 | private: | |
783 | // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...) | |
784 | // They are used for checking if read values are all 0, and reread if so. | |
20effc67 | 785 | int _read_and_check(uint8_t ndev, uint64_t off, uint64_t len, |
cd265ab1 | 786 | ceph::buffer::list *pbl, IOContext *ioc, bool buffered); |
20effc67 TL |
787 | int _read_random_and_check(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered); |
788 | ||
789 | int _bdev_read(uint8_t ndev, uint64_t off, uint64_t len, | |
790 | ceph::buffer::list* pbl, IOContext* ioc, bool buffered); | |
791 | int _bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len, char* buf, bool buffered); | |
792 | ||
793 | /// test and compact log, if necessary | |
794 | void _maybe_compact_log_LNF_NF_LD_D(); | |
795 | int _do_replay_recovery_read(FileReader *log, | |
796 | size_t log_pos, | |
797 | size_t read_offset, | |
798 | size_t read_len, | |
799 | bufferlist* bl); | |
800 | void _check_vselector_LNF(); | |
9f95a23c TL |
801 | }; |
802 | ||
803 | class OriginalVolumeSelector : public BlueFSVolumeSelector { | |
804 | uint64_t wal_total; | |
805 | uint64_t db_total; | |
806 | uint64_t slow_total; | |
807 | ||
808 | public: | |
809 | OriginalVolumeSelector( | |
810 | uint64_t _wal_total, | |
811 | uint64_t _db_total, | |
812 | uint64_t _slow_total) | |
813 | : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {} | |
814 | ||
f6b5b4d7 | 815 | void* get_hint_for_log() const override; |
b3b6e05e | 816 | void* get_hint_by_dir(std::string_view dirname) const override; |
9f95a23c | 817 | |
f51cf556 | 818 | void add_usage(void* hint, const bluefs_extent_t& extent) override { |
9f95a23c TL |
819 | // do nothing |
820 | return; | |
821 | } | |
f51cf556 | 822 | void sub_usage(void* hint, const bluefs_extent_t& extent) override { |
9f95a23c TL |
823 | // do nothing |
824 | return; | |
825 | } | |
f51cf556 | 826 | void add_usage(void*, uint64_t, bool) override { |
9f95a23c TL |
827 | // do nothing |
828 | return; | |
829 | } | |
f51cf556 | 830 | void sub_usage(void*, uint64_t, bool) override { |
9f95a23c TL |
831 | // do nothing |
832 | return; | |
833 | } | |
834 | ||
835 | uint8_t select_prefer_bdev(void* hint) override; | |
836 | void get_paths(const std::string& base, paths& res) const override; | |
f67539c2 TL |
837 | void dump(std::ostream& sout) override; |
838 | }; | |
839 | ||
840 | class FitToFastVolumeSelector : public OriginalVolumeSelector { | |
841 | public: | |
842 | FitToFastVolumeSelector( | |
843 | uint64_t _wal_total, | |
844 | uint64_t _db_total, | |
845 | uint64_t _slow_total) | |
846 | : OriginalVolumeSelector(_wal_total, _db_total, _slow_total) {} | |
847 | ||
848 | void get_paths(const std::string& base, paths& res) const override; | |
7c673cae | 849 | }; |
20effc67 TL |
850 | /** |
851 | * Directional graph of locks. | |
852 | * Vertices - Locks. Edges (directed) - locking progression. | |
853 | * Edge A->B exist if last taken lock was A and next taken lock is B. | |
854 | * | |
855 | * Row represents last lock taken. | |
856 | * Column represents next lock taken. | |
857 | * | |
858 | * > | W | L | N | D | F | |
859 | * -------------|---|---|---|---|--- | |
860 | * FileWriter W | | > | > | > | > | |
861 | * log L | | > | > | > | |
862 | * nodes N | | > | > | |
863 | * dirty D | | | > | |
864 | * File F | | |
865 | * | |
866 | * Claim: Deadlock is possible IFF graph contains cycles. | |
867 | */ | |
7c673cae | 868 | #endif |