]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.h
import quincy 17.2.0
[ceph.git] / ceph / src / os / bluestore / BlueFS.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4 #define CEPH_OS_BLUESTORE_BLUEFS_H
5
6 #include <atomic>
7 #include <mutex>
8 #include <limits>
9
10 #include "bluefs_types.h"
11 #include "blk/BlockDevice.h"
12
13 #include "common/RefCountedObj.h"
14 #include "common/ceph_context.h"
15 #include "global/global_context.h"
16 #include "include/common_fwd.h"
17
18 #include "boost/intrusive/list.hpp"
19 #include "boost/dynamic_bitset.hpp"
20
21 class Allocator;
22
23 enum {
24 l_bluefs_first = 732600,
25 l_bluefs_db_total_bytes,
26 l_bluefs_db_used_bytes,
27 l_bluefs_wal_total_bytes,
28 l_bluefs_wal_used_bytes,
29 l_bluefs_slow_total_bytes,
30 l_bluefs_slow_used_bytes,
31 l_bluefs_num_files,
32 l_bluefs_log_bytes,
33 l_bluefs_log_compactions,
34 l_bluefs_logged_bytes,
35 l_bluefs_files_written_wal,
36 l_bluefs_files_written_sst,
37 l_bluefs_bytes_written_wal,
38 l_bluefs_bytes_written_sst,
39 l_bluefs_bytes_written_slow,
40 l_bluefs_max_bytes_wal,
41 l_bluefs_max_bytes_db,
42 l_bluefs_max_bytes_slow,
43 l_bluefs_main_alloc_unit,
44 l_bluefs_db_alloc_unit,
45 l_bluefs_wal_alloc_unit,
46 l_bluefs_read_random_count,
47 l_bluefs_read_random_bytes,
48 l_bluefs_read_random_disk_count,
49 l_bluefs_read_random_disk_bytes,
50 l_bluefs_read_random_disk_bytes_wal,
51 l_bluefs_read_random_disk_bytes_db,
52 l_bluefs_read_random_disk_bytes_slow,
53 l_bluefs_read_random_buffer_count,
54 l_bluefs_read_random_buffer_bytes,
55 l_bluefs_read_count,
56 l_bluefs_read_bytes,
57 l_bluefs_read_disk_count,
58 l_bluefs_read_disk_bytes,
59 l_bluefs_read_disk_bytes_wal,
60 l_bluefs_read_disk_bytes_db,
61 l_bluefs_read_disk_bytes_slow,
62 l_bluefs_read_prefetch_count,
63 l_bluefs_read_prefetch_bytes,
64 l_bluefs_read_zeros_candidate,
65 l_bluefs_read_zeros_errors,
66
67 l_bluefs_last,
68 };
69
70 class BlueFSVolumeSelector {
71 public:
72 typedef std::vector<std::pair<std::string, uint64_t>> paths;
73
74 virtual ~BlueFSVolumeSelector() {
75 }
76 virtual void* get_hint_for_log() const = 0;
77 virtual void* get_hint_by_dir(std::string_view dirname) const = 0;
78
79 virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
80 virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
81 virtual void add_usage(void* file_hint, uint64_t fsize) = 0;
82 virtual void sub_usage(void* file_hint, uint64_t fsize) = 0;
83 virtual uint8_t select_prefer_bdev(void* hint) = 0;
84 virtual void get_paths(const std::string& base, paths& res) const = 0;
85 virtual void dump(std::ostream& sout) = 0;
86
87 /* used for sanity checking of vselector */
88 virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; }
89 virtual bool compare(BlueFSVolumeSelector* other) { return true; };
90 };
91
92 struct bluefs_shared_alloc_context_t {
93 bool need_init = false;
94 Allocator* a = nullptr;
95
96 std::atomic<uint64_t> bluefs_used = 0;
97
98 void set(Allocator* _a) {
99 a = _a;
100 need_init = true;
101 bluefs_used = 0;
102 }
103 void reset() {
104 a = nullptr;
105 }
106 };
107
108 class BlueFS {
109 public:
110 CephContext* cct;
111 static constexpr unsigned MAX_BDEV = 5;
112 static constexpr unsigned BDEV_WAL = 0;
113 static constexpr unsigned BDEV_DB = 1;
114 static constexpr unsigned BDEV_SLOW = 2;
115 static constexpr unsigned BDEV_NEWWAL = 3;
116 static constexpr unsigned BDEV_NEWDB = 4;
117
118 enum {
119 WRITER_UNKNOWN,
120 WRITER_WAL,
121 WRITER_SST,
122 };
123
124 struct File : public RefCountedObject {
125 MEMPOOL_CLASS_HELPERS();
126
127 bluefs_fnode_t fnode;
128 int refs;
129 uint64_t dirty_seq;
130 bool locked;
131 bool deleted;
132 bool is_dirty;
133 boost::intrusive::list_member_hook<> dirty_item;
134
135 std::atomic_int num_readers, num_writers;
136 std::atomic_int num_reading;
137
138 void* vselector_hint = nullptr;
139 /* lock protects fnode and other the parts that can be modified during read & write operations.
140 Does not protect values that are fixed
141 Does not need to be taken when doing one-time operations:
142 _replay, device_migrate_to_existing, device_migrate_to_new */
143 ceph::mutex lock = ceph::make_mutex("BlueFS::File::lock");
144
145 private:
146 FRIEND_MAKE_REF(File);
147 File()
148 :
149 refs(0),
150 dirty_seq(0),
151 locked(false),
152 deleted(false),
153 is_dirty(false),
154 num_readers(0),
155 num_writers(0),
156 num_reading(0),
157 vselector_hint(nullptr)
158 {}
159 ~File() override {
160 ceph_assert(num_readers.load() == 0);
161 ceph_assert(num_writers.load() == 0);
162 ceph_assert(num_reading.load() == 0);
163 ceph_assert(!locked);
164 }
165 };
166 using FileRef = ceph::ref_t<File>;
167
168 typedef boost::intrusive::list<
169 File,
170 boost::intrusive::member_hook<
171 File,
172 boost::intrusive::list_member_hook<>,
173 &File::dirty_item> > dirty_file_list_t;
174
175 struct Dir : public RefCountedObject {
176 MEMPOOL_CLASS_HELPERS();
177
178 mempool::bluefs::map<std::string, FileRef, std::less<>> file_map;
179
180 private:
181 FRIEND_MAKE_REF(Dir);
182 Dir() = default;
183 };
184 using DirRef = ceph::ref_t<Dir>;
185
186 struct FileWriter {
187 MEMPOOL_CLASS_HELPERS();
188
189 FileRef file;
190 uint64_t pos = 0; ///< start offset for buffer
191 private:
192 ceph::buffer::list buffer; ///< new data to write (at end of file)
193 ceph::buffer::list tail_block; ///< existing partial block at end of file, if any
194 public:
195 unsigned get_buffer_length() const {
196 return buffer.length();
197 }
198 ceph::bufferlist flush_buffer(
199 CephContext* cct,
200 const bool partial,
201 const unsigned length,
202 const bluefs_super_t& super);
203 ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only
204 public:
205 int writer_type = 0; ///< WRITER_*
206 int write_hint = WRITE_LIFE_NOT_SET;
207
208 ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
209 std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
210 std::array<bool, MAX_BDEV> dirty_devs;
211
212 FileWriter(FileRef f)
213 : file(std::move(f)),
214 buffer_appender(buffer.get_page_aligned_appender(
215 g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
216 ++file->num_writers;
217 iocv.fill(nullptr);
218 dirty_devs.fill(false);
219 if (file->fnode.ino == 1) {
220 write_hint = WRITE_LIFE_MEDIUM;
221 }
222 }
223 // NOTE: caller must call BlueFS::close_writer()
224 ~FileWriter() {
225 --file->num_writers;
226 }
227
228 // note: BlueRocksEnv uses this append exclusively, so it's safe
229 // to use buffer_appender exclusively here (e.g., its notion of
230 // offset will remain accurate).
231 void append(const char *buf, size_t len) {
232 uint64_t l0 = get_buffer_length();
233 ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
234 buffer_appender.append(buf, len);
235 }
236
237 void append(const std::byte *buf, size_t len) {
238 // allow callers to use byte type instead of char* as we simply pass byte array
239 append((const char*)buf, len);
240 }
241
242 // note: used internally only, for ino 1 or 0.
243 void append(ceph::buffer::list& bl) {
244 uint64_t l0 = get_buffer_length();
245 ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max());
246 buffer.claim_append(bl);
247 }
248
249 void append_zero(size_t len) {
250 uint64_t l0 = get_buffer_length();
251 ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
252 buffer_appender.append_zero(len);
253 }
254
255 uint64_t get_effective_write_pos() {
256 return pos + buffer.length();
257 }
258 };
259
260 struct FileReaderBuffer {
261 MEMPOOL_CLASS_HELPERS();
262
263 uint64_t bl_off = 0; ///< prefetch buffer logical offset
264 ceph::buffer::list bl; ///< prefetch buffer
265 uint64_t pos = 0; ///< current logical offset
266 uint64_t max_prefetch; ///< max allowed prefetch
267
268 explicit FileReaderBuffer(uint64_t mpf)
269 : max_prefetch(mpf) {}
270
271 uint64_t get_buf_end() const {
272 return bl_off + bl.length();
273 }
274 uint64_t get_buf_remaining(uint64_t p) const {
275 if (p >= bl_off && p < bl_off + bl.length())
276 return bl_off + bl.length() - p;
277 return 0;
278 }
279
280 void skip(size_t n) {
281 pos += n;
282 }
283
284 // For the sake of simplicity, we invalidate completed rather than
285 // for the provided extent
286 void invalidate_cache(uint64_t offset, uint64_t length) {
287 if (offset >= bl_off && offset < get_buf_end()) {
288 bl.clear();
289 bl_off = 0;
290 }
291 }
292 };
293
294 struct FileReader {
295 MEMPOOL_CLASS_HELPERS();
296
297 FileRef file;
298 FileReaderBuffer buf;
299 bool random;
300 bool ignore_eof; ///< used when reading our log file
301
302 ceph::shared_mutex lock {
303 ceph::make_shared_mutex(std::string(), false, false, false)
304 };
305
306
307 FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
308 : file(f),
309 buf(mpf),
310 random(rand),
311 ignore_eof(ie) {
312 ++file->num_readers;
313 }
314 ~FileReader() {
315 --file->num_readers;
316 }
317 };
318
319 struct FileLock {
320 MEMPOOL_CLASS_HELPERS();
321
322 FileRef file;
323 explicit FileLock(FileRef f) : file(std::move(f)) {}
324 };
325
326 private:
327 PerfCounters *logger = nullptr;
328
329 uint64_t max_bytes[MAX_BDEV] = {0};
330 uint64_t max_bytes_pcounters[MAX_BDEV] = {
331 l_bluefs_max_bytes_wal,
332 l_bluefs_max_bytes_db,
333 l_bluefs_max_bytes_slow,
334 };
335
336 // cache
337 struct {
338 ceph::mutex lock = ceph::make_mutex("BlueFS::nodes.lock");
339 mempool::bluefs::map<std::string, DirRef, std::less<>> dir_map; ///< dirname -> Dir
340 mempool::bluefs::unordered_map<uint64_t, FileRef> file_map; ///< ino -> File
341 } nodes;
342
343 bluefs_super_t super; ///< latest superblock (as last written)
344 uint64_t ino_last = 0; ///< last assigned ino (this one is in use)
345
346 struct {
347 ceph::mutex lock = ceph::make_mutex("BlueFS::log.lock");
348 uint64_t seq_live = 1; //seq that log is currently writing to; mirrors dirty.seq_live
349 FileWriter *writer = 0;
350 bluefs_transaction_t t;
351 } log;
352
353 struct {
354 ceph::mutex lock = ceph::make_mutex("BlueFS::dirty.lock");
355 uint64_t seq_stable = 0; //seq that is now stable on disk
356 uint64_t seq_live = 1; //seq that is ongoing and dirty files will be written to
357 // map of dirty files, files of same dirty_seq are grouped into list.
358 std::map<uint64_t, dirty_file_list_t> files;
359 std::vector<interval_set<uint64_t>> pending_release; ///< extents to release
360 // TODO: it should be examined what makes pending_release immune to
361 // eras in a way similar to dirty_files. Hints:
362 // 1) we have actually only 2 eras: log_seq and log_seq+1
363 // 2) we usually not remove extents from files. And when we do, we force log-syncing.
364 } dirty;
365
366 ceph::condition_variable log_cond; ///< used for state control between log flush / log compaction
367 std::atomic<bool> log_is_compacting{false}; ///< signals that bluefs log is already ongoing compaction
368 std::atomic<bool> log_forbidden_to_expand{false}; ///< used to signal that async compaction is in state
369 /// that prohibits expansion of bluefs log
370 /*
371 * There are up to 3 block devices:
372 *
373 * BDEV_DB db/ - the primary db device
374 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
375 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
376 */
377 std::vector<BlockDevice*> bdev; ///< block devices we can use
378 std::vector<IOContext*> ioc; ///< IOContexts for bdevs
379 std::vector<uint64_t> block_reserved; ///< starting reserve extent per device
380 std::vector<Allocator*> alloc; ///< allocators for bdevs
381 std::vector<uint64_t> alloc_size; ///< alloc size for each device
382
383 //std::vector<interval_set<uint64_t>> block_unused_too_granular;
384
385 BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
386
387 std::unique_ptr<BlueFSVolumeSelector> vselector;
388
389 bluefs_shared_alloc_context_t* shared_alloc = nullptr;
390 unsigned shared_alloc_id = unsigned(-1);
391 inline bool is_shared_alloc(unsigned id) const {
392 return id == shared_alloc_id;
393 }
394
395 class SocketHook;
396 SocketHook* asok_hook = nullptr;
397 // used to trigger zeros into read (debug / verify)
398 std::atomic<uint64_t> inject_read_zeros{0};
399
400 void _init_logger();
401 void _shutdown_logger();
402 void _update_logger_stats();
403
404 void _init_alloc();
405 void _stop_alloc();
406
407 void _pad_bl(ceph::buffer::list& bl); ///< pad ceph::buffer::list to block size w/ zeros
408
409 uint64_t _get_used(unsigned id) const;
410 uint64_t _get_total(unsigned id) const;
411
412
413 FileRef _get_file(uint64_t ino);
414 void _drop_link_D(FileRef f);
415
416 unsigned _get_slow_device_id() {
417 return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB;
418 }
419 const char* get_device_name(unsigned id);
420 int _allocate(uint8_t bdev, uint64_t len,
421 bluefs_fnode_t* node);
422 int _allocate_without_fallback(uint8_t id, uint64_t len,
423 PExtentVector* extents);
424
425 /* signal replay log to include h->file in nearest log flush */
426 int _signal_dirty_to_log_D(FileWriter *h);
427 int _flush_range_F(FileWriter *h, uint64_t offset, uint64_t length);
428 int _flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered);
429 int _flush_F(FileWriter *h, bool force, bool *flushed = nullptr);
430 uint64_t _flush_special(FileWriter *h);
431 int _fsync(FileWriter *h);
432
433 #ifdef HAVE_LIBAIO
434 void _claim_completed_aios(FileWriter *h, std::list<aio_t> *ls);
435 void _wait_for_aio(FileWriter *h); // safe to call without a lock
436 #endif
437
438 int64_t _maybe_extend_log();
439 void _extend_log();
440 uint64_t _log_advance_seq();
441 void _consume_dirty(uint64_t seq);
442 void _clear_dirty_set_stable_D(uint64_t seq_stable);
443 void _release_pending_allocations(std::vector<interval_set<uint64_t>>& to_release);
444
445 void _flush_and_sync_log_core(int64_t available_runway);
446 int _flush_and_sync_log_jump_D(uint64_t jump_to,
447 int64_t available_runway);
448 int _flush_and_sync_log_LD(uint64_t want_seq = 0);
449
450 uint64_t _estimate_log_size_N();
451 bool _should_start_compact_log_L_N();
452
453 enum {
454 REMOVE_DB = 1,
455 REMOVE_WAL = 2,
456 RENAME_SLOW2DB = 4,
457 RENAME_DB2SLOW = 8,
458 };
459 void _compact_log_dump_metadata_NF(bluefs_transaction_t *t,
460 int flags);
461 void _compact_log_async_dump_metadata_NF(bluefs_transaction_t *t,
462 uint64_t capture_before_seq);
463
464 void _compact_log_sync_LNF_LD();
465 void _compact_log_async_LD_LNF_D();
466
467 void _rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
468 int super_dev,
469 int log_dev,
470 int new_log_dev,
471 int flags,
472 std::optional<bluefs_layout_t> layout);
473
474 //void _aio_finish(void *priv);
475
476 void _flush_bdev(FileWriter *h);
477 void _flush_bdev(); // this is safe to call without a lock
478 void _flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock
479
480 int _preallocate(FileRef f, uint64_t off, uint64_t len);
481 int _truncate(FileWriter *h, uint64_t off);
482
483 int64_t _read(
484 FileReader *h, ///< [in] read from here
485 uint64_t offset, ///< [in] offset
486 size_t len, ///< [in] this many bytes
487 ceph::buffer::list *outbl, ///< [out] optional: reference the result here
488 char *out); ///< [out] optional: or copy it here
489 int64_t _read_random(
490 FileReader *h, ///< [in] read from here
491 uint64_t offset, ///< [in] offset
492 uint64_t len, ///< [in] this many bytes
493 char *out); ///< [out] optional: or copy it here
494
495 int _open_super();
496 int _write_super(int dev);
497 int _check_allocations(const bluefs_fnode_t& fnode,
498 boost::dynamic_bitset<uint64_t>* used_blocks,
499 bool is_alloc, //true when allocating, false when deallocating
500 const char* op_name);
501 int _verify_alloc_granularity(
502 __u8 id, uint64_t offset, uint64_t length,
503 const char *op);
504 int _replay(bool noop, bool to_stdout = false); ///< replay journal
505
506 FileWriter *_create_writer(FileRef f);
507 void _drain_writer(FileWriter *h);
508 void _close_writer(FileWriter *h);
509
510 // always put the super in the second 4k block. FIXME should this be
511 // block size independent?
512 unsigned get_super_offset() {
513 return 4096;
514 }
515 unsigned get_super_length() {
516 return 4096;
517 }
518 void _maybe_check_vselector_LNF() {
519 if (cct->_conf->bluefs_check_volume_selector_often) {
520 _check_vselector_LNF();
521 }
522 }
523 public:
524 BlueFS(CephContext* cct);
525 ~BlueFS();
526
527 // the super is always stored on bdev 0
528 int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout);
529 int mount();
530 int maybe_verify_layout(const bluefs_layout_t& layout) const;
531 void umount(bool avoid_compact = false);
532 int prepare_new_device(int id, const bluefs_layout_t& layout);
533
534 int log_dump();
535
536 void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id);
537 void get_devices(std::set<std::string> *ls);
538 uint64_t get_alloc_size(int id) {
539 return alloc_size[id];
540 }
541 int fsck();
542
543 int device_migrate_to_new(
544 CephContext *cct,
545 const std::set<int>& devs_source,
546 int dev_target,
547 const bluefs_layout_t& layout);
548 int device_migrate_to_existing(
549 CephContext *cct,
550 const std::set<int>& devs_source,
551 int dev_target,
552 const bluefs_layout_t& layout);
553
554 uint64_t get_used();
555 uint64_t get_total(unsigned id);
556 uint64_t get_free(unsigned id);
557 uint64_t get_used(unsigned id);
558 void dump_perf_counters(ceph::Formatter *f);
559
560 void dump_block_extents(std::ostream& out);
561
562 /// get current extents that we own for given block device
563 int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
564
565 int open_for_write(
566 std::string_view dir,
567 std::string_view file,
568 FileWriter **h,
569 bool overwrite);
570
571 int open_for_read(
572 std::string_view dir,
573 std::string_view file,
574 FileReader **h,
575 bool random = false);
576
577 // data added after last fsync() is lost
578 void close_writer(FileWriter *h);
579
580 int rename(std::string_view old_dir, std::string_view old_file,
581 std::string_view new_dir, std::string_view new_file);
582
583 int readdir(std::string_view dirname, std::vector<std::string> *ls);
584
585 int unlink(std::string_view dirname, std::string_view filename);
586 int mkdir(std::string_view dirname);
587 int rmdir(std::string_view dirname);
588 bool wal_is_rotational();
589 bool db_is_rotational();
590
591 bool dir_exists(std::string_view dirname);
592 int stat(std::string_view dirname, std::string_view filename,
593 uint64_t *size, utime_t *mtime);
594
595 int lock_file(std::string_view dirname, std::string_view filename, FileLock **p);
596 int unlock_file(FileLock *l);
597
598 void compact_log();
599
600 /// sync any uncommitted state to disk
601 void sync_metadata(bool avoid_compact);
602
603 void set_volume_selector(BlueFSVolumeSelector* s) {
604 vselector.reset(s);
605 }
606 void dump_volume_selector(std::ostream& sout) {
607 vselector->dump(sout);
608 }
609 void get_vselector_paths(const std::string& base,
610 BlueFSVolumeSelector::paths& res) const {
611 return vselector->get_paths(base, res);
612 }
613
614 int add_block_device(unsigned bdev, const std::string& path, bool trim,
615 uint64_t reserved,
616 bluefs_shared_alloc_context_t* _shared_alloc = nullptr);
617 bool bdev_support_label(unsigned id);
618 uint64_t get_block_device_size(unsigned bdev) const;
619
620 // handler for discard event
621 void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
622
623 void flush(FileWriter *h, bool force = false);
624
625 void append_try_flush(FileWriter *h, const char* buf, size_t len);
626 void flush_range(FileWriter *h, uint64_t offset, uint64_t length);
627 int fsync(FileWriter *h);
628 int64_t read(FileReader *h, uint64_t offset, size_t len,
629 ceph::buffer::list *outbl, char *out) {
630 // no need to hold the global lock here; we only touch h and
631 // h->file, and read vs write or delete is already protected (via
632 // atomics and asserts).
633 return _read(h, offset, len, outbl, out);
634 }
635 int64_t read_random(FileReader *h, uint64_t offset, size_t len,
636 char *out) {
637 // no need to hold the global lock here; we only touch h and
638 // h->file, and read vs write or delete is already protected (via
639 // atomics and asserts).
640 return _read_random(h, offset, len, out);
641 }
642 void invalidate_cache(FileRef f, uint64_t offset, uint64_t len);
643 int preallocate(FileRef f, uint64_t offset, uint64_t len);
644 int truncate(FileWriter *h, uint64_t offset);
645
646 size_t probe_alloc_avail(int dev, uint64_t alloc_size);
647
648 /// test purpose methods
649 const PerfCounters* get_perf_counters() const {
650 return logger;
651 }
652 uint64_t debug_get_dirty_seq(FileWriter *h);
653 bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev);
654
655 private:
656 // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...)
657 // They are used for checking if read values are all 0, and reread if so.
658 int _read_and_check(uint8_t ndev, uint64_t off, uint64_t len,
659 ceph::buffer::list *pbl, IOContext *ioc, bool buffered);
660 int _read_random_and_check(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered);
661
662 int _bdev_read(uint8_t ndev, uint64_t off, uint64_t len,
663 ceph::buffer::list* pbl, IOContext* ioc, bool buffered);
664 int _bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len, char* buf, bool buffered);
665
666 /// test and compact log, if necessary
667 void _maybe_compact_log_LNF_NF_LD_D();
668 int _do_replay_recovery_read(FileReader *log,
669 size_t log_pos,
670 size_t read_offset,
671 size_t read_len,
672 bufferlist* bl);
673 void _check_vselector_LNF();
674 };
675
676 class OriginalVolumeSelector : public BlueFSVolumeSelector {
677 uint64_t wal_total;
678 uint64_t db_total;
679 uint64_t slow_total;
680
681 public:
682 OriginalVolumeSelector(
683 uint64_t _wal_total,
684 uint64_t _db_total,
685 uint64_t _slow_total)
686 : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {}
687
688 void* get_hint_for_log() const override;
689 void* get_hint_by_dir(std::string_view dirname) const override;
690
691 void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
692 // do nothing
693 return;
694 }
695 void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
696 // do nothing
697 return;
698 }
699 void add_usage(void* hint, uint64_t fsize) override {
700 // do nothing
701 return;
702 }
703 void sub_usage(void* hint, uint64_t fsize) override {
704 // do nothing
705 return;
706 }
707
708 uint8_t select_prefer_bdev(void* hint) override;
709 void get_paths(const std::string& base, paths& res) const override;
710 void dump(std::ostream& sout) override;
711 };
712
713 class FitToFastVolumeSelector : public OriginalVolumeSelector {
714 public:
715 FitToFastVolumeSelector(
716 uint64_t _wal_total,
717 uint64_t _db_total,
718 uint64_t _slow_total)
719 : OriginalVolumeSelector(_wal_total, _db_total, _slow_total) {}
720
721 void get_paths(const std::string& base, paths& res) const override;
722 };
723 /**
724 * Directional graph of locks.
725 * Vertices - Locks. Edges (directed) - locking progression.
726 * Edge A->B exist if last taken lock was A and next taken lock is B.
727 *
728 * Row represents last lock taken.
729 * Column represents next lock taken.
730 *
731 * > | W | L | N | D | F
732 * -------------|---|---|---|---|---
733 * FileWriter W | | > | > | > | >
734 * log L | | > | > | >
735 * nodes N | | > | >
736 * dirty D | | | >
737 * File F |
738 *
739 * Claim: Deadlock is possible IFF graph contains cycles.
740 */
741 #endif