]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.h
import ceph quincy 17.2.6
[ceph.git] / ceph / src / os / bluestore / BlueFS.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4 #define CEPH_OS_BLUESTORE_BLUEFS_H
5
6 #include <atomic>
7 #include <mutex>
8 #include <limits>
9
10 #include "bluefs_types.h"
11 #include "blk/BlockDevice.h"
12
13 #include "common/RefCountedObj.h"
14 #include "common/ceph_context.h"
15 #include "global/global_context.h"
16 #include "include/common_fwd.h"
17
18 #include "boost/intrusive/list.hpp"
19 #include "boost/dynamic_bitset.hpp"
20
21 class Allocator;
22
23 enum {
24 l_bluefs_first = 732600,
25 l_bluefs_db_total_bytes,
26 l_bluefs_db_used_bytes,
27 l_bluefs_wal_total_bytes,
28 l_bluefs_wal_used_bytes,
29 l_bluefs_slow_total_bytes,
30 l_bluefs_slow_used_bytes,
31 l_bluefs_num_files,
32 l_bluefs_log_bytes,
33 l_bluefs_log_compactions,
34 l_bluefs_logged_bytes,
35 l_bluefs_files_written_wal,
36 l_bluefs_files_written_sst,
37 l_bluefs_bytes_written_wal,
38 l_bluefs_bytes_written_sst,
39 l_bluefs_bytes_written_slow,
40 l_bluefs_max_bytes_wal,
41 l_bluefs_max_bytes_db,
42 l_bluefs_max_bytes_slow,
43 l_bluefs_main_alloc_unit,
44 l_bluefs_db_alloc_unit,
45 l_bluefs_wal_alloc_unit,
46 l_bluefs_read_random_count,
47 l_bluefs_read_random_bytes,
48 l_bluefs_read_random_disk_count,
49 l_bluefs_read_random_disk_bytes,
50 l_bluefs_read_random_disk_bytes_wal,
51 l_bluefs_read_random_disk_bytes_db,
52 l_bluefs_read_random_disk_bytes_slow,
53 l_bluefs_read_random_buffer_count,
54 l_bluefs_read_random_buffer_bytes,
55 l_bluefs_read_count,
56 l_bluefs_read_bytes,
57 l_bluefs_read_disk_count,
58 l_bluefs_read_disk_bytes,
59 l_bluefs_read_disk_bytes_wal,
60 l_bluefs_read_disk_bytes_db,
61 l_bluefs_read_disk_bytes_slow,
62 l_bluefs_read_prefetch_count,
63 l_bluefs_read_prefetch_bytes,
64 l_bluefs_compaction_lat,
65 l_bluefs_compaction_lock_lat,
66 l_bluefs_alloc_shared_dev_fallbacks,
67 l_bluefs_alloc_shared_size_fallbacks,
68 l_bluefs_read_zeros_candidate,
69 l_bluefs_read_zeros_errors,
70 l_bluefs_last,
71 };
72
73 class BlueFSVolumeSelector {
74 public:
75 typedef std::vector<std::pair<std::string, uint64_t>> paths;
76
77 virtual ~BlueFSVolumeSelector() {
78 }
79 virtual void* get_hint_for_log() const = 0;
80 virtual void* get_hint_by_dir(std::string_view dirname) const = 0;
81
82 virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
83 virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
84 virtual void add_usage(void* file_hint, uint64_t fsize) = 0;
85 virtual void sub_usage(void* file_hint, uint64_t fsize) = 0;
86 virtual uint8_t select_prefer_bdev(void* hint) = 0;
87 virtual void get_paths(const std::string& base, paths& res) const = 0;
88 virtual void dump(std::ostream& sout) = 0;
89
90 /* used for sanity checking of vselector */
91 virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; }
92 virtual bool compare(BlueFSVolumeSelector* other) { return true; };
93 };
94
95 struct bluefs_shared_alloc_context_t {
96 bool need_init = false;
97 Allocator* a = nullptr;
98 uint64_t alloc_unit = 0;
99
100 std::atomic<uint64_t> bluefs_used = 0;
101
102 void set(Allocator* _a, uint64_t _au) {
103 a = _a;
104 alloc_unit = _au;
105 need_init = true;
106 bluefs_used = 0;
107 }
108 void reset() {
109 a = nullptr;
110 alloc_unit = 0;
111 }
112 };
113
114 class BlueFS {
115 public:
116 CephContext* cct;
117 static constexpr unsigned MAX_BDEV = 5;
118 static constexpr unsigned BDEV_WAL = 0;
119 static constexpr unsigned BDEV_DB = 1;
120 static constexpr unsigned BDEV_SLOW = 2;
121 static constexpr unsigned BDEV_NEWWAL = 3;
122 static constexpr unsigned BDEV_NEWDB = 4;
123
124 enum {
125 WRITER_UNKNOWN,
126 WRITER_WAL,
127 WRITER_SST,
128 };
129
130 struct File : public RefCountedObject {
131 MEMPOOL_CLASS_HELPERS();
132
133 bluefs_fnode_t fnode;
134 int refs;
135 uint64_t dirty_seq;
136 bool locked;
137 bool deleted;
138 bool is_dirty;
139 boost::intrusive::list_member_hook<> dirty_item;
140
141 std::atomic_int num_readers, num_writers;
142 std::atomic_int num_reading;
143
144 void* vselector_hint = nullptr;
145 /* lock protects fnode and other the parts that can be modified during read & write operations.
146 Does not protect values that are fixed
147 Does not need to be taken when doing one-time operations:
148 _replay, device_migrate_to_existing, device_migrate_to_new */
149 ceph::mutex lock = ceph::make_mutex("BlueFS::File::lock");
150
151 private:
152 FRIEND_MAKE_REF(File);
153 File()
154 :
155 refs(0),
156 dirty_seq(0),
157 locked(false),
158 deleted(false),
159 is_dirty(false),
160 num_readers(0),
161 num_writers(0),
162 num_reading(0),
163 vselector_hint(nullptr)
164 {}
165 ~File() override {
166 ceph_assert(num_readers.load() == 0);
167 ceph_assert(num_writers.load() == 0);
168 ceph_assert(num_reading.load() == 0);
169 ceph_assert(!locked);
170 }
171 };
172 using FileRef = ceph::ref_t<File>;
173
174 typedef boost::intrusive::list<
175 File,
176 boost::intrusive::member_hook<
177 File,
178 boost::intrusive::list_member_hook<>,
179 &File::dirty_item> > dirty_file_list_t;
180
181 struct Dir : public RefCountedObject {
182 MEMPOOL_CLASS_HELPERS();
183
184 mempool::bluefs::map<std::string, FileRef, std::less<>> file_map;
185
186 private:
187 FRIEND_MAKE_REF(Dir);
188 Dir() = default;
189 };
190 using DirRef = ceph::ref_t<Dir>;
191
192 struct FileWriter {
193 MEMPOOL_CLASS_HELPERS();
194
195 FileRef file;
196 uint64_t pos = 0; ///< start offset for buffer
197 private:
198 ceph::buffer::list buffer; ///< new data to write (at end of file)
199 ceph::buffer::list tail_block; ///< existing partial block at end of file, if any
200 public:
201 unsigned get_buffer_length() const {
202 return buffer.length();
203 }
204 ceph::bufferlist flush_buffer(
205 CephContext* cct,
206 const bool partial,
207 const unsigned length,
208 const bluefs_super_t& super);
209 ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only
210 public:
211 int writer_type = 0; ///< WRITER_*
212 int write_hint = WRITE_LIFE_NOT_SET;
213
214 ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
215 std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
216 std::array<bool, MAX_BDEV> dirty_devs;
217
218 FileWriter(FileRef f)
219 : file(std::move(f)),
220 buffer_appender(buffer.get_page_aligned_appender(
221 g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
222 ++file->num_writers;
223 iocv.fill(nullptr);
224 dirty_devs.fill(false);
225 if (file->fnode.ino == 1) {
226 write_hint = WRITE_LIFE_MEDIUM;
227 }
228 }
229 // NOTE: caller must call BlueFS::close_writer()
230 ~FileWriter() {
231 --file->num_writers;
232 }
233
234 // note: BlueRocksEnv uses this append exclusively, so it's safe
235 // to use buffer_appender exclusively here (e.g., its notion of
236 // offset will remain accurate).
237 void append(const char *buf, size_t len) {
238 uint64_t l0 = get_buffer_length();
239 ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
240 buffer_appender.append(buf, len);
241 }
242
243 void append(const std::byte *buf, size_t len) {
244 // allow callers to use byte type instead of char* as we simply pass byte array
245 append((const char*)buf, len);
246 }
247
248 // note: used internally only, for ino 1 or 0.
249 void append(ceph::buffer::list& bl) {
250 uint64_t l0 = get_buffer_length();
251 ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max());
252 buffer.claim_append(bl);
253 }
254
255 void append_zero(size_t len) {
256 uint64_t l0 = get_buffer_length();
257 ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
258 buffer_appender.append_zero(len);
259 }
260
261 uint64_t get_effective_write_pos() {
262 return pos + buffer.length();
263 }
264 };
265
266 struct FileReaderBuffer {
267 MEMPOOL_CLASS_HELPERS();
268
269 uint64_t bl_off = 0; ///< prefetch buffer logical offset
270 ceph::buffer::list bl; ///< prefetch buffer
271 uint64_t pos = 0; ///< current logical offset
272 uint64_t max_prefetch; ///< max allowed prefetch
273
274 explicit FileReaderBuffer(uint64_t mpf)
275 : max_prefetch(mpf) {}
276
277 uint64_t get_buf_end() const {
278 return bl_off + bl.length();
279 }
280 uint64_t get_buf_remaining(uint64_t p) const {
281 if (p >= bl_off && p < bl_off + bl.length())
282 return bl_off + bl.length() - p;
283 return 0;
284 }
285
286 void skip(size_t n) {
287 pos += n;
288 }
289
290 // For the sake of simplicity, we invalidate completed rather than
291 // for the provided extent
292 void invalidate_cache(uint64_t offset, uint64_t length) {
293 if (offset >= bl_off && offset < get_buf_end()) {
294 bl.clear();
295 bl_off = 0;
296 }
297 }
298 };
299
300 struct FileReader {
301 MEMPOOL_CLASS_HELPERS();
302
303 FileRef file;
304 FileReaderBuffer buf;
305 bool random;
306 bool ignore_eof; ///< used when reading our log file
307
308 ceph::shared_mutex lock {
309 ceph::make_shared_mutex(std::string(), false, false, false)
310 };
311
312
313 FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
314 : file(f),
315 buf(mpf),
316 random(rand),
317 ignore_eof(ie) {
318 ++file->num_readers;
319 }
320 ~FileReader() {
321 --file->num_readers;
322 }
323 };
324
325 struct FileLock {
326 MEMPOOL_CLASS_HELPERS();
327
328 FileRef file;
329 explicit FileLock(FileRef f) : file(std::move(f)) {}
330 };
331
332 private:
333 PerfCounters *logger = nullptr;
334
335 uint64_t max_bytes[MAX_BDEV] = {0};
336 uint64_t max_bytes_pcounters[MAX_BDEV] = {
337 l_bluefs_max_bytes_wal,
338 l_bluefs_max_bytes_db,
339 l_bluefs_max_bytes_slow,
340 l_bluefs_max_bytes_wal,
341 l_bluefs_max_bytes_db,
342 };
343
344 // cache
345 struct {
346 ceph::mutex lock = ceph::make_mutex("BlueFS::nodes.lock");
347 mempool::bluefs::map<std::string, DirRef, std::less<>> dir_map; ///< dirname -> Dir
348 mempool::bluefs::unordered_map<uint64_t, FileRef> file_map; ///< ino -> File
349 } nodes;
350
351 bluefs_super_t super; ///< latest superblock (as last written)
352 uint64_t ino_last = 0; ///< last assigned ino (this one is in use)
353
354 struct {
355 ceph::mutex lock = ceph::make_mutex("BlueFS::log.lock");
356 uint64_t seq_live = 1; //seq that log is currently writing to; mirrors dirty.seq_live
357 FileWriter *writer = 0;
358 bluefs_transaction_t t;
359 } log;
360
361 struct {
362 ceph::mutex lock = ceph::make_mutex("BlueFS::dirty.lock");
363 uint64_t seq_stable = 0; //seq that is now stable on disk
364 uint64_t seq_live = 1; //seq that is ongoing and dirty files will be written to
365 // map of dirty files, files of same dirty_seq are grouped into list.
366 std::map<uint64_t, dirty_file_list_t> files;
367 std::vector<interval_set<uint64_t>> pending_release; ///< extents to release
368 // TODO: it should be examined what makes pending_release immune to
369 // eras in a way similar to dirty_files. Hints:
370 // 1) we have actually only 2 eras: log_seq and log_seq+1
371 // 2) we usually not remove extents from files. And when we do, we force log-syncing.
372 } dirty;
373
374 ceph::condition_variable log_cond; ///< used for state control between log flush / log compaction
375 std::atomic<bool> log_is_compacting{false}; ///< signals that bluefs log is already ongoing compaction
376 std::atomic<bool> log_forbidden_to_expand{false}; ///< used to signal that async compaction is in state
377 /// that prohibits expansion of bluefs log
378 /*
379 * There are up to 3 block devices:
380 *
381 * BDEV_DB db/ - the primary db device
382 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
383 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
384 */
385 std::vector<BlockDevice*> bdev; ///< block devices we can use
386 std::vector<IOContext*> ioc; ///< IOContexts for bdevs
387 std::vector<uint64_t> block_reserved; ///< starting reserve extent per device
388 std::vector<Allocator*> alloc; ///< allocators for bdevs
389 std::vector<uint64_t> alloc_size; ///< alloc size for each device
390
391 //std::vector<interval_set<uint64_t>> block_unused_too_granular;
392
393 BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
394
395 std::unique_ptr<BlueFSVolumeSelector> vselector;
396
397 bluefs_shared_alloc_context_t* shared_alloc = nullptr;
398 unsigned shared_alloc_id = unsigned(-1);
399 inline bool is_shared_alloc(unsigned id) const {
400 return id == shared_alloc_id;
401 }
402 std::atomic<int64_t> cooldown_deadline = 0;
403
404 class SocketHook;
405 SocketHook* asok_hook = nullptr;
406 // used to trigger zeros into read (debug / verify)
407 std::atomic<uint64_t> inject_read_zeros{0};
408
409 void _init_logger();
410 void _shutdown_logger();
411 void _update_logger_stats();
412
413 void _init_alloc();
414 void _stop_alloc();
415
416 ///< pad ceph::buffer::list to max(block size, pad_size) w/ zeros
417 void _pad_bl(ceph::buffer::list& bl, uint64_t pad_size = 0);
418
419 uint64_t _get_used(unsigned id) const;
420 uint64_t _get_total(unsigned id) const;
421
422
423 FileRef _get_file(uint64_t ino);
424 void _drop_link_D(FileRef f);
425
426 unsigned _get_slow_device_id() {
427 return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB;
428 }
429 const char* get_device_name(unsigned id);
430 int _allocate(uint8_t bdev, uint64_t len,
431 uint64_t alloc_unit,
432 bluefs_fnode_t* node,
433 size_t alloc_attempts = 0,
434 bool permit_dev_fallback = true);
435
436 /* signal replay log to include h->file in nearest log flush */
437 int _signal_dirty_to_log_D(FileWriter *h);
438 int _flush_range_F(FileWriter *h, uint64_t offset, uint64_t length);
439 int _flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered);
440 int _flush_F(FileWriter *h, bool force, bool *flushed = nullptr);
441 uint64_t _flush_special(FileWriter *h);
442 int _fsync(FileWriter *h);
443
444 #ifdef HAVE_LIBAIO
445 void _claim_completed_aios(FileWriter *h, std::list<aio_t> *ls);
446 void _wait_for_aio(FileWriter *h); // safe to call without a lock
447 #endif
448
449 int64_t _maybe_extend_log();
450 void _extend_log();
451 uint64_t _log_advance_seq();
452 void _consume_dirty(uint64_t seq);
453 void _clear_dirty_set_stable_D(uint64_t seq_stable);
454 void _release_pending_allocations(std::vector<interval_set<uint64_t>>& to_release);
455
456 void _flush_and_sync_log_core(int64_t available_runway);
457 int _flush_and_sync_log_jump_D(uint64_t jump_to,
458 int64_t available_runway);
459 int _flush_and_sync_log_LD(uint64_t want_seq = 0);
460
461 uint64_t _estimate_transaction_size(bluefs_transaction_t* t);
462 uint64_t _make_initial_transaction(uint64_t start_seq,
463 bluefs_fnode_t& fnode,
464 uint64_t expected_final_size,
465 bufferlist* out);
466 uint64_t _estimate_log_size_N();
467 bool _should_start_compact_log_L_N();
468
469 enum {
470 REMOVE_DB = 1,
471 REMOVE_WAL = 2,
472 RENAME_SLOW2DB = 4,
473 RENAME_DB2SLOW = 8,
474 };
475 void _compact_log_dump_metadata_NF(uint64_t start_seq,
476 bluefs_transaction_t *t,
477 int flags,
478 uint64_t capture_before_seq);
479
480 void _compact_log_sync_LNF_LD();
481 void _compact_log_async_LD_LNF_D();
482
483 void _rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
484 int super_dev,
485 int log_dev,
486 int new_log_dev,
487 int flags,
488 std::optional<bluefs_layout_t> layout);
489
490 //void _aio_finish(void *priv);
491
492 void _flush_bdev(FileWriter *h, bool check_mutex_locked = true);
493 void _flush_bdev(); // this is safe to call without a lock
494 void _flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock
495
496 int _preallocate(FileRef f, uint64_t off, uint64_t len);
497 int _truncate(FileWriter *h, uint64_t off);
498
499 int64_t _read(
500 FileReader *h, ///< [in] read from here
501 uint64_t offset, ///< [in] offset
502 size_t len, ///< [in] this many bytes
503 ceph::buffer::list *outbl, ///< [out] optional: reference the result here
504 char *out); ///< [out] optional: or copy it here
505 int64_t _read_random(
506 FileReader *h, ///< [in] read from here
507 uint64_t offset, ///< [in] offset
508 uint64_t len, ///< [in] this many bytes
509 char *out); ///< [out] optional: or copy it here
510
511 int _open_super();
512 int _write_super(int dev);
513 int _check_allocations(const bluefs_fnode_t& fnode,
514 boost::dynamic_bitset<uint64_t>* used_blocks,
515 bool is_alloc, //true when allocating, false when deallocating
516 const char* op_name);
517 int _verify_alloc_granularity(
518 __u8 id, uint64_t offset, uint64_t length,
519 uint64_t alloc_unit,
520 const char *op);
521 int _replay(bool noop, bool to_stdout = false); ///< replay journal
522
523 FileWriter *_create_writer(FileRef f);
524 void _drain_writer(FileWriter *h);
525 void _close_writer(FileWriter *h);
526
527 // always put the super in the second 4k block. FIXME should this be
528 // block size independent?
529 unsigned get_super_offset() {
530 return 4096;
531 }
532 unsigned get_super_length() {
533 return 4096;
534 }
535 void _maybe_check_vselector_LNF() {
536 if (cct->_conf->bluefs_check_volume_selector_often) {
537 _check_vselector_LNF();
538 }
539 }
540 public:
541 BlueFS(CephContext* cct);
542 ~BlueFS();
543
544 // the super is always stored on bdev 0
545 int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout);
546 int mount();
547 int maybe_verify_layout(const bluefs_layout_t& layout) const;
548 void umount(bool avoid_compact = false);
549 int prepare_new_device(int id, const bluefs_layout_t& layout);
550
551 int log_dump();
552
553 void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id);
554 void get_devices(std::set<std::string> *ls);
555 uint64_t get_alloc_size(int id) {
556 return alloc_size[id];
557 }
558 int fsck();
559
560 int device_migrate_to_new(
561 CephContext *cct,
562 const std::set<int>& devs_source,
563 int dev_target,
564 const bluefs_layout_t& layout);
565 int device_migrate_to_existing(
566 CephContext *cct,
567 const std::set<int>& devs_source,
568 int dev_target,
569 const bluefs_layout_t& layout);
570
571 uint64_t get_used();
572 uint64_t get_total(unsigned id);
573 uint64_t get_free(unsigned id);
574 uint64_t get_used(unsigned id);
575 void dump_perf_counters(ceph::Formatter *f);
576
577 void dump_block_extents(std::ostream& out);
578
579 /// get current extents that we own for given block device
580 int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
581
582 int open_for_write(
583 std::string_view dir,
584 std::string_view file,
585 FileWriter **h,
586 bool overwrite);
587
588 int open_for_read(
589 std::string_view dir,
590 std::string_view file,
591 FileReader **h,
592 bool random = false);
593
594 // data added after last fsync() is lost
595 void close_writer(FileWriter *h);
596
597 int rename(std::string_view old_dir, std::string_view old_file,
598 std::string_view new_dir, std::string_view new_file);
599
600 int readdir(std::string_view dirname, std::vector<std::string> *ls);
601
602 int unlink(std::string_view dirname, std::string_view filename);
603 int mkdir(std::string_view dirname);
604 int rmdir(std::string_view dirname);
605 bool wal_is_rotational();
606 bool db_is_rotational();
607
608 bool dir_exists(std::string_view dirname);
609 int stat(std::string_view dirname, std::string_view filename,
610 uint64_t *size, utime_t *mtime);
611
612 int lock_file(std::string_view dirname, std::string_view filename, FileLock **p);
613 int unlock_file(FileLock *l);
614
615 void compact_log();
616
617 /// sync any uncommitted state to disk
618 void sync_metadata(bool avoid_compact);
619
620 void set_volume_selector(BlueFSVolumeSelector* s) {
621 vselector.reset(s);
622 }
623 void dump_volume_selector(std::ostream& sout) {
624 vselector->dump(sout);
625 }
626 void get_vselector_paths(const std::string& base,
627 BlueFSVolumeSelector::paths& res) const {
628 return vselector->get_paths(base, res);
629 }
630
631 int add_block_device(unsigned bdev, const std::string& path, bool trim,
632 uint64_t reserved,
633 bluefs_shared_alloc_context_t* _shared_alloc = nullptr);
634 bool bdev_support_label(unsigned id);
635 uint64_t get_block_device_size(unsigned bdev) const;
636
637 // handler for discard event
638 void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
639
640 void flush(FileWriter *h, bool force = false);
641
642 void append_try_flush(FileWriter *h, const char* buf, size_t len);
643 void flush_range(FileWriter *h, uint64_t offset, uint64_t length);
644 int fsync(FileWriter *h);
645 int64_t read(FileReader *h, uint64_t offset, size_t len,
646 ceph::buffer::list *outbl, char *out) {
647 // no need to hold the global lock here; we only touch h and
648 // h->file, and read vs write or delete is already protected (via
649 // atomics and asserts).
650 return _read(h, offset, len, outbl, out);
651 }
652 int64_t read_random(FileReader *h, uint64_t offset, size_t len,
653 char *out) {
654 // no need to hold the global lock here; we only touch h and
655 // h->file, and read vs write or delete is already protected (via
656 // atomics and asserts).
657 return _read_random(h, offset, len, out);
658 }
659 void invalidate_cache(FileRef f, uint64_t offset, uint64_t len);
660 int preallocate(FileRef f, uint64_t offset, uint64_t len);
661 int truncate(FileWriter *h, uint64_t offset);
662
663 size_t probe_alloc_avail(int dev, uint64_t alloc_size);
664
665 /// test purpose methods
666 const PerfCounters* get_perf_counters() const {
667 return logger;
668 }
669 uint64_t debug_get_dirty_seq(FileWriter *h);
670 bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev);
671
672 private:
673 // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...)
674 // They are used for checking if read values are all 0, and reread if so.
675 int _read_and_check(uint8_t ndev, uint64_t off, uint64_t len,
676 ceph::buffer::list *pbl, IOContext *ioc, bool buffered);
677 int _read_random_and_check(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered);
678
679 int _bdev_read(uint8_t ndev, uint64_t off, uint64_t len,
680 ceph::buffer::list* pbl, IOContext* ioc, bool buffered);
681 int _bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len, char* buf, bool buffered);
682
683 /// test and compact log, if necessary
684 void _maybe_compact_log_LNF_NF_LD_D();
685 int _do_replay_recovery_read(FileReader *log,
686 size_t log_pos,
687 size_t read_offset,
688 size_t read_len,
689 bufferlist* bl);
690 void _check_vselector_LNF();
691 };
692
693 class OriginalVolumeSelector : public BlueFSVolumeSelector {
694 uint64_t wal_total;
695 uint64_t db_total;
696 uint64_t slow_total;
697
698 public:
699 OriginalVolumeSelector(
700 uint64_t _wal_total,
701 uint64_t _db_total,
702 uint64_t _slow_total)
703 : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {}
704
705 void* get_hint_for_log() const override;
706 void* get_hint_by_dir(std::string_view dirname) const override;
707
708 void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
709 // do nothing
710 return;
711 }
712 void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
713 // do nothing
714 return;
715 }
716 void add_usage(void* hint, uint64_t fsize) override {
717 // do nothing
718 return;
719 }
720 void sub_usage(void* hint, uint64_t fsize) override {
721 // do nothing
722 return;
723 }
724
725 uint8_t select_prefer_bdev(void* hint) override;
726 void get_paths(const std::string& base, paths& res) const override;
727 void dump(std::ostream& sout) override;
728 };
729
730 class FitToFastVolumeSelector : public OriginalVolumeSelector {
731 public:
732 FitToFastVolumeSelector(
733 uint64_t _wal_total,
734 uint64_t _db_total,
735 uint64_t _slow_total)
736 : OriginalVolumeSelector(_wal_total, _db_total, _slow_total) {}
737
738 void get_paths(const std::string& base, paths& res) const override;
739 };
740 /**
741 * Directional graph of locks.
742 * Vertices - Locks. Edges (directed) - locking progression.
743 * Edge A->B exist if last taken lock was A and next taken lock is B.
744 *
745 * Row represents last lock taken.
746 * Column represents next lock taken.
747 *
748 * > | W | L | N | D | F
749 * -------------|---|---|---|---|---
750 * FileWriter W | | > | > | > | >
751 * log L | | > | > | >
752 * nodes N | | > | >
753 * dirty D | | | >
754 * File F |
755 *
756 * Claim: Deadlock is possible IFF graph contains cycles.
757 */
758 #endif