]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.h
14fca0fb5b3c8d73b38abc50b5445af3604871ad
[ceph.git] / ceph / src / os / bluestore / BlueFS.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4 #define CEPH_OS_BLUESTORE_BLUEFS_H
5
6 #include <atomic>
7 #include <mutex>
8
9 #include "bluefs_types.h"
10 #include "BlockDevice.h"
11
12 #include "common/RefCountedObj.h"
13 #include "common/ceph_context.h"
14 #include "global/global_context.h"
15 #include "include/common_fwd.h"
16
17 #include "boost/intrusive/list.hpp"
18 #include "boost/dynamic_bitset.hpp"
19
20 class Allocator;
21
22 enum {
23 l_bluefs_first = 732600,
24 l_bluefs_gift_bytes,
25 l_bluefs_reclaim_bytes,
26 l_bluefs_db_total_bytes,
27 l_bluefs_db_used_bytes,
28 l_bluefs_wal_total_bytes,
29 l_bluefs_wal_used_bytes,
30 l_bluefs_slow_total_bytes,
31 l_bluefs_slow_used_bytes,
32 l_bluefs_num_files,
33 l_bluefs_log_bytes,
34 l_bluefs_log_compactions,
35 l_bluefs_logged_bytes,
36 l_bluefs_files_written_wal,
37 l_bluefs_files_written_sst,
38 l_bluefs_bytes_written_wal,
39 l_bluefs_bytes_written_sst,
40 l_bluefs_bytes_written_slow,
41 l_bluefs_max_bytes_wal,
42 l_bluefs_max_bytes_db,
43 l_bluefs_max_bytes_slow,
44 l_bluefs_read_random_count,
45 l_bluefs_read_random_bytes,
46 l_bluefs_read_random_disk_count,
47 l_bluefs_read_random_disk_bytes,
48 l_bluefs_read_random_buffer_count,
49 l_bluefs_read_random_buffer_bytes,
50 l_bluefs_read_count,
51 l_bluefs_read_bytes,
52 l_bluefs_read_prefetch_count,
53 l_bluefs_read_prefetch_bytes,
54
55 l_bluefs_last,
56 };
57
58 class BlueFSDeviceExpander {
59 protected:
60 ~BlueFSDeviceExpander() {}
61 public:
62 virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free,
63 uint64_t bluefs_total) = 0;
64 virtual int allocate_freespace(
65 uint64_t min_size,
66 uint64_t size,
67 PExtentVector& extents) = 0;
68 /** Reports amount of space that can be transferred to BlueFS.
69 * This gives either current state, when alloc_size is currently used
70 * BlueFS's size, or simulation when alloc_size is different.
71 * @params
72 * alloc_size - allocation unit size to check
73 */
74 virtual uint64_t available_freespace(uint64_t alloc_size) = 0;
75 };
76
77 class BlueFSVolumeSelector {
78 public:
79 typedef std::vector<std::pair<std::string, uint64_t>> paths;
80
81 virtual ~BlueFSVolumeSelector() {
82 }
83 virtual void* get_hint_for_log() const = 0;
84 virtual void* get_hint_by_dir(const std::string& dirname) const = 0;
85
86 virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
87 virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
88 virtual void add_usage(void* file_hint, uint64_t fsize) = 0;
89 virtual void sub_usage(void* file_hint, uint64_t fsize) = 0;
90 virtual uint8_t select_prefer_bdev(void* hint) = 0;
91 virtual void get_paths(const std::string& base, paths& res) const = 0;
92 virtual void dump(ostream& sout) = 0;
93 };
94 class BlueFS;
95
96 class BlueFS {
97 public:
98 CephContext* cct;
99 static constexpr unsigned MAX_BDEV = 5;
100 static constexpr unsigned BDEV_WAL = 0;
101 static constexpr unsigned BDEV_DB = 1;
102 static constexpr unsigned BDEV_SLOW = 2;
103 static constexpr unsigned BDEV_NEWWAL = 3;
104 static constexpr unsigned BDEV_NEWDB = 4;
105
106 enum {
107 WRITER_UNKNOWN,
108 WRITER_WAL,
109 WRITER_SST,
110 };
111
112 struct File : public RefCountedObject {
113 MEMPOOL_CLASS_HELPERS();
114
115 bluefs_fnode_t fnode;
116 int refs;
117 uint64_t dirty_seq;
118 bool locked;
119 bool deleted;
120 boost::intrusive::list_member_hook<> dirty_item;
121
122 std::atomic_int num_readers, num_writers;
123 std::atomic_int num_reading;
124
125 void* vselector_hint = nullptr;
126
127 private:
128 FRIEND_MAKE_REF(File);
129 File()
130 :
131 refs(0),
132 dirty_seq(0),
133 locked(false),
134 deleted(false),
135 num_readers(0),
136 num_writers(0),
137 num_reading(0),
138 vselector_hint(nullptr)
139 {}
140 ~File() override {
141 ceph_assert(num_readers.load() == 0);
142 ceph_assert(num_writers.load() == 0);
143 ceph_assert(num_reading.load() == 0);
144 ceph_assert(!locked);
145 }
146 };
147 using FileRef = ceph::ref_t<File>;
148
149 typedef boost::intrusive::list<
150 File,
151 boost::intrusive::member_hook<
152 File,
153 boost::intrusive::list_member_hook<>,
154 &File::dirty_item> > dirty_file_list_t;
155
156 struct Dir : public RefCountedObject {
157 MEMPOOL_CLASS_HELPERS();
158
159 mempool::bluefs::map<string,FileRef> file_map;
160
161 private:
162 FRIEND_MAKE_REF(Dir);
163 Dir() = default;
164 };
165 using DirRef = ceph::ref_t<Dir>;
166
167 struct FileWriter {
168 MEMPOOL_CLASS_HELPERS();
169
170 FileRef file;
171 uint64_t pos = 0; ///< start offset for buffer
172 bufferlist buffer; ///< new data to write (at end of file)
173 bufferlist tail_block; ///< existing partial block at end of file, if any
174 bufferlist::page_aligned_appender buffer_appender; //< for const char* only
175 int writer_type = 0; ///< WRITER_*
176 int write_hint = WRITE_LIFE_NOT_SET;
177
178 ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
179 std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
180 std::array<bool, MAX_BDEV> dirty_devs;
181
182 FileWriter(FileRef f)
183 : file(std::move(f)),
184 buffer_appender(buffer.get_page_aligned_appender(
185 g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
186 ++file->num_writers;
187 iocv.fill(nullptr);
188 dirty_devs.fill(false);
189 if (file->fnode.ino == 1) {
190 write_hint = WRITE_LIFE_MEDIUM;
191 }
192 }
193 // NOTE: caller must call BlueFS::close_writer()
194 ~FileWriter() {
195 --file->num_writers;
196 }
197
198 // note: BlueRocksEnv uses this append exclusively, so it's safe
199 // to use buffer_appender exclusively here (e.g., it's notion of
200 // offset will remain accurate).
201 void append(const char *buf, size_t len) {
202 buffer_appender.append(buf, len);
203 }
204
205 // note: used internally only, for ino 1 or 0.
206 void append(bufferlist& bl) {
207 buffer.claim_append(bl);
208 }
209
210 uint64_t get_effective_write_pos() {
211 buffer_appender.flush();
212 return pos + buffer.length();
213 }
214 };
215
216 struct FileReaderBuffer {
217 MEMPOOL_CLASS_HELPERS();
218
219 uint64_t bl_off = 0; ///< prefetch buffer logical offset
220 bufferlist bl; ///< prefetch buffer
221 uint64_t pos = 0; ///< current logical offset
222 uint64_t max_prefetch; ///< max allowed prefetch
223
224 explicit FileReaderBuffer(uint64_t mpf)
225 : max_prefetch(mpf) {}
226
227 uint64_t get_buf_end() const {
228 return bl_off + bl.length();
229 }
230 uint64_t get_buf_remaining(uint64_t p) const {
231 if (p >= bl_off && p < bl_off + bl.length())
232 return bl_off + bl.length() - p;
233 return 0;
234 }
235
236 void skip(size_t n) {
237 pos += n;
238 }
239 void seek(uint64_t offset) {
240 pos = offset;
241 }
242 };
243
244 struct FileReader {
245 MEMPOOL_CLASS_HELPERS();
246
247 FileRef file;
248 FileReaderBuffer buf;
249 bool random;
250 bool ignore_eof; ///< used when reading our log file
251
252 ceph::shared_mutex lock {
253 ceph::make_shared_mutex(std::string(), false, false, false)
254 };
255
256
257 FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
258 : file(f),
259 buf(mpf),
260 random(rand),
261 ignore_eof(ie) {
262 ++file->num_readers;
263 }
264 ~FileReader() {
265 --file->num_readers;
266 }
267 };
268
269 struct FileLock {
270 MEMPOOL_CLASS_HELPERS();
271
272 FileRef file;
273 explicit FileLock(FileRef f) : file(std::move(f)) {}
274 };
275
276 private:
277 ceph::mutex lock = ceph::make_mutex("BlueFS::lock");
278
279 PerfCounters *logger = nullptr;
280
281 uint64_t max_bytes[MAX_BDEV] = {0};
282 uint64_t max_bytes_pcounters[MAX_BDEV] = {
283 l_bluefs_max_bytes_wal,
284 l_bluefs_max_bytes_db,
285 l_bluefs_max_bytes_slow,
286 };
287
288 // cache
289 mempool::bluefs::map<string, DirRef> dir_map; ///< dirname -> Dir
290 mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File
291
292 // map of dirty files, files of same dirty_seq are grouped into list.
293 map<uint64_t, dirty_file_list_t> dirty_files;
294
295 bluefs_super_t super; ///< latest superblock (as last written)
296 uint64_t ino_last = 0; ///< last assigned ino (this one is in use)
297 uint64_t log_seq = 0; ///< last used log seq (by current pending log_t)
298 uint64_t log_seq_stable = 0; ///< last stable/synced log seq
299 FileWriter *log_writer = 0; ///< writer for the log
300 bluefs_transaction_t log_t; ///< pending, unwritten log transaction
301 bool log_flushing = false; ///< true while flushing the log
302 ceph::condition_variable log_cond;
303
304 uint64_t new_log_jump_to = 0;
305 uint64_t old_log_jump_to = 0;
306 FileRef new_log = nullptr;
307 FileWriter *new_log_writer = nullptr;
308
309 /*
310 * There are up to 3 block devices:
311 *
312 * BDEV_DB db/ - the primary db device
313 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
314 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
315 */
316 vector<BlockDevice*> bdev; ///< block devices we can use
317 vector<IOContext*> ioc; ///< IOContexts for bdevs
318 vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own
319 vector<Allocator*> alloc; ///< allocators for bdevs
320 vector<uint64_t> alloc_size; ///< alloc size for each device
321 vector<interval_set<uint64_t>> pending_release; ///< extents to release
322 vector<interval_set<uint64_t>> block_unused_too_granular;
323
324 BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
325
326 BlueFSDeviceExpander* slow_dev_expander = nullptr;
327 std::unique_ptr<BlueFSVolumeSelector> vselector;
328
329 class SocketHook;
330 SocketHook* asok_hook = nullptr;
331
332 void _init_logger();
333 void _shutdown_logger();
334 void _update_logger_stats();
335
336 void _init_alloc();
337 void _stop_alloc();
338
339 void _pad_bl(bufferlist& bl); ///< pad bufferlist to block size w/ zeros
340
341 FileRef _get_file(uint64_t ino);
342 void _drop_link(FileRef f);
343
344 unsigned _get_slow_device_id() {
345 return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB;
346 }
347 const char* get_device_name(unsigned id);
348 int _expand_slow_device(uint64_t min_size, PExtentVector& extents);
349 int _allocate(uint8_t bdev, uint64_t len,
350 bluefs_fnode_t* node);
351 int _allocate_without_fallback(uint8_t id, uint64_t len,
352 PExtentVector* extents);
353
354 int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
355 int _flush(FileWriter *h, bool focce, std::unique_lock<ceph::mutex>& l);
356 int _flush(FileWriter *h, bool force, bool *flushed = nullptr);
357 int _fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l);
358
359 #ifdef HAVE_LIBAIO
360 void _claim_completed_aios(FileWriter *h, list<aio_t> *ls);
361 void wait_for_aio(FileWriter *h); // safe to call without a lock
362 #endif
363
364 int _flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
365 uint64_t want_seq = 0,
366 uint64_t jump_to = 0);
367 uint64_t _estimate_log_size();
368 bool _should_compact_log();
369
370 enum {
371 REMOVE_DB = 1,
372 REMOVE_WAL = 2,
373 RENAME_SLOW2DB = 4,
374 RENAME_DB2SLOW = 8,
375 };
376 void _compact_log_dump_metadata(bluefs_transaction_t *t,
377 int flags);
378 void _compact_log_sync();
379 void _compact_log_async(std::unique_lock<ceph::mutex>& l);
380
381 void _rewrite_log_and_layout_sync(bool allocate_with_fallback,
382 int super_dev,
383 int log_dev,
384 int new_log_dev,
385 int flags,
386 std::optional<bluefs_layout_t> layout);
387
388 //void _aio_finish(void *priv);
389
390 void _flush_bdev_safely(FileWriter *h);
391 void flush_bdev(); // this is safe to call without a lock
392 void flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock
393
394 int _preallocate(FileRef f, uint64_t off, uint64_t len);
395 int _truncate(FileWriter *h, uint64_t off);
396
397 int64_t _read(
398 FileReader *h, ///< [in] read from here
399 FileReaderBuffer *buf, ///< [in] reader state
400 uint64_t offset, ///< [in] offset
401 size_t len, ///< [in] this many bytes
402 bufferlist *outbl, ///< [out] optional: reference the result here
403 char *out); ///< [out] optional: or copy it here
404 int64_t _read_random(
405 FileReader *h, ///< [in] read from here
406 uint64_t offset, ///< [in] offset
407 uint64_t len, ///< [in] this many bytes
408 char *out); ///< [out] optional: or copy it here
409
410 void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length);
411
412 int _open_super();
413 int _write_super(int dev);
414 int _check_new_allocations(const bluefs_fnode_t& fnode,
415 size_t dev_count,
416 boost::dynamic_bitset<uint64_t>* owned_blocks,
417 boost::dynamic_bitset<uint64_t>* used_blocks);
418 int _verify_alloc_granularity(
419 __u8 id, uint64_t offset, uint64_t length,
420 const char *op);
421 int _adjust_granularity(
422 __u8 id, uint64_t *offset, uint64_t *length, bool alloc);
423 int _replay(bool noop, bool to_stdout = false); ///< replay journal
424
425 FileWriter *_create_writer(FileRef f);
426 void _close_writer(FileWriter *h);
427
428 // always put the super in the second 4k block. FIXME should this be
429 // block size independent?
430 unsigned get_super_offset() {
431 return 4096;
432 }
433 unsigned get_super_length() {
434 return 4096;
435 }
436
437 void _add_block_extent(unsigned bdev, uint64_t offset, uint64_t len,
438 bool skip=false);
439
440 public:
441 BlueFS(CephContext* cct);
442 ~BlueFS();
443
444 // the super is always stored on bdev 0
445 int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout);
446 int mount();
447 int maybe_verify_layout(const bluefs_layout_t& layout) const;
448 void umount(bool avoid_compact = false);
449 int prepare_new_device(int id, const bluefs_layout_t& layout);
450
451 int log_dump();
452
453 void collect_metadata(map<string,string> *pm, unsigned skip_bdev_id);
454 void get_devices(set<string> *ls);
455 uint64_t get_alloc_size(int id) {
456 return alloc_size[id];
457 }
458 int fsck();
459
460 int device_migrate_to_new(
461 CephContext *cct,
462 const set<int>& devs_source,
463 int dev_target,
464 const bluefs_layout_t& layout);
465 int device_migrate_to_existing(
466 CephContext *cct,
467 const set<int>& devs_source,
468 int dev_target,
469 const bluefs_layout_t& layout);
470
471 uint64_t get_used();
472 uint64_t get_total(unsigned id);
473 uint64_t get_free(unsigned id);
474 void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...]
475 void dump_perf_counters(Formatter *f);
476
477 void dump_block_extents(ostream& out);
478
479 /// get current extents that we own for given block device
480 int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
481
482 int open_for_write(
483 const string& dir,
484 const string& file,
485 FileWriter **h,
486 bool overwrite);
487
488 int open_for_read(
489 const string& dir,
490 const string& file,
491 FileReader **h,
492 bool random = false);
493
494 void close_writer(FileWriter *h) {
495 std::lock_guard l(lock);
496 _close_writer(h);
497 }
498
499 int rename(const string& old_dir, const string& old_file,
500 const string& new_dir, const string& new_file);
501
502 int readdir(const string& dirname, vector<string> *ls);
503
504 int unlink(const string& dirname, const string& filename);
505 int mkdir(const string& dirname);
506 int rmdir(const string& dirname);
507 bool wal_is_rotational();
508
509 bool dir_exists(const string& dirname);
510 int stat(const string& dirname, const string& filename,
511 uint64_t *size, utime_t *mtime);
512
513 int lock_file(const string& dirname, const string& filename, FileLock **p);
514 int unlock_file(FileLock *l);
515
516 void compact_log();
517
518 /// sync any uncommitted state to disk
519 void sync_metadata(bool avoid_compact);
520 /// test and compact log, if necessary
521 void _maybe_compact_log(std::unique_lock<ceph::mutex>& l);
522
523 void set_slow_device_expander(BlueFSDeviceExpander* a) {
524 slow_dev_expander = a;
525 }
526 void set_volume_selector(BlueFSVolumeSelector* s) {
527 vselector.reset(s);
528 }
529 void dump_volume_selector(ostream& sout) {
530 vselector->dump(sout);
531 }
532 void get_vselector_paths(const std::string& base,
533 BlueFSVolumeSelector::paths& res) const {
534 return vselector->get_paths(base, res);
535 }
536
537 int add_block_device(unsigned bdev, const string& path, bool trim,
538 bool shared_with_bluestore=false);
539 bool bdev_support_label(unsigned id);
540 uint64_t get_block_device_size(unsigned bdev);
541
542 /// gift more block space
543 void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len,
544 bool skip=false) {
545 std::unique_lock l(lock);
546 _add_block_extent(bdev, offset, len, skip);
547 int r = _flush_and_sync_log(l);
548 ceph_assert(r == 0);
549 }
550
551 /// reclaim block space
552 int reclaim_blocks(unsigned bdev, uint64_t want,
553 PExtentVector *extents);
554
555 // handler for discard event
556 void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
557
558 void flush(FileWriter *h, bool force = false) {
559 std::unique_lock l(lock);
560 int r = _flush(h, force, l);
561 ceph_assert(r == 0);
562 }
563 void flush_range(FileWriter *h, uint64_t offset, uint64_t length) {
564 std::lock_guard l(lock);
565 _flush_range(h, offset, length);
566 }
567 int fsync(FileWriter *h) {
568 std::unique_lock l(lock);
569 int r = _fsync(h, l);
570 _maybe_compact_log(l);
571 return r;
572 }
573 int64_t read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len,
574 bufferlist *outbl, char *out) {
575 // no need to hold the global lock here; we only touch h and
576 // h->file, and read vs write or delete is already protected (via
577 // atomics and asserts).
578 return _read(h, buf, offset, len, outbl, out);
579 }
580 int64_t read_random(FileReader *h, uint64_t offset, size_t len,
581 char *out) {
582 // no need to hold the global lock here; we only touch h and
583 // h->file, and read vs write or delete is already protected (via
584 // atomics and asserts).
585 return _read_random(h, offset, len, out);
586 }
587 void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) {
588 std::lock_guard l(lock);
589 _invalidate_cache(f, offset, len);
590 }
591 int preallocate(FileRef f, uint64_t offset, uint64_t len) {
592 std::lock_guard l(lock);
593 return _preallocate(f, offset, len);
594 }
595 int truncate(FileWriter *h, uint64_t offset) {
596 std::lock_guard l(lock);
597 return _truncate(h, offset);
598 }
599 int do_replay_recovery_read(FileReader *log,
600 size_t log_pos,
601 size_t read_offset,
602 size_t read_len,
603 bufferlist* bl);
604
605 /// test purpose methods
606 void debug_inject_duplicate_gift(unsigned bdev, uint64_t offset, uint64_t len);
607 const PerfCounters* get_perf_counters() const {
608 return logger;
609 }
610 };
611
612 class OriginalVolumeSelector : public BlueFSVolumeSelector {
613 uint64_t wal_total;
614 uint64_t db_total;
615 uint64_t slow_total;
616
617 public:
618 OriginalVolumeSelector(
619 uint64_t _wal_total,
620 uint64_t _db_total,
621 uint64_t _slow_total)
622 : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {}
623
624 void* get_hint_for_log() const override;
625 void* get_hint_by_dir(const std::string& dirname) const override;
626
627 void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
628 // do nothing
629 return;
630 }
631 void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
632 // do nothing
633 return;
634 }
635 void add_usage(void* hint, uint64_t fsize) override {
636 // do nothing
637 return;
638 }
639 void sub_usage(void* hint, uint64_t fsize) override {
640 // do nothing
641 return;
642 }
643
644 uint8_t select_prefer_bdev(void* hint) override;
645 void get_paths(const std::string& base, paths& res) const override;
646 void dump(ostream& sout) override;
647 };
648
649 #endif