]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.h
import ceph 14.2.5
[ceph.git] / ceph / src / os / bluestore / BlueFS.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3#ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4#define CEPH_OS_BLUESTORE_BLUEFS_H
5
6#include <atomic>
7#include <mutex>
8
9#include "bluefs_types.h"
10#include "common/RefCountedObj.h"
11#include "BlockDevice.h"
12
13#include "boost/intrusive/list.hpp"
14#include <boost/intrusive_ptr.hpp>
15
16class PerfCounters;
17
18class Allocator;
19
20enum {
21 l_bluefs_first = 732600,
22 l_bluefs_gift_bytes,
23 l_bluefs_reclaim_bytes,
24 l_bluefs_db_total_bytes,
25 l_bluefs_db_used_bytes,
26 l_bluefs_wal_total_bytes,
27 l_bluefs_wal_used_bytes,
28 l_bluefs_slow_total_bytes,
29 l_bluefs_slow_used_bytes,
30 l_bluefs_num_files,
31 l_bluefs_log_bytes,
32 l_bluefs_log_compactions,
33 l_bluefs_logged_bytes,
34 l_bluefs_files_written_wal,
35 l_bluefs_files_written_sst,
36 l_bluefs_bytes_written_wal,
37 l_bluefs_bytes_written_sst,
11fdf7f2
TL
38 l_bluefs_bytes_written_slow,
39 l_bluefs_max_bytes_wal,
40 l_bluefs_max_bytes_db,
41 l_bluefs_max_bytes_slow,
494da23a
TL
42 l_bluefs_read_random_count,
43 l_bluefs_read_random_bytes,
44 l_bluefs_read_random_disk_count,
45 l_bluefs_read_random_disk_bytes,
46 l_bluefs_read_random_buffer_count,
47 l_bluefs_read_random_buffer_bytes,
48 l_bluefs_read_count,
49 l_bluefs_read_bytes,
50 l_bluefs_read_prefetch_count,
51 l_bluefs_read_prefetch_bytes,
52
7c673cae
FG
53 l_bluefs_last,
54};
55
11fdf7f2
TL
56class BlueFSDeviceExpander {
57protected:
58 ~BlueFSDeviceExpander() {}
59public:
60 virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free,
61 uint64_t bluefs_total) = 0;
62 virtual int allocate_freespace(
63 uint64_t min_size,
64 uint64_t size,
65 PExtentVector& extents) = 0;
eafe8130
TL
66 /** Reports amount of space that can be transferred to BlueFS.
67 * This gives either current state, when alloc_size is currently used
68 * BlueFS's size, or simulation when alloc_size is different.
69 * @params
70 * alloc_size - allocation unit size to check
71 */
72 virtual size_t available_freespace(uint64_t alloc_size) = 0;
11fdf7f2
TL
73};
74
7c673cae
FG
75class BlueFS {
76public:
77 CephContext* cct;
11fdf7f2 78 static constexpr unsigned MAX_BDEV = 5;
7c673cae
FG
79 static constexpr unsigned BDEV_WAL = 0;
80 static constexpr unsigned BDEV_DB = 1;
81 static constexpr unsigned BDEV_SLOW = 2;
11fdf7f2
TL
82 static constexpr unsigned BDEV_NEWWAL = 3;
83 static constexpr unsigned BDEV_NEWDB = 4;
7c673cae
FG
84
85 enum {
86 WRITER_UNKNOWN,
87 WRITER_WAL,
88 WRITER_SST,
89 };
90
91 struct File : public RefCountedObject {
92 MEMPOOL_CLASS_HELPERS();
93
94 bluefs_fnode_t fnode;
95 int refs;
96 uint64_t dirty_seq;
97 bool locked;
98 bool deleted;
99 boost::intrusive::list_member_hook<> dirty_item;
100
101 std::atomic_int num_readers, num_writers;
102 std::atomic_int num_reading;
103
104 File()
105 : RefCountedObject(NULL, 0),
106 refs(0),
107 dirty_seq(0),
108 locked(false),
109 deleted(false),
110 num_readers(0),
111 num_writers(0),
112 num_reading(0)
113 {}
114 ~File() override {
11fdf7f2
TL
115 ceph_assert(num_readers.load() == 0);
116 ceph_assert(num_writers.load() == 0);
117 ceph_assert(num_reading.load() == 0);
118 ceph_assert(!locked);
7c673cae
FG
119 }
120
121 friend void intrusive_ptr_add_ref(File *f) {
122 f->get();
123 }
124 friend void intrusive_ptr_release(File *f) {
125 f->put();
126 }
127 };
128 typedef boost::intrusive_ptr<File> FileRef;
129
130 typedef boost::intrusive::list<
131 File,
132 boost::intrusive::member_hook<
133 File,
134 boost::intrusive::list_member_hook<>,
135 &File::dirty_item> > dirty_file_list_t;
136
137 struct Dir : public RefCountedObject {
138 MEMPOOL_CLASS_HELPERS();
139
140 mempool::bluefs::map<string,FileRef> file_map;
141
142 Dir() : RefCountedObject(NULL, 0) {}
143
144 friend void intrusive_ptr_add_ref(Dir *d) {
145 d->get();
146 }
147 friend void intrusive_ptr_release(Dir *d) {
148 d->put();
149 }
150 };
151 typedef boost::intrusive_ptr<Dir> DirRef;
152
153 struct FileWriter {
154 MEMPOOL_CLASS_HELPERS();
155
156 FileRef file;
157 uint64_t pos; ///< start offset for buffer
158 bufferlist buffer; ///< new data to write (at end of file)
159 bufferlist tail_block; ///< existing partial block at end of file, if any
160 bufferlist::page_aligned_appender buffer_appender; //< for const char* only
161 int writer_type = 0; ///< WRITER_*
11fdf7f2 162 int write_hint = WRITE_LIFE_NOT_SET;
7c673cae 163
11fdf7f2 164 ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
7c673cae 165 std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
11fdf7f2 166 std::array<bool, MAX_BDEV> dirty_devs;
7c673cae
FG
167
168 FileWriter(FileRef f)
169 : file(f),
170 pos(0),
171 buffer_appender(buffer.get_page_aligned_appender(
11fdf7f2 172 g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
7c673cae
FG
173 ++file->num_writers;
174 iocv.fill(nullptr);
11fdf7f2
TL
175 dirty_devs.fill(false);
176 if (f->fnode.ino == 1) {
177 write_hint = WRITE_LIFE_MEDIUM;
178 }
7c673cae
FG
179 }
180 // NOTE: caller must call BlueFS::close_writer()
181 ~FileWriter() {
182 --file->num_writers;
183 }
184
185 // note: BlueRocksEnv uses this append exclusively, so it's safe
186 // to use buffer_appender exclusively here (e.g., it's notion of
187 // offset will remain accurate).
188 void append(const char *buf, size_t len) {
189 buffer_appender.append(buf, len);
190 }
191
192 // note: used internally only, for ino 1 or 0.
193 void append(bufferlist& bl) {
194 buffer.claim_append(bl);
195 }
196
197 uint64_t get_effective_write_pos() {
198 buffer_appender.flush();
199 return pos + buffer.length();
200 }
201 };
202
203 struct FileReaderBuffer {
204 MEMPOOL_CLASS_HELPERS();
205
206 uint64_t bl_off; ///< prefetch buffer logical offset
207 bufferlist bl; ///< prefetch buffer
208 uint64_t pos; ///< current logical offset
209 uint64_t max_prefetch; ///< max allowed prefetch
210
211 explicit FileReaderBuffer(uint64_t mpf)
212 : bl_off(0),
213 pos(0),
214 max_prefetch(mpf) {}
215
216 uint64_t get_buf_end() {
217 return bl_off + bl.length();
218 }
219 uint64_t get_buf_remaining(uint64_t p) {
220 if (p >= bl_off && p < bl_off + bl.length())
221 return bl_off + bl.length() - p;
222 return 0;
223 }
224
225 void skip(size_t n) {
226 pos += n;
227 }
228 void seek(uint64_t offset) {
229 pos = offset;
230 }
231 };
232
233 struct FileReader {
234 MEMPOOL_CLASS_HELPERS();
235
236 FileRef file;
237 FileReaderBuffer buf;
238 bool random;
239 bool ignore_eof; ///< used when reading our log file
240
494da23a
TL
241 ceph::shared_mutex lock {
242 ceph::make_shared_mutex(std::string(), false, false, false)
243 };
244
245
7c673cae
FG
246 FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
247 : file(f),
248 buf(mpf),
249 random(rand),
250 ignore_eof(ie) {
251 ++file->num_readers;
252 }
253 ~FileReader() {
254 --file->num_readers;
255 }
256 };
257
258 struct FileLock {
259 MEMPOOL_CLASS_HELPERS();
260
261 FileRef file;
262 explicit FileLock(FileRef f) : file(f) {}
263 };
264
265private:
11fdf7f2 266 ceph::mutex lock = ceph::make_mutex("BlueFS::lock");
7c673cae
FG
267
268 PerfCounters *logger = nullptr;
269
11fdf7f2
TL
270 uint64_t max_bytes[MAX_BDEV] = {0};
271 uint64_t max_bytes_pcounters[MAX_BDEV] = {
272 l_bluefs_max_bytes_wal,
273 l_bluefs_max_bytes_db,
274 l_bluefs_max_bytes_slow,
275 };
276
7c673cae
FG
277 // cache
278 mempool::bluefs::map<string, DirRef> dir_map; ///< dirname -> Dir
279 mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File
280
281 // map of dirty files, files of same dirty_seq are grouped into list.
282 map<uint64_t, dirty_file_list_t> dirty_files;
283
284 bluefs_super_t super; ///< latest superblock (as last written)
285 uint64_t ino_last = 0; ///< last assigned ino (this one is in use)
286 uint64_t log_seq = 0; ///< last used log seq (by current pending log_t)
287 uint64_t log_seq_stable = 0; ///< last stable/synced log seq
288 FileWriter *log_writer = 0; ///< writer for the log
289 bluefs_transaction_t log_t; ///< pending, unwritten log transaction
290 bool log_flushing = false; ///< true while flushing the log
11fdf7f2 291 ceph::condition_variable log_cond;
7c673cae
FG
292
293 uint64_t new_log_jump_to = 0;
294 uint64_t old_log_jump_to = 0;
295 FileRef new_log = nullptr;
296 FileWriter *new_log_writer = nullptr;
297
298 /*
299 * There are up to 3 block devices:
300 *
301 * BDEV_DB db/ - the primary db device
302 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
303 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
304 */
305 vector<BlockDevice*> bdev; ///< block devices we can use
306 vector<IOContext*> ioc; ///< IOContexts for bdevs
307 vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own
7c673cae 308 vector<Allocator*> alloc; ///< allocators for bdevs
eafe8130 309 vector<uint64_t> alloc_size; ///< alloc size for each device
7c673cae
FG
310 vector<interval_set<uint64_t>> pending_release; ///< extents to release
311
11fdf7f2
TL
312 BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
313
314 BlueFSDeviceExpander* slow_dev_expander = nullptr;
315
eafe8130
TL
316 class SocketHook;
317 SocketHook* asok_hook = nullptr;
318
7c673cae
FG
319 void _init_logger();
320 void _shutdown_logger();
321 void _update_logger_stats();
322
323 void _init_alloc();
324 void _stop_alloc();
325
326 void _pad_bl(bufferlist& bl); ///< pad bufferlist to block size w/ zeros
327
328 FileRef _get_file(uint64_t ino);
329 void _drop_link(FileRef f);
330
11fdf7f2 331 int _get_slow_device_id() { return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; }
eafe8130 332 const char* get_device_name(unsigned id);
11fdf7f2 333 int _expand_slow_device(uint64_t min_size, PExtentVector& extents);
7c673cae 334 int _allocate(uint8_t bdev, uint64_t len,
94b18763 335 bluefs_fnode_t* node);
11fdf7f2
TL
336 int _allocate_without_fallback(uint8_t id, uint64_t len,
337 PExtentVector* extents);
338
7c673cae
FG
339 int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
340 int _flush(FileWriter *h, bool force);
11fdf7f2 341 int _fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l);
7c673cae 342
11fdf7f2 343#ifdef HAVE_LIBAIO
7c673cae
FG
344 void _claim_completed_aios(FileWriter *h, list<aio_t> *ls);
345 void wait_for_aio(FileWriter *h); // safe to call without a lock
11fdf7f2 346#endif
7c673cae 347
11fdf7f2 348 int _flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
7c673cae
FG
349 uint64_t want_seq = 0,
350 uint64_t jump_to = 0);
351 uint64_t _estimate_log_size();
352 bool _should_compact_log();
11fdf7f2
TL
353
354 enum {
355 REMOVE_DB = 1,
356 REMOVE_WAL = 2,
357 RENAME_SLOW2DB = 4,
358 RENAME_DB2SLOW = 8,
359 };
360 void _compact_log_dump_metadata(bluefs_transaction_t *t,
361 int flags);
7c673cae 362 void _compact_log_sync();
11fdf7f2
TL
363 void _compact_log_async(std::unique_lock<ceph::mutex>& l);
364
365 void _rewrite_log_sync(bool allocate_with_fallback,
366 int super_dev,
367 int log_dev,
368 int new_log_dev,
369 int flags);
7c673cae
FG
370
371 //void _aio_finish(void *priv);
372
373 void _flush_bdev_safely(FileWriter *h);
374 void flush_bdev(); // this is safe to call without a lock
11fdf7f2 375 void flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock
7c673cae
FG
376
377 int _preallocate(FileRef f, uint64_t off, uint64_t len);
378 int _truncate(FileWriter *h, uint64_t off);
379
380 int _read(
381 FileReader *h, ///< [in] read from here
382 FileReaderBuffer *buf, ///< [in] reader state
383 uint64_t offset, ///< [in] offset
384 size_t len, ///< [in] this many bytes
385 bufferlist *outbl, ///< [out] optional: reference the result here
386 char *out); ///< [out] optional: or copy it here
387 int _read_random(
388 FileReader *h, ///< [in] read from here
389 uint64_t offset, ///< [in] offset
390 size_t len, ///< [in] this many bytes
391 char *out); ///< [out] optional: or copy it here
392
393 void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length);
394
395 int _open_super();
11fdf7f2
TL
396 int _write_super(int dev);
397 int _replay(bool noop, bool to_stdout = false); ///< replay journal
7c673cae
FG
398
399 FileWriter *_create_writer(FileRef f);
400 void _close_writer(FileWriter *h);
401
402 // always put the super in the second 4k block. FIXME should this be
403 // block size independent?
404 unsigned get_super_offset() {
405 return 4096;
406 }
407 unsigned get_super_length() {
408 return 4096;
409 }
410
11fdf7f2
TL
411 void _add_block_extent(unsigned bdev, uint64_t offset, uint64_t len);
412
7c673cae
FG
413public:
414 BlueFS(CephContext* cct);
415 ~BlueFS();
416
417 // the super is always stored on bdev 0
418 int mkfs(uuid_d osd_uuid);
419 int mount();
420 void umount();
11fdf7f2
TL
421 int prepare_new_device(int id);
422
423 int log_dump();
7c673cae 424
11fdf7f2
TL
425 void collect_metadata(map<string,string> *pm, unsigned skip_bdev_id);
426 void get_devices(set<string> *ls);
eafe8130
TL
427 uint64_t get_alloc_size(int id) {
428 return alloc_size[id];
429 }
7c673cae
FG
430 int fsck();
431
11fdf7f2
TL
432 int device_migrate_to_new(
433 CephContext *cct,
434 const set<int>& devs_source,
435 int dev_target);
436 int device_migrate_to_existing(
437 CephContext *cct,
438 const set<int>& devs_source,
439 int dev_target);
440
441 uint64_t get_used();
7c673cae
FG
442 uint64_t get_total(unsigned id);
443 uint64_t get_free(unsigned id);
444 void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...]
445 void dump_perf_counters(Formatter *f);
446
3efd9988
FG
447 void dump_block_extents(ostream& out);
448
7c673cae
FG
449 /// get current extents that we own for given block device
450 int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
451
452 int open_for_write(
453 const string& dir,
454 const string& file,
455 FileWriter **h,
456 bool overwrite);
457
458 int open_for_read(
459 const string& dir,
460 const string& file,
461 FileReader **h,
462 bool random = false);
463
464 void close_writer(FileWriter *h) {
11fdf7f2 465 std::lock_guard l(lock);
7c673cae
FG
466 _close_writer(h);
467 }
468
469 int rename(const string& old_dir, const string& old_file,
470 const string& new_dir, const string& new_file);
471
472 int readdir(const string& dirname, vector<string> *ls);
473
474 int unlink(const string& dirname, const string& filename);
475 int mkdir(const string& dirname);
476 int rmdir(const string& dirname);
d2e6a577 477 bool wal_is_rotational();
7c673cae
FG
478
479 bool dir_exists(const string& dirname);
480 int stat(const string& dirname, const string& filename,
481 uint64_t *size, utime_t *mtime);
482
483 int lock_file(const string& dirname, const string& filename, FileLock **p);
484 int unlock_file(FileLock *l);
485
486 void flush_log();
487 void compact_log();
488
489 /// sync any uncommitted state to disk
490 void sync_metadata();
491
11fdf7f2
TL
492 void set_slow_device_expander(BlueFSDeviceExpander* a) {
493 slow_dev_expander = a;
494 }
495 int add_block_device(unsigned bdev, const string& path, bool trim,
496 bool shared_with_bluestore=false);
7c673cae
FG
497 bool bdev_support_label(unsigned id);
498 uint64_t get_block_device_size(unsigned bdev);
499
500 /// gift more block space
11fdf7f2
TL
501 void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len) {
502 std::unique_lock l(lock);
503 _add_block_extent(bdev, offset, len);
504 int r = _flush_and_sync_log(l);
505 ceph_assert(r == 0);
506 }
7c673cae
FG
507
508 /// reclaim block space
509 int reclaim_blocks(unsigned bdev, uint64_t want,
a8e16298 510 PExtentVector *extents);
7c673cae 511
11fdf7f2
TL
512 // handler for discard event
513 void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
514
7c673cae 515 void flush(FileWriter *h) {
11fdf7f2 516 std::lock_guard l(lock);
7c673cae
FG
517 _flush(h, false);
518 }
519 void flush_range(FileWriter *h, uint64_t offset, uint64_t length) {
11fdf7f2 520 std::lock_guard l(lock);
7c673cae
FG
521 _flush_range(h, offset, length);
522 }
523 int fsync(FileWriter *h) {
11fdf7f2 524 std::unique_lock l(lock);
7c673cae
FG
525 return _fsync(h, l);
526 }
527 int read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len,
528 bufferlist *outbl, char *out) {
529 // no need to hold the global lock here; we only touch h and
530 // h->file, and read vs write or delete is already protected (via
531 // atomics and asserts).
532 return _read(h, buf, offset, len, outbl, out);
533 }
534 int read_random(FileReader *h, uint64_t offset, size_t len,
535 char *out) {
536 // no need to hold the global lock here; we only touch h and
537 // h->file, and read vs write or delete is already protected (via
538 // atomics and asserts).
539 return _read_random(h, offset, len, out);
540 }
541 void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) {
11fdf7f2 542 std::lock_guard l(lock);
7c673cae
FG
543 _invalidate_cache(f, offset, len);
544 }
545 int preallocate(FileRef f, uint64_t offset, uint64_t len) {
11fdf7f2 546 std::lock_guard l(lock);
7c673cae
FG
547 return _preallocate(f, offset, len);
548 }
549 int truncate(FileWriter *h, uint64_t offset) {
11fdf7f2 550 std::lock_guard l(lock);
7c673cae
FG
551 return _truncate(h, offset);
552 }
553
554};
555
556#endif