]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.h
bb14ffc3697d035347d9b502aa8e93266b7441b6
[ceph.git] / ceph / src / os / bluestore / BlueFS.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4 #define CEPH_OS_BLUESTORE_BLUEFS_H
5
6 #include <atomic>
7 #include <mutex>
8
9 #include "bluefs_types.h"
10 #include "common/RefCountedObj.h"
11 #include "BlockDevice.h"
12
13 #include "boost/intrusive/list.hpp"
14 #include <boost/intrusive_ptr.hpp>
15
16 class PerfCounters;
17
18 class Allocator;
19
20 enum {
21 l_bluefs_first = 732600,
22 l_bluefs_gift_bytes,
23 l_bluefs_reclaim_bytes,
24 l_bluefs_db_total_bytes,
25 l_bluefs_db_used_bytes,
26 l_bluefs_wal_total_bytes,
27 l_bluefs_wal_used_bytes,
28 l_bluefs_slow_total_bytes,
29 l_bluefs_slow_used_bytes,
30 l_bluefs_num_files,
31 l_bluefs_log_bytes,
32 l_bluefs_log_compactions,
33 l_bluefs_logged_bytes,
34 l_bluefs_files_written_wal,
35 l_bluefs_files_written_sst,
36 l_bluefs_bytes_written_wal,
37 l_bluefs_bytes_written_sst,
38 l_bluefs_last,
39 };
40
41 class BlueFS {
42 public:
43 CephContext* cct;
44 static constexpr unsigned MAX_BDEV = 3;
45 static constexpr unsigned BDEV_WAL = 0;
46 static constexpr unsigned BDEV_DB = 1;
47 static constexpr unsigned BDEV_SLOW = 2;
48
49 enum {
50 WRITER_UNKNOWN,
51 WRITER_WAL,
52 WRITER_SST,
53 };
54
55 struct File : public RefCountedObject {
56 MEMPOOL_CLASS_HELPERS();
57
58 bluefs_fnode_t fnode;
59 int refs;
60 uint64_t dirty_seq;
61 bool locked;
62 bool deleted;
63 boost::intrusive::list_member_hook<> dirty_item;
64
65 std::atomic_int num_readers, num_writers;
66 std::atomic_int num_reading;
67
68 File()
69 : RefCountedObject(NULL, 0),
70 refs(0),
71 dirty_seq(0),
72 locked(false),
73 deleted(false),
74 num_readers(0),
75 num_writers(0),
76 num_reading(0)
77 {}
78 ~File() override {
79 assert(num_readers.load() == 0);
80 assert(num_writers.load() == 0);
81 assert(num_reading.load() == 0);
82 assert(!locked);
83 }
84
85 friend void intrusive_ptr_add_ref(File *f) {
86 f->get();
87 }
88 friend void intrusive_ptr_release(File *f) {
89 f->put();
90 }
91 };
92 typedef boost::intrusive_ptr<File> FileRef;
93
94 typedef boost::intrusive::list<
95 File,
96 boost::intrusive::member_hook<
97 File,
98 boost::intrusive::list_member_hook<>,
99 &File::dirty_item> > dirty_file_list_t;
100
101 struct Dir : public RefCountedObject {
102 MEMPOOL_CLASS_HELPERS();
103
104 mempool::bluefs::map<string,FileRef> file_map;
105
106 Dir() : RefCountedObject(NULL, 0) {}
107
108 friend void intrusive_ptr_add_ref(Dir *d) {
109 d->get();
110 }
111 friend void intrusive_ptr_release(Dir *d) {
112 d->put();
113 }
114 };
115 typedef boost::intrusive_ptr<Dir> DirRef;
116
117 struct FileWriter {
118 MEMPOOL_CLASS_HELPERS();
119
120 FileRef file;
121 uint64_t pos; ///< start offset for buffer
122 bufferlist buffer; ///< new data to write (at end of file)
123 bufferlist tail_block; ///< existing partial block at end of file, if any
124 bufferlist::page_aligned_appender buffer_appender; //< for const char* only
125 int writer_type = 0; ///< WRITER_*
126
127 std::mutex lock;
128 std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
129
130 FileWriter(FileRef f)
131 : file(f),
132 pos(0),
133 buffer_appender(buffer.get_page_aligned_appender(
134 g_conf->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
135 ++file->num_writers;
136 iocv.fill(nullptr);
137 }
138 // NOTE: caller must call BlueFS::close_writer()
139 ~FileWriter() {
140 --file->num_writers;
141 }
142
143 // note: BlueRocksEnv uses this append exclusively, so it's safe
144 // to use buffer_appender exclusively here (e.g., it's notion of
145 // offset will remain accurate).
146 void append(const char *buf, size_t len) {
147 buffer_appender.append(buf, len);
148 }
149
150 // note: used internally only, for ino 1 or 0.
151 void append(bufferlist& bl) {
152 buffer.claim_append(bl);
153 }
154
155 uint64_t get_effective_write_pos() {
156 buffer_appender.flush();
157 return pos + buffer.length();
158 }
159 };
160
161 struct FileReaderBuffer {
162 MEMPOOL_CLASS_HELPERS();
163
164 uint64_t bl_off; ///< prefetch buffer logical offset
165 bufferlist bl; ///< prefetch buffer
166 uint64_t pos; ///< current logical offset
167 uint64_t max_prefetch; ///< max allowed prefetch
168
169 explicit FileReaderBuffer(uint64_t mpf)
170 : bl_off(0),
171 pos(0),
172 max_prefetch(mpf) {}
173
174 uint64_t get_buf_end() {
175 return bl_off + bl.length();
176 }
177 uint64_t get_buf_remaining(uint64_t p) {
178 if (p >= bl_off && p < bl_off + bl.length())
179 return bl_off + bl.length() - p;
180 return 0;
181 }
182
183 void skip(size_t n) {
184 pos += n;
185 }
186 void seek(uint64_t offset) {
187 pos = offset;
188 }
189 };
190
191 struct FileReader {
192 MEMPOOL_CLASS_HELPERS();
193
194 FileRef file;
195 FileReaderBuffer buf;
196 bool random;
197 bool ignore_eof; ///< used when reading our log file
198
199 FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
200 : file(f),
201 buf(mpf),
202 random(rand),
203 ignore_eof(ie) {
204 ++file->num_readers;
205 }
206 ~FileReader() {
207 --file->num_readers;
208 }
209 };
210
211 struct FileLock {
212 MEMPOOL_CLASS_HELPERS();
213
214 FileRef file;
215 explicit FileLock(FileRef f) : file(f) {}
216 };
217
218 private:
219 std::mutex lock;
220
221 PerfCounters *logger = nullptr;
222
223 // cache
224 mempool::bluefs::map<string, DirRef> dir_map; ///< dirname -> Dir
225 mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File
226
227 // map of dirty files, files of same dirty_seq are grouped into list.
228 map<uint64_t, dirty_file_list_t> dirty_files;
229
230 bluefs_super_t super; ///< latest superblock (as last written)
231 uint64_t ino_last = 0; ///< last assigned ino (this one is in use)
232 uint64_t log_seq = 0; ///< last used log seq (by current pending log_t)
233 uint64_t log_seq_stable = 0; ///< last stable/synced log seq
234 FileWriter *log_writer = 0; ///< writer for the log
235 bluefs_transaction_t log_t; ///< pending, unwritten log transaction
236 bool log_flushing = false; ///< true while flushing the log
237 std::condition_variable log_cond;
238
239 uint64_t new_log_jump_to = 0;
240 uint64_t old_log_jump_to = 0;
241 FileRef new_log = nullptr;
242 FileWriter *new_log_writer = nullptr;
243
244 /*
245 * There are up to 3 block devices:
246 *
247 * BDEV_DB db/ - the primary db device
248 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
249 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
250 */
251 vector<BlockDevice*> bdev; ///< block devices we can use
252 vector<IOContext*> ioc; ///< IOContexts for bdevs
253 vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own
254 vector<uint64_t> block_total; ///< sum of block_all
255 vector<Allocator*> alloc; ///< allocators for bdevs
256 vector<interval_set<uint64_t>> pending_release; ///< extents to release
257
258 void _init_logger();
259 void _shutdown_logger();
260 void _update_logger_stats();
261
262 void _init_alloc();
263 void _stop_alloc();
264
265 void _pad_bl(bufferlist& bl); ///< pad bufferlist to block size w/ zeros
266
267 FileRef _get_file(uint64_t ino);
268 void _drop_link(FileRef f);
269
270 int _allocate(uint8_t bdev, uint64_t len,
271 mempool::bluefs::vector<bluefs_extent_t> *ev);
272 int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
273 int _flush(FileWriter *h, bool force);
274 int _fsync(FileWriter *h, std::unique_lock<std::mutex>& l);
275
276 void _claim_completed_aios(FileWriter *h, list<aio_t> *ls);
277 void wait_for_aio(FileWriter *h); // safe to call without a lock
278
279 int _flush_and_sync_log(std::unique_lock<std::mutex>& l,
280 uint64_t want_seq = 0,
281 uint64_t jump_to = 0);
282 uint64_t _estimate_log_size();
283 bool _should_compact_log();
284 void _compact_log_dump_metadata(bluefs_transaction_t *t);
285 void _compact_log_sync();
286 void _compact_log_async(std::unique_lock<std::mutex>& l);
287
288 //void _aio_finish(void *priv);
289
290 void _flush_bdev_safely(FileWriter *h);
291 void flush_bdev(); // this is safe to call without a lock
292
293 int _preallocate(FileRef f, uint64_t off, uint64_t len);
294 int _truncate(FileWriter *h, uint64_t off);
295
296 int _read(
297 FileReader *h, ///< [in] read from here
298 FileReaderBuffer *buf, ///< [in] reader state
299 uint64_t offset, ///< [in] offset
300 size_t len, ///< [in] this many bytes
301 bufferlist *outbl, ///< [out] optional: reference the result here
302 char *out); ///< [out] optional: or copy it here
303 int _read_random(
304 FileReader *h, ///< [in] read from here
305 uint64_t offset, ///< [in] offset
306 size_t len, ///< [in] this many bytes
307 char *out); ///< [out] optional: or copy it here
308
309 void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length);
310
311 int _open_super();
312 int _write_super();
313 int _replay(bool noop); ///< replay journal
314
315 FileWriter *_create_writer(FileRef f);
316 void _close_writer(FileWriter *h);
317
318 // always put the super in the second 4k block. FIXME should this be
319 // block size independent?
320 unsigned get_super_offset() {
321 return 4096;
322 }
323 unsigned get_super_length() {
324 return 4096;
325 }
326
327 public:
328 BlueFS(CephContext* cct);
329 ~BlueFS();
330
331 // the super is always stored on bdev 0
332 int mkfs(uuid_d osd_uuid);
333 int mount();
334 void umount();
335
336 void collect_metadata(map<string,string> *pm);
337 int fsck();
338
339 uint64_t get_fs_usage();
340 uint64_t get_total(unsigned id);
341 uint64_t get_free(unsigned id);
342 void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...]
343 void dump_perf_counters(Formatter *f);
344
345 /// get current extents that we own for given block device
346 int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
347
348 int open_for_write(
349 const string& dir,
350 const string& file,
351 FileWriter **h,
352 bool overwrite);
353
354 int open_for_read(
355 const string& dir,
356 const string& file,
357 FileReader **h,
358 bool random = false);
359
360 void close_writer(FileWriter *h) {
361 std::lock_guard<std::mutex> l(lock);
362 _close_writer(h);
363 }
364
365 int rename(const string& old_dir, const string& old_file,
366 const string& new_dir, const string& new_file);
367
368 int readdir(const string& dirname, vector<string> *ls);
369
370 int unlink(const string& dirname, const string& filename);
371 int mkdir(const string& dirname);
372 int rmdir(const string& dirname);
373
374 bool dir_exists(const string& dirname);
375 int stat(const string& dirname, const string& filename,
376 uint64_t *size, utime_t *mtime);
377
378 int lock_file(const string& dirname, const string& filename, FileLock **p);
379 int unlock_file(FileLock *l);
380
381 void flush_log();
382 void compact_log();
383
384 /// sync any uncommitted state to disk
385 void sync_metadata();
386
387 int add_block_device(unsigned bdev, const string& path);
388 bool bdev_support_label(unsigned id);
389 uint64_t get_block_device_size(unsigned bdev);
390
391 /// gift more block space
392 void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len);
393
394 /// reclaim block space
395 int reclaim_blocks(unsigned bdev, uint64_t want,
396 AllocExtentVector *extents);
397
398 void flush(FileWriter *h) {
399 std::lock_guard<std::mutex> l(lock);
400 _flush(h, false);
401 }
402 void flush_range(FileWriter *h, uint64_t offset, uint64_t length) {
403 std::lock_guard<std::mutex> l(lock);
404 _flush_range(h, offset, length);
405 }
406 int fsync(FileWriter *h) {
407 std::unique_lock<std::mutex> l(lock);
408 return _fsync(h, l);
409 }
410 int read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len,
411 bufferlist *outbl, char *out) {
412 // no need to hold the global lock here; we only touch h and
413 // h->file, and read vs write or delete is already protected (via
414 // atomics and asserts).
415 return _read(h, buf, offset, len, outbl, out);
416 }
417 int read_random(FileReader *h, uint64_t offset, size_t len,
418 char *out) {
419 // no need to hold the global lock here; we only touch h and
420 // h->file, and read vs write or delete is already protected (via
421 // atomics and asserts).
422 return _read_random(h, offset, len, out);
423 }
424 void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) {
425 std::lock_guard<std::mutex> l(lock);
426 _invalidate_cache(f, offset, len);
427 }
428 int preallocate(FileRef f, uint64_t offset, uint64_t len) {
429 std::lock_guard<std::mutex> l(lock);
430 return _preallocate(f, offset, len);
431 }
432 int truncate(FileWriter *h, uint64_t offset) {
433 std::lock_guard<std::mutex> l(lock);
434 return _truncate(h, offset);
435 }
436
437 };
438
439 #endif