]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | #ifndef CEPH_OS_BLUESTORE_BLUEFS_H | |
4 | #define CEPH_OS_BLUESTORE_BLUEFS_H | |
5 | ||
6 | #include <atomic> | |
7 | #include <mutex> | |
8 | ||
9 | #include "bluefs_types.h" | |
10 | #include "common/RefCountedObj.h" | |
11 | #include "BlockDevice.h" | |
12 | ||
13 | #include "boost/intrusive/list.hpp" | |
14 | #include <boost/intrusive_ptr.hpp> | |
15 | ||
16 | class PerfCounters; | |
17 | ||
18 | class Allocator; | |
19 | ||
20 | enum { | |
21 | l_bluefs_first = 732600, | |
22 | l_bluefs_gift_bytes, | |
23 | l_bluefs_reclaim_bytes, | |
24 | l_bluefs_db_total_bytes, | |
25 | l_bluefs_db_used_bytes, | |
26 | l_bluefs_wal_total_bytes, | |
27 | l_bluefs_wal_used_bytes, | |
28 | l_bluefs_slow_total_bytes, | |
29 | l_bluefs_slow_used_bytes, | |
30 | l_bluefs_num_files, | |
31 | l_bluefs_log_bytes, | |
32 | l_bluefs_log_compactions, | |
33 | l_bluefs_logged_bytes, | |
34 | l_bluefs_files_written_wal, | |
35 | l_bluefs_files_written_sst, | |
36 | l_bluefs_bytes_written_wal, | |
37 | l_bluefs_bytes_written_sst, | |
38 | l_bluefs_last, | |
39 | }; | |
40 | ||
41 | class BlueFS { | |
42 | public: | |
43 | CephContext* cct; | |
44 | static constexpr unsigned MAX_BDEV = 3; | |
45 | static constexpr unsigned BDEV_WAL = 0; | |
46 | static constexpr unsigned BDEV_DB = 1; | |
47 | static constexpr unsigned BDEV_SLOW = 2; | |
48 | ||
49 | enum { | |
50 | WRITER_UNKNOWN, | |
51 | WRITER_WAL, | |
52 | WRITER_SST, | |
53 | }; | |
54 | ||
55 | struct File : public RefCountedObject { | |
56 | MEMPOOL_CLASS_HELPERS(); | |
57 | ||
58 | bluefs_fnode_t fnode; | |
59 | int refs; | |
60 | uint64_t dirty_seq; | |
61 | bool locked; | |
62 | bool deleted; | |
63 | boost::intrusive::list_member_hook<> dirty_item; | |
64 | ||
65 | std::atomic_int num_readers, num_writers; | |
66 | std::atomic_int num_reading; | |
67 | ||
68 | File() | |
69 | : RefCountedObject(NULL, 0), | |
70 | refs(0), | |
71 | dirty_seq(0), | |
72 | locked(false), | |
73 | deleted(false), | |
74 | num_readers(0), | |
75 | num_writers(0), | |
76 | num_reading(0) | |
77 | {} | |
78 | ~File() override { | |
79 | assert(num_readers.load() == 0); | |
80 | assert(num_writers.load() == 0); | |
81 | assert(num_reading.load() == 0); | |
82 | assert(!locked); | |
83 | } | |
84 | ||
85 | friend void intrusive_ptr_add_ref(File *f) { | |
86 | f->get(); | |
87 | } | |
88 | friend void intrusive_ptr_release(File *f) { | |
89 | f->put(); | |
90 | } | |
91 | }; | |
92 | typedef boost::intrusive_ptr<File> FileRef; | |
93 | ||
94 | typedef boost::intrusive::list< | |
95 | File, | |
96 | boost::intrusive::member_hook< | |
97 | File, | |
98 | boost::intrusive::list_member_hook<>, | |
99 | &File::dirty_item> > dirty_file_list_t; | |
100 | ||
101 | struct Dir : public RefCountedObject { | |
102 | MEMPOOL_CLASS_HELPERS(); | |
103 | ||
104 | mempool::bluefs::map<string,FileRef> file_map; | |
105 | ||
106 | Dir() : RefCountedObject(NULL, 0) {} | |
107 | ||
108 | friend void intrusive_ptr_add_ref(Dir *d) { | |
109 | d->get(); | |
110 | } | |
111 | friend void intrusive_ptr_release(Dir *d) { | |
112 | d->put(); | |
113 | } | |
114 | }; | |
115 | typedef boost::intrusive_ptr<Dir> DirRef; | |
116 | ||
117 | struct FileWriter { | |
118 | MEMPOOL_CLASS_HELPERS(); | |
119 | ||
120 | FileRef file; | |
121 | uint64_t pos; ///< start offset for buffer | |
122 | bufferlist buffer; ///< new data to write (at end of file) | |
123 | bufferlist tail_block; ///< existing partial block at end of file, if any | |
124 | bufferlist::page_aligned_appender buffer_appender; //< for const char* only | |
125 | int writer_type = 0; ///< WRITER_* | |
126 | ||
127 | std::mutex lock; | |
128 | std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev | |
129 | ||
130 | FileWriter(FileRef f) | |
131 | : file(f), | |
132 | pos(0), | |
133 | buffer_appender(buffer.get_page_aligned_appender( | |
134 | g_conf->bluefs_alloc_size / CEPH_PAGE_SIZE)) { | |
135 | ++file->num_writers; | |
136 | iocv.fill(nullptr); | |
137 | } | |
138 | // NOTE: caller must call BlueFS::close_writer() | |
139 | ~FileWriter() { | |
140 | --file->num_writers; | |
141 | } | |
142 | ||
143 | // note: BlueRocksEnv uses this append exclusively, so it's safe | |
144 | // to use buffer_appender exclusively here (e.g., it's notion of | |
145 | // offset will remain accurate). | |
146 | void append(const char *buf, size_t len) { | |
147 | buffer_appender.append(buf, len); | |
148 | } | |
149 | ||
150 | // note: used internally only, for ino 1 or 0. | |
151 | void append(bufferlist& bl) { | |
152 | buffer.claim_append(bl); | |
153 | } | |
154 | ||
155 | uint64_t get_effective_write_pos() { | |
156 | buffer_appender.flush(); | |
157 | return pos + buffer.length(); | |
158 | } | |
159 | }; | |
160 | ||
161 | struct FileReaderBuffer { | |
162 | MEMPOOL_CLASS_HELPERS(); | |
163 | ||
164 | uint64_t bl_off; ///< prefetch buffer logical offset | |
165 | bufferlist bl; ///< prefetch buffer | |
166 | uint64_t pos; ///< current logical offset | |
167 | uint64_t max_prefetch; ///< max allowed prefetch | |
168 | ||
169 | explicit FileReaderBuffer(uint64_t mpf) | |
170 | : bl_off(0), | |
171 | pos(0), | |
172 | max_prefetch(mpf) {} | |
173 | ||
174 | uint64_t get_buf_end() { | |
175 | return bl_off + bl.length(); | |
176 | } | |
177 | uint64_t get_buf_remaining(uint64_t p) { | |
178 | if (p >= bl_off && p < bl_off + bl.length()) | |
179 | return bl_off + bl.length() - p; | |
180 | return 0; | |
181 | } | |
182 | ||
183 | void skip(size_t n) { | |
184 | pos += n; | |
185 | } | |
186 | void seek(uint64_t offset) { | |
187 | pos = offset; | |
188 | } | |
189 | }; | |
190 | ||
191 | struct FileReader { | |
192 | MEMPOOL_CLASS_HELPERS(); | |
193 | ||
194 | FileRef file; | |
195 | FileReaderBuffer buf; | |
196 | bool random; | |
197 | bool ignore_eof; ///< used when reading our log file | |
198 | ||
199 | FileReader(FileRef f, uint64_t mpf, bool rand, bool ie) | |
200 | : file(f), | |
201 | buf(mpf), | |
202 | random(rand), | |
203 | ignore_eof(ie) { | |
204 | ++file->num_readers; | |
205 | } | |
206 | ~FileReader() { | |
207 | --file->num_readers; | |
208 | } | |
209 | }; | |
210 | ||
211 | struct FileLock { | |
212 | MEMPOOL_CLASS_HELPERS(); | |
213 | ||
214 | FileRef file; | |
215 | explicit FileLock(FileRef f) : file(f) {} | |
216 | }; | |
217 | ||
218 | private: | |
219 | std::mutex lock; | |
220 | ||
221 | PerfCounters *logger = nullptr; | |
222 | ||
223 | // cache | |
224 | mempool::bluefs::map<string, DirRef> dir_map; ///< dirname -> Dir | |
225 | mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File | |
226 | ||
227 | // map of dirty files, files of same dirty_seq are grouped into list. | |
228 | map<uint64_t, dirty_file_list_t> dirty_files; | |
229 | ||
230 | bluefs_super_t super; ///< latest superblock (as last written) | |
231 | uint64_t ino_last = 0; ///< last assigned ino (this one is in use) | |
232 | uint64_t log_seq = 0; ///< last used log seq (by current pending log_t) | |
233 | uint64_t log_seq_stable = 0; ///< last stable/synced log seq | |
234 | FileWriter *log_writer = 0; ///< writer for the log | |
235 | bluefs_transaction_t log_t; ///< pending, unwritten log transaction | |
236 | bool log_flushing = false; ///< true while flushing the log | |
237 | std::condition_variable log_cond; | |
238 | ||
239 | uint64_t new_log_jump_to = 0; | |
240 | uint64_t old_log_jump_to = 0; | |
241 | FileRef new_log = nullptr; | |
242 | FileWriter *new_log_writer = nullptr; | |
243 | ||
244 | /* | |
245 | * There are up to 3 block devices: | |
246 | * | |
247 | * BDEV_DB db/ - the primary db device | |
248 | * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL | |
249 | * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills | |
250 | */ | |
251 | vector<BlockDevice*> bdev; ///< block devices we can use | |
252 | vector<IOContext*> ioc; ///< IOContexts for bdevs | |
253 | vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own | |
254 | vector<uint64_t> block_total; ///< sum of block_all | |
255 | vector<Allocator*> alloc; ///< allocators for bdevs | |
256 | vector<interval_set<uint64_t>> pending_release; ///< extents to release | |
257 | ||
258 | void _init_logger(); | |
259 | void _shutdown_logger(); | |
260 | void _update_logger_stats(); | |
261 | ||
262 | void _init_alloc(); | |
263 | void _stop_alloc(); | |
264 | ||
265 | void _pad_bl(bufferlist& bl); ///< pad bufferlist to block size w/ zeros | |
266 | ||
267 | FileRef _get_file(uint64_t ino); | |
268 | void _drop_link(FileRef f); | |
269 | ||
270 | int _allocate(uint8_t bdev, uint64_t len, | |
271 | mempool::bluefs::vector<bluefs_extent_t> *ev); | |
272 | int _flush_range(FileWriter *h, uint64_t offset, uint64_t length); | |
273 | int _flush(FileWriter *h, bool force); | |
274 | int _fsync(FileWriter *h, std::unique_lock<std::mutex>& l); | |
275 | ||
276 | void _claim_completed_aios(FileWriter *h, list<aio_t> *ls); | |
277 | void wait_for_aio(FileWriter *h); // safe to call without a lock | |
278 | ||
279 | int _flush_and_sync_log(std::unique_lock<std::mutex>& l, | |
280 | uint64_t want_seq = 0, | |
281 | uint64_t jump_to = 0); | |
282 | uint64_t _estimate_log_size(); | |
283 | bool _should_compact_log(); | |
284 | void _compact_log_dump_metadata(bluefs_transaction_t *t); | |
285 | void _compact_log_sync(); | |
286 | void _compact_log_async(std::unique_lock<std::mutex>& l); | |
287 | ||
288 | //void _aio_finish(void *priv); | |
289 | ||
290 | void _flush_bdev_safely(FileWriter *h); | |
291 | void flush_bdev(); // this is safe to call without a lock | |
292 | ||
293 | int _preallocate(FileRef f, uint64_t off, uint64_t len); | |
294 | int _truncate(FileWriter *h, uint64_t off); | |
295 | ||
296 | int _read( | |
297 | FileReader *h, ///< [in] read from here | |
298 | FileReaderBuffer *buf, ///< [in] reader state | |
299 | uint64_t offset, ///< [in] offset | |
300 | size_t len, ///< [in] this many bytes | |
301 | bufferlist *outbl, ///< [out] optional: reference the result here | |
302 | char *out); ///< [out] optional: or copy it here | |
303 | int _read_random( | |
304 | FileReader *h, ///< [in] read from here | |
305 | uint64_t offset, ///< [in] offset | |
306 | size_t len, ///< [in] this many bytes | |
307 | char *out); ///< [out] optional: or copy it here | |
308 | ||
309 | void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length); | |
310 | ||
311 | int _open_super(); | |
312 | int _write_super(); | |
313 | int _replay(bool noop); ///< replay journal | |
314 | ||
315 | FileWriter *_create_writer(FileRef f); | |
316 | void _close_writer(FileWriter *h); | |
317 | ||
318 | // always put the super in the second 4k block. FIXME should this be | |
319 | // block size independent? | |
320 | unsigned get_super_offset() { | |
321 | return 4096; | |
322 | } | |
323 | unsigned get_super_length() { | |
324 | return 4096; | |
325 | } | |
326 | ||
327 | public: | |
328 | BlueFS(CephContext* cct); | |
329 | ~BlueFS(); | |
330 | ||
331 | // the super is always stored on bdev 0 | |
332 | int mkfs(uuid_d osd_uuid); | |
333 | int mount(); | |
334 | void umount(); | |
335 | ||
336 | void collect_metadata(map<string,string> *pm); | |
337 | int fsck(); | |
338 | ||
339 | uint64_t get_fs_usage(); | |
340 | uint64_t get_total(unsigned id); | |
341 | uint64_t get_free(unsigned id); | |
342 | void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...] | |
343 | void dump_perf_counters(Formatter *f); | |
344 | ||
3efd9988 FG |
345 | void dump_block_extents(ostream& out); |
346 | ||
7c673cae FG |
347 | /// get current extents that we own for given block device |
348 | int get_block_extents(unsigned id, interval_set<uint64_t> *extents); | |
349 | ||
350 | int open_for_write( | |
351 | const string& dir, | |
352 | const string& file, | |
353 | FileWriter **h, | |
354 | bool overwrite); | |
355 | ||
356 | int open_for_read( | |
357 | const string& dir, | |
358 | const string& file, | |
359 | FileReader **h, | |
360 | bool random = false); | |
361 | ||
362 | void close_writer(FileWriter *h) { | |
363 | std::lock_guard<std::mutex> l(lock); | |
364 | _close_writer(h); | |
365 | } | |
366 | ||
367 | int rename(const string& old_dir, const string& old_file, | |
368 | const string& new_dir, const string& new_file); | |
369 | ||
370 | int readdir(const string& dirname, vector<string> *ls); | |
371 | ||
372 | int unlink(const string& dirname, const string& filename); | |
373 | int mkdir(const string& dirname); | |
374 | int rmdir(const string& dirname); | |
d2e6a577 | 375 | bool wal_is_rotational(); |
7c673cae FG |
376 | |
377 | bool dir_exists(const string& dirname); | |
378 | int stat(const string& dirname, const string& filename, | |
379 | uint64_t *size, utime_t *mtime); | |
380 | ||
381 | int lock_file(const string& dirname, const string& filename, FileLock **p); | |
382 | int unlock_file(FileLock *l); | |
383 | ||
384 | void flush_log(); | |
385 | void compact_log(); | |
386 | ||
387 | /// sync any uncommitted state to disk | |
388 | void sync_metadata(); | |
389 | ||
c07f9fc5 | 390 | int add_block_device(unsigned bdev, const string& path); |
7c673cae FG |
391 | bool bdev_support_label(unsigned id); |
392 | uint64_t get_block_device_size(unsigned bdev); | |
393 | ||
394 | /// gift more block space | |
395 | void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len); | |
396 | ||
397 | /// reclaim block space | |
398 | int reclaim_blocks(unsigned bdev, uint64_t want, | |
399 | AllocExtentVector *extents); | |
400 | ||
401 | void flush(FileWriter *h) { | |
402 | std::lock_guard<std::mutex> l(lock); | |
403 | _flush(h, false); | |
404 | } | |
405 | void flush_range(FileWriter *h, uint64_t offset, uint64_t length) { | |
406 | std::lock_guard<std::mutex> l(lock); | |
407 | _flush_range(h, offset, length); | |
408 | } | |
409 | int fsync(FileWriter *h) { | |
410 | std::unique_lock<std::mutex> l(lock); | |
411 | return _fsync(h, l); | |
412 | } | |
413 | int read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len, | |
414 | bufferlist *outbl, char *out) { | |
415 | // no need to hold the global lock here; we only touch h and | |
416 | // h->file, and read vs write or delete is already protected (via | |
417 | // atomics and asserts). | |
418 | return _read(h, buf, offset, len, outbl, out); | |
419 | } | |
420 | int read_random(FileReader *h, uint64_t offset, size_t len, | |
421 | char *out) { | |
422 | // no need to hold the global lock here; we only touch h and | |
423 | // h->file, and read vs write or delete is already protected (via | |
424 | // atomics and asserts). | |
425 | return _read_random(h, offset, len, out); | |
426 | } | |
427 | void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) { | |
428 | std::lock_guard<std::mutex> l(lock); | |
429 | _invalidate_cache(f, offset, len); | |
430 | } | |
431 | int preallocate(FileRef f, uint64_t offset, uint64_t len) { | |
432 | std::lock_guard<std::mutex> l(lock); | |
433 | return _preallocate(f, offset, len); | |
434 | } | |
435 | int truncate(FileWriter *h, uint64_t offset) { | |
436 | std::lock_guard<std::mutex> l(lock); | |
437 | return _truncate(h, offset); | |
438 | } | |
439 | ||
440 | }; | |
441 | ||
442 | #endif |