]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | #ifndef CEPH_OS_BLUESTORE_BLUEFS_H | |
4 | #define CEPH_OS_BLUESTORE_BLUEFS_H | |
5 | ||
6 | #include <atomic> | |
7 | #include <mutex> | |
8 | ||
9 | #include "bluefs_types.h" | |
10 | #include "common/RefCountedObj.h" | |
11 | #include "BlockDevice.h" | |
12 | ||
13 | #include "boost/intrusive/list.hpp" | |
14 | #include <boost/intrusive_ptr.hpp> | |
15 | ||
16 | class PerfCounters; | |
17 | ||
18 | class Allocator; | |
19 | ||
20 | enum { | |
21 | l_bluefs_first = 732600, | |
22 | l_bluefs_gift_bytes, | |
23 | l_bluefs_reclaim_bytes, | |
24 | l_bluefs_db_total_bytes, | |
25 | l_bluefs_db_used_bytes, | |
26 | l_bluefs_wal_total_bytes, | |
27 | l_bluefs_wal_used_bytes, | |
28 | l_bluefs_slow_total_bytes, | |
29 | l_bluefs_slow_used_bytes, | |
30 | l_bluefs_num_files, | |
31 | l_bluefs_log_bytes, | |
32 | l_bluefs_log_compactions, | |
33 | l_bluefs_logged_bytes, | |
34 | l_bluefs_files_written_wal, | |
35 | l_bluefs_files_written_sst, | |
36 | l_bluefs_bytes_written_wal, | |
37 | l_bluefs_bytes_written_sst, | |
11fdf7f2 TL |
38 | l_bluefs_bytes_written_slow, |
39 | l_bluefs_max_bytes_wal, | |
40 | l_bluefs_max_bytes_db, | |
41 | l_bluefs_max_bytes_slow, | |
7c673cae FG |
42 | l_bluefs_last, |
43 | }; | |
44 | ||
11fdf7f2 TL |
45 | class BlueFSDeviceExpander { |
46 | protected: | |
47 | ~BlueFSDeviceExpander() {} | |
48 | public: | |
49 | virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free, | |
50 | uint64_t bluefs_total) = 0; | |
51 | virtual int allocate_freespace( | |
52 | uint64_t min_size, | |
53 | uint64_t size, | |
54 | PExtentVector& extents) = 0; | |
55 | }; | |
56 | ||
7c673cae FG |
57 | class BlueFS { |
58 | public: | |
59 | CephContext* cct; | |
11fdf7f2 | 60 | static constexpr unsigned MAX_BDEV = 5; |
7c673cae FG |
61 | static constexpr unsigned BDEV_WAL = 0; |
62 | static constexpr unsigned BDEV_DB = 1; | |
63 | static constexpr unsigned BDEV_SLOW = 2; | |
11fdf7f2 TL |
64 | static constexpr unsigned BDEV_NEWWAL = 3; |
65 | static constexpr unsigned BDEV_NEWDB = 4; | |
7c673cae FG |
66 | |
67 | enum { | |
68 | WRITER_UNKNOWN, | |
69 | WRITER_WAL, | |
70 | WRITER_SST, | |
71 | }; | |
72 | ||
73 | struct File : public RefCountedObject { | |
74 | MEMPOOL_CLASS_HELPERS(); | |
75 | ||
76 | bluefs_fnode_t fnode; | |
77 | int refs; | |
78 | uint64_t dirty_seq; | |
79 | bool locked; | |
80 | bool deleted; | |
81 | boost::intrusive::list_member_hook<> dirty_item; | |
82 | ||
83 | std::atomic_int num_readers, num_writers; | |
84 | std::atomic_int num_reading; | |
85 | ||
86 | File() | |
87 | : RefCountedObject(NULL, 0), | |
88 | refs(0), | |
89 | dirty_seq(0), | |
90 | locked(false), | |
91 | deleted(false), | |
92 | num_readers(0), | |
93 | num_writers(0), | |
94 | num_reading(0) | |
95 | {} | |
96 | ~File() override { | |
11fdf7f2 TL |
97 | ceph_assert(num_readers.load() == 0); |
98 | ceph_assert(num_writers.load() == 0); | |
99 | ceph_assert(num_reading.load() == 0); | |
100 | ceph_assert(!locked); | |
7c673cae FG |
101 | } |
102 | ||
103 | friend void intrusive_ptr_add_ref(File *f) { | |
104 | f->get(); | |
105 | } | |
106 | friend void intrusive_ptr_release(File *f) { | |
107 | f->put(); | |
108 | } | |
109 | }; | |
110 | typedef boost::intrusive_ptr<File> FileRef; | |
111 | ||
112 | typedef boost::intrusive::list< | |
113 | File, | |
114 | boost::intrusive::member_hook< | |
115 | File, | |
116 | boost::intrusive::list_member_hook<>, | |
117 | &File::dirty_item> > dirty_file_list_t; | |
118 | ||
119 | struct Dir : public RefCountedObject { | |
120 | MEMPOOL_CLASS_HELPERS(); | |
121 | ||
122 | mempool::bluefs::map<string,FileRef> file_map; | |
123 | ||
124 | Dir() : RefCountedObject(NULL, 0) {} | |
125 | ||
126 | friend void intrusive_ptr_add_ref(Dir *d) { | |
127 | d->get(); | |
128 | } | |
129 | friend void intrusive_ptr_release(Dir *d) { | |
130 | d->put(); | |
131 | } | |
132 | }; | |
133 | typedef boost::intrusive_ptr<Dir> DirRef; | |
134 | ||
135 | struct FileWriter { | |
136 | MEMPOOL_CLASS_HELPERS(); | |
137 | ||
138 | FileRef file; | |
139 | uint64_t pos; ///< start offset for buffer | |
140 | bufferlist buffer; ///< new data to write (at end of file) | |
141 | bufferlist tail_block; ///< existing partial block at end of file, if any | |
142 | bufferlist::page_aligned_appender buffer_appender; //< for const char* only | |
143 | int writer_type = 0; ///< WRITER_* | |
11fdf7f2 | 144 | int write_hint = WRITE_LIFE_NOT_SET; |
7c673cae | 145 | |
11fdf7f2 | 146 | ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock"); |
7c673cae | 147 | std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev |
11fdf7f2 | 148 | std::array<bool, MAX_BDEV> dirty_devs; |
7c673cae FG |
149 | |
150 | FileWriter(FileRef f) | |
151 | : file(f), | |
152 | pos(0), | |
153 | buffer_appender(buffer.get_page_aligned_appender( | |
11fdf7f2 | 154 | g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) { |
7c673cae FG |
155 | ++file->num_writers; |
156 | iocv.fill(nullptr); | |
11fdf7f2 TL |
157 | dirty_devs.fill(false); |
158 | if (f->fnode.ino == 1) { | |
159 | write_hint = WRITE_LIFE_MEDIUM; | |
160 | } | |
7c673cae FG |
161 | } |
162 | // NOTE: caller must call BlueFS::close_writer() | |
163 | ~FileWriter() { | |
164 | --file->num_writers; | |
165 | } | |
166 | ||
167 | // note: BlueRocksEnv uses this append exclusively, so it's safe | |
168 | // to use buffer_appender exclusively here (e.g., it's notion of | |
169 | // offset will remain accurate). | |
170 | void append(const char *buf, size_t len) { | |
171 | buffer_appender.append(buf, len); | |
172 | } | |
173 | ||
174 | // note: used internally only, for ino 1 or 0. | |
175 | void append(bufferlist& bl) { | |
176 | buffer.claim_append(bl); | |
177 | } | |
178 | ||
179 | uint64_t get_effective_write_pos() { | |
180 | buffer_appender.flush(); | |
181 | return pos + buffer.length(); | |
182 | } | |
183 | }; | |
184 | ||
185 | struct FileReaderBuffer { | |
186 | MEMPOOL_CLASS_HELPERS(); | |
187 | ||
188 | uint64_t bl_off; ///< prefetch buffer logical offset | |
189 | bufferlist bl; ///< prefetch buffer | |
190 | uint64_t pos; ///< current logical offset | |
191 | uint64_t max_prefetch; ///< max allowed prefetch | |
192 | ||
193 | explicit FileReaderBuffer(uint64_t mpf) | |
194 | : bl_off(0), | |
195 | pos(0), | |
196 | max_prefetch(mpf) {} | |
197 | ||
198 | uint64_t get_buf_end() { | |
199 | return bl_off + bl.length(); | |
200 | } | |
201 | uint64_t get_buf_remaining(uint64_t p) { | |
202 | if (p >= bl_off && p < bl_off + bl.length()) | |
203 | return bl_off + bl.length() - p; | |
204 | return 0; | |
205 | } | |
206 | ||
207 | void skip(size_t n) { | |
208 | pos += n; | |
209 | } | |
210 | void seek(uint64_t offset) { | |
211 | pos = offset; | |
212 | } | |
213 | }; | |
214 | ||
215 | struct FileReader { | |
216 | MEMPOOL_CLASS_HELPERS(); | |
217 | ||
218 | FileRef file; | |
219 | FileReaderBuffer buf; | |
220 | bool random; | |
221 | bool ignore_eof; ///< used when reading our log file | |
222 | ||
223 | FileReader(FileRef f, uint64_t mpf, bool rand, bool ie) | |
224 | : file(f), | |
225 | buf(mpf), | |
226 | random(rand), | |
227 | ignore_eof(ie) { | |
228 | ++file->num_readers; | |
229 | } | |
230 | ~FileReader() { | |
231 | --file->num_readers; | |
232 | } | |
233 | }; | |
234 | ||
235 | struct FileLock { | |
236 | MEMPOOL_CLASS_HELPERS(); | |
237 | ||
238 | FileRef file; | |
239 | explicit FileLock(FileRef f) : file(f) {} | |
240 | }; | |
241 | ||
242 | private: | |
11fdf7f2 | 243 | ceph::mutex lock = ceph::make_mutex("BlueFS::lock"); |
7c673cae FG |
244 | |
245 | PerfCounters *logger = nullptr; | |
246 | ||
11fdf7f2 TL |
247 | uint64_t max_bytes[MAX_BDEV] = {0}; |
248 | uint64_t max_bytes_pcounters[MAX_BDEV] = { | |
249 | l_bluefs_max_bytes_wal, | |
250 | l_bluefs_max_bytes_db, | |
251 | l_bluefs_max_bytes_slow, | |
252 | }; | |
253 | ||
7c673cae FG |
254 | // cache |
255 | mempool::bluefs::map<string, DirRef> dir_map; ///< dirname -> Dir | |
256 | mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File | |
257 | ||
258 | // map of dirty files, files of same dirty_seq are grouped into list. | |
259 | map<uint64_t, dirty_file_list_t> dirty_files; | |
260 | ||
261 | bluefs_super_t super; ///< latest superblock (as last written) | |
262 | uint64_t ino_last = 0; ///< last assigned ino (this one is in use) | |
263 | uint64_t log_seq = 0; ///< last used log seq (by current pending log_t) | |
264 | uint64_t log_seq_stable = 0; ///< last stable/synced log seq | |
265 | FileWriter *log_writer = 0; ///< writer for the log | |
266 | bluefs_transaction_t log_t; ///< pending, unwritten log transaction | |
267 | bool log_flushing = false; ///< true while flushing the log | |
11fdf7f2 | 268 | ceph::condition_variable log_cond; |
7c673cae FG |
269 | |
270 | uint64_t new_log_jump_to = 0; | |
271 | uint64_t old_log_jump_to = 0; | |
272 | FileRef new_log = nullptr; | |
273 | FileWriter *new_log_writer = nullptr; | |
274 | ||
275 | /* | |
276 | * There are up to 3 block devices: | |
277 | * | |
278 | * BDEV_DB db/ - the primary db device | |
279 | * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL | |
280 | * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills | |
281 | */ | |
282 | vector<BlockDevice*> bdev; ///< block devices we can use | |
283 | vector<IOContext*> ioc; ///< IOContexts for bdevs | |
284 | vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own | |
7c673cae FG |
285 | vector<Allocator*> alloc; ///< allocators for bdevs |
286 | vector<interval_set<uint64_t>> pending_release; ///< extents to release | |
287 | ||
11fdf7f2 TL |
288 | BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev |
289 | ||
290 | BlueFSDeviceExpander* slow_dev_expander = nullptr; | |
291 | ||
7c673cae FG |
292 | void _init_logger(); |
293 | void _shutdown_logger(); | |
294 | void _update_logger_stats(); | |
295 | ||
296 | void _init_alloc(); | |
297 | void _stop_alloc(); | |
298 | ||
299 | void _pad_bl(bufferlist& bl); ///< pad bufferlist to block size w/ zeros | |
300 | ||
301 | FileRef _get_file(uint64_t ino); | |
302 | void _drop_link(FileRef f); | |
303 | ||
11fdf7f2 TL |
304 | int _get_slow_device_id() { return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; } |
305 | int _expand_slow_device(uint64_t min_size, PExtentVector& extents); | |
7c673cae | 306 | int _allocate(uint8_t bdev, uint64_t len, |
94b18763 | 307 | bluefs_fnode_t* node); |
11fdf7f2 TL |
308 | int _allocate_without_fallback(uint8_t id, uint64_t len, |
309 | PExtentVector* extents); | |
310 | ||
7c673cae FG |
311 | int _flush_range(FileWriter *h, uint64_t offset, uint64_t length); |
312 | int _flush(FileWriter *h, bool force); | |
11fdf7f2 | 313 | int _fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l); |
7c673cae | 314 | |
11fdf7f2 | 315 | #ifdef HAVE_LIBAIO |
7c673cae FG |
316 | void _claim_completed_aios(FileWriter *h, list<aio_t> *ls); |
317 | void wait_for_aio(FileWriter *h); // safe to call without a lock | |
11fdf7f2 | 318 | #endif |
7c673cae | 319 | |
11fdf7f2 | 320 | int _flush_and_sync_log(std::unique_lock<ceph::mutex>& l, |
7c673cae FG |
321 | uint64_t want_seq = 0, |
322 | uint64_t jump_to = 0); | |
323 | uint64_t _estimate_log_size(); | |
324 | bool _should_compact_log(); | |
11fdf7f2 TL |
325 | |
326 | enum { | |
327 | REMOVE_DB = 1, | |
328 | REMOVE_WAL = 2, | |
329 | RENAME_SLOW2DB = 4, | |
330 | RENAME_DB2SLOW = 8, | |
331 | }; | |
332 | void _compact_log_dump_metadata(bluefs_transaction_t *t, | |
333 | int flags); | |
7c673cae | 334 | void _compact_log_sync(); |
11fdf7f2 TL |
335 | void _compact_log_async(std::unique_lock<ceph::mutex>& l); |
336 | ||
337 | void _rewrite_log_sync(bool allocate_with_fallback, | |
338 | int super_dev, | |
339 | int log_dev, | |
340 | int new_log_dev, | |
341 | int flags); | |
7c673cae FG |
342 | |
343 | //void _aio_finish(void *priv); | |
344 | ||
345 | void _flush_bdev_safely(FileWriter *h); | |
346 | void flush_bdev(); // this is safe to call without a lock | |
11fdf7f2 | 347 | void flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock |
7c673cae FG |
348 | |
349 | int _preallocate(FileRef f, uint64_t off, uint64_t len); | |
350 | int _truncate(FileWriter *h, uint64_t off); | |
351 | ||
352 | int _read( | |
353 | FileReader *h, ///< [in] read from here | |
354 | FileReaderBuffer *buf, ///< [in] reader state | |
355 | uint64_t offset, ///< [in] offset | |
356 | size_t len, ///< [in] this many bytes | |
357 | bufferlist *outbl, ///< [out] optional: reference the result here | |
358 | char *out); ///< [out] optional: or copy it here | |
359 | int _read_random( | |
360 | FileReader *h, ///< [in] read from here | |
361 | uint64_t offset, ///< [in] offset | |
362 | size_t len, ///< [in] this many bytes | |
363 | char *out); ///< [out] optional: or copy it here | |
364 | ||
365 | void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length); | |
366 | ||
367 | int _open_super(); | |
11fdf7f2 TL |
368 | int _write_super(int dev); |
369 | int _replay(bool noop, bool to_stdout = false); ///< replay journal | |
7c673cae FG |
370 | |
371 | FileWriter *_create_writer(FileRef f); | |
372 | void _close_writer(FileWriter *h); | |
373 | ||
374 | // always put the super in the second 4k block. FIXME should this be | |
375 | // block size independent? | |
376 | unsigned get_super_offset() { | |
377 | return 4096; | |
378 | } | |
379 | unsigned get_super_length() { | |
380 | return 4096; | |
381 | } | |
382 | ||
11fdf7f2 TL |
383 | void _add_block_extent(unsigned bdev, uint64_t offset, uint64_t len); |
384 | ||
7c673cae FG |
385 | public: |
386 | BlueFS(CephContext* cct); | |
387 | ~BlueFS(); | |
388 | ||
389 | // the super is always stored on bdev 0 | |
390 | int mkfs(uuid_d osd_uuid); | |
391 | int mount(); | |
392 | void umount(); | |
11fdf7f2 TL |
393 | int prepare_new_device(int id); |
394 | ||
395 | int log_dump(); | |
7c673cae | 396 | |
11fdf7f2 TL |
397 | void collect_metadata(map<string,string> *pm, unsigned skip_bdev_id); |
398 | void get_devices(set<string> *ls); | |
7c673cae FG |
399 | int fsck(); |
400 | ||
11fdf7f2 TL |
401 | int device_migrate_to_new( |
402 | CephContext *cct, | |
403 | const set<int>& devs_source, | |
404 | int dev_target); | |
405 | int device_migrate_to_existing( | |
406 | CephContext *cct, | |
407 | const set<int>& devs_source, | |
408 | int dev_target); | |
409 | ||
410 | uint64_t get_used(); | |
7c673cae FG |
411 | uint64_t get_total(unsigned id); |
412 | uint64_t get_free(unsigned id); | |
413 | void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...] | |
414 | void dump_perf_counters(Formatter *f); | |
415 | ||
3efd9988 FG |
416 | void dump_block_extents(ostream& out); |
417 | ||
7c673cae FG |
418 | /// get current extents that we own for given block device |
419 | int get_block_extents(unsigned id, interval_set<uint64_t> *extents); | |
420 | ||
421 | int open_for_write( | |
422 | const string& dir, | |
423 | const string& file, | |
424 | FileWriter **h, | |
425 | bool overwrite); | |
426 | ||
427 | int open_for_read( | |
428 | const string& dir, | |
429 | const string& file, | |
430 | FileReader **h, | |
431 | bool random = false); | |
432 | ||
433 | void close_writer(FileWriter *h) { | |
11fdf7f2 | 434 | std::lock_guard l(lock); |
7c673cae FG |
435 | _close_writer(h); |
436 | } | |
437 | ||
438 | int rename(const string& old_dir, const string& old_file, | |
439 | const string& new_dir, const string& new_file); | |
440 | ||
441 | int readdir(const string& dirname, vector<string> *ls); | |
442 | ||
443 | int unlink(const string& dirname, const string& filename); | |
444 | int mkdir(const string& dirname); | |
445 | int rmdir(const string& dirname); | |
d2e6a577 | 446 | bool wal_is_rotational(); |
7c673cae FG |
447 | |
448 | bool dir_exists(const string& dirname); | |
449 | int stat(const string& dirname, const string& filename, | |
450 | uint64_t *size, utime_t *mtime); | |
451 | ||
452 | int lock_file(const string& dirname, const string& filename, FileLock **p); | |
453 | int unlock_file(FileLock *l); | |
454 | ||
455 | void flush_log(); | |
456 | void compact_log(); | |
457 | ||
458 | /// sync any uncommitted state to disk | |
459 | void sync_metadata(); | |
460 | ||
11fdf7f2 TL |
461 | void set_slow_device_expander(BlueFSDeviceExpander* a) { |
462 | slow_dev_expander = a; | |
463 | } | |
464 | int add_block_device(unsigned bdev, const string& path, bool trim, | |
465 | bool shared_with_bluestore=false); | |
7c673cae FG |
466 | bool bdev_support_label(unsigned id); |
467 | uint64_t get_block_device_size(unsigned bdev); | |
468 | ||
469 | /// gift more block space | |
11fdf7f2 TL |
470 | void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len) { |
471 | std::unique_lock l(lock); | |
472 | _add_block_extent(bdev, offset, len); | |
473 | int r = _flush_and_sync_log(l); | |
474 | ceph_assert(r == 0); | |
475 | } | |
7c673cae FG |
476 | |
477 | /// reclaim block space | |
478 | int reclaim_blocks(unsigned bdev, uint64_t want, | |
a8e16298 | 479 | PExtentVector *extents); |
7c673cae | 480 | |
11fdf7f2 TL |
481 | // handler for discard event |
482 | void handle_discard(unsigned dev, interval_set<uint64_t>& to_release); | |
483 | ||
7c673cae | 484 | void flush(FileWriter *h) { |
11fdf7f2 | 485 | std::lock_guard l(lock); |
7c673cae FG |
486 | _flush(h, false); |
487 | } | |
488 | void flush_range(FileWriter *h, uint64_t offset, uint64_t length) { | |
11fdf7f2 | 489 | std::lock_guard l(lock); |
7c673cae FG |
490 | _flush_range(h, offset, length); |
491 | } | |
492 | int fsync(FileWriter *h) { | |
11fdf7f2 | 493 | std::unique_lock l(lock); |
7c673cae FG |
494 | return _fsync(h, l); |
495 | } | |
496 | int read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len, | |
497 | bufferlist *outbl, char *out) { | |
498 | // no need to hold the global lock here; we only touch h and | |
499 | // h->file, and read vs write or delete is already protected (via | |
500 | // atomics and asserts). | |
501 | return _read(h, buf, offset, len, outbl, out); | |
502 | } | |
503 | int read_random(FileReader *h, uint64_t offset, size_t len, | |
504 | char *out) { | |
505 | // no need to hold the global lock here; we only touch h and | |
506 | // h->file, and read vs write or delete is already protected (via | |
507 | // atomics and asserts). | |
508 | return _read_random(h, offset, len, out); | |
509 | } | |
510 | void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) { | |
11fdf7f2 | 511 | std::lock_guard l(lock); |
7c673cae FG |
512 | _invalidate_cache(f, offset, len); |
513 | } | |
514 | int preallocate(FileRef f, uint64_t offset, uint64_t len) { | |
11fdf7f2 | 515 | std::lock_guard l(lock); |
7c673cae FG |
516 | return _preallocate(f, offset, len); |
517 | } | |
518 | int truncate(FileWriter *h, uint64_t offset) { | |
11fdf7f2 | 519 | std::lock_guard l(lock); |
7c673cae FG |
520 | return _truncate(h, offset); |
521 | } | |
522 | ||
523 | }; | |
524 | ||
525 | #endif |