]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | #ifndef CEPH_OS_BLUESTORE_BLUEFS_H | |
4 | #define CEPH_OS_BLUESTORE_BLUEFS_H | |
5 | ||
6 | #include <atomic> | |
7 | #include <mutex> | |
8 | ||
9 | #include "bluefs_types.h" | |
7c673cae FG |
10 | #include "BlockDevice.h" |
11 | ||
9f95a23c TL |
12 | #include "common/RefCountedObj.h" |
13 | #include "common/ceph_context.h" | |
14 | #include "global/global_context.h" | |
15 | #include "include/common_fwd.h" | |
7c673cae | 16 | |
9f95a23c TL |
17 | #include "boost/intrusive/list.hpp" |
18 | #include "boost/dynamic_bitset.hpp" | |
7c673cae FG |
19 | |
20 | class Allocator; | |
21 | ||
22 | enum { | |
23 | l_bluefs_first = 732600, | |
24 | l_bluefs_gift_bytes, | |
25 | l_bluefs_reclaim_bytes, | |
26 | l_bluefs_db_total_bytes, | |
27 | l_bluefs_db_used_bytes, | |
28 | l_bluefs_wal_total_bytes, | |
29 | l_bluefs_wal_used_bytes, | |
30 | l_bluefs_slow_total_bytes, | |
31 | l_bluefs_slow_used_bytes, | |
32 | l_bluefs_num_files, | |
33 | l_bluefs_log_bytes, | |
34 | l_bluefs_log_compactions, | |
35 | l_bluefs_logged_bytes, | |
36 | l_bluefs_files_written_wal, | |
37 | l_bluefs_files_written_sst, | |
38 | l_bluefs_bytes_written_wal, | |
39 | l_bluefs_bytes_written_sst, | |
11fdf7f2 TL |
40 | l_bluefs_bytes_written_slow, |
41 | l_bluefs_max_bytes_wal, | |
42 | l_bluefs_max_bytes_db, | |
43 | l_bluefs_max_bytes_slow, | |
494da23a TL |
44 | l_bluefs_read_random_count, |
45 | l_bluefs_read_random_bytes, | |
46 | l_bluefs_read_random_disk_count, | |
47 | l_bluefs_read_random_disk_bytes, | |
48 | l_bluefs_read_random_buffer_count, | |
49 | l_bluefs_read_random_buffer_bytes, | |
50 | l_bluefs_read_count, | |
51 | l_bluefs_read_bytes, | |
52 | l_bluefs_read_prefetch_count, | |
53 | l_bluefs_read_prefetch_bytes, | |
54 | ||
7c673cae FG |
55 | l_bluefs_last, |
56 | }; | |
57 | ||
11fdf7f2 TL |
58 | class BlueFSDeviceExpander { |
59 | protected: | |
60 | ~BlueFSDeviceExpander() {} | |
61 | public: | |
62 | virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free, | |
63 | uint64_t bluefs_total) = 0; | |
64 | virtual int allocate_freespace( | |
65 | uint64_t min_size, | |
66 | uint64_t size, | |
67 | PExtentVector& extents) = 0; | |
eafe8130 TL |
68 | /** Reports amount of space that can be transferred to BlueFS. |
69 | * This gives either current state, when alloc_size is currently used | |
70 | * BlueFS's size, or simulation when alloc_size is different. | |
71 | * @params | |
72 | * alloc_size - allocation unit size to check | |
73 | */ | |
9f95a23c TL |
74 | virtual uint64_t available_freespace(uint64_t alloc_size) = 0; |
75 | }; | |
76 | ||
77 | class BlueFSVolumeSelector { | |
78 | public: | |
79 | typedef std::vector<std::pair<std::string, uint64_t>> paths; | |
80 | ||
81 | virtual ~BlueFSVolumeSelector() { | |
82 | } | |
f6b5b4d7 TL |
83 | virtual void* get_hint_for_log() const = 0; |
84 | virtual void* get_hint_by_dir(const std::string& dirname) const = 0; | |
9f95a23c TL |
85 | |
86 | virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; | |
87 | virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; | |
88 | virtual void add_usage(void* file_hint, uint64_t fsize) = 0; | |
89 | virtual void sub_usage(void* file_hint, uint64_t fsize) = 0; | |
90 | virtual uint8_t select_prefer_bdev(void* hint) = 0; | |
91 | virtual void get_paths(const std::string& base, paths& res) const = 0; | |
92 | virtual void dump(ostream& sout) = 0; | |
11fdf7f2 | 93 | }; |
9f95a23c | 94 | class BlueFS; |
11fdf7f2 | 95 | |
7c673cae FG |
96 | class BlueFS { |
97 | public: | |
98 | CephContext* cct; | |
11fdf7f2 | 99 | static constexpr unsigned MAX_BDEV = 5; |
7c673cae FG |
100 | static constexpr unsigned BDEV_WAL = 0; |
101 | static constexpr unsigned BDEV_DB = 1; | |
102 | static constexpr unsigned BDEV_SLOW = 2; | |
11fdf7f2 TL |
103 | static constexpr unsigned BDEV_NEWWAL = 3; |
104 | static constexpr unsigned BDEV_NEWDB = 4; | |
7c673cae FG |
105 | |
106 | enum { | |
107 | WRITER_UNKNOWN, | |
108 | WRITER_WAL, | |
109 | WRITER_SST, | |
110 | }; | |
111 | ||
112 | struct File : public RefCountedObject { | |
113 | MEMPOOL_CLASS_HELPERS(); | |
114 | ||
115 | bluefs_fnode_t fnode; | |
116 | int refs; | |
117 | uint64_t dirty_seq; | |
118 | bool locked; | |
119 | bool deleted; | |
120 | boost::intrusive::list_member_hook<> dirty_item; | |
121 | ||
122 | std::atomic_int num_readers, num_writers; | |
123 | std::atomic_int num_reading; | |
124 | ||
9f95a23c TL |
125 | void* vselector_hint = nullptr; |
126 | ||
127 | private: | |
128 | FRIEND_MAKE_REF(File); | |
7c673cae | 129 | File() |
9f95a23c | 130 | : |
7c673cae FG |
131 | refs(0), |
132 | dirty_seq(0), | |
133 | locked(false), | |
134 | deleted(false), | |
135 | num_readers(0), | |
136 | num_writers(0), | |
9f95a23c TL |
137 | num_reading(0), |
138 | vselector_hint(nullptr) | |
7c673cae FG |
139 | {} |
140 | ~File() override { | |
11fdf7f2 TL |
141 | ceph_assert(num_readers.load() == 0); |
142 | ceph_assert(num_writers.load() == 0); | |
143 | ceph_assert(num_reading.load() == 0); | |
144 | ceph_assert(!locked); | |
7c673cae | 145 | } |
7c673cae | 146 | }; |
9f95a23c | 147 | using FileRef = ceph::ref_t<File>; |
7c673cae FG |
148 | |
149 | typedef boost::intrusive::list< | |
150 | File, | |
151 | boost::intrusive::member_hook< | |
152 | File, | |
153 | boost::intrusive::list_member_hook<>, | |
154 | &File::dirty_item> > dirty_file_list_t; | |
155 | ||
156 | struct Dir : public RefCountedObject { | |
157 | MEMPOOL_CLASS_HELPERS(); | |
158 | ||
159 | mempool::bluefs::map<string,FileRef> file_map; | |
160 | ||
9f95a23c TL |
161 | private: |
162 | FRIEND_MAKE_REF(Dir); | |
163 | Dir() = default; | |
7c673cae | 164 | }; |
9f95a23c | 165 | using DirRef = ceph::ref_t<Dir>; |
7c673cae FG |
166 | |
167 | struct FileWriter { | |
168 | MEMPOOL_CLASS_HELPERS(); | |
169 | ||
170 | FileRef file; | |
9f95a23c | 171 | uint64_t pos = 0; ///< start offset for buffer |
7c673cae FG |
172 | bufferlist buffer; ///< new data to write (at end of file) |
173 | bufferlist tail_block; ///< existing partial block at end of file, if any | |
174 | bufferlist::page_aligned_appender buffer_appender; //< for const char* only | |
175 | int writer_type = 0; ///< WRITER_* | |
11fdf7f2 | 176 | int write_hint = WRITE_LIFE_NOT_SET; |
7c673cae | 177 | |
11fdf7f2 | 178 | ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock"); |
7c673cae | 179 | std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev |
11fdf7f2 | 180 | std::array<bool, MAX_BDEV> dirty_devs; |
7c673cae FG |
181 | |
182 | FileWriter(FileRef f) | |
9f95a23c | 183 | : file(std::move(f)), |
7c673cae | 184 | buffer_appender(buffer.get_page_aligned_appender( |
11fdf7f2 | 185 | g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) { |
7c673cae FG |
186 | ++file->num_writers; |
187 | iocv.fill(nullptr); | |
11fdf7f2 | 188 | dirty_devs.fill(false); |
9f95a23c | 189 | if (file->fnode.ino == 1) { |
11fdf7f2 TL |
190 | write_hint = WRITE_LIFE_MEDIUM; |
191 | } | |
7c673cae FG |
192 | } |
193 | // NOTE: caller must call BlueFS::close_writer() | |
194 | ~FileWriter() { | |
195 | --file->num_writers; | |
196 | } | |
197 | ||
198 | // note: BlueRocksEnv uses this append exclusively, so it's safe | |
199 | // to use buffer_appender exclusively here (e.g., it's notion of | |
200 | // offset will remain accurate). | |
201 | void append(const char *buf, size_t len) { | |
202 | buffer_appender.append(buf, len); | |
203 | } | |
204 | ||
205 | // note: used internally only, for ino 1 or 0. | |
206 | void append(bufferlist& bl) { | |
207 | buffer.claim_append(bl); | |
208 | } | |
209 | ||
210 | uint64_t get_effective_write_pos() { | |
211 | buffer_appender.flush(); | |
212 | return pos + buffer.length(); | |
213 | } | |
214 | }; | |
215 | ||
216 | struct FileReaderBuffer { | |
217 | MEMPOOL_CLASS_HELPERS(); | |
218 | ||
9f95a23c | 219 | uint64_t bl_off = 0; ///< prefetch buffer logical offset |
7c673cae | 220 | bufferlist bl; ///< prefetch buffer |
9f95a23c | 221 | uint64_t pos = 0; ///< current logical offset |
7c673cae FG |
222 | uint64_t max_prefetch; ///< max allowed prefetch |
223 | ||
224 | explicit FileReaderBuffer(uint64_t mpf) | |
9f95a23c | 225 | : max_prefetch(mpf) {} |
7c673cae | 226 | |
9f95a23c | 227 | uint64_t get_buf_end() const { |
7c673cae FG |
228 | return bl_off + bl.length(); |
229 | } | |
9f95a23c | 230 | uint64_t get_buf_remaining(uint64_t p) const { |
7c673cae FG |
231 | if (p >= bl_off && p < bl_off + bl.length()) |
232 | return bl_off + bl.length() - p; | |
233 | return 0; | |
234 | } | |
235 | ||
236 | void skip(size_t n) { | |
237 | pos += n; | |
238 | } | |
239 | void seek(uint64_t offset) { | |
240 | pos = offset; | |
241 | } | |
242 | }; | |
243 | ||
244 | struct FileReader { | |
245 | MEMPOOL_CLASS_HELPERS(); | |
246 | ||
247 | FileRef file; | |
248 | FileReaderBuffer buf; | |
249 | bool random; | |
250 | bool ignore_eof; ///< used when reading our log file | |
251 | ||
494da23a TL |
252 | ceph::shared_mutex lock { |
253 | ceph::make_shared_mutex(std::string(), false, false, false) | |
254 | }; | |
255 | ||
256 | ||
7c673cae FG |
257 | FileReader(FileRef f, uint64_t mpf, bool rand, bool ie) |
258 | : file(f), | |
259 | buf(mpf), | |
260 | random(rand), | |
261 | ignore_eof(ie) { | |
262 | ++file->num_readers; | |
263 | } | |
264 | ~FileReader() { | |
265 | --file->num_readers; | |
266 | } | |
267 | }; | |
268 | ||
269 | struct FileLock { | |
270 | MEMPOOL_CLASS_HELPERS(); | |
271 | ||
272 | FileRef file; | |
9f95a23c | 273 | explicit FileLock(FileRef f) : file(std::move(f)) {} |
7c673cae FG |
274 | }; |
275 | ||
276 | private: | |
11fdf7f2 | 277 | ceph::mutex lock = ceph::make_mutex("BlueFS::lock"); |
7c673cae FG |
278 | |
279 | PerfCounters *logger = nullptr; | |
280 | ||
11fdf7f2 TL |
281 | uint64_t max_bytes[MAX_BDEV] = {0}; |
282 | uint64_t max_bytes_pcounters[MAX_BDEV] = { | |
283 | l_bluefs_max_bytes_wal, | |
284 | l_bluefs_max_bytes_db, | |
285 | l_bluefs_max_bytes_slow, | |
286 | }; | |
287 | ||
7c673cae FG |
288 | // cache |
289 | mempool::bluefs::map<string, DirRef> dir_map; ///< dirname -> Dir | |
290 | mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File | |
291 | ||
292 | // map of dirty files, files of same dirty_seq are grouped into list. | |
293 | map<uint64_t, dirty_file_list_t> dirty_files; | |
294 | ||
295 | bluefs_super_t super; ///< latest superblock (as last written) | |
296 | uint64_t ino_last = 0; ///< last assigned ino (this one is in use) | |
297 | uint64_t log_seq = 0; ///< last used log seq (by current pending log_t) | |
298 | uint64_t log_seq_stable = 0; ///< last stable/synced log seq | |
299 | FileWriter *log_writer = 0; ///< writer for the log | |
300 | bluefs_transaction_t log_t; ///< pending, unwritten log transaction | |
301 | bool log_flushing = false; ///< true while flushing the log | |
11fdf7f2 | 302 | ceph::condition_variable log_cond; |
7c673cae FG |
303 | |
304 | uint64_t new_log_jump_to = 0; | |
305 | uint64_t old_log_jump_to = 0; | |
306 | FileRef new_log = nullptr; | |
307 | FileWriter *new_log_writer = nullptr; | |
308 | ||
309 | /* | |
310 | * There are up to 3 block devices: | |
311 | * | |
312 | * BDEV_DB db/ - the primary db device | |
313 | * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL | |
314 | * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills | |
315 | */ | |
316 | vector<BlockDevice*> bdev; ///< block devices we can use | |
317 | vector<IOContext*> ioc; ///< IOContexts for bdevs | |
318 | vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own | |
7c673cae | 319 | vector<Allocator*> alloc; ///< allocators for bdevs |
eafe8130 | 320 | vector<uint64_t> alloc_size; ///< alloc size for each device |
7c673cae | 321 | vector<interval_set<uint64_t>> pending_release; ///< extents to release |
9f95a23c | 322 | vector<interval_set<uint64_t>> block_unused_too_granular; |
7c673cae | 323 | |
11fdf7f2 TL |
324 | BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev |
325 | ||
326 | BlueFSDeviceExpander* slow_dev_expander = nullptr; | |
9f95a23c | 327 | std::unique_ptr<BlueFSVolumeSelector> vselector; |
11fdf7f2 | 328 | |
eafe8130 TL |
329 | class SocketHook; |
330 | SocketHook* asok_hook = nullptr; | |
331 | ||
7c673cae FG |
332 | void _init_logger(); |
333 | void _shutdown_logger(); | |
334 | void _update_logger_stats(); | |
335 | ||
336 | void _init_alloc(); | |
337 | void _stop_alloc(); | |
338 | ||
339 | void _pad_bl(bufferlist& bl); ///< pad bufferlist to block size w/ zeros | |
340 | ||
341 | FileRef _get_file(uint64_t ino); | |
342 | void _drop_link(FileRef f); | |
343 | ||
1911f103 TL |
344 | unsigned _get_slow_device_id() { |
345 | return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; | |
346 | } | |
eafe8130 | 347 | const char* get_device_name(unsigned id); |
11fdf7f2 | 348 | int _expand_slow_device(uint64_t min_size, PExtentVector& extents); |
7c673cae | 349 | int _allocate(uint8_t bdev, uint64_t len, |
94b18763 | 350 | bluefs_fnode_t* node); |
11fdf7f2 TL |
351 | int _allocate_without_fallback(uint8_t id, uint64_t len, |
352 | PExtentVector* extents); | |
353 | ||
7c673cae | 354 | int _flush_range(FileWriter *h, uint64_t offset, uint64_t length); |
f6b5b4d7 TL |
355 | int _flush(FileWriter *h, bool focce, std::unique_lock<ceph::mutex>& l); |
356 | int _flush(FileWriter *h, bool force, bool *flushed = nullptr); | |
11fdf7f2 | 357 | int _fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l); |
7c673cae | 358 | |
11fdf7f2 | 359 | #ifdef HAVE_LIBAIO |
7c673cae FG |
360 | void _claim_completed_aios(FileWriter *h, list<aio_t> *ls); |
361 | void wait_for_aio(FileWriter *h); // safe to call without a lock | |
11fdf7f2 | 362 | #endif |
7c673cae | 363 | |
11fdf7f2 | 364 | int _flush_and_sync_log(std::unique_lock<ceph::mutex>& l, |
7c673cae FG |
365 | uint64_t want_seq = 0, |
366 | uint64_t jump_to = 0); | |
367 | uint64_t _estimate_log_size(); | |
368 | bool _should_compact_log(); | |
11fdf7f2 TL |
369 | |
370 | enum { | |
371 | REMOVE_DB = 1, | |
372 | REMOVE_WAL = 2, | |
373 | RENAME_SLOW2DB = 4, | |
374 | RENAME_DB2SLOW = 8, | |
375 | }; | |
376 | void _compact_log_dump_metadata(bluefs_transaction_t *t, | |
377 | int flags); | |
7c673cae | 378 | void _compact_log_sync(); |
11fdf7f2 TL |
379 | void _compact_log_async(std::unique_lock<ceph::mutex>& l); |
380 | ||
9f95a23c TL |
381 | void _rewrite_log_and_layout_sync(bool allocate_with_fallback, |
382 | int super_dev, | |
383 | int log_dev, | |
384 | int new_log_dev, | |
385 | int flags, | |
386 | std::optional<bluefs_layout_t> layout); | |
7c673cae FG |
387 | |
388 | //void _aio_finish(void *priv); | |
389 | ||
390 | void _flush_bdev_safely(FileWriter *h); | |
391 | void flush_bdev(); // this is safe to call without a lock | |
11fdf7f2 | 392 | void flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock |
7c673cae FG |
393 | |
394 | int _preallocate(FileRef f, uint64_t off, uint64_t len); | |
395 | int _truncate(FileWriter *h, uint64_t off); | |
396 | ||
397 | int _read( | |
398 | FileReader *h, ///< [in] read from here | |
399 | FileReaderBuffer *buf, ///< [in] reader state | |
400 | uint64_t offset, ///< [in] offset | |
401 | size_t len, ///< [in] this many bytes | |
402 | bufferlist *outbl, ///< [out] optional: reference the result here | |
403 | char *out); ///< [out] optional: or copy it here | |
404 | int _read_random( | |
405 | FileReader *h, ///< [in] read from here | |
406 | uint64_t offset, ///< [in] offset | |
9f95a23c | 407 | uint64_t len, ///< [in] this many bytes |
7c673cae FG |
408 | char *out); ///< [out] optional: or copy it here |
409 | ||
410 | void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length); | |
411 | ||
412 | int _open_super(); | |
11fdf7f2 | 413 | int _write_super(int dev); |
9f95a23c TL |
414 | int _check_new_allocations(const bluefs_fnode_t& fnode, |
415 | size_t dev_count, | |
416 | boost::dynamic_bitset<uint64_t>* owned_blocks, | |
417 | boost::dynamic_bitset<uint64_t>* used_blocks); | |
418 | int _verify_alloc_granularity( | |
419 | __u8 id, uint64_t offset, uint64_t length, | |
420 | const char *op); | |
421 | int _adjust_granularity( | |
422 | __u8 id, uint64_t *offset, uint64_t *length, bool alloc); | |
11fdf7f2 | 423 | int _replay(bool noop, bool to_stdout = false); ///< replay journal |
7c673cae FG |
424 | |
425 | FileWriter *_create_writer(FileRef f); | |
426 | void _close_writer(FileWriter *h); | |
427 | ||
428 | // always put the super in the second 4k block. FIXME should this be | |
429 | // block size independent? | |
430 | unsigned get_super_offset() { | |
431 | return 4096; | |
432 | } | |
433 | unsigned get_super_length() { | |
434 | return 4096; | |
435 | } | |
436 | ||
1911f103 TL |
437 | void _add_block_extent(unsigned bdev, uint64_t offset, uint64_t len, |
438 | bool skip=false); | |
11fdf7f2 | 439 | |
7c673cae FG |
440 | public: |
441 | BlueFS(CephContext* cct); | |
442 | ~BlueFS(); | |
443 | ||
444 | // the super is always stored on bdev 0 | |
9f95a23c | 445 | int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout); |
7c673cae | 446 | int mount(); |
9f95a23c | 447 | int maybe_verify_layout(const bluefs_layout_t& layout) const; |
1911f103 | 448 | void umount(bool avoid_compact = false); |
9f95a23c | 449 | int prepare_new_device(int id, const bluefs_layout_t& layout); |
11fdf7f2 TL |
450 | |
451 | int log_dump(); | |
7c673cae | 452 | |
11fdf7f2 TL |
453 | void collect_metadata(map<string,string> *pm, unsigned skip_bdev_id); |
454 | void get_devices(set<string> *ls); | |
eafe8130 TL |
455 | uint64_t get_alloc_size(int id) { |
456 | return alloc_size[id]; | |
457 | } | |
7c673cae FG |
458 | int fsck(); |
459 | ||
11fdf7f2 TL |
460 | int device_migrate_to_new( |
461 | CephContext *cct, | |
462 | const set<int>& devs_source, | |
9f95a23c TL |
463 | int dev_target, |
464 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
465 | int device_migrate_to_existing( |
466 | CephContext *cct, | |
467 | const set<int>& devs_source, | |
9f95a23c TL |
468 | int dev_target, |
469 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
470 | |
471 | uint64_t get_used(); | |
7c673cae FG |
472 | uint64_t get_total(unsigned id); |
473 | uint64_t get_free(unsigned id); | |
474 | void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...] | |
475 | void dump_perf_counters(Formatter *f); | |
476 | ||
3efd9988 FG |
477 | void dump_block_extents(ostream& out); |
478 | ||
7c673cae FG |
479 | /// get current extents that we own for given block device |
480 | int get_block_extents(unsigned id, interval_set<uint64_t> *extents); | |
481 | ||
482 | int open_for_write( | |
483 | const string& dir, | |
484 | const string& file, | |
485 | FileWriter **h, | |
486 | bool overwrite); | |
487 | ||
488 | int open_for_read( | |
489 | const string& dir, | |
490 | const string& file, | |
491 | FileReader **h, | |
492 | bool random = false); | |
493 | ||
494 | void close_writer(FileWriter *h) { | |
11fdf7f2 | 495 | std::lock_guard l(lock); |
7c673cae FG |
496 | _close_writer(h); |
497 | } | |
498 | ||
499 | int rename(const string& old_dir, const string& old_file, | |
500 | const string& new_dir, const string& new_file); | |
501 | ||
502 | int readdir(const string& dirname, vector<string> *ls); | |
503 | ||
504 | int unlink(const string& dirname, const string& filename); | |
505 | int mkdir(const string& dirname); | |
506 | int rmdir(const string& dirname); | |
d2e6a577 | 507 | bool wal_is_rotational(); |
7c673cae FG |
508 | |
509 | bool dir_exists(const string& dirname); | |
510 | int stat(const string& dirname, const string& filename, | |
511 | uint64_t *size, utime_t *mtime); | |
512 | ||
513 | int lock_file(const string& dirname, const string& filename, FileLock **p); | |
514 | int unlock_file(FileLock *l); | |
515 | ||
7c673cae FG |
516 | void compact_log(); |
517 | ||
518 | /// sync any uncommitted state to disk | |
1911f103 | 519 | void sync_metadata(bool avoid_compact); |
f6b5b4d7 TL |
520 | /// test and compact log, if necessary |
521 | void _maybe_compact_log(std::unique_lock<ceph::mutex>& l); | |
7c673cae | 522 | |
11fdf7f2 TL |
523 | void set_slow_device_expander(BlueFSDeviceExpander* a) { |
524 | slow_dev_expander = a; | |
525 | } | |
9f95a23c TL |
526 | void set_volume_selector(BlueFSVolumeSelector* s) { |
527 | vselector.reset(s); | |
528 | } | |
529 | void dump_volume_selector(ostream& sout) { | |
530 | vselector->dump(sout); | |
531 | } | |
532 | void get_vselector_paths(const std::string& base, | |
533 | BlueFSVolumeSelector::paths& res) const { | |
534 | return vselector->get_paths(base, res); | |
535 | } | |
536 | ||
11fdf7f2 TL |
537 | int add_block_device(unsigned bdev, const string& path, bool trim, |
538 | bool shared_with_bluestore=false); | |
7c673cae FG |
539 | bool bdev_support_label(unsigned id); |
540 | uint64_t get_block_device_size(unsigned bdev); | |
541 | ||
542 | /// gift more block space | |
1911f103 TL |
543 | void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len, |
544 | bool skip=false) { | |
11fdf7f2 | 545 | std::unique_lock l(lock); |
1911f103 | 546 | _add_block_extent(bdev, offset, len, skip); |
11fdf7f2 TL |
547 | int r = _flush_and_sync_log(l); |
548 | ceph_assert(r == 0); | |
549 | } | |
7c673cae FG |
550 | |
551 | /// reclaim block space | |
552 | int reclaim_blocks(unsigned bdev, uint64_t want, | |
a8e16298 | 553 | PExtentVector *extents); |
7c673cae | 554 | |
11fdf7f2 TL |
555 | // handler for discard event |
556 | void handle_discard(unsigned dev, interval_set<uint64_t>& to_release); | |
557 | ||
f6b5b4d7 TL |
558 | void flush(FileWriter *h, bool force = false) { |
559 | std::unique_lock l(lock); | |
560 | int r = _flush(h, force, l); | |
561 | ceph_assert(r == 0); | |
7c673cae FG |
562 | } |
563 | void flush_range(FileWriter *h, uint64_t offset, uint64_t length) { | |
11fdf7f2 | 564 | std::lock_guard l(lock); |
7c673cae FG |
565 | _flush_range(h, offset, length); |
566 | } | |
567 | int fsync(FileWriter *h) { | |
11fdf7f2 | 568 | std::unique_lock l(lock); |
f6b5b4d7 TL |
569 | int r = _fsync(h, l); |
570 | _maybe_compact_log(l); | |
571 | return r; | |
7c673cae FG |
572 | } |
573 | int read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len, | |
574 | bufferlist *outbl, char *out) { | |
575 | // no need to hold the global lock here; we only touch h and | |
576 | // h->file, and read vs write or delete is already protected (via | |
577 | // atomics and asserts). | |
578 | return _read(h, buf, offset, len, outbl, out); | |
579 | } | |
580 | int read_random(FileReader *h, uint64_t offset, size_t len, | |
581 | char *out) { | |
582 | // no need to hold the global lock here; we only touch h and | |
583 | // h->file, and read vs write or delete is already protected (via | |
584 | // atomics and asserts). | |
585 | return _read_random(h, offset, len, out); | |
586 | } | |
587 | void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) { | |
11fdf7f2 | 588 | std::lock_guard l(lock); |
7c673cae FG |
589 | _invalidate_cache(f, offset, len); |
590 | } | |
591 | int preallocate(FileRef f, uint64_t offset, uint64_t len) { | |
11fdf7f2 | 592 | std::lock_guard l(lock); |
7c673cae FG |
593 | return _preallocate(f, offset, len); |
594 | } | |
595 | int truncate(FileWriter *h, uint64_t offset) { | |
11fdf7f2 | 596 | std::lock_guard l(lock); |
7c673cae FG |
597 | return _truncate(h, offset); |
598 | } | |
f6b5b4d7 TL |
599 | int do_replay_recovery_read(FileReader *log, |
600 | size_t log_pos, | |
601 | size_t read_offset, | |
602 | size_t read_len, | |
603 | bufferlist* bl); | |
7c673cae | 604 | |
9f95a23c TL |
605 | /// test purpose methods |
606 | void debug_inject_duplicate_gift(unsigned bdev, uint64_t offset, uint64_t len); | |
607 | const PerfCounters* get_perf_counters() const { | |
608 | return logger; | |
609 | } | |
610 | }; | |
611 | ||
612 | class OriginalVolumeSelector : public BlueFSVolumeSelector { | |
613 | uint64_t wal_total; | |
614 | uint64_t db_total; | |
615 | uint64_t slow_total; | |
616 | ||
617 | public: | |
618 | OriginalVolumeSelector( | |
619 | uint64_t _wal_total, | |
620 | uint64_t _db_total, | |
621 | uint64_t _slow_total) | |
622 | : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {} | |
623 | ||
f6b5b4d7 TL |
624 | void* get_hint_for_log() const override; |
625 | void* get_hint_by_dir(const std::string& dirname) const override; | |
9f95a23c TL |
626 | |
627 | void add_usage(void* hint, const bluefs_fnode_t& fnode) override { | |
628 | // do nothing | |
629 | return; | |
630 | } | |
631 | void sub_usage(void* hint, const bluefs_fnode_t& fnode) override { | |
632 | // do nothing | |
633 | return; | |
634 | } | |
635 | void add_usage(void* hint, uint64_t fsize) override { | |
636 | // do nothing | |
637 | return; | |
638 | } | |
639 | void sub_usage(void* hint, uint64_t fsize) override { | |
640 | // do nothing | |
641 | return; | |
642 | } | |
643 | ||
644 | uint8_t select_prefer_bdev(void* hint) override; | |
645 | void get_paths(const std::string& base, paths& res) const override; | |
646 | void dump(ostream& sout) override; | |
7c673cae FG |
647 | }; |
648 | ||
649 | #endif |