]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | #ifndef CEPH_OS_BLUESTORE_BLUEFS_H | |
4 | #define CEPH_OS_BLUESTORE_BLUEFS_H | |
5 | ||
6 | #include <atomic> | |
7 | #include <mutex> | |
cd265ab1 | 8 | #include <limits> |
7c673cae FG |
9 | |
10 | #include "bluefs_types.h" | |
f67539c2 | 11 | #include "blk/BlockDevice.h" |
7c673cae | 12 | |
9f95a23c TL |
13 | #include "common/RefCountedObj.h" |
14 | #include "common/ceph_context.h" | |
15 | #include "global/global_context.h" | |
16 | #include "include/common_fwd.h" | |
7c673cae | 17 | |
9f95a23c TL |
18 | #include "boost/intrusive/list.hpp" |
19 | #include "boost/dynamic_bitset.hpp" | |
7c673cae FG |
20 | |
21 | class Allocator; | |
22 | ||
23 | enum { | |
24 | l_bluefs_first = 732600, | |
7c673cae FG |
25 | l_bluefs_db_total_bytes, |
26 | l_bluefs_db_used_bytes, | |
27 | l_bluefs_wal_total_bytes, | |
28 | l_bluefs_wal_used_bytes, | |
29 | l_bluefs_slow_total_bytes, | |
30 | l_bluefs_slow_used_bytes, | |
31 | l_bluefs_num_files, | |
32 | l_bluefs_log_bytes, | |
33 | l_bluefs_log_compactions, | |
34 | l_bluefs_logged_bytes, | |
35 | l_bluefs_files_written_wal, | |
36 | l_bluefs_files_written_sst, | |
37 | l_bluefs_bytes_written_wal, | |
38 | l_bluefs_bytes_written_sst, | |
11fdf7f2 TL |
39 | l_bluefs_bytes_written_slow, |
40 | l_bluefs_max_bytes_wal, | |
41 | l_bluefs_max_bytes_db, | |
42 | l_bluefs_max_bytes_slow, | |
494da23a TL |
43 | l_bluefs_read_random_count, |
44 | l_bluefs_read_random_bytes, | |
45 | l_bluefs_read_random_disk_count, | |
46 | l_bluefs_read_random_disk_bytes, | |
47 | l_bluefs_read_random_buffer_count, | |
48 | l_bluefs_read_random_buffer_bytes, | |
49 | l_bluefs_read_count, | |
50 | l_bluefs_read_bytes, | |
51 | l_bluefs_read_prefetch_count, | |
52 | l_bluefs_read_prefetch_bytes, | |
cd265ab1 TL |
53 | l_bluefs_read_zeros_candidate, |
54 | l_bluefs_read_zeros_errors, | |
494da23a | 55 | |
7c673cae FG |
56 | l_bluefs_last, |
57 | }; | |
58 | ||
9f95a23c TL |
59 | class BlueFSVolumeSelector { |
60 | public: | |
61 | typedef std::vector<std::pair<std::string, uint64_t>> paths; | |
62 | ||
63 | virtual ~BlueFSVolumeSelector() { | |
64 | } | |
f6b5b4d7 TL |
65 | virtual void* get_hint_for_log() const = 0; |
66 | virtual void* get_hint_by_dir(const std::string& dirname) const = 0; | |
9f95a23c TL |
67 | |
68 | virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; | |
69 | virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; | |
70 | virtual void add_usage(void* file_hint, uint64_t fsize) = 0; | |
71 | virtual void sub_usage(void* file_hint, uint64_t fsize) = 0; | |
72 | virtual uint8_t select_prefer_bdev(void* hint) = 0; | |
73 | virtual void get_paths(const std::string& base, paths& res) const = 0; | |
f67539c2 TL |
74 | virtual void dump(std::ostream& sout) = 0; |
75 | }; | |
76 | ||
77 | struct bluefs_shared_alloc_context_t { | |
78 | bool need_init = false; | |
79 | Allocator* a = nullptr; | |
80 | ||
81 | std::atomic<uint64_t> bluefs_used = 0; | |
82 | ||
83 | void set(Allocator* _a) { | |
84 | a = _a; | |
85 | need_init = true; | |
86 | bluefs_used = 0; | |
87 | } | |
88 | void reset() { | |
89 | a = nullptr; | |
90 | } | |
11fdf7f2 TL |
91 | }; |
92 | ||
7c673cae FG |
93 | class BlueFS { |
94 | public: | |
95 | CephContext* cct; | |
11fdf7f2 | 96 | static constexpr unsigned MAX_BDEV = 5; |
7c673cae FG |
97 | static constexpr unsigned BDEV_WAL = 0; |
98 | static constexpr unsigned BDEV_DB = 1; | |
99 | static constexpr unsigned BDEV_SLOW = 2; | |
11fdf7f2 TL |
100 | static constexpr unsigned BDEV_NEWWAL = 3; |
101 | static constexpr unsigned BDEV_NEWDB = 4; | |
7c673cae FG |
102 | |
103 | enum { | |
104 | WRITER_UNKNOWN, | |
105 | WRITER_WAL, | |
106 | WRITER_SST, | |
107 | }; | |
108 | ||
109 | struct File : public RefCountedObject { | |
110 | MEMPOOL_CLASS_HELPERS(); | |
111 | ||
112 | bluefs_fnode_t fnode; | |
113 | int refs; | |
114 | uint64_t dirty_seq; | |
115 | bool locked; | |
116 | bool deleted; | |
117 | boost::intrusive::list_member_hook<> dirty_item; | |
118 | ||
119 | std::atomic_int num_readers, num_writers; | |
120 | std::atomic_int num_reading; | |
121 | ||
9f95a23c TL |
122 | void* vselector_hint = nullptr; |
123 | ||
124 | private: | |
125 | FRIEND_MAKE_REF(File); | |
7c673cae | 126 | File() |
9f95a23c | 127 | : |
7c673cae FG |
128 | refs(0), |
129 | dirty_seq(0), | |
130 | locked(false), | |
131 | deleted(false), | |
132 | num_readers(0), | |
133 | num_writers(0), | |
9f95a23c TL |
134 | num_reading(0), |
135 | vselector_hint(nullptr) | |
7c673cae FG |
136 | {} |
137 | ~File() override { | |
11fdf7f2 TL |
138 | ceph_assert(num_readers.load() == 0); |
139 | ceph_assert(num_writers.load() == 0); | |
140 | ceph_assert(num_reading.load() == 0); | |
141 | ceph_assert(!locked); | |
7c673cae | 142 | } |
7c673cae | 143 | }; |
9f95a23c | 144 | using FileRef = ceph::ref_t<File>; |
7c673cae FG |
145 | |
146 | typedef boost::intrusive::list< | |
147 | File, | |
148 | boost::intrusive::member_hook< | |
149 | File, | |
150 | boost::intrusive::list_member_hook<>, | |
151 | &File::dirty_item> > dirty_file_list_t; | |
152 | ||
153 | struct Dir : public RefCountedObject { | |
154 | MEMPOOL_CLASS_HELPERS(); | |
155 | ||
f67539c2 | 156 | mempool::bluefs::map<std::string,FileRef> file_map; |
7c673cae | 157 | |
9f95a23c TL |
158 | private: |
159 | FRIEND_MAKE_REF(Dir); | |
160 | Dir() = default; | |
7c673cae | 161 | }; |
9f95a23c | 162 | using DirRef = ceph::ref_t<Dir>; |
7c673cae FG |
163 | |
164 | struct FileWriter { | |
165 | MEMPOOL_CLASS_HELPERS(); | |
166 | ||
167 | FileRef file; | |
9f95a23c | 168 | uint64_t pos = 0; ///< start offset for buffer |
f67539c2 TL |
169 | private: |
170 | ceph::buffer::list buffer; ///< new data to write (at end of file) | |
171 | ceph::buffer::list tail_block; ///< existing partial block at end of file, if any | |
172 | public: | |
173 | unsigned get_buffer_length() const { | |
174 | return buffer.length(); | |
175 | } | |
176 | ceph::bufferlist flush_buffer( | |
177 | CephContext* cct, | |
178 | const bool partial, | |
179 | const unsigned length, | |
180 | const bluefs_super_t& super); | |
181 | ceph::buffer::list::page_aligned_appender buffer_appender; //< for const char* only | |
182 | public: | |
7c673cae | 183 | int writer_type = 0; ///< WRITER_* |
11fdf7f2 | 184 | int write_hint = WRITE_LIFE_NOT_SET; |
7c673cae | 185 | |
11fdf7f2 | 186 | ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock"); |
7c673cae | 187 | std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev |
11fdf7f2 | 188 | std::array<bool, MAX_BDEV> dirty_devs; |
7c673cae FG |
189 | |
190 | FileWriter(FileRef f) | |
9f95a23c | 191 | : file(std::move(f)), |
f67539c2 TL |
192 | buffer_appender(buffer.get_page_aligned_appender( |
193 | g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) { | |
7c673cae FG |
194 | ++file->num_writers; |
195 | iocv.fill(nullptr); | |
11fdf7f2 | 196 | dirty_devs.fill(false); |
9f95a23c | 197 | if (file->fnode.ino == 1) { |
11fdf7f2 TL |
198 | write_hint = WRITE_LIFE_MEDIUM; |
199 | } | |
7c673cae FG |
200 | } |
201 | // NOTE: caller must call BlueFS::close_writer() | |
202 | ~FileWriter() { | |
203 | --file->num_writers; | |
204 | } | |
205 | ||
206 | // note: BlueRocksEnv uses this append exclusively, so it's safe | |
207 | // to use buffer_appender exclusively here (e.g., it's notion of | |
208 | // offset will remain accurate). | |
209 | void append(const char *buf, size_t len) { | |
f67539c2 | 210 | uint64_t l0 = get_buffer_length(); |
cd265ab1 | 211 | ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max()); |
7c673cae FG |
212 | buffer_appender.append(buf, len); |
213 | } | |
214 | ||
215 | // note: used internally only, for ino 1 or 0. | |
cd265ab1 | 216 | void append(ceph::buffer::list& bl) { |
f67539c2 | 217 | uint64_t l0 = get_buffer_length(); |
cd265ab1 | 218 | ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max()); |
7c673cae FG |
219 | buffer.claim_append(bl); |
220 | } | |
221 | ||
f67539c2 TL |
222 | void append_zero(size_t len) { |
223 | uint64_t l0 = get_buffer_length(); | |
224 | ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max()); | |
225 | buffer_appender.append_zero(len); | |
226 | } | |
227 | ||
7c673cae | 228 | uint64_t get_effective_write_pos() { |
7c673cae FG |
229 | return pos + buffer.length(); |
230 | } | |
231 | }; | |
232 | ||
233 | struct FileReaderBuffer { | |
234 | MEMPOOL_CLASS_HELPERS(); | |
235 | ||
9f95a23c | 236 | uint64_t bl_off = 0; ///< prefetch buffer logical offset |
f67539c2 | 237 | ceph::buffer::list bl; ///< prefetch buffer |
9f95a23c | 238 | uint64_t pos = 0; ///< current logical offset |
7c673cae FG |
239 | uint64_t max_prefetch; ///< max allowed prefetch |
240 | ||
241 | explicit FileReaderBuffer(uint64_t mpf) | |
9f95a23c | 242 | : max_prefetch(mpf) {} |
7c673cae | 243 | |
9f95a23c | 244 | uint64_t get_buf_end() const { |
7c673cae FG |
245 | return bl_off + bl.length(); |
246 | } | |
9f95a23c | 247 | uint64_t get_buf_remaining(uint64_t p) const { |
7c673cae FG |
248 | if (p >= bl_off && p < bl_off + bl.length()) |
249 | return bl_off + bl.length() - p; | |
250 | return 0; | |
251 | } | |
252 | ||
253 | void skip(size_t n) { | |
254 | pos += n; | |
255 | } | |
f67539c2 TL |
256 | |
257 | // For the sake of simplicity, we invalidate completed rather than | |
258 | // for the provided extent | |
259 | void invalidate_cache(uint64_t offset, uint64_t length) { | |
260 | if (offset >= bl_off && offset < get_buf_end()) { | |
261 | bl.clear(); | |
262 | bl_off = 0; | |
263 | } | |
7c673cae FG |
264 | } |
265 | }; | |
266 | ||
267 | struct FileReader { | |
268 | MEMPOOL_CLASS_HELPERS(); | |
269 | ||
270 | FileRef file; | |
271 | FileReaderBuffer buf; | |
272 | bool random; | |
273 | bool ignore_eof; ///< used when reading our log file | |
274 | ||
494da23a TL |
275 | ceph::shared_mutex lock { |
276 | ceph::make_shared_mutex(std::string(), false, false, false) | |
277 | }; | |
278 | ||
279 | ||
7c673cae FG |
280 | FileReader(FileRef f, uint64_t mpf, bool rand, bool ie) |
281 | : file(f), | |
282 | buf(mpf), | |
283 | random(rand), | |
284 | ignore_eof(ie) { | |
285 | ++file->num_readers; | |
286 | } | |
287 | ~FileReader() { | |
288 | --file->num_readers; | |
289 | } | |
290 | }; | |
291 | ||
292 | struct FileLock { | |
293 | MEMPOOL_CLASS_HELPERS(); | |
294 | ||
295 | FileRef file; | |
9f95a23c | 296 | explicit FileLock(FileRef f) : file(std::move(f)) {} |
7c673cae FG |
297 | }; |
298 | ||
299 | private: | |
11fdf7f2 | 300 | ceph::mutex lock = ceph::make_mutex("BlueFS::lock"); |
7c673cae FG |
301 | |
302 | PerfCounters *logger = nullptr; | |
303 | ||
11fdf7f2 TL |
304 | uint64_t max_bytes[MAX_BDEV] = {0}; |
305 | uint64_t max_bytes_pcounters[MAX_BDEV] = { | |
306 | l_bluefs_max_bytes_wal, | |
307 | l_bluefs_max_bytes_db, | |
308 | l_bluefs_max_bytes_slow, | |
309 | }; | |
310 | ||
7c673cae | 311 | // cache |
f67539c2 | 312 | mempool::bluefs::map<std::string, DirRef> dir_map; ///< dirname -> Dir |
7c673cae FG |
313 | mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File |
314 | ||
315 | // map of dirty files, files of same dirty_seq are grouped into list. | |
f67539c2 | 316 | std::map<uint64_t, dirty_file_list_t> dirty_files; |
7c673cae FG |
317 | |
318 | bluefs_super_t super; ///< latest superblock (as last written) | |
319 | uint64_t ino_last = 0; ///< last assigned ino (this one is in use) | |
320 | uint64_t log_seq = 0; ///< last used log seq (by current pending log_t) | |
321 | uint64_t log_seq_stable = 0; ///< last stable/synced log seq | |
322 | FileWriter *log_writer = 0; ///< writer for the log | |
323 | bluefs_transaction_t log_t; ///< pending, unwritten log transaction | |
324 | bool log_flushing = false; ///< true while flushing the log | |
11fdf7f2 | 325 | ceph::condition_variable log_cond; |
7c673cae FG |
326 | |
327 | uint64_t new_log_jump_to = 0; | |
328 | uint64_t old_log_jump_to = 0; | |
329 | FileRef new_log = nullptr; | |
330 | FileWriter *new_log_writer = nullptr; | |
331 | ||
332 | /* | |
333 | * There are up to 3 block devices: | |
334 | * | |
335 | * BDEV_DB db/ - the primary db device | |
336 | * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL | |
337 | * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills | |
338 | */ | |
f67539c2 TL |
339 | std::vector<BlockDevice*> bdev; ///< block devices we can use |
340 | std::vector<IOContext*> ioc; ///< IOContexts for bdevs | |
341 | std::vector<uint64_t> block_reserved; ///< starting reserve extent per device | |
342 | std::vector<Allocator*> alloc; ///< allocators for bdevs | |
343 | std::vector<uint64_t> alloc_size; ///< alloc size for each device | |
344 | std::vector<interval_set<uint64_t>> pending_release; ///< extents to release | |
345 | //std::vector<interval_set<uint64_t>> block_unused_too_granular; | |
7c673cae | 346 | |
11fdf7f2 TL |
347 | BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev |
348 | ||
9f95a23c | 349 | std::unique_ptr<BlueFSVolumeSelector> vselector; |
11fdf7f2 | 350 | |
f67539c2 TL |
351 | bluefs_shared_alloc_context_t* shared_alloc = nullptr; |
352 | unsigned shared_alloc_id = unsigned(-1); | |
353 | inline bool is_shared_alloc(unsigned id) const { | |
354 | return id == shared_alloc_id; | |
355 | } | |
356 | ||
eafe8130 TL |
357 | class SocketHook; |
358 | SocketHook* asok_hook = nullptr; | |
cd265ab1 TL |
359 | // used to trigger zeros into read (debug / verify) |
360 | std::atomic<uint64_t> inject_read_zeros{0}; | |
eafe8130 | 361 | |
7c673cae FG |
362 | void _init_logger(); |
363 | void _shutdown_logger(); | |
364 | void _update_logger_stats(); | |
365 | ||
366 | void _init_alloc(); | |
367 | void _stop_alloc(); | |
368 | ||
f67539c2 TL |
369 | void _pad_bl(ceph::buffer::list& bl); ///< pad ceph::buffer::list to block size w/ zeros |
370 | ||
371 | uint64_t _get_used(unsigned id) const; | |
372 | uint64_t _get_total(unsigned id) const; | |
373 | ||
7c673cae FG |
374 | |
375 | FileRef _get_file(uint64_t ino); | |
376 | void _drop_link(FileRef f); | |
377 | ||
1911f103 TL |
378 | unsigned _get_slow_device_id() { |
379 | return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; | |
380 | } | |
eafe8130 | 381 | const char* get_device_name(unsigned id); |
7c673cae | 382 | int _allocate(uint8_t bdev, uint64_t len, |
94b18763 | 383 | bluefs_fnode_t* node); |
11fdf7f2 TL |
384 | int _allocate_without_fallback(uint8_t id, uint64_t len, |
385 | PExtentVector* extents); | |
386 | ||
7c673cae | 387 | int _flush_range(FileWriter *h, uint64_t offset, uint64_t length); |
f67539c2 | 388 | int _flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l); |
f6b5b4d7 | 389 | int _flush(FileWriter *h, bool force, bool *flushed = nullptr); |
11fdf7f2 | 390 | int _fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l); |
7c673cae | 391 | |
11fdf7f2 | 392 | #ifdef HAVE_LIBAIO |
f67539c2 | 393 | void _claim_completed_aios(FileWriter *h, std::list<aio_t> *ls); |
7c673cae | 394 | void wait_for_aio(FileWriter *h); // safe to call without a lock |
11fdf7f2 | 395 | #endif |
7c673cae | 396 | |
11fdf7f2 | 397 | int _flush_and_sync_log(std::unique_lock<ceph::mutex>& l, |
7c673cae FG |
398 | uint64_t want_seq = 0, |
399 | uint64_t jump_to = 0); | |
400 | uint64_t _estimate_log_size(); | |
401 | bool _should_compact_log(); | |
11fdf7f2 TL |
402 | |
403 | enum { | |
404 | REMOVE_DB = 1, | |
405 | REMOVE_WAL = 2, | |
406 | RENAME_SLOW2DB = 4, | |
407 | RENAME_DB2SLOW = 8, | |
408 | }; | |
409 | void _compact_log_dump_metadata(bluefs_transaction_t *t, | |
410 | int flags); | |
7c673cae | 411 | void _compact_log_sync(); |
11fdf7f2 TL |
412 | void _compact_log_async(std::unique_lock<ceph::mutex>& l); |
413 | ||
9f95a23c TL |
414 | void _rewrite_log_and_layout_sync(bool allocate_with_fallback, |
415 | int super_dev, | |
416 | int log_dev, | |
417 | int new_log_dev, | |
418 | int flags, | |
419 | std::optional<bluefs_layout_t> layout); | |
7c673cae FG |
420 | |
421 | //void _aio_finish(void *priv); | |
422 | ||
423 | void _flush_bdev_safely(FileWriter *h); | |
424 | void flush_bdev(); // this is safe to call without a lock | |
11fdf7f2 | 425 | void flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock |
7c673cae FG |
426 | |
427 | int _preallocate(FileRef f, uint64_t off, uint64_t len); | |
428 | int _truncate(FileWriter *h, uint64_t off); | |
429 | ||
adb31ebb | 430 | int64_t _read( |
7c673cae | 431 | FileReader *h, ///< [in] read from here |
7c673cae FG |
432 | uint64_t offset, ///< [in] offset |
433 | size_t len, ///< [in] this many bytes | |
f67539c2 | 434 | ceph::buffer::list *outbl, ///< [out] optional: reference the result here |
7c673cae | 435 | char *out); ///< [out] optional: or copy it here |
adb31ebb | 436 | int64_t _read_random( |
7c673cae FG |
437 | FileReader *h, ///< [in] read from here |
438 | uint64_t offset, ///< [in] offset | |
9f95a23c | 439 | uint64_t len, ///< [in] this many bytes |
7c673cae FG |
440 | char *out); ///< [out] optional: or copy it here |
441 | ||
442 | void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length); | |
443 | ||
444 | int _open_super(); | |
11fdf7f2 | 445 | int _write_super(int dev); |
9f95a23c TL |
446 | int _check_new_allocations(const bluefs_fnode_t& fnode, |
447 | size_t dev_count, | |
9f95a23c TL |
448 | boost::dynamic_bitset<uint64_t>* used_blocks); |
449 | int _verify_alloc_granularity( | |
450 | __u8 id, uint64_t offset, uint64_t length, | |
451 | const char *op); | |
11fdf7f2 | 452 | int _replay(bool noop, bool to_stdout = false); ///< replay journal |
7c673cae FG |
453 | |
454 | FileWriter *_create_writer(FileRef f); | |
455 | void _close_writer(FileWriter *h); | |
456 | ||
457 | // always put the super in the second 4k block. FIXME should this be | |
458 | // block size independent? | |
459 | unsigned get_super_offset() { | |
460 | return 4096; | |
461 | } | |
462 | unsigned get_super_length() { | |
463 | return 4096; | |
464 | } | |
465 | ||
466 | public: | |
467 | BlueFS(CephContext* cct); | |
468 | ~BlueFS(); | |
469 | ||
470 | // the super is always stored on bdev 0 | |
9f95a23c | 471 | int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout); |
7c673cae | 472 | int mount(); |
9f95a23c | 473 | int maybe_verify_layout(const bluefs_layout_t& layout) const; |
1911f103 | 474 | void umount(bool avoid_compact = false); |
9f95a23c | 475 | int prepare_new_device(int id, const bluefs_layout_t& layout); |
11fdf7f2 TL |
476 | |
477 | int log_dump(); | |
7c673cae | 478 | |
f67539c2 TL |
479 | void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id); |
480 | void get_devices(std::set<std::string> *ls); | |
eafe8130 TL |
481 | uint64_t get_alloc_size(int id) { |
482 | return alloc_size[id]; | |
483 | } | |
7c673cae FG |
484 | int fsck(); |
485 | ||
11fdf7f2 TL |
486 | int device_migrate_to_new( |
487 | CephContext *cct, | |
f67539c2 | 488 | const std::set<int>& devs_source, |
9f95a23c TL |
489 | int dev_target, |
490 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
491 | int device_migrate_to_existing( |
492 | CephContext *cct, | |
f67539c2 | 493 | const std::set<int>& devs_source, |
9f95a23c TL |
494 | int dev_target, |
495 | const bluefs_layout_t& layout); | |
11fdf7f2 TL |
496 | |
497 | uint64_t get_used(); | |
7c673cae FG |
498 | uint64_t get_total(unsigned id); |
499 | uint64_t get_free(unsigned id); | |
f67539c2 TL |
500 | uint64_t get_used(unsigned id); |
501 | void dump_perf_counters(ceph::Formatter *f); | |
7c673cae | 502 | |
f67539c2 | 503 | void dump_block_extents(std::ostream& out); |
3efd9988 | 504 | |
7c673cae FG |
505 | /// get current extents that we own for given block device |
506 | int get_block_extents(unsigned id, interval_set<uint64_t> *extents); | |
507 | ||
508 | int open_for_write( | |
f67539c2 TL |
509 | const std::string& dir, |
510 | const std::string& file, | |
7c673cae FG |
511 | FileWriter **h, |
512 | bool overwrite); | |
513 | ||
514 | int open_for_read( | |
f67539c2 TL |
515 | const std::string& dir, |
516 | const std::string& file, | |
7c673cae FG |
517 | FileReader **h, |
518 | bool random = false); | |
519 | ||
520 | void close_writer(FileWriter *h) { | |
11fdf7f2 | 521 | std::lock_guard l(lock); |
7c673cae FG |
522 | _close_writer(h); |
523 | } | |
524 | ||
f67539c2 TL |
525 | int rename(const std::string& old_dir, const std::string& old_file, |
526 | const std::string& new_dir, const std::string& new_file); | |
7c673cae | 527 | |
f67539c2 | 528 | int readdir(const std::string& dirname, std::vector<std::string> *ls); |
7c673cae | 529 | |
f67539c2 TL |
530 | int unlink(const std::string& dirname, const std::string& filename); |
531 | int mkdir(const std::string& dirname); | |
532 | int rmdir(const std::string& dirname); | |
d2e6a577 | 533 | bool wal_is_rotational(); |
7c673cae | 534 | |
f67539c2 TL |
535 | bool dir_exists(const std::string& dirname); |
536 | int stat(const std::string& dirname, const std::string& filename, | |
7c673cae FG |
537 | uint64_t *size, utime_t *mtime); |
538 | ||
f67539c2 | 539 | int lock_file(const std::string& dirname, const std::string& filename, FileLock **p); |
7c673cae FG |
540 | int unlock_file(FileLock *l); |
541 | ||
7c673cae FG |
542 | void compact_log(); |
543 | ||
544 | /// sync any uncommitted state to disk | |
1911f103 | 545 | void sync_metadata(bool avoid_compact); |
f6b5b4d7 TL |
546 | /// test and compact log, if necessary |
547 | void _maybe_compact_log(std::unique_lock<ceph::mutex>& l); | |
7c673cae | 548 | |
9f95a23c TL |
549 | void set_volume_selector(BlueFSVolumeSelector* s) { |
550 | vselector.reset(s); | |
551 | } | |
f67539c2 | 552 | void dump_volume_selector(std::ostream& sout) { |
9f95a23c TL |
553 | vselector->dump(sout); |
554 | } | |
555 | void get_vselector_paths(const std::string& base, | |
556 | BlueFSVolumeSelector::paths& res) const { | |
557 | return vselector->get_paths(base, res); | |
558 | } | |
559 | ||
f67539c2 TL |
560 | int add_block_device(unsigned bdev, const std::string& path, bool trim, |
561 | uint64_t reserved, | |
562 | bluefs_shared_alloc_context_t* _shared_alloc = nullptr); | |
7c673cae | 563 | bool bdev_support_label(unsigned id); |
f67539c2 | 564 | uint64_t get_block_device_size(unsigned bdev) const; |
7c673cae | 565 | |
11fdf7f2 TL |
566 | // handler for discard event |
567 | void handle_discard(unsigned dev, interval_set<uint64_t>& to_release); | |
568 | ||
f6b5b4d7 TL |
569 | void flush(FileWriter *h, bool force = false) { |
570 | std::unique_lock l(lock); | |
571 | int r = _flush(h, force, l); | |
572 | ceph_assert(r == 0); | |
7c673cae | 573 | } |
cd265ab1 TL |
574 | |
575 | void append_try_flush(FileWriter *h, const char* buf, size_t len) { | |
576 | size_t max_size = 1ull << 30; // cap to 1GB | |
577 | while (len > 0) { | |
578 | bool need_flush = true; | |
f67539c2 | 579 | auto l0 = h->get_buffer_length(); |
cd265ab1 TL |
580 | if (l0 < max_size) { |
581 | size_t l = std::min(len, max_size - l0); | |
582 | h->append(buf, l); | |
583 | buf += l; | |
584 | len -= l; | |
f67539c2 | 585 | need_flush = h->get_buffer_length() >= cct->_conf->bluefs_min_flush_size; |
cd265ab1 TL |
586 | } |
587 | if (need_flush) { | |
588 | flush(h, true); | |
589 | // make sure we've made any progress with flush hence the | |
590 | // loop doesn't iterate forever | |
f67539c2 | 591 | ceph_assert(h->get_buffer_length() < max_size); |
cd265ab1 TL |
592 | } |
593 | } | |
594 | } | |
7c673cae | 595 | void flush_range(FileWriter *h, uint64_t offset, uint64_t length) { |
11fdf7f2 | 596 | std::lock_guard l(lock); |
7c673cae FG |
597 | _flush_range(h, offset, length); |
598 | } | |
599 | int fsync(FileWriter *h) { | |
11fdf7f2 | 600 | std::unique_lock l(lock); |
f6b5b4d7 TL |
601 | int r = _fsync(h, l); |
602 | _maybe_compact_log(l); | |
603 | return r; | |
7c673cae | 604 | } |
f67539c2 TL |
605 | int64_t read(FileReader *h, uint64_t offset, size_t len, |
606 | ceph::buffer::list *outbl, char *out) { | |
7c673cae FG |
607 | // no need to hold the global lock here; we only touch h and |
608 | // h->file, and read vs write or delete is already protected (via | |
609 | // atomics and asserts). | |
f67539c2 | 610 | return _read(h, offset, len, outbl, out); |
7c673cae | 611 | } |
adb31ebb | 612 | int64_t read_random(FileReader *h, uint64_t offset, size_t len, |
7c673cae FG |
613 | char *out) { |
614 | // no need to hold the global lock here; we only touch h and | |
615 | // h->file, and read vs write or delete is already protected (via | |
616 | // atomics and asserts). | |
617 | return _read_random(h, offset, len, out); | |
618 | } | |
619 | void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) { | |
11fdf7f2 | 620 | std::lock_guard l(lock); |
7c673cae FG |
621 | _invalidate_cache(f, offset, len); |
622 | } | |
623 | int preallocate(FileRef f, uint64_t offset, uint64_t len) { | |
11fdf7f2 | 624 | std::lock_guard l(lock); |
7c673cae FG |
625 | return _preallocate(f, offset, len); |
626 | } | |
627 | int truncate(FileWriter *h, uint64_t offset) { | |
11fdf7f2 | 628 | std::lock_guard l(lock); |
7c673cae FG |
629 | return _truncate(h, offset); |
630 | } | |
f6b5b4d7 TL |
631 | int do_replay_recovery_read(FileReader *log, |
632 | size_t log_pos, | |
633 | size_t read_offset, | |
634 | size_t read_len, | |
635 | bufferlist* bl); | |
7c673cae | 636 | |
f67539c2 TL |
637 | size_t probe_alloc_avail(int dev, uint64_t alloc_size); |
638 | ||
9f95a23c | 639 | /// test purpose methods |
9f95a23c TL |
640 | const PerfCounters* get_perf_counters() const { |
641 | return logger; | |
642 | } | |
cd265ab1 TL |
643 | |
644 | private: | |
645 | // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...) | |
646 | // They are used for checking if read values are all 0, and reread if so. | |
647 | int read(uint8_t ndev, uint64_t off, uint64_t len, | |
648 | ceph::buffer::list *pbl, IOContext *ioc, bool buffered); | |
649 | int read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered); | |
9f95a23c TL |
650 | }; |
651 | ||
652 | class OriginalVolumeSelector : public BlueFSVolumeSelector { | |
653 | uint64_t wal_total; | |
654 | uint64_t db_total; | |
655 | uint64_t slow_total; | |
656 | ||
657 | public: | |
658 | OriginalVolumeSelector( | |
659 | uint64_t _wal_total, | |
660 | uint64_t _db_total, | |
661 | uint64_t _slow_total) | |
662 | : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {} | |
663 | ||
f6b5b4d7 TL |
664 | void* get_hint_for_log() const override; |
665 | void* get_hint_by_dir(const std::string& dirname) const override; | |
9f95a23c TL |
666 | |
667 | void add_usage(void* hint, const bluefs_fnode_t& fnode) override { | |
668 | // do nothing | |
669 | return; | |
670 | } | |
671 | void sub_usage(void* hint, const bluefs_fnode_t& fnode) override { | |
672 | // do nothing | |
673 | return; | |
674 | } | |
675 | void add_usage(void* hint, uint64_t fsize) override { | |
676 | // do nothing | |
677 | return; | |
678 | } | |
679 | void sub_usage(void* hint, uint64_t fsize) override { | |
680 | // do nothing | |
681 | return; | |
682 | } | |
683 | ||
684 | uint8_t select_prefer_bdev(void* hint) override; | |
685 | void get_paths(const std::string& base, paths& res) const override; | |
f67539c2 TL |
686 | void dump(std::ostream& sout) override; |
687 | }; | |
688 | ||
689 | class FitToFastVolumeSelector : public OriginalVolumeSelector { | |
690 | public: | |
691 | FitToFastVolumeSelector( | |
692 | uint64_t _wal_total, | |
693 | uint64_t _db_total, | |
694 | uint64_t _slow_total) | |
695 | : OriginalVolumeSelector(_wal_total, _db_total, _slow_total) {} | |
696 | ||
697 | void get_paths(const std::string& base, paths& res) const override; | |
7c673cae FG |
698 | }; |
699 | ||
700 | #endif |