]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / os / bluestore / BlueFS.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3#ifndef CEPH_OS_BLUESTORE_BLUEFS_H
4#define CEPH_OS_BLUESTORE_BLUEFS_H
5
6#include <atomic>
7#include <mutex>
8
9#include "bluefs_types.h"
10#include "common/RefCountedObj.h"
11#include "BlockDevice.h"
12
13#include "boost/intrusive/list.hpp"
14#include <boost/intrusive_ptr.hpp>
15
16class PerfCounters;
17
18class Allocator;
19
20enum {
21 l_bluefs_first = 732600,
22 l_bluefs_gift_bytes,
23 l_bluefs_reclaim_bytes,
24 l_bluefs_db_total_bytes,
25 l_bluefs_db_used_bytes,
26 l_bluefs_wal_total_bytes,
27 l_bluefs_wal_used_bytes,
28 l_bluefs_slow_total_bytes,
29 l_bluefs_slow_used_bytes,
30 l_bluefs_num_files,
31 l_bluefs_log_bytes,
32 l_bluefs_log_compactions,
33 l_bluefs_logged_bytes,
34 l_bluefs_files_written_wal,
35 l_bluefs_files_written_sst,
36 l_bluefs_bytes_written_wal,
37 l_bluefs_bytes_written_sst,
11fdf7f2
TL
38 l_bluefs_bytes_written_slow,
39 l_bluefs_max_bytes_wal,
40 l_bluefs_max_bytes_db,
41 l_bluefs_max_bytes_slow,
7c673cae
FG
42 l_bluefs_last,
43};
44
11fdf7f2
TL
45class BlueFSDeviceExpander {
46protected:
47 ~BlueFSDeviceExpander() {}
48public:
49 virtual uint64_t get_recommended_expansion_delta(uint64_t bluefs_free,
50 uint64_t bluefs_total) = 0;
51 virtual int allocate_freespace(
52 uint64_t min_size,
53 uint64_t size,
54 PExtentVector& extents) = 0;
55};
56
7c673cae
FG
57class BlueFS {
58public:
59 CephContext* cct;
11fdf7f2 60 static constexpr unsigned MAX_BDEV = 5;
7c673cae
FG
61 static constexpr unsigned BDEV_WAL = 0;
62 static constexpr unsigned BDEV_DB = 1;
63 static constexpr unsigned BDEV_SLOW = 2;
11fdf7f2
TL
64 static constexpr unsigned BDEV_NEWWAL = 3;
65 static constexpr unsigned BDEV_NEWDB = 4;
7c673cae
FG
66
67 enum {
68 WRITER_UNKNOWN,
69 WRITER_WAL,
70 WRITER_SST,
71 };
72
73 struct File : public RefCountedObject {
74 MEMPOOL_CLASS_HELPERS();
75
76 bluefs_fnode_t fnode;
77 int refs;
78 uint64_t dirty_seq;
79 bool locked;
80 bool deleted;
81 boost::intrusive::list_member_hook<> dirty_item;
82
83 std::atomic_int num_readers, num_writers;
84 std::atomic_int num_reading;
85
86 File()
87 : RefCountedObject(NULL, 0),
88 refs(0),
89 dirty_seq(0),
90 locked(false),
91 deleted(false),
92 num_readers(0),
93 num_writers(0),
94 num_reading(0)
95 {}
96 ~File() override {
11fdf7f2
TL
97 ceph_assert(num_readers.load() == 0);
98 ceph_assert(num_writers.load() == 0);
99 ceph_assert(num_reading.load() == 0);
100 ceph_assert(!locked);
7c673cae
FG
101 }
102
103 friend void intrusive_ptr_add_ref(File *f) {
104 f->get();
105 }
106 friend void intrusive_ptr_release(File *f) {
107 f->put();
108 }
109 };
110 typedef boost::intrusive_ptr<File> FileRef;
111
112 typedef boost::intrusive::list<
113 File,
114 boost::intrusive::member_hook<
115 File,
116 boost::intrusive::list_member_hook<>,
117 &File::dirty_item> > dirty_file_list_t;
118
119 struct Dir : public RefCountedObject {
120 MEMPOOL_CLASS_HELPERS();
121
122 mempool::bluefs::map<string,FileRef> file_map;
123
124 Dir() : RefCountedObject(NULL, 0) {}
125
126 friend void intrusive_ptr_add_ref(Dir *d) {
127 d->get();
128 }
129 friend void intrusive_ptr_release(Dir *d) {
130 d->put();
131 }
132 };
133 typedef boost::intrusive_ptr<Dir> DirRef;
134
135 struct FileWriter {
136 MEMPOOL_CLASS_HELPERS();
137
138 FileRef file;
139 uint64_t pos; ///< start offset for buffer
140 bufferlist buffer; ///< new data to write (at end of file)
141 bufferlist tail_block; ///< existing partial block at end of file, if any
142 bufferlist::page_aligned_appender buffer_appender; //< for const char* only
143 int writer_type = 0; ///< WRITER_*
11fdf7f2 144 int write_hint = WRITE_LIFE_NOT_SET;
7c673cae 145
11fdf7f2 146 ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
7c673cae 147 std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
11fdf7f2 148 std::array<bool, MAX_BDEV> dirty_devs;
7c673cae
FG
149
150 FileWriter(FileRef f)
151 : file(f),
152 pos(0),
153 buffer_appender(buffer.get_page_aligned_appender(
11fdf7f2 154 g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
7c673cae
FG
155 ++file->num_writers;
156 iocv.fill(nullptr);
11fdf7f2
TL
157 dirty_devs.fill(false);
158 if (f->fnode.ino == 1) {
159 write_hint = WRITE_LIFE_MEDIUM;
160 }
7c673cae
FG
161 }
162 // NOTE: caller must call BlueFS::close_writer()
163 ~FileWriter() {
164 --file->num_writers;
165 }
166
167 // note: BlueRocksEnv uses this append exclusively, so it's safe
168 // to use buffer_appender exclusively here (e.g., it's notion of
169 // offset will remain accurate).
170 void append(const char *buf, size_t len) {
171 buffer_appender.append(buf, len);
172 }
173
174 // note: used internally only, for ino 1 or 0.
175 void append(bufferlist& bl) {
176 buffer.claim_append(bl);
177 }
178
179 uint64_t get_effective_write_pos() {
180 buffer_appender.flush();
181 return pos + buffer.length();
182 }
183 };
184
185 struct FileReaderBuffer {
186 MEMPOOL_CLASS_HELPERS();
187
188 uint64_t bl_off; ///< prefetch buffer logical offset
189 bufferlist bl; ///< prefetch buffer
190 uint64_t pos; ///< current logical offset
191 uint64_t max_prefetch; ///< max allowed prefetch
192
193 explicit FileReaderBuffer(uint64_t mpf)
194 : bl_off(0),
195 pos(0),
196 max_prefetch(mpf) {}
197
198 uint64_t get_buf_end() {
199 return bl_off + bl.length();
200 }
201 uint64_t get_buf_remaining(uint64_t p) {
202 if (p >= bl_off && p < bl_off + bl.length())
203 return bl_off + bl.length() - p;
204 return 0;
205 }
206
207 void skip(size_t n) {
208 pos += n;
209 }
210 void seek(uint64_t offset) {
211 pos = offset;
212 }
213 };
214
215 struct FileReader {
216 MEMPOOL_CLASS_HELPERS();
217
218 FileRef file;
219 FileReaderBuffer buf;
220 bool random;
221 bool ignore_eof; ///< used when reading our log file
222
223 FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
224 : file(f),
225 buf(mpf),
226 random(rand),
227 ignore_eof(ie) {
228 ++file->num_readers;
229 }
230 ~FileReader() {
231 --file->num_readers;
232 }
233 };
234
235 struct FileLock {
236 MEMPOOL_CLASS_HELPERS();
237
238 FileRef file;
239 explicit FileLock(FileRef f) : file(f) {}
240 };
241
242private:
11fdf7f2 243 ceph::mutex lock = ceph::make_mutex("BlueFS::lock");
7c673cae
FG
244
245 PerfCounters *logger = nullptr;
246
11fdf7f2
TL
247 uint64_t max_bytes[MAX_BDEV] = {0};
248 uint64_t max_bytes_pcounters[MAX_BDEV] = {
249 l_bluefs_max_bytes_wal,
250 l_bluefs_max_bytes_db,
251 l_bluefs_max_bytes_slow,
252 };
253
7c673cae
FG
254 // cache
255 mempool::bluefs::map<string, DirRef> dir_map; ///< dirname -> Dir
256 mempool::bluefs::unordered_map<uint64_t,FileRef> file_map; ///< ino -> File
257
258 // map of dirty files, files of same dirty_seq are grouped into list.
259 map<uint64_t, dirty_file_list_t> dirty_files;
260
261 bluefs_super_t super; ///< latest superblock (as last written)
262 uint64_t ino_last = 0; ///< last assigned ino (this one is in use)
263 uint64_t log_seq = 0; ///< last used log seq (by current pending log_t)
264 uint64_t log_seq_stable = 0; ///< last stable/synced log seq
265 FileWriter *log_writer = 0; ///< writer for the log
266 bluefs_transaction_t log_t; ///< pending, unwritten log transaction
267 bool log_flushing = false; ///< true while flushing the log
11fdf7f2 268 ceph::condition_variable log_cond;
7c673cae
FG
269
270 uint64_t new_log_jump_to = 0;
271 uint64_t old_log_jump_to = 0;
272 FileRef new_log = nullptr;
273 FileWriter *new_log_writer = nullptr;
274
275 /*
276 * There are up to 3 block devices:
277 *
278 * BDEV_DB db/ - the primary db device
279 * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
280 * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
281 */
282 vector<BlockDevice*> bdev; ///< block devices we can use
283 vector<IOContext*> ioc; ///< IOContexts for bdevs
284 vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own
7c673cae
FG
285 vector<Allocator*> alloc; ///< allocators for bdevs
286 vector<interval_set<uint64_t>> pending_release; ///< extents to release
287
11fdf7f2
TL
288 BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
289
290 BlueFSDeviceExpander* slow_dev_expander = nullptr;
291
7c673cae
FG
292 void _init_logger();
293 void _shutdown_logger();
294 void _update_logger_stats();
295
296 void _init_alloc();
297 void _stop_alloc();
298
299 void _pad_bl(bufferlist& bl); ///< pad bufferlist to block size w/ zeros
300
301 FileRef _get_file(uint64_t ino);
302 void _drop_link(FileRef f);
303
11fdf7f2
TL
304 int _get_slow_device_id() { return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB; }
305 int _expand_slow_device(uint64_t min_size, PExtentVector& extents);
7c673cae 306 int _allocate(uint8_t bdev, uint64_t len,
94b18763 307 bluefs_fnode_t* node);
11fdf7f2
TL
308 int _allocate_without_fallback(uint8_t id, uint64_t len,
309 PExtentVector* extents);
310
7c673cae
FG
311 int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
312 int _flush(FileWriter *h, bool force);
11fdf7f2 313 int _fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l);
7c673cae 314
11fdf7f2 315#ifdef HAVE_LIBAIO
7c673cae
FG
316 void _claim_completed_aios(FileWriter *h, list<aio_t> *ls);
317 void wait_for_aio(FileWriter *h); // safe to call without a lock
11fdf7f2 318#endif
7c673cae 319
11fdf7f2 320 int _flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
7c673cae
FG
321 uint64_t want_seq = 0,
322 uint64_t jump_to = 0);
323 uint64_t _estimate_log_size();
324 bool _should_compact_log();
11fdf7f2
TL
325
326 enum {
327 REMOVE_DB = 1,
328 REMOVE_WAL = 2,
329 RENAME_SLOW2DB = 4,
330 RENAME_DB2SLOW = 8,
331 };
332 void _compact_log_dump_metadata(bluefs_transaction_t *t,
333 int flags);
7c673cae 334 void _compact_log_sync();
11fdf7f2
TL
335 void _compact_log_async(std::unique_lock<ceph::mutex>& l);
336
337 void _rewrite_log_sync(bool allocate_with_fallback,
338 int super_dev,
339 int log_dev,
340 int new_log_dev,
341 int flags);
7c673cae
FG
342
343 //void _aio_finish(void *priv);
344
345 void _flush_bdev_safely(FileWriter *h);
346 void flush_bdev(); // this is safe to call without a lock
11fdf7f2 347 void flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs); // this is safe to call without a lock
7c673cae
FG
348
349 int _preallocate(FileRef f, uint64_t off, uint64_t len);
350 int _truncate(FileWriter *h, uint64_t off);
351
352 int _read(
353 FileReader *h, ///< [in] read from here
354 FileReaderBuffer *buf, ///< [in] reader state
355 uint64_t offset, ///< [in] offset
356 size_t len, ///< [in] this many bytes
357 bufferlist *outbl, ///< [out] optional: reference the result here
358 char *out); ///< [out] optional: or copy it here
359 int _read_random(
360 FileReader *h, ///< [in] read from here
361 uint64_t offset, ///< [in] offset
362 size_t len, ///< [in] this many bytes
363 char *out); ///< [out] optional: or copy it here
364
365 void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length);
366
367 int _open_super();
11fdf7f2
TL
368 int _write_super(int dev);
369 int _replay(bool noop, bool to_stdout = false); ///< replay journal
7c673cae
FG
370
371 FileWriter *_create_writer(FileRef f);
372 void _close_writer(FileWriter *h);
373
374 // always put the super in the second 4k block. FIXME should this be
375 // block size independent?
376 unsigned get_super_offset() {
377 return 4096;
378 }
379 unsigned get_super_length() {
380 return 4096;
381 }
382
11fdf7f2
TL
383 void _add_block_extent(unsigned bdev, uint64_t offset, uint64_t len);
384
7c673cae
FG
385public:
386 BlueFS(CephContext* cct);
387 ~BlueFS();
388
389 // the super is always stored on bdev 0
390 int mkfs(uuid_d osd_uuid);
391 int mount();
392 void umount();
11fdf7f2
TL
393 int prepare_new_device(int id);
394
395 int log_dump();
7c673cae 396
11fdf7f2
TL
397 void collect_metadata(map<string,string> *pm, unsigned skip_bdev_id);
398 void get_devices(set<string> *ls);
7c673cae
FG
399 int fsck();
400
11fdf7f2
TL
401 int device_migrate_to_new(
402 CephContext *cct,
403 const set<int>& devs_source,
404 int dev_target);
405 int device_migrate_to_existing(
406 CephContext *cct,
407 const set<int>& devs_source,
408 int dev_target);
409
410 uint64_t get_used();
7c673cae
FG
411 uint64_t get_total(unsigned id);
412 uint64_t get_free(unsigned id);
413 void get_usage(vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...]
414 void dump_perf_counters(Formatter *f);
415
3efd9988
FG
416 void dump_block_extents(ostream& out);
417
7c673cae
FG
418 /// get current extents that we own for given block device
419 int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
420
421 int open_for_write(
422 const string& dir,
423 const string& file,
424 FileWriter **h,
425 bool overwrite);
426
427 int open_for_read(
428 const string& dir,
429 const string& file,
430 FileReader **h,
431 bool random = false);
432
433 void close_writer(FileWriter *h) {
11fdf7f2 434 std::lock_guard l(lock);
7c673cae
FG
435 _close_writer(h);
436 }
437
438 int rename(const string& old_dir, const string& old_file,
439 const string& new_dir, const string& new_file);
440
441 int readdir(const string& dirname, vector<string> *ls);
442
443 int unlink(const string& dirname, const string& filename);
444 int mkdir(const string& dirname);
445 int rmdir(const string& dirname);
d2e6a577 446 bool wal_is_rotational();
7c673cae
FG
447
448 bool dir_exists(const string& dirname);
449 int stat(const string& dirname, const string& filename,
450 uint64_t *size, utime_t *mtime);
451
452 int lock_file(const string& dirname, const string& filename, FileLock **p);
453 int unlock_file(FileLock *l);
454
455 void flush_log();
456 void compact_log();
457
458 /// sync any uncommitted state to disk
459 void sync_metadata();
460
11fdf7f2
TL
461 void set_slow_device_expander(BlueFSDeviceExpander* a) {
462 slow_dev_expander = a;
463 }
464 int add_block_device(unsigned bdev, const string& path, bool trim,
465 bool shared_with_bluestore=false);
7c673cae
FG
466 bool bdev_support_label(unsigned id);
467 uint64_t get_block_device_size(unsigned bdev);
468
469 /// gift more block space
11fdf7f2
TL
470 void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len) {
471 std::unique_lock l(lock);
472 _add_block_extent(bdev, offset, len);
473 int r = _flush_and_sync_log(l);
474 ceph_assert(r == 0);
475 }
7c673cae
FG
476
477 /// reclaim block space
478 int reclaim_blocks(unsigned bdev, uint64_t want,
a8e16298 479 PExtentVector *extents);
7c673cae 480
11fdf7f2
TL
481 // handler for discard event
482 void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
483
7c673cae 484 void flush(FileWriter *h) {
11fdf7f2 485 std::lock_guard l(lock);
7c673cae
FG
486 _flush(h, false);
487 }
488 void flush_range(FileWriter *h, uint64_t offset, uint64_t length) {
11fdf7f2 489 std::lock_guard l(lock);
7c673cae
FG
490 _flush_range(h, offset, length);
491 }
492 int fsync(FileWriter *h) {
11fdf7f2 493 std::unique_lock l(lock);
7c673cae
FG
494 return _fsync(h, l);
495 }
496 int read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len,
497 bufferlist *outbl, char *out) {
498 // no need to hold the global lock here; we only touch h and
499 // h->file, and read vs write or delete is already protected (via
500 // atomics and asserts).
501 return _read(h, buf, offset, len, outbl, out);
502 }
503 int read_random(FileReader *h, uint64_t offset, size_t len,
504 char *out) {
505 // no need to hold the global lock here; we only touch h and
506 // h->file, and read vs write or delete is already protected (via
507 // atomics and asserts).
508 return _read_random(h, offset, len, out);
509 }
510 void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) {
11fdf7f2 511 std::lock_guard l(lock);
7c673cae
FG
512 _invalidate_cache(f, offset, len);
513 }
514 int preallocate(FileRef f, uint64_t offset, uint64_t len) {
11fdf7f2 515 std::lock_guard l(lock);
7c673cae
FG
516 return _preallocate(f, offset, len);
517 }
518 int truncate(FileWriter *h, uint64_t offset) {
11fdf7f2 519 std::lock_guard l(lock);
7c673cae
FG
520 return _truncate(h, offset);
521 }
522
523};
524
525#endif