]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueRocksEnv.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / os / bluestore / BlueRocksEnv.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "BlueRocksEnv.h"
5#include "BlueFS.h"
6#include "include/stringify.h"
7#include "kv/RocksDBStore.h"
11fdf7f2 8#include "string.h"
7c673cae 9
20effc67
TL
10using std::string_view;
11
b3b6e05e
TL
12namespace {
13
7c673cae
FG
14rocksdb::Status err_to_status(int r)
15{
16 switch (r) {
17 case 0:
18 return rocksdb::Status::OK();
19 case -ENOENT:
20 return rocksdb::Status::NotFound(rocksdb::Status::kNone);
21 case -EINVAL:
22 return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone);
23 case -EIO:
11fdf7f2 24 case -EEXIST:
7c673cae 25 return rocksdb::Status::IOError(rocksdb::Status::kNone);
11fdf7f2
TL
26 case -ENOLCK:
27 return rocksdb::Status::IOError(strerror(r));
7c673cae
FG
28 default:
29 // FIXME :(
11fdf7f2 30 ceph_abort_msg("unrecognized error code");
7c673cae
FG
31 return rocksdb::Status::NotSupported(rocksdb::Status::kNone);
32 }
33}
34
b3b6e05e
TL
35std::pair<std::string_view, std::string_view>
36split(const std::string &fn)
37{
38 size_t slash = fn.rfind('/');
39 assert(slash != fn.npos);
40 size_t file_begin = slash + 1;
41 while (slash && fn[slash - 1] == '/')
42 --slash;
43 return {string_view(fn.data(), slash),
44 string_view(fn.data() + file_begin,
45 fn.size() - file_begin)};
46}
47
48}
49
7c673cae
FG
50// A file abstraction for reading sequentially through a file
51class BlueRocksSequentialFile : public rocksdb::SequentialFile {
52 BlueFS *fs;
53 BlueFS::FileReader *h;
54 public:
55 BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
56 ~BlueRocksSequentialFile() override {
57 delete h;
58 }
59
60 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
61 // written by this routine. Sets "*result" to the data that was
62 // read (including if fewer than "n" bytes were successfully read).
63 // May set "*result" to point at data in "scratch[0..n-1]", so
64 // "scratch[0..n-1]" must be live when "*result" is used.
65 // If an error was encountered, returns a non-OK status.
66 //
67 // REQUIRES: External synchronization
68 rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override {
f67539c2 69 int64_t r = fs->read(h, h->buf.pos, n, NULL, scratch);
11fdf7f2 70 ceph_assert(r >= 0);
7c673cae
FG
71 *result = rocksdb::Slice(scratch, r);
72 return rocksdb::Status::OK();
73 }
74
75 // Skip "n" bytes from the file. This is guaranteed to be no
76 // slower that reading the same data, but may be faster.
77 //
78 // If end of file is reached, skipping will stop at the end of the
79 // file, and Skip will return OK.
80 //
81 // REQUIRES: External synchronization
82 rocksdb::Status Skip(uint64_t n) override {
83 h->buf.skip(n);
84 return rocksdb::Status::OK();
85 }
86
87 // Remove any kind of caching of data from the offset to offset+length
88 // of this file. If the length is 0, then it refers to the end of file.
89 // If the system is not caching the file contents, then this is a noop.
90 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
f67539c2 91 h->buf.invalidate_cache(offset, length);
7c673cae
FG
92 fs->invalidate_cache(h->file, offset, length);
93 return rocksdb::Status::OK();
94 }
95};
96
97// A file abstraction for randomly reading the contents of a file.
98class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile {
99 BlueFS *fs;
100 BlueFS::FileReader *h;
101 public:
102 BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
103 ~BlueRocksRandomAccessFile() override {
104 delete h;
105 }
106
107 // Read up to "n" bytes from the file starting at "offset".
108 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
109 // to the data that was read (including if fewer than "n" bytes were
110 // successfully read). May set "*result" to point at data in
111 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
112 // "*result" is used. If an error was encountered, returns a non-OK
113 // status.
114 //
115 // Safe for concurrent use by multiple threads.
116 rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result,
117 char* scratch) const override {
adb31ebb 118 int64_t r = fs->read_random(h, offset, n, scratch);
11fdf7f2 119 ceph_assert(r >= 0);
7c673cae
FG
120 *result = rocksdb::Slice(scratch, r);
121 return rocksdb::Status::OK();
122 }
123
7c673cae
FG
124 // Tries to get an unique ID for this file that will be the same each time
125 // the file is opened (and will stay the same while the file is open).
126 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
127 // ID can be created this function returns the length of the ID and places it
128 // in "id"; otherwise, this function returns 0, in which case "id"
129 // may not have been modified.
130 //
131 // This function guarantees, for IDs from a given environment, two unique ids
132 // cannot be made equal to eachother by adding arbitrary bytes to one of
133 // them. That is, no unique ID is the prefix of another.
134 //
135 // This function guarantees that the returned ID will not be interpretable as
136 // a single varint.
137 //
138 // Note: these IDs are only valid for the duration of the process.
139 size_t GetUniqueId(char* id, size_t max_size) const override {
140 return snprintf(id, max_size, "%016llx",
141 (unsigned long long)h->file->fnode.ino);
142 };
143
494da23a
TL
144 // Readahead the file starting from offset by n bytes for caching.
145 rocksdb::Status Prefetch(uint64_t offset, size_t n) override {
f67539c2 146 fs->read(h, offset, n, nullptr, nullptr);
494da23a
TL
147 return rocksdb::Status::OK();
148 }
149
7c673cae
FG
150 //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
151
152 void Hint(AccessPattern pattern) override {
153 if (pattern == RANDOM)
154 h->buf.max_prefetch = 4096;
155 else if (pattern == SEQUENTIAL)
156 h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch;
157 }
158
f67539c2
TL
159 bool use_direct_io() const override {
160 return !fs->cct->_conf->bluefs_buffered_io;
161 }
162
7c673cae
FG
163 // Remove any kind of caching of data from the offset to offset+length
164 // of this file. If the length is 0, then it refers to the end of file.
165 // If the system is not caching the file contents, then this is a noop.
166 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
f67539c2 167 h->buf.invalidate_cache(offset, length);
7c673cae
FG
168 fs->invalidate_cache(h->file, offset, length);
169 return rocksdb::Status::OK();
170 }
171};
172
173
174// A file abstraction for sequential writing. The implementation
175// must provide buffering since callers may append small fragments
176// at a time to the file.
177class BlueRocksWritableFile : public rocksdb::WritableFile {
178 BlueFS *fs;
179 BlueFS::FileWriter *h;
180 public:
181 BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {}
182 ~BlueRocksWritableFile() override {
183 fs->close_writer(h);
184 }
185
186 // Indicates if the class makes use of unbuffered I/O
187 /*bool UseOSBuffer() const {
188 return true;
189 }*/
190
191 // This is needed when you want to allocate
192 // AlignedBuffer for use with file I/O classes
193 // Used for unbuffered file I/O when UseOSBuffer() returns false
194 /*size_t GetRequiredBufferAlignment() const {
195 return c_DefaultPageSize;
196 }*/
197
198 rocksdb::Status Append(const rocksdb::Slice& data) override {
cd265ab1 199 fs->append_try_flush(h, data.data(), data.size());
7c673cae
FG
200 return rocksdb::Status::OK();
201 }
202
203 // Positioned write for unbuffered access default forward
204 // to simple append as most of the tests are buffered by default
205 rocksdb::Status PositionedAppend(
206 const rocksdb::Slice& /* data */,
207 uint64_t /* offset */) override {
208 return rocksdb::Status::NotSupported();
209 }
210
211 // Truncate is necessary to trim the file to the correct size
212 // before closing. It is not always possible to keep track of the file
213 // size due to whole pages writes. The behavior is undefined if called
214 // with other writes to follow.
215 rocksdb::Status Truncate(uint64_t size) override {
216 // we mirror the posix env, which does nothing here; instead, it
217 // truncates to the final size on close. whatever!
218 return rocksdb::Status::OK();
219 //int r = fs->truncate(h, size);
220 // return err_to_status(r);
221 }
222
223 rocksdb::Status Close() override {
522d829b 224 fs->fsync(h);
7c673cae
FG
225
226 // mimic posix env, here. shrug.
227 size_t block_size;
228 size_t last_allocated_block;
229 GetPreallocationStatus(&block_size, &last_allocated_block);
230 if (last_allocated_block > 0) {
231 int r = fs->truncate(h, h->pos);
232 if (r < 0)
233 return err_to_status(r);
234 }
235
236 return rocksdb::Status::OK();
237 }
238
239 rocksdb::Status Flush() override {
240 fs->flush(h);
241 return rocksdb::Status::OK();
242 }
243
244 rocksdb::Status Sync() override { // sync data
245 fs->fsync(h);
246 return rocksdb::Status::OK();
247 }
248
249 // true if Sync() and Fsync() are safe to call concurrently with Append()
250 // and Flush().
251 bool IsSyncThreadSafe() const override {
252 return true;
253 }
254
255 // Indicates the upper layers if the current WritableFile implementation
256 // uses direct IO.
257 bool UseDirectIO() const {
258 return false;
259 }
260
11fdf7f2
TL
261 void SetWriteLifeTimeHint(rocksdb::Env::WriteLifeTimeHint hint) override {
262 h->write_hint = (const int)hint;
263 }
264
7c673cae
FG
265 /*
266 * Get the size of valid data in the file.
267 */
268 uint64_t GetFileSize() override {
f67539c2 269 return h->file->fnode.size + h->get_buffer_length();;
7c673cae
FG
270 }
271
272 // For documentation, refer to RandomAccessFile::GetUniqueId()
273 size_t GetUniqueId(char* id, size_t max_size) const override {
274 return snprintf(id, max_size, "%016llx",
275 (unsigned long long)h->file->fnode.ino);
276 }
277
278 // Remove any kind of caching of data from the offset to offset+length
279 // of this file. If the length is 0, then it refers to the end of file.
280 // If the system is not caching the file contents, then this is a noop.
281 // This call has no effect on dirty pages in the cache.
282 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
f67539c2 283 fs->fsync(h);
7c673cae
FG
284 fs->invalidate_cache(h->file, offset, length);
285 return rocksdb::Status::OK();
286 }
287
7c673cae
FG
288 // Sync a file range with disk.
289 // offset is the starting byte of the file range to be synchronized.
290 // nbytes specifies the length of the range to be synchronized.
291 // This asks the OS to initiate flushing the cached data to disk,
292 // without waiting for completion.
1e59de90 293 rocksdb::Status RangeSync(uint64_t offset, uint64_t nbytes) override {
7c673cae
FG
294 // round down to page boundaries
295 int partial = offset & 4095;
296 offset -= partial;
297 nbytes += partial;
298 nbytes &= ~4095;
299 if (nbytes)
300 fs->flush_range(h, offset, nbytes);
301 return rocksdb::Status::OK();
302 }
303
304 protected:
7c673cae
FG
305 /*
306 * Pre-allocate space for a file.
307 */
1e59de90 308 rocksdb::Status Allocate(uint64_t offset, uint64_t len) override {
7c673cae
FG
309 int r = fs->preallocate(h->file, offset, len);
310 return err_to_status(r);
311 }
312};
313
314
315// Directory object represents collection of files and implements
316// filesystem operations that can be executed on directories.
317class BlueRocksDirectory : public rocksdb::Directory {
318 BlueFS *fs;
319 public:
320 explicit BlueRocksDirectory(BlueFS *f) : fs(f) {}
321
322 // Fsync directory. Can be called concurrently from multiple threads.
323 rocksdb::Status Fsync() override {
324 // it is sufficient to flush the log.
1911f103 325 fs->sync_metadata(false);
7c673cae
FG
326 return rocksdb::Status::OK();
327 }
328};
329
330// Identifies a locked file.
331class BlueRocksFileLock : public rocksdb::FileLock {
332 public:
333 BlueFS *fs;
334 BlueFS::FileLock *lock;
335 BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { }
336 ~BlueRocksFileLock() override {
337 }
338};
339
340
341// --------------------
342// --- BlueRocksEnv ---
343// --------------------
344
345BlueRocksEnv::BlueRocksEnv(BlueFS *f)
346 : EnvWrapper(Env::Default()), // forward most of it to POSIX
347 fs(f)
348{
349
350}
351
352rocksdb::Status BlueRocksEnv::NewSequentialFile(
353 const std::string& fname,
354 std::unique_ptr<rocksdb::SequentialFile>* result,
355 const rocksdb::EnvOptions& options)
356{
357 if (fname[0] == '/')
358 return target()->NewSequentialFile(fname, result, options);
b3b6e05e 359 auto [dir, file] = split(fname);
7c673cae
FG
360 BlueFS::FileReader *h;
361 int r = fs->open_for_read(dir, file, &h, false);
362 if (r < 0)
363 return err_to_status(r);
364 result->reset(new BlueRocksSequentialFile(fs, h));
365 return rocksdb::Status::OK();
366}
367
368rocksdb::Status BlueRocksEnv::NewRandomAccessFile(
369 const std::string& fname,
370 std::unique_ptr<rocksdb::RandomAccessFile>* result,
371 const rocksdb::EnvOptions& options)
372{
b3b6e05e 373 auto [dir, file] = split(fname);
7c673cae
FG
374 BlueFS::FileReader *h;
375 int r = fs->open_for_read(dir, file, &h, true);
376 if (r < 0)
377 return err_to_status(r);
378 result->reset(new BlueRocksRandomAccessFile(fs, h));
379 return rocksdb::Status::OK();
380}
381
382rocksdb::Status BlueRocksEnv::NewWritableFile(
383 const std::string& fname,
384 std::unique_ptr<rocksdb::WritableFile>* result,
385 const rocksdb::EnvOptions& options)
386{
b3b6e05e 387 auto [dir, file] = split(fname);
7c673cae
FG
388 BlueFS::FileWriter *h;
389 int r = fs->open_for_write(dir, file, &h, false);
390 if (r < 0)
391 return err_to_status(r);
392 result->reset(new BlueRocksWritableFile(fs, h));
393 return rocksdb::Status::OK();
394}
395
396rocksdb::Status BlueRocksEnv::ReuseWritableFile(
397 const std::string& new_fname,
398 const std::string& old_fname,
399 std::unique_ptr<rocksdb::WritableFile>* result,
400 const rocksdb::EnvOptions& options)
401{
b3b6e05e
TL
402 auto [old_dir, old_file] = split(old_fname);
403 auto [new_dir, new_file] = split(new_fname);
7c673cae
FG
404
405 int r = fs->rename(old_dir, old_file, new_dir, new_file);
406 if (r < 0)
407 return err_to_status(r);
408
409 BlueFS::FileWriter *h;
410 r = fs->open_for_write(new_dir, new_file, &h, true);
411 if (r < 0)
412 return err_to_status(r);
413 result->reset(new BlueRocksWritableFile(fs, h));
20effc67 414 fs->sync_metadata(false);
7c673cae
FG
415 return rocksdb::Status::OK();
416}
417
418rocksdb::Status BlueRocksEnv::NewDirectory(
419 const std::string& name,
420 std::unique_ptr<rocksdb::Directory>* result)
421{
422 if (!fs->dir_exists(name))
11fdf7f2 423 return rocksdb::Status::NotFound(name, strerror(ENOENT));
7c673cae
FG
424 result->reset(new BlueRocksDirectory(fs));
425 return rocksdb::Status::OK();
426}
427
428rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname)
429{
430 if (fname[0] == '/')
431 return target()->FileExists(fname);
b3b6e05e 432 auto [dir, file] = split(fname);
7c673cae
FG
433 if (fs->stat(dir, file, NULL, NULL) == 0)
434 return rocksdb::Status::OK();
435 return err_to_status(-ENOENT);
436}
437
438rocksdb::Status BlueRocksEnv::GetChildren(
439 const std::string& dir,
440 std::vector<std::string>* result)
441{
d2e6a577 442 result->clear();
7c673cae
FG
443 int r = fs->readdir(dir, result);
444 if (r < 0)
11fdf7f2 445 return rocksdb::Status::NotFound(dir, strerror(ENOENT));// return err_to_status(r);
7c673cae
FG
446 return rocksdb::Status::OK();
447}
448
449rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname)
450{
b3b6e05e 451 auto [dir, file] = split(fname);
7c673cae
FG
452 int r = fs->unlink(dir, file);
453 if (r < 0)
454 return err_to_status(r);
20effc67 455 fs->sync_metadata(false);
7c673cae
FG
456 return rocksdb::Status::OK();
457}
458
459rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname)
460{
461 int r = fs->mkdir(dirname);
462 if (r < 0)
463 return err_to_status(r);
464 return rocksdb::Status::OK();
465}
466
467rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname)
468{
469 int r = fs->mkdir(dirname);
470 if (r < 0 && r != -EEXIST)
471 return err_to_status(r);
472 return rocksdb::Status::OK();
473}
474
475rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname)
476{
477 int r = fs->rmdir(dirname);
478 if (r < 0)
479 return err_to_status(r);
480 return rocksdb::Status::OK();
481}
482
483rocksdb::Status BlueRocksEnv::GetFileSize(
484 const std::string& fname,
485 uint64_t* file_size)
486{
b3b6e05e 487 auto [dir, file] = split(fname);
7c673cae
FG
488 int r = fs->stat(dir, file, file_size, NULL);
489 if (r < 0)
490 return err_to_status(r);
491 return rocksdb::Status::OK();
492}
493
494rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname,
495 uint64_t* file_mtime)
496{
b3b6e05e 497 auto [dir, file] = split(fname);
7c673cae
FG
498 utime_t mtime;
499 int r = fs->stat(dir, file, NULL, &mtime);
500 if (r < 0)
501 return err_to_status(r);
502 *file_mtime = mtime.sec();
503 return rocksdb::Status::OK();
504}
505
506rocksdb::Status BlueRocksEnv::RenameFile(
507 const std::string& src,
508 const std::string& target)
509{
b3b6e05e
TL
510 auto [old_dir, old_file] = split(src);
511 auto [new_dir, new_file] = split(target);
7c673cae
FG
512
513 int r = fs->rename(old_dir, old_file, new_dir, new_file);
514 if (r < 0)
515 return err_to_status(r);
20effc67 516 fs->sync_metadata(false);
7c673cae
FG
517 return rocksdb::Status::OK();
518}
519
520rocksdb::Status BlueRocksEnv::LinkFile(
521 const std::string& src,
522 const std::string& target)
523{
524 ceph_abort();
525}
526
11fdf7f2
TL
527rocksdb::Status BlueRocksEnv::AreFilesSame(
528 const std::string& first,
529 const std::string& second, bool* res)
530{
531 for (auto& path : {first, second}) {
532 if (fs->dir_exists(path)) {
533 continue;
534 }
b3b6e05e 535 auto [dir, file] = split(path);
11fdf7f2
TL
536 int r = fs->stat(dir, file, nullptr, nullptr);
537 if (!r) {
538 continue;
539 } else if (r == -ENOENT) {
540 return rocksdb::Status::NotFound("AreFilesSame", path);
541 } else {
542 return err_to_status(r);
543 }
544 }
545 *res = (first == second);
546 return rocksdb::Status::OK();
547}
548
7c673cae
FG
549rocksdb::Status BlueRocksEnv::LockFile(
550 const std::string& fname,
551 rocksdb::FileLock** lock)
552{
b3b6e05e 553 auto [dir, file] = split(fname);
7c673cae
FG
554 BlueFS::FileLock *l = NULL;
555 int r = fs->lock_file(dir, file, &l);
556 if (r < 0)
557 return err_to_status(r);
558 *lock = new BlueRocksFileLock(fs, l);
559 return rocksdb::Status::OK();
560}
561
562rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock)
563{
564 BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock);
565 int r = fs->unlock_file(l->lock);
566 if (r < 0)
567 return err_to_status(r);
568 delete lock;
11fdf7f2 569 lock = nullptr;
7c673cae
FG
570 return rocksdb::Status::OK();
571}
572
573rocksdb::Status BlueRocksEnv::GetAbsolutePath(
574 const std::string& db_path,
575 std::string* output_path)
576{
577 // this is a lie...
578 *output_path = "/" + db_path;
579 return rocksdb::Status::OK();
580}
581
582rocksdb::Status BlueRocksEnv::NewLogger(
583 const std::string& fname,
584 std::shared_ptr<rocksdb::Logger>* result)
585{
586 // ignore the filename :)
587 result->reset(create_rocksdb_ceph_logger());
588 return rocksdb::Status::OK();
589}
590
591rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path)
592{
593 static int foo = 0;
594 *path = "temp_" + stringify(++foo);
595 return rocksdb::Status::OK();
596}