1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "BlueRocksEnv.h"
6 #include "include/stringify.h"
7 #include "kv/RocksDBStore.h"
12 rocksdb::Status
err_to_status(int r
)
16 return rocksdb::Status::OK();
18 return rocksdb::Status::NotFound(rocksdb::Status::kNone
);
20 return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone
);
23 return rocksdb::Status::IOError(rocksdb::Status::kNone
);
25 return rocksdb::Status::IOError(strerror(r
));
28 ceph_abort_msg("unrecognized error code");
29 return rocksdb::Status::NotSupported(rocksdb::Status::kNone
);
33 std::pair
<std::string_view
, std::string_view
>
34 split(const std::string
&fn
)
36 size_t slash
= fn
.rfind('/');
37 assert(slash
!= fn
.npos
);
38 size_t file_begin
= slash
+ 1;
39 while (slash
&& fn
[slash
- 1] == '/')
41 return {string_view(fn
.data(), slash
),
42 string_view(fn
.data() + file_begin
,
43 fn
.size() - file_begin
)};
48 // A file abstraction for reading sequentially through a file
49 class BlueRocksSequentialFile
: public rocksdb::SequentialFile
{
51 BlueFS::FileReader
*h
;
53 BlueRocksSequentialFile(BlueFS
*fs
, BlueFS::FileReader
*h
) : fs(fs
), h(h
) {}
54 ~BlueRocksSequentialFile() override
{
58 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
59 // written by this routine. Sets "*result" to the data that was
60 // read (including if fewer than "n" bytes were successfully read).
61 // May set "*result" to point at data in "scratch[0..n-1]", so
62 // "scratch[0..n-1]" must be live when "*result" is used.
63 // If an error was encountered, returns a non-OK status.
65 // REQUIRES: External synchronization
66 rocksdb::Status
Read(size_t n
, rocksdb::Slice
* result
, char* scratch
) override
{
67 int64_t r
= fs
->read(h
, h
->buf
.pos
, n
, NULL
, scratch
);
69 *result
= rocksdb::Slice(scratch
, r
);
70 return rocksdb::Status::OK();
73 // Skip "n" bytes from the file. This is guaranteed to be no
74 // slower that reading the same data, but may be faster.
76 // If end of file is reached, skipping will stop at the end of the
77 // file, and Skip will return OK.
79 // REQUIRES: External synchronization
80 rocksdb::Status
Skip(uint64_t n
) override
{
82 return rocksdb::Status::OK();
85 // Remove any kind of caching of data from the offset to offset+length
86 // of this file. If the length is 0, then it refers to the end of file.
87 // If the system is not caching the file contents, then this is a noop.
88 rocksdb::Status
InvalidateCache(size_t offset
, size_t length
) override
{
89 h
->buf
.invalidate_cache(offset
, length
);
90 fs
->invalidate_cache(h
->file
, offset
, length
);
91 return rocksdb::Status::OK();
95 // A file abstraction for randomly reading the contents of a file.
96 class BlueRocksRandomAccessFile
: public rocksdb::RandomAccessFile
{
98 BlueFS::FileReader
*h
;
100 BlueRocksRandomAccessFile(BlueFS
*fs
, BlueFS::FileReader
*h
) : fs(fs
), h(h
) {}
101 ~BlueRocksRandomAccessFile() override
{
105 // Read up to "n" bytes from the file starting at "offset".
106 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
107 // to the data that was read (including if fewer than "n" bytes were
108 // successfully read). May set "*result" to point at data in
109 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
110 // "*result" is used. If an error was encountered, returns a non-OK
113 // Safe for concurrent use by multiple threads.
114 rocksdb::Status
Read(uint64_t offset
, size_t n
, rocksdb::Slice
* result
,
115 char* scratch
) const override
{
116 int64_t r
= fs
->read_random(h
, offset
, n
, scratch
);
118 *result
= rocksdb::Slice(scratch
, r
);
119 return rocksdb::Status::OK();
122 // Tries to get an unique ID for this file that will be the same each time
123 // the file is opened (and will stay the same while the file is open).
124 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
125 // ID can be created this function returns the length of the ID and places it
126 // in "id"; otherwise, this function returns 0, in which case "id"
127 // may not have been modified.
129 // This function guarantees, for IDs from a given environment, two unique ids
130 // cannot be made equal to eachother by adding arbitrary bytes to one of
131 // them. That is, no unique ID is the prefix of another.
133 // This function guarantees that the returned ID will not be interpretable as
136 // Note: these IDs are only valid for the duration of the process.
137 size_t GetUniqueId(char* id
, size_t max_size
) const override
{
138 return snprintf(id
, max_size
, "%016llx",
139 (unsigned long long)h
->file
->fnode
.ino
);
142 // Readahead the file starting from offset by n bytes for caching.
143 rocksdb::Status
Prefetch(uint64_t offset
, size_t n
) override
{
144 fs
->read(h
, offset
, n
, nullptr, nullptr);
145 return rocksdb::Status::OK();
148 //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
150 void Hint(AccessPattern pattern
) override
{
151 if (pattern
== RANDOM
)
152 h
->buf
.max_prefetch
= 4096;
153 else if (pattern
== SEQUENTIAL
)
154 h
->buf
.max_prefetch
= fs
->cct
->_conf
->bluefs_max_prefetch
;
157 bool use_direct_io() const override
{
158 return !fs
->cct
->_conf
->bluefs_buffered_io
;
161 // Remove any kind of caching of data from the offset to offset+length
162 // of this file. If the length is 0, then it refers to the end of file.
163 // If the system is not caching the file contents, then this is a noop.
164 rocksdb::Status
InvalidateCache(size_t offset
, size_t length
) override
{
165 h
->buf
.invalidate_cache(offset
, length
);
166 fs
->invalidate_cache(h
->file
, offset
, length
);
167 return rocksdb::Status::OK();
172 // A file abstraction for sequential writing. The implementation
173 // must provide buffering since callers may append small fragments
174 // at a time to the file.
175 class BlueRocksWritableFile
: public rocksdb::WritableFile
{
177 BlueFS::FileWriter
*h
;
179 BlueRocksWritableFile(BlueFS
*fs
, BlueFS::FileWriter
*h
) : fs(fs
), h(h
) {}
180 ~BlueRocksWritableFile() override
{
184 // Indicates if the class makes use of unbuffered I/O
185 /*bool UseOSBuffer() const {
189 // This is needed when you want to allocate
190 // AlignedBuffer for use with file I/O classes
191 // Used for unbuffered file I/O when UseOSBuffer() returns false
192 /*size_t GetRequiredBufferAlignment() const {
193 return c_DefaultPageSize;
196 rocksdb::Status
Append(const rocksdb::Slice
& data
) override
{
197 fs
->append_try_flush(h
, data
.data(), data
.size());
198 return rocksdb::Status::OK();
201 // Positioned write for unbuffered access default forward
202 // to simple append as most of the tests are buffered by default
203 rocksdb::Status
PositionedAppend(
204 const rocksdb::Slice
& /* data */,
205 uint64_t /* offset */) override
{
206 return rocksdb::Status::NotSupported();
209 // Truncate is necessary to trim the file to the correct size
210 // before closing. It is not always possible to keep track of the file
211 // size due to whole pages writes. The behavior is undefined if called
212 // with other writes to follow.
213 rocksdb::Status
Truncate(uint64_t size
) override
{
214 // we mirror the posix env, which does nothing here; instead, it
215 // truncates to the final size on close. whatever!
216 return rocksdb::Status::OK();
217 //int r = fs->truncate(h, size);
218 // return err_to_status(r);
221 rocksdb::Status
Close() override
{
224 // mimic posix env, here. shrug.
226 size_t last_allocated_block
;
227 GetPreallocationStatus(&block_size
, &last_allocated_block
);
228 if (last_allocated_block
> 0) {
229 int r
= fs
->truncate(h
, h
->pos
);
231 return err_to_status(r
);
234 return rocksdb::Status::OK();
237 rocksdb::Status
Flush() override
{
239 return rocksdb::Status::OK();
242 rocksdb::Status
Sync() override
{ // sync data
244 return rocksdb::Status::OK();
247 // true if Sync() and Fsync() are safe to call concurrently with Append()
249 bool IsSyncThreadSafe() const override
{
253 // Indicates the upper layers if the current WritableFile implementation
255 bool UseDirectIO() const {
259 void SetWriteLifeTimeHint(rocksdb::Env::WriteLifeTimeHint hint
) override
{
260 h
->write_hint
= (const int)hint
;
264 * Get the size of valid data in the file.
266 uint64_t GetFileSize() override
{
267 return h
->file
->fnode
.size
+ h
->get_buffer_length();;
270 // For documentation, refer to RandomAccessFile::GetUniqueId()
271 size_t GetUniqueId(char* id
, size_t max_size
) const override
{
272 return snprintf(id
, max_size
, "%016llx",
273 (unsigned long long)h
->file
->fnode
.ino
);
276 // Remove any kind of caching of data from the offset to offset+length
277 // of this file. If the length is 0, then it refers to the end of file.
278 // If the system is not caching the file contents, then this is a noop.
279 // This call has no effect on dirty pages in the cache.
280 rocksdb::Status
InvalidateCache(size_t offset
, size_t length
) override
{
282 fs
->invalidate_cache(h
->file
, offset
, length
);
283 return rocksdb::Status::OK();
286 using rocksdb::WritableFile::RangeSync
;
287 // Sync a file range with disk.
288 // offset is the starting byte of the file range to be synchronized.
289 // nbytes specifies the length of the range to be synchronized.
290 // This asks the OS to initiate flushing the cached data to disk,
291 // without waiting for completion.
292 // Default implementation does nothing.
293 rocksdb::Status
RangeSync(off_t offset
, off_t nbytes
) {
294 // round down to page boundaries
295 int partial
= offset
& 4095;
300 fs
->flush_range(h
, offset
, nbytes
);
301 return rocksdb::Status::OK();
305 using rocksdb::WritableFile::Allocate
;
307 * Pre-allocate space for a file.
309 rocksdb::Status
Allocate(off_t offset
, off_t len
) {
310 int r
= fs
->preallocate(h
->file
, offset
, len
);
311 return err_to_status(r
);
316 // Directory object represents collection of files and implements
317 // filesystem operations that can be executed on directories.
318 class BlueRocksDirectory
: public rocksdb::Directory
{
321 explicit BlueRocksDirectory(BlueFS
*f
) : fs(f
) {}
323 // Fsync directory. Can be called concurrently from multiple threads.
324 rocksdb::Status
Fsync() override
{
325 // it is sufficient to flush the log.
326 fs
->sync_metadata(false);
327 return rocksdb::Status::OK();
331 // Identifies a locked file.
332 class BlueRocksFileLock
: public rocksdb::FileLock
{
335 BlueFS::FileLock
*lock
;
336 BlueRocksFileLock(BlueFS
*fs
, BlueFS::FileLock
*l
) : fs(fs
), lock(l
) { }
337 ~BlueRocksFileLock() override
{
342 // --------------------
343 // --- BlueRocksEnv ---
344 // --------------------
346 BlueRocksEnv::BlueRocksEnv(BlueFS
*f
)
347 : EnvWrapper(Env::Default()), // forward most of it to POSIX
353 rocksdb::Status
BlueRocksEnv::NewSequentialFile(
354 const std::string
& fname
,
355 std::unique_ptr
<rocksdb::SequentialFile
>* result
,
356 const rocksdb::EnvOptions
& options
)
359 return target()->NewSequentialFile(fname
, result
, options
);
360 auto [dir
, file
] = split(fname
);
361 BlueFS::FileReader
*h
;
362 int r
= fs
->open_for_read(dir
, file
, &h
, false);
364 return err_to_status(r
);
365 result
->reset(new BlueRocksSequentialFile(fs
, h
));
366 return rocksdb::Status::OK();
369 rocksdb::Status
BlueRocksEnv::NewRandomAccessFile(
370 const std::string
& fname
,
371 std::unique_ptr
<rocksdb::RandomAccessFile
>* result
,
372 const rocksdb::EnvOptions
& options
)
374 auto [dir
, file
] = split(fname
);
375 BlueFS::FileReader
*h
;
376 int r
= fs
->open_for_read(dir
, file
, &h
, true);
378 return err_to_status(r
);
379 result
->reset(new BlueRocksRandomAccessFile(fs
, h
));
380 return rocksdb::Status::OK();
383 rocksdb::Status
BlueRocksEnv::NewWritableFile(
384 const std::string
& fname
,
385 std::unique_ptr
<rocksdb::WritableFile
>* result
,
386 const rocksdb::EnvOptions
& options
)
388 auto [dir
, file
] = split(fname
);
389 BlueFS::FileWriter
*h
;
390 int r
= fs
->open_for_write(dir
, file
, &h
, false);
392 return err_to_status(r
);
393 result
->reset(new BlueRocksWritableFile(fs
, h
));
394 return rocksdb::Status::OK();
397 rocksdb::Status
BlueRocksEnv::ReuseWritableFile(
398 const std::string
& new_fname
,
399 const std::string
& old_fname
,
400 std::unique_ptr
<rocksdb::WritableFile
>* result
,
401 const rocksdb::EnvOptions
& options
)
403 auto [old_dir
, old_file
] = split(old_fname
);
404 auto [new_dir
, new_file
] = split(new_fname
);
406 int r
= fs
->rename(old_dir
, old_file
, new_dir
, new_file
);
408 return err_to_status(r
);
410 BlueFS::FileWriter
*h
;
411 r
= fs
->open_for_write(new_dir
, new_file
, &h
, true);
413 return err_to_status(r
);
414 result
->reset(new BlueRocksWritableFile(fs
, h
));
415 return rocksdb::Status::OK();
418 rocksdb::Status
BlueRocksEnv::NewDirectory(
419 const std::string
& name
,
420 std::unique_ptr
<rocksdb::Directory
>* result
)
422 if (!fs
->dir_exists(name
))
423 return rocksdb::Status::NotFound(name
, strerror(ENOENT
));
424 result
->reset(new BlueRocksDirectory(fs
));
425 return rocksdb::Status::OK();
428 rocksdb::Status
BlueRocksEnv::FileExists(const std::string
& fname
)
431 return target()->FileExists(fname
);
432 auto [dir
, file
] = split(fname
);
433 if (fs
->stat(dir
, file
, NULL
, NULL
) == 0)
434 return rocksdb::Status::OK();
435 return err_to_status(-ENOENT
);
438 rocksdb::Status
BlueRocksEnv::GetChildren(
439 const std::string
& dir
,
440 std::vector
<std::string
>* result
)
443 int r
= fs
->readdir(dir
, result
);
445 return rocksdb::Status::NotFound(dir
, strerror(ENOENT
));// return err_to_status(r);
446 return rocksdb::Status::OK();
449 rocksdb::Status
BlueRocksEnv::DeleteFile(const std::string
& fname
)
451 auto [dir
, file
] = split(fname
);
452 int r
= fs
->unlink(dir
, file
);
454 return err_to_status(r
);
455 return rocksdb::Status::OK();
458 rocksdb::Status
BlueRocksEnv::CreateDir(const std::string
& dirname
)
460 int r
= fs
->mkdir(dirname
);
462 return err_to_status(r
);
463 return rocksdb::Status::OK();
466 rocksdb::Status
BlueRocksEnv::CreateDirIfMissing(const std::string
& dirname
)
468 int r
= fs
->mkdir(dirname
);
469 if (r
< 0 && r
!= -EEXIST
)
470 return err_to_status(r
);
471 return rocksdb::Status::OK();
474 rocksdb::Status
BlueRocksEnv::DeleteDir(const std::string
& dirname
)
476 int r
= fs
->rmdir(dirname
);
478 return err_to_status(r
);
479 return rocksdb::Status::OK();
482 rocksdb::Status
BlueRocksEnv::GetFileSize(
483 const std::string
& fname
,
486 auto [dir
, file
] = split(fname
);
487 int r
= fs
->stat(dir
, file
, file_size
, NULL
);
489 return err_to_status(r
);
490 return rocksdb::Status::OK();
493 rocksdb::Status
BlueRocksEnv::GetFileModificationTime(const std::string
& fname
,
494 uint64_t* file_mtime
)
496 auto [dir
, file
] = split(fname
);
498 int r
= fs
->stat(dir
, file
, NULL
, &mtime
);
500 return err_to_status(r
);
501 *file_mtime
= mtime
.sec();
502 return rocksdb::Status::OK();
505 rocksdb::Status
BlueRocksEnv::RenameFile(
506 const std::string
& src
,
507 const std::string
& target
)
509 auto [old_dir
, old_file
] = split(src
);
510 auto [new_dir
, new_file
] = split(target
);
512 int r
= fs
->rename(old_dir
, old_file
, new_dir
, new_file
);
514 return err_to_status(r
);
515 return rocksdb::Status::OK();
518 rocksdb::Status
BlueRocksEnv::LinkFile(
519 const std::string
& src
,
520 const std::string
& target
)
525 rocksdb::Status
BlueRocksEnv::AreFilesSame(
526 const std::string
& first
,
527 const std::string
& second
, bool* res
)
529 for (auto& path
: {first
, second
}) {
530 if (fs
->dir_exists(path
)) {
533 auto [dir
, file
] = split(path
);
534 int r
= fs
->stat(dir
, file
, nullptr, nullptr);
537 } else if (r
== -ENOENT
) {
538 return rocksdb::Status::NotFound("AreFilesSame", path
);
540 return err_to_status(r
);
543 *res
= (first
== second
);
544 return rocksdb::Status::OK();
547 rocksdb::Status
BlueRocksEnv::LockFile(
548 const std::string
& fname
,
549 rocksdb::FileLock
** lock
)
551 auto [dir
, file
] = split(fname
);
552 BlueFS::FileLock
*l
= NULL
;
553 int r
= fs
->lock_file(dir
, file
, &l
);
555 return err_to_status(r
);
556 *lock
= new BlueRocksFileLock(fs
, l
);
557 return rocksdb::Status::OK();
560 rocksdb::Status
BlueRocksEnv::UnlockFile(rocksdb::FileLock
* lock
)
562 BlueRocksFileLock
*l
= static_cast<BlueRocksFileLock
*>(lock
);
563 int r
= fs
->unlock_file(l
->lock
);
565 return err_to_status(r
);
568 return rocksdb::Status::OK();
571 rocksdb::Status
BlueRocksEnv::GetAbsolutePath(
572 const std::string
& db_path
,
573 std::string
* output_path
)
576 *output_path
= "/" + db_path
;
577 return rocksdb::Status::OK();
580 rocksdb::Status
BlueRocksEnv::NewLogger(
581 const std::string
& fname
,
582 std::shared_ptr
<rocksdb::Logger
>* result
)
584 // ignore the filename :)
585 result
->reset(create_rocksdb_ceph_logger());
586 return rocksdb::Status::OK();
589 rocksdb::Status
BlueRocksEnv::GetTestDirectory(std::string
* path
)
592 *path
= "temp_" + stringify(++foo
);
593 return rocksdb::Status::OK();