1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "BlueRocksEnv.h"
6 #include "include/stringify.h"
7 #include "kv/RocksDBStore.h"
10 rocksdb::Status
err_to_status(int r
)
14 return rocksdb::Status::OK();
16 return rocksdb::Status::NotFound(rocksdb::Status::kNone
);
18 return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone
);
21 return rocksdb::Status::IOError(rocksdb::Status::kNone
);
23 return rocksdb::Status::IOError(strerror(r
));
26 ceph_abort_msg("unrecognized error code");
27 return rocksdb::Status::NotSupported(rocksdb::Status::kNone
);
31 // A file abstraction for reading sequentially through a file
32 class BlueRocksSequentialFile
: public rocksdb::SequentialFile
{
34 BlueFS::FileReader
*h
;
36 BlueRocksSequentialFile(BlueFS
*fs
, BlueFS::FileReader
*h
) : fs(fs
), h(h
) {}
37 ~BlueRocksSequentialFile() override
{
41 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
42 // written by this routine. Sets "*result" to the data that was
43 // read (including if fewer than "n" bytes were successfully read).
44 // May set "*result" to point at data in "scratch[0..n-1]", so
45 // "scratch[0..n-1]" must be live when "*result" is used.
46 // If an error was encountered, returns a non-OK status.
48 // REQUIRES: External synchronization
49 rocksdb::Status
Read(size_t n
, rocksdb::Slice
* result
, char* scratch
) override
{
50 int r
= fs
->read(h
, &h
->buf
, h
->buf
.pos
, n
, NULL
, scratch
);
52 *result
= rocksdb::Slice(scratch
, r
);
53 return rocksdb::Status::OK();
56 // Skip "n" bytes from the file. This is guaranteed to be no
57 // slower that reading the same data, but may be faster.
59 // If end of file is reached, skipping will stop at the end of the
60 // file, and Skip will return OK.
62 // REQUIRES: External synchronization
63 rocksdb::Status
Skip(uint64_t n
) override
{
65 return rocksdb::Status::OK();
68 // Remove any kind of caching of data from the offset to offset+length
69 // of this file. If the length is 0, then it refers to the end of file.
70 // If the system is not caching the file contents, then this is a noop.
71 rocksdb::Status
InvalidateCache(size_t offset
, size_t length
) override
{
72 fs
->invalidate_cache(h
->file
, offset
, length
);
73 return rocksdb::Status::OK();
77 // A file abstraction for randomly reading the contents of a file.
78 class BlueRocksRandomAccessFile
: public rocksdb::RandomAccessFile
{
80 BlueFS::FileReader
*h
;
82 BlueRocksRandomAccessFile(BlueFS
*fs
, BlueFS::FileReader
*h
) : fs(fs
), h(h
) {}
83 ~BlueRocksRandomAccessFile() override
{
87 // Read up to "n" bytes from the file starting at "offset".
88 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
89 // to the data that was read (including if fewer than "n" bytes were
90 // successfully read). May set "*result" to point at data in
91 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
92 // "*result" is used. If an error was encountered, returns a non-OK
95 // Safe for concurrent use by multiple threads.
96 rocksdb::Status
Read(uint64_t offset
, size_t n
, rocksdb::Slice
* result
,
97 char* scratch
) const override
{
98 int r
= fs
->read_random(h
, offset
, n
, scratch
);
100 *result
= rocksdb::Slice(scratch
, r
);
101 return rocksdb::Status::OK();
104 // Tries to get an unique ID for this file that will be the same each time
105 // the file is opened (and will stay the same while the file is open).
106 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
107 // ID can be created this function returns the length of the ID and places it
108 // in "id"; otherwise, this function returns 0, in which case "id"
109 // may not have been modified.
111 // This function guarantees, for IDs from a given environment, two unique ids
112 // cannot be made equal to eachother by adding arbitrary bytes to one of
113 // them. That is, no unique ID is the prefix of another.
115 // This function guarantees that the returned ID will not be interpretable as
118 // Note: these IDs are only valid for the duration of the process.
119 size_t GetUniqueId(char* id
, size_t max_size
) const override
{
120 return snprintf(id
, max_size
, "%016llx",
121 (unsigned long long)h
->file
->fnode
.ino
);
124 // Readahead the file starting from offset by n bytes for caching.
125 rocksdb::Status
Prefetch(uint64_t offset
, size_t n
) override
{
126 fs
->read(h
, &h
->buf
, offset
, n
, nullptr, nullptr);
127 return rocksdb::Status::OK();
130 //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
132 void Hint(AccessPattern pattern
) override
{
133 if (pattern
== RANDOM
)
134 h
->buf
.max_prefetch
= 4096;
135 else if (pattern
== SEQUENTIAL
)
136 h
->buf
.max_prefetch
= fs
->cct
->_conf
->bluefs_max_prefetch
;
139 // Remove any kind of caching of data from the offset to offset+length
140 // of this file. If the length is 0, then it refers to the end of file.
141 // If the system is not caching the file contents, then this is a noop.
142 rocksdb::Status
InvalidateCache(size_t offset
, size_t length
) override
{
143 fs
->invalidate_cache(h
->file
, offset
, length
);
144 return rocksdb::Status::OK();
149 // A file abstraction for sequential writing. The implementation
150 // must provide buffering since callers may append small fragments
151 // at a time to the file.
152 class BlueRocksWritableFile
: public rocksdb::WritableFile
{
154 BlueFS::FileWriter
*h
;
156 BlueRocksWritableFile(BlueFS
*fs
, BlueFS::FileWriter
*h
) : fs(fs
), h(h
) {}
157 ~BlueRocksWritableFile() override
{
161 // Indicates if the class makes use of unbuffered I/O
162 /*bool UseOSBuffer() const {
166 // This is needed when you want to allocate
167 // AlignedBuffer for use with file I/O classes
168 // Used for unbuffered file I/O when UseOSBuffer() returns false
169 /*size_t GetRequiredBufferAlignment() const {
170 return c_DefaultPageSize;
173 rocksdb::Status
Append(const rocksdb::Slice
& data
) override
{
174 h
->append(data
.data(), data
.size());
175 return rocksdb::Status::OK();
178 // Positioned write for unbuffered access default forward
179 // to simple append as most of the tests are buffered by default
180 rocksdb::Status
PositionedAppend(
181 const rocksdb::Slice
& /* data */,
182 uint64_t /* offset */) override
{
183 return rocksdb::Status::NotSupported();
186 // Truncate is necessary to trim the file to the correct size
187 // before closing. It is not always possible to keep track of the file
188 // size due to whole pages writes. The behavior is undefined if called
189 // with other writes to follow.
190 rocksdb::Status
Truncate(uint64_t size
) override
{
191 // we mirror the posix env, which does nothing here; instead, it
192 // truncates to the final size on close. whatever!
193 return rocksdb::Status::OK();
194 //int r = fs->truncate(h, size);
195 // return err_to_status(r);
198 rocksdb::Status
Close() override
{
201 // mimic posix env, here. shrug.
203 size_t last_allocated_block
;
204 GetPreallocationStatus(&block_size
, &last_allocated_block
);
205 if (last_allocated_block
> 0) {
206 int r
= fs
->truncate(h
, h
->pos
);
208 return err_to_status(r
);
211 return rocksdb::Status::OK();
214 rocksdb::Status
Flush() override
{
216 return rocksdb::Status::OK();
219 rocksdb::Status
Sync() override
{ // sync data
221 return rocksdb::Status::OK();
224 // true if Sync() and Fsync() are safe to call concurrently with Append()
226 bool IsSyncThreadSafe() const override
{
230 // Indicates the upper layers if the current WritableFile implementation
232 bool UseDirectIO() const {
236 void SetWriteLifeTimeHint(rocksdb::Env::WriteLifeTimeHint hint
) override
{
237 h
->write_hint
= (const int)hint
;
241 * Get the size of valid data in the file.
243 uint64_t GetFileSize() override
{
244 return h
->file
->fnode
.size
+ h
->buffer
.length();;
247 // For documentation, refer to RandomAccessFile::GetUniqueId()
248 size_t GetUniqueId(char* id
, size_t max_size
) const override
{
249 return snprintf(id
, max_size
, "%016llx",
250 (unsigned long long)h
->file
->fnode
.ino
);
253 // Remove any kind of caching of data from the offset to offset+length
254 // of this file. If the length is 0, then it refers to the end of file.
255 // If the system is not caching the file contents, then this is a noop.
256 // This call has no effect on dirty pages in the cache.
257 rocksdb::Status
InvalidateCache(size_t offset
, size_t length
) override
{
258 fs
->invalidate_cache(h
->file
, offset
, length
);
259 return rocksdb::Status::OK();
262 using rocksdb::WritableFile::RangeSync
;
263 // Sync a file range with disk.
264 // offset is the starting byte of the file range to be synchronized.
265 // nbytes specifies the length of the range to be synchronized.
266 // This asks the OS to initiate flushing the cached data to disk,
267 // without waiting for completion.
268 // Default implementation does nothing.
269 rocksdb::Status
RangeSync(off_t offset
, off_t nbytes
) {
270 // round down to page boundaries
271 int partial
= offset
& 4095;
276 fs
->flush_range(h
, offset
, nbytes
);
277 return rocksdb::Status::OK();
281 using rocksdb::WritableFile::Allocate
;
283 * Pre-allocate space for a file.
285 rocksdb::Status
Allocate(off_t offset
, off_t len
) {
286 int r
= fs
->preallocate(h
->file
, offset
, len
);
287 return err_to_status(r
);
292 // Directory object represents collection of files and implements
293 // filesystem operations that can be executed on directories.
294 class BlueRocksDirectory
: public rocksdb::Directory
{
297 explicit BlueRocksDirectory(BlueFS
*f
) : fs(f
) {}
299 // Fsync directory. Can be called concurrently from multiple threads.
300 rocksdb::Status
Fsync() override
{
301 // it is sufficient to flush the log.
302 fs
->sync_metadata(false);
303 return rocksdb::Status::OK();
307 // Identifies a locked file.
308 class BlueRocksFileLock
: public rocksdb::FileLock
{
311 BlueFS::FileLock
*lock
;
312 BlueRocksFileLock(BlueFS
*fs
, BlueFS::FileLock
*l
) : fs(fs
), lock(l
) { }
313 ~BlueRocksFileLock() override
{
318 // --------------------
319 // --- BlueRocksEnv ---
320 // --------------------
322 BlueRocksEnv::BlueRocksEnv(BlueFS
*f
)
323 : EnvWrapper(Env::Default()), // forward most of it to POSIX
329 rocksdb::Status
BlueRocksEnv::NewSequentialFile(
330 const std::string
& fname
,
331 std::unique_ptr
<rocksdb::SequentialFile
>* result
,
332 const rocksdb::EnvOptions
& options
)
335 return target()->NewSequentialFile(fname
, result
, options
);
336 std::string dir
, file
;
337 split(fname
, &dir
, &file
);
338 BlueFS::FileReader
*h
;
339 int r
= fs
->open_for_read(dir
, file
, &h
, false);
341 return err_to_status(r
);
342 result
->reset(new BlueRocksSequentialFile(fs
, h
));
343 return rocksdb::Status::OK();
346 rocksdb::Status
BlueRocksEnv::NewRandomAccessFile(
347 const std::string
& fname
,
348 std::unique_ptr
<rocksdb::RandomAccessFile
>* result
,
349 const rocksdb::EnvOptions
& options
)
351 std::string dir
, file
;
352 split(fname
, &dir
, &file
);
353 BlueFS::FileReader
*h
;
354 int r
= fs
->open_for_read(dir
, file
, &h
, true);
356 return err_to_status(r
);
357 result
->reset(new BlueRocksRandomAccessFile(fs
, h
));
358 return rocksdb::Status::OK();
361 rocksdb::Status
BlueRocksEnv::NewWritableFile(
362 const std::string
& fname
,
363 std::unique_ptr
<rocksdb::WritableFile
>* result
,
364 const rocksdb::EnvOptions
& options
)
366 std::string dir
, file
;
367 split(fname
, &dir
, &file
);
368 BlueFS::FileWriter
*h
;
369 int r
= fs
->open_for_write(dir
, file
, &h
, false);
371 return err_to_status(r
);
372 result
->reset(new BlueRocksWritableFile(fs
, h
));
373 return rocksdb::Status::OK();
376 rocksdb::Status
BlueRocksEnv::ReuseWritableFile(
377 const std::string
& new_fname
,
378 const std::string
& old_fname
,
379 std::unique_ptr
<rocksdb::WritableFile
>* result
,
380 const rocksdb::EnvOptions
& options
)
382 std::string old_dir
, old_file
;
383 split(old_fname
, &old_dir
, &old_file
);
384 std::string new_dir
, new_file
;
385 split(new_fname
, &new_dir
, &new_file
);
387 int r
= fs
->rename(old_dir
, old_file
, new_dir
, new_file
);
389 return err_to_status(r
);
391 BlueFS::FileWriter
*h
;
392 r
= fs
->open_for_write(new_dir
, new_file
, &h
, true);
394 return err_to_status(r
);
395 result
->reset(new BlueRocksWritableFile(fs
, h
));
396 return rocksdb::Status::OK();
399 rocksdb::Status
BlueRocksEnv::NewDirectory(
400 const std::string
& name
,
401 std::unique_ptr
<rocksdb::Directory
>* result
)
403 if (!fs
->dir_exists(name
))
404 return rocksdb::Status::NotFound(name
, strerror(ENOENT
));
405 result
->reset(new BlueRocksDirectory(fs
));
406 return rocksdb::Status::OK();
409 rocksdb::Status
BlueRocksEnv::FileExists(const std::string
& fname
)
412 return target()->FileExists(fname
);
413 std::string dir
, file
;
414 split(fname
, &dir
, &file
);
415 if (fs
->stat(dir
, file
, NULL
, NULL
) == 0)
416 return rocksdb::Status::OK();
417 return err_to_status(-ENOENT
);
420 rocksdb::Status
BlueRocksEnv::GetChildren(
421 const std::string
& dir
,
422 std::vector
<std::string
>* result
)
425 int r
= fs
->readdir(dir
, result
);
427 return rocksdb::Status::NotFound(dir
, strerror(ENOENT
));// return err_to_status(r);
428 return rocksdb::Status::OK();
431 rocksdb::Status
BlueRocksEnv::DeleteFile(const std::string
& fname
)
433 std::string dir
, file
;
434 split(fname
, &dir
, &file
);
435 int r
= fs
->unlink(dir
, file
);
437 return err_to_status(r
);
438 return rocksdb::Status::OK();
441 rocksdb::Status
BlueRocksEnv::CreateDir(const std::string
& dirname
)
443 int r
= fs
->mkdir(dirname
);
445 return err_to_status(r
);
446 return rocksdb::Status::OK();
449 rocksdb::Status
BlueRocksEnv::CreateDirIfMissing(const std::string
& dirname
)
451 int r
= fs
->mkdir(dirname
);
452 if (r
< 0 && r
!= -EEXIST
)
453 return err_to_status(r
);
454 return rocksdb::Status::OK();
457 rocksdb::Status
BlueRocksEnv::DeleteDir(const std::string
& dirname
)
459 int r
= fs
->rmdir(dirname
);
461 return err_to_status(r
);
462 return rocksdb::Status::OK();
465 rocksdb::Status
BlueRocksEnv::GetFileSize(
466 const std::string
& fname
,
469 std::string dir
, file
;
470 split(fname
, &dir
, &file
);
471 int r
= fs
->stat(dir
, file
, file_size
, NULL
);
473 return err_to_status(r
);
474 return rocksdb::Status::OK();
477 rocksdb::Status
BlueRocksEnv::GetFileModificationTime(const std::string
& fname
,
478 uint64_t* file_mtime
)
480 std::string dir
, file
;
481 split(fname
, &dir
, &file
);
483 int r
= fs
->stat(dir
, file
, NULL
, &mtime
);
485 return err_to_status(r
);
486 *file_mtime
= mtime
.sec();
487 return rocksdb::Status::OK();
490 rocksdb::Status
BlueRocksEnv::RenameFile(
491 const std::string
& src
,
492 const std::string
& target
)
494 std::string old_dir
, old_file
;
495 split(src
, &old_dir
, &old_file
);
496 std::string new_dir
, new_file
;
497 split(target
, &new_dir
, &new_file
);
499 int r
= fs
->rename(old_dir
, old_file
, new_dir
, new_file
);
501 return err_to_status(r
);
502 return rocksdb::Status::OK();
505 rocksdb::Status
BlueRocksEnv::LinkFile(
506 const std::string
& src
,
507 const std::string
& target
)
512 rocksdb::Status
BlueRocksEnv::AreFilesSame(
513 const std::string
& first
,
514 const std::string
& second
, bool* res
)
516 for (auto& path
: {first
, second
}) {
517 if (fs
->dir_exists(path
)) {
520 std::string dir
, file
;
521 split(path
, &dir
, &file
);
522 int r
= fs
->stat(dir
, file
, nullptr, nullptr);
525 } else if (r
== -ENOENT
) {
526 return rocksdb::Status::NotFound("AreFilesSame", path
);
528 return err_to_status(r
);
531 *res
= (first
== second
);
532 return rocksdb::Status::OK();
535 rocksdb::Status
BlueRocksEnv::LockFile(
536 const std::string
& fname
,
537 rocksdb::FileLock
** lock
)
539 std::string dir
, file
;
540 split(fname
, &dir
, &file
);
541 BlueFS::FileLock
*l
= NULL
;
542 int r
= fs
->lock_file(dir
, file
, &l
);
544 return err_to_status(r
);
545 *lock
= new BlueRocksFileLock(fs
, l
);
546 return rocksdb::Status::OK();
549 rocksdb::Status
BlueRocksEnv::UnlockFile(rocksdb::FileLock
* lock
)
551 BlueRocksFileLock
*l
= static_cast<BlueRocksFileLock
*>(lock
);
552 int r
= fs
->unlock_file(l
->lock
);
554 return err_to_status(r
);
557 return rocksdb::Status::OK();
560 rocksdb::Status
BlueRocksEnv::GetAbsolutePath(
561 const std::string
& db_path
,
562 std::string
* output_path
)
565 *output_path
= "/" + db_path
;
566 return rocksdb::Status::OK();
569 rocksdb::Status
BlueRocksEnv::NewLogger(
570 const std::string
& fname
,
571 std::shared_ptr
<rocksdb::Logger
>* result
)
573 // ignore the filename :)
574 result
->reset(create_rocksdb_ceph_logger());
575 return rocksdb::Status::OK();
578 rocksdb::Status
BlueRocksEnv::GetTestDirectory(std::string
* path
)
581 *path
= "temp_" + stringify(++foo
);
582 return rocksdb::Status::OK();