1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "BlueRocksEnv.h"
6 #include "include/stringify.h"
7 #include "kv/RocksDBStore.h"
9 rocksdb::Status
err_to_status(int r
)
13 return rocksdb::Status::OK();
15 return rocksdb::Status::NotFound(rocksdb::Status::kNone
);
17 return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone
);
19 return rocksdb::Status::IOError(rocksdb::Status::kNone
);
22 assert(0 == "unrecognized error code");
23 return rocksdb::Status::NotSupported(rocksdb::Status::kNone
);
27 // A file abstraction for reading sequentially through a file
28 class BlueRocksSequentialFile
: public rocksdb::SequentialFile
{
30 BlueFS::FileReader
*h
;
32 BlueRocksSequentialFile(BlueFS
*fs
, BlueFS::FileReader
*h
) : fs(fs
), h(h
) {}
33 ~BlueRocksSequentialFile() override
{
37 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
38 // written by this routine. Sets "*result" to the data that was
39 // read (including if fewer than "n" bytes were successfully read).
40 // May set "*result" to point at data in "scratch[0..n-1]", so
41 // "scratch[0..n-1]" must be live when "*result" is used.
42 // If an error was encountered, returns a non-OK status.
44 // REQUIRES: External synchronization
45 rocksdb::Status
Read(size_t n
, rocksdb::Slice
* result
, char* scratch
) override
{
46 int r
= fs
->read(h
, &h
->buf
, h
->buf
.pos
, n
, NULL
, scratch
);
48 *result
= rocksdb::Slice(scratch
, r
);
49 return rocksdb::Status::OK();
52 // Skip "n" bytes from the file. This is guaranteed to be no
53 // slower that reading the same data, but may be faster.
55 // If end of file is reached, skipping will stop at the end of the
56 // file, and Skip will return OK.
58 // REQUIRES: External synchronization
59 rocksdb::Status
Skip(uint64_t n
) override
{
61 return rocksdb::Status::OK();
64 // Remove any kind of caching of data from the offset to offset+length
65 // of this file. If the length is 0, then it refers to the end of file.
66 // If the system is not caching the file contents, then this is a noop.
67 rocksdb::Status
InvalidateCache(size_t offset
, size_t length
) override
{
68 fs
->invalidate_cache(h
->file
, offset
, length
);
69 return rocksdb::Status::OK();
73 // A file abstraction for randomly reading the contents of a file.
74 class BlueRocksRandomAccessFile
: public rocksdb::RandomAccessFile
{
76 BlueFS::FileReader
*h
;
78 BlueRocksRandomAccessFile(BlueFS
*fs
, BlueFS::FileReader
*h
) : fs(fs
), h(h
) {}
79 ~BlueRocksRandomAccessFile() override
{
83 // Read up to "n" bytes from the file starting at "offset".
84 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
85 // to the data that was read (including if fewer than "n" bytes were
86 // successfully read). May set "*result" to point at data in
87 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
88 // "*result" is used. If an error was encountered, returns a non-OK
91 // Safe for concurrent use by multiple threads.
92 rocksdb::Status
Read(uint64_t offset
, size_t n
, rocksdb::Slice
* result
,
93 char* scratch
) const override
{
94 int r
= fs
->read_random(h
, offset
, n
, scratch
);
96 *result
= rocksdb::Slice(scratch
, r
);
97 return rocksdb::Status::OK();
100 // Tries to get an unique ID for this file that will be the same each time
101 // the file is opened (and will stay the same while the file is open).
102 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
103 // ID can be created this function returns the length of the ID and places it
104 // in "id"; otherwise, this function returns 0, in which case "id"
105 // may not have been modified.
107 // This function guarantees, for IDs from a given environment, two unique ids
108 // cannot be made equal to eachother by adding arbitrary bytes to one of
109 // them. That is, no unique ID is the prefix of another.
111 // This function guarantees that the returned ID will not be interpretable as
114 // Note: these IDs are only valid for the duration of the process.
115 size_t GetUniqueId(char* id
, size_t max_size
) const override
{
116 return snprintf(id
, max_size
, "%016llx",
117 (unsigned long long)h
->file
->fnode
.ino
);
120 //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
122 void Hint(AccessPattern pattern
) override
{
123 if (pattern
== RANDOM
)
124 h
->buf
.max_prefetch
= 4096;
125 else if (pattern
== SEQUENTIAL
)
126 h
->buf
.max_prefetch
= fs
->cct
->_conf
->bluefs_max_prefetch
;
129 // Remove any kind of caching of data from the offset to offset+length
130 // of this file. If the length is 0, then it refers to the end of file.
131 // If the system is not caching the file contents, then this is a noop.
132 rocksdb::Status
InvalidateCache(size_t offset
, size_t length
) override
{
133 fs
->invalidate_cache(h
->file
, offset
, length
);
134 return rocksdb::Status::OK();
139 // A file abstraction for sequential writing. The implementation
140 // must provide buffering since callers may append small fragments
141 // at a time to the file.
142 class BlueRocksWritableFile
: public rocksdb::WritableFile
{
144 BlueFS::FileWriter
*h
;
146 BlueRocksWritableFile(BlueFS
*fs
, BlueFS::FileWriter
*h
) : fs(fs
), h(h
) {}
147 ~BlueRocksWritableFile() override
{
151 // Indicates if the class makes use of unbuffered I/O
152 /*bool UseOSBuffer() const {
156 // This is needed when you want to allocate
157 // AlignedBuffer for use with file I/O classes
158 // Used for unbuffered file I/O when UseOSBuffer() returns false
159 /*size_t GetRequiredBufferAlignment() const {
160 return c_DefaultPageSize;
163 rocksdb::Status
Append(const rocksdb::Slice
& data
) override
{
164 h
->append(data
.data(), data
.size());
165 return rocksdb::Status::OK();
168 // Positioned write for unbuffered access default forward
169 // to simple append as most of the tests are buffered by default
170 rocksdb::Status
PositionedAppend(
171 const rocksdb::Slice
& /* data */,
172 uint64_t /* offset */) override
{
173 return rocksdb::Status::NotSupported();
176 // Truncate is necessary to trim the file to the correct size
177 // before closing. It is not always possible to keep track of the file
178 // size due to whole pages writes. The behavior is undefined if called
179 // with other writes to follow.
180 rocksdb::Status
Truncate(uint64_t size
) override
{
181 // we mirror the posix env, which does nothing here; instead, it
182 // truncates to the final size on close. whatever!
183 return rocksdb::Status::OK();
184 //int r = fs->truncate(h, size);
185 // return err_to_status(r);
188 rocksdb::Status
Close() override
{
191 // mimic posix env, here. shrug.
193 size_t last_allocated_block
;
194 GetPreallocationStatus(&block_size
, &last_allocated_block
);
195 if (last_allocated_block
> 0) {
196 int r
= fs
->truncate(h
, h
->pos
);
198 return err_to_status(r
);
201 return rocksdb::Status::OK();
204 rocksdb::Status
Flush() override
{
206 return rocksdb::Status::OK();
209 rocksdb::Status
Sync() override
{ // sync data
211 return rocksdb::Status::OK();
214 // true if Sync() and Fsync() are safe to call concurrently with Append()
216 bool IsSyncThreadSafe() const override
{
220 // Indicates the upper layers if the current WritableFile implementation
222 bool UseDirectIO() const {
227 * Get the size of valid data in the file.
229 uint64_t GetFileSize() override
{
230 return h
->file
->fnode
.size
+ h
->buffer
.length();;
233 // For documentation, refer to RandomAccessFile::GetUniqueId()
234 size_t GetUniqueId(char* id
, size_t max_size
) const override
{
235 return snprintf(id
, max_size
, "%016llx",
236 (unsigned long long)h
->file
->fnode
.ino
);
239 // Remove any kind of caching of data from the offset to offset+length
240 // of this file. If the length is 0, then it refers to the end of file.
241 // If the system is not caching the file contents, then this is a noop.
242 // This call has no effect on dirty pages in the cache.
243 rocksdb::Status
InvalidateCache(size_t offset
, size_t length
) override
{
244 fs
->invalidate_cache(h
->file
, offset
, length
);
245 return rocksdb::Status::OK();
248 using rocksdb::WritableFile::RangeSync
;
249 // Sync a file range with disk.
250 // offset is the starting byte of the file range to be synchronized.
251 // nbytes specifies the length of the range to be synchronized.
252 // This asks the OS to initiate flushing the cached data to disk,
253 // without waiting for completion.
254 // Default implementation does nothing.
255 rocksdb::Status
RangeSync(off_t offset
, off_t nbytes
) {
256 // round down to page boundaries
257 int partial
= offset
& 4095;
262 fs
->flush_range(h
, offset
, nbytes
);
263 return rocksdb::Status::OK();
267 using rocksdb::WritableFile::Allocate
;
269 * Pre-allocate space for a file.
271 rocksdb::Status
Allocate(off_t offset
, off_t len
) {
272 int r
= fs
->preallocate(h
->file
, offset
, len
);
273 return err_to_status(r
);
278 // Directory object represents collection of files and implements
279 // filesystem operations that can be executed on directories.
280 class BlueRocksDirectory
: public rocksdb::Directory
{
283 explicit BlueRocksDirectory(BlueFS
*f
) : fs(f
) {}
285 // Fsync directory. Can be called concurrently from multiple threads.
286 rocksdb::Status
Fsync() override
{
287 // it is sufficient to flush the log.
289 return rocksdb::Status::OK();
293 // Identifies a locked file.
294 class BlueRocksFileLock
: public rocksdb::FileLock
{
297 BlueFS::FileLock
*lock
;
298 BlueRocksFileLock(BlueFS
*fs
, BlueFS::FileLock
*l
) : fs(fs
), lock(l
) { }
299 ~BlueRocksFileLock() override
{
304 // --------------------
305 // --- BlueRocksEnv ---
306 // --------------------
308 BlueRocksEnv::BlueRocksEnv(BlueFS
*f
)
309 : EnvWrapper(Env::Default()), // forward most of it to POSIX
315 rocksdb::Status
BlueRocksEnv::NewSequentialFile(
316 const std::string
& fname
,
317 std::unique_ptr
<rocksdb::SequentialFile
>* result
,
318 const rocksdb::EnvOptions
& options
)
321 return target()->NewSequentialFile(fname
, result
, options
);
322 std::string dir
, file
;
323 split(fname
, &dir
, &file
);
324 BlueFS::FileReader
*h
;
325 int r
= fs
->open_for_read(dir
, file
, &h
, false);
327 return err_to_status(r
);
328 result
->reset(new BlueRocksSequentialFile(fs
, h
));
329 return rocksdb::Status::OK();
332 rocksdb::Status
BlueRocksEnv::NewRandomAccessFile(
333 const std::string
& fname
,
334 std::unique_ptr
<rocksdb::RandomAccessFile
>* result
,
335 const rocksdb::EnvOptions
& options
)
337 std::string dir
, file
;
338 split(fname
, &dir
, &file
);
339 BlueFS::FileReader
*h
;
340 int r
= fs
->open_for_read(dir
, file
, &h
, true);
342 return err_to_status(r
);
343 result
->reset(new BlueRocksRandomAccessFile(fs
, h
));
344 return rocksdb::Status::OK();
347 rocksdb::Status
BlueRocksEnv::NewWritableFile(
348 const std::string
& fname
,
349 std::unique_ptr
<rocksdb::WritableFile
>* result
,
350 const rocksdb::EnvOptions
& options
)
352 std::string dir
, file
;
353 split(fname
, &dir
, &file
);
354 BlueFS::FileWriter
*h
;
355 int r
= fs
->open_for_write(dir
, file
, &h
, false);
357 return err_to_status(r
);
358 result
->reset(new BlueRocksWritableFile(fs
, h
));
359 return rocksdb::Status::OK();
362 rocksdb::Status
BlueRocksEnv::ReuseWritableFile(
363 const std::string
& new_fname
,
364 const std::string
& old_fname
,
365 std::unique_ptr
<rocksdb::WritableFile
>* result
,
366 const rocksdb::EnvOptions
& options
)
368 std::string old_dir
, old_file
;
369 split(old_fname
, &old_dir
, &old_file
);
370 std::string new_dir
, new_file
;
371 split(new_fname
, &new_dir
, &new_file
);
373 int r
= fs
->rename(old_dir
, old_file
, new_dir
, new_file
);
375 return err_to_status(r
);
377 BlueFS::FileWriter
*h
;
378 r
= fs
->open_for_write(new_dir
, new_file
, &h
, true);
380 return err_to_status(r
);
381 result
->reset(new BlueRocksWritableFile(fs
, h
));
382 return rocksdb::Status::OK();
385 rocksdb::Status
BlueRocksEnv::NewDirectory(
386 const std::string
& name
,
387 std::unique_ptr
<rocksdb::Directory
>* result
)
389 if (!fs
->dir_exists(name
))
390 return rocksdb::Status::IOError(name
, strerror(ENOENT
));
391 result
->reset(new BlueRocksDirectory(fs
));
392 return rocksdb::Status::OK();
395 rocksdb::Status
BlueRocksEnv::FileExists(const std::string
& fname
)
398 return target()->FileExists(fname
);
399 std::string dir
, file
;
400 split(fname
, &dir
, &file
);
401 if (fs
->stat(dir
, file
, NULL
, NULL
) == 0)
402 return rocksdb::Status::OK();
403 return err_to_status(-ENOENT
);
406 rocksdb::Status
BlueRocksEnv::GetChildren(
407 const std::string
& dir
,
408 std::vector
<std::string
>* result
)
410 int r
= fs
->readdir(dir
, result
);
412 return rocksdb::Status::IOError(dir
, strerror(ENOENT
));// return err_to_status(r);
413 return rocksdb::Status::OK();
416 rocksdb::Status
BlueRocksEnv::DeleteFile(const std::string
& fname
)
418 std::string dir
, file
;
419 split(fname
, &dir
, &file
);
420 int r
= fs
->unlink(dir
, file
);
422 return err_to_status(r
);
423 return rocksdb::Status::OK();
426 rocksdb::Status
BlueRocksEnv::CreateDir(const std::string
& dirname
)
428 int r
= fs
->mkdir(dirname
);
430 return err_to_status(r
);
431 return rocksdb::Status::OK();
434 rocksdb::Status
BlueRocksEnv::CreateDirIfMissing(const std::string
& dirname
)
436 int r
= fs
->mkdir(dirname
);
437 if (r
< 0 && r
!= -EEXIST
)
438 return err_to_status(r
);
439 return rocksdb::Status::OK();
442 rocksdb::Status
BlueRocksEnv::DeleteDir(const std::string
& dirname
)
444 int r
= fs
->rmdir(dirname
);
446 return err_to_status(r
);
447 return rocksdb::Status::OK();
450 rocksdb::Status
BlueRocksEnv::GetFileSize(
451 const std::string
& fname
,
454 std::string dir
, file
;
455 split(fname
, &dir
, &file
);
456 int r
= fs
->stat(dir
, file
, file_size
, NULL
);
458 return err_to_status(r
);
459 return rocksdb::Status::OK();
462 rocksdb::Status
BlueRocksEnv::GetFileModificationTime(const std::string
& fname
,
463 uint64_t* file_mtime
)
465 std::string dir
, file
;
466 split(fname
, &dir
, &file
);
468 int r
= fs
->stat(dir
, file
, NULL
, &mtime
);
470 return err_to_status(r
);
471 *file_mtime
= mtime
.sec();
472 return rocksdb::Status::OK();
475 rocksdb::Status
BlueRocksEnv::RenameFile(
476 const std::string
& src
,
477 const std::string
& target
)
479 std::string old_dir
, old_file
;
480 split(src
, &old_dir
, &old_file
);
481 std::string new_dir
, new_file
;
482 split(target
, &new_dir
, &new_file
);
484 int r
= fs
->rename(old_dir
, old_file
, new_dir
, new_file
);
486 return err_to_status(r
);
487 return rocksdb::Status::OK();
490 rocksdb::Status
BlueRocksEnv::LinkFile(
491 const std::string
& src
,
492 const std::string
& target
)
497 rocksdb::Status
BlueRocksEnv::LockFile(
498 const std::string
& fname
,
499 rocksdb::FileLock
** lock
)
501 std::string dir
, file
;
502 split(fname
, &dir
, &file
);
503 BlueFS::FileLock
*l
= NULL
;
504 int r
= fs
->lock_file(dir
, file
, &l
);
506 return err_to_status(r
);
507 *lock
= new BlueRocksFileLock(fs
, l
);
508 return rocksdb::Status::OK();
511 rocksdb::Status
BlueRocksEnv::UnlockFile(rocksdb::FileLock
* lock
)
513 BlueRocksFileLock
*l
= static_cast<BlueRocksFileLock
*>(lock
);
514 int r
= fs
->unlock_file(l
->lock
);
516 return err_to_status(r
);
518 return rocksdb::Status::OK();
521 rocksdb::Status
BlueRocksEnv::GetAbsolutePath(
522 const std::string
& db_path
,
523 std::string
* output_path
)
526 *output_path
= "/" + db_path
;
527 return rocksdb::Status::OK();
530 rocksdb::Status
BlueRocksEnv::NewLogger(
531 const std::string
& fname
,
532 std::shared_ptr
<rocksdb::Logger
>* result
)
534 // ignore the filename :)
535 result
->reset(create_rocksdb_ceph_logger());
536 return rocksdb::Status::OK();
539 rocksdb::Status
BlueRocksEnv::GetTestDirectory(std::string
* path
)
542 *path
= "temp_" + stringify(++foo
);
543 return rocksdb::Status::OK();