]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueRocksEnv.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / os / bluestore / BlueRocksEnv.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "BlueRocksEnv.h"
5#include "BlueFS.h"
6#include "include/stringify.h"
7#include "kv/RocksDBStore.h"
11fdf7f2 8#include "string.h"
7c673cae 9
20effc67
TL
10using std::string_view;
11
b3b6e05e
TL
12namespace {
13
7c673cae
FG
14rocksdb::Status err_to_status(int r)
15{
16 switch (r) {
17 case 0:
18 return rocksdb::Status::OK();
19 case -ENOENT:
20 return rocksdb::Status::NotFound(rocksdb::Status::kNone);
21 case -EINVAL:
22 return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone);
23 case -EIO:
11fdf7f2 24 case -EEXIST:
7c673cae 25 return rocksdb::Status::IOError(rocksdb::Status::kNone);
11fdf7f2
TL
26 case -ENOLCK:
27 return rocksdb::Status::IOError(strerror(r));
7c673cae
FG
28 default:
29 // FIXME :(
11fdf7f2 30 ceph_abort_msg("unrecognized error code");
7c673cae
FG
31 return rocksdb::Status::NotSupported(rocksdb::Status::kNone);
32 }
33}
34
b3b6e05e
TL
35std::pair<std::string_view, std::string_view>
36split(const std::string &fn)
37{
38 size_t slash = fn.rfind('/');
39 assert(slash != fn.npos);
40 size_t file_begin = slash + 1;
41 while (slash && fn[slash - 1] == '/')
42 --slash;
43 return {string_view(fn.data(), slash),
44 string_view(fn.data() + file_begin,
45 fn.size() - file_begin)};
46}
47
48}
49
7c673cae
FG
50// A file abstraction for reading sequentially through a file
51class BlueRocksSequentialFile : public rocksdb::SequentialFile {
52 BlueFS *fs;
53 BlueFS::FileReader *h;
54 public:
55 BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
56 ~BlueRocksSequentialFile() override {
57 delete h;
58 }
59
60 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
61 // written by this routine. Sets "*result" to the data that was
62 // read (including if fewer than "n" bytes were successfully read).
63 // May set "*result" to point at data in "scratch[0..n-1]", so
64 // "scratch[0..n-1]" must be live when "*result" is used.
65 // If an error was encountered, returns a non-OK status.
66 //
67 // REQUIRES: External synchronization
68 rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override {
f67539c2 69 int64_t r = fs->read(h, h->buf.pos, n, NULL, scratch);
11fdf7f2 70 ceph_assert(r >= 0);
7c673cae
FG
71 *result = rocksdb::Slice(scratch, r);
72 return rocksdb::Status::OK();
73 }
74
75 // Skip "n" bytes from the file. This is guaranteed to be no
76 // slower that reading the same data, but may be faster.
77 //
78 // If end of file is reached, skipping will stop at the end of the
79 // file, and Skip will return OK.
80 //
81 // REQUIRES: External synchronization
82 rocksdb::Status Skip(uint64_t n) override {
83 h->buf.skip(n);
84 return rocksdb::Status::OK();
85 }
86
87 // Remove any kind of caching of data from the offset to offset+length
88 // of this file. If the length is 0, then it refers to the end of file.
89 // If the system is not caching the file contents, then this is a noop.
90 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
f67539c2 91 h->buf.invalidate_cache(offset, length);
7c673cae
FG
92 fs->invalidate_cache(h->file, offset, length);
93 return rocksdb::Status::OK();
94 }
95};
96
97// A file abstraction for randomly reading the contents of a file.
98class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile {
99 BlueFS *fs;
100 BlueFS::FileReader *h;
101 public:
102 BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
103 ~BlueRocksRandomAccessFile() override {
104 delete h;
105 }
106
107 // Read up to "n" bytes from the file starting at "offset".
108 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
109 // to the data that was read (including if fewer than "n" bytes were
110 // successfully read). May set "*result" to point at data in
111 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
112 // "*result" is used. If an error was encountered, returns a non-OK
113 // status.
114 //
115 // Safe for concurrent use by multiple threads.
116 rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result,
117 char* scratch) const override {
adb31ebb 118 int64_t r = fs->read_random(h, offset, n, scratch);
11fdf7f2 119 ceph_assert(r >= 0);
7c673cae
FG
120 *result = rocksdb::Slice(scratch, r);
121 return rocksdb::Status::OK();
122 }
123
7c673cae
FG
124 // Tries to get an unique ID for this file that will be the same each time
125 // the file is opened (and will stay the same while the file is open).
126 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
127 // ID can be created this function returns the length of the ID and places it
128 // in "id"; otherwise, this function returns 0, in which case "id"
129 // may not have been modified.
130 //
131 // This function guarantees, for IDs from a given environment, two unique ids
132 // cannot be made equal to eachother by adding arbitrary bytes to one of
133 // them. That is, no unique ID is the prefix of another.
134 //
135 // This function guarantees that the returned ID will not be interpretable as
136 // a single varint.
137 //
138 // Note: these IDs are only valid for the duration of the process.
139 size_t GetUniqueId(char* id, size_t max_size) const override {
140 return snprintf(id, max_size, "%016llx",
141 (unsigned long long)h->file->fnode.ino);
142 };
143
494da23a
TL
144 // Readahead the file starting from offset by n bytes for caching.
145 rocksdb::Status Prefetch(uint64_t offset, size_t n) override {
f67539c2 146 fs->read(h, offset, n, nullptr, nullptr);
494da23a
TL
147 return rocksdb::Status::OK();
148 }
149
7c673cae
FG
150 //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
151
152 void Hint(AccessPattern pattern) override {
153 if (pattern == RANDOM)
154 h->buf.max_prefetch = 4096;
155 else if (pattern == SEQUENTIAL)
156 h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch;
157 }
158
f67539c2
TL
159 bool use_direct_io() const override {
160 return !fs->cct->_conf->bluefs_buffered_io;
161 }
162
7c673cae
FG
163 // Remove any kind of caching of data from the offset to offset+length
164 // of this file. If the length is 0, then it refers to the end of file.
165 // If the system is not caching the file contents, then this is a noop.
166 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
f67539c2 167 h->buf.invalidate_cache(offset, length);
7c673cae
FG
168 fs->invalidate_cache(h->file, offset, length);
169 return rocksdb::Status::OK();
170 }
171};
172
173
174// A file abstraction for sequential writing. The implementation
175// must provide buffering since callers may append small fragments
176// at a time to the file.
177class BlueRocksWritableFile : public rocksdb::WritableFile {
178 BlueFS *fs;
179 BlueFS::FileWriter *h;
180 public:
181 BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {}
182 ~BlueRocksWritableFile() override {
183 fs->close_writer(h);
184 }
185
186 // Indicates if the class makes use of unbuffered I/O
187 /*bool UseOSBuffer() const {
188 return true;
189 }*/
190
191 // This is needed when you want to allocate
192 // AlignedBuffer for use with file I/O classes
193 // Used for unbuffered file I/O when UseOSBuffer() returns false
194 /*size_t GetRequiredBufferAlignment() const {
195 return c_DefaultPageSize;
196 }*/
197
198 rocksdb::Status Append(const rocksdb::Slice& data) override {
cd265ab1 199 fs->append_try_flush(h, data.data(), data.size());
7c673cae
FG
200 return rocksdb::Status::OK();
201 }
202
203 // Positioned write for unbuffered access default forward
204 // to simple append as most of the tests are buffered by default
205 rocksdb::Status PositionedAppend(
206 const rocksdb::Slice& /* data */,
207 uint64_t /* offset */) override {
208 return rocksdb::Status::NotSupported();
209 }
210
211 // Truncate is necessary to trim the file to the correct size
212 // before closing. It is not always possible to keep track of the file
213 // size due to whole pages writes. The behavior is undefined if called
214 // with other writes to follow.
215 rocksdb::Status Truncate(uint64_t size) override {
216 // we mirror the posix env, which does nothing here; instead, it
217 // truncates to the final size on close. whatever!
218 return rocksdb::Status::OK();
219 //int r = fs->truncate(h, size);
220 // return err_to_status(r);
221 }
222
223 rocksdb::Status Close() override {
522d829b 224 fs->fsync(h);
7c673cae
FG
225
226 // mimic posix env, here. shrug.
227 size_t block_size;
228 size_t last_allocated_block;
229 GetPreallocationStatus(&block_size, &last_allocated_block);
230 if (last_allocated_block > 0) {
231 int r = fs->truncate(h, h->pos);
232 if (r < 0)
233 return err_to_status(r);
234 }
235
236 return rocksdb::Status::OK();
237 }
238
239 rocksdb::Status Flush() override {
240 fs->flush(h);
241 return rocksdb::Status::OK();
242 }
243
244 rocksdb::Status Sync() override { // sync data
245 fs->fsync(h);
246 return rocksdb::Status::OK();
247 }
248
249 // true if Sync() and Fsync() are safe to call concurrently with Append()
250 // and Flush().
251 bool IsSyncThreadSafe() const override {
252 return true;
253 }
254
255 // Indicates the upper layers if the current WritableFile implementation
256 // uses direct IO.
257 bool UseDirectIO() const {
258 return false;
259 }
260
11fdf7f2
TL
261 void SetWriteLifeTimeHint(rocksdb::Env::WriteLifeTimeHint hint) override {
262 h->write_hint = (const int)hint;
263 }
264
7c673cae
FG
265 /*
266 * Get the size of valid data in the file.
267 */
268 uint64_t GetFileSize() override {
f67539c2 269 return h->file->fnode.size + h->get_buffer_length();;
7c673cae
FG
270 }
271
272 // For documentation, refer to RandomAccessFile::GetUniqueId()
273 size_t GetUniqueId(char* id, size_t max_size) const override {
274 return snprintf(id, max_size, "%016llx",
275 (unsigned long long)h->file->fnode.ino);
276 }
277
278 // Remove any kind of caching of data from the offset to offset+length
279 // of this file. If the length is 0, then it refers to the end of file.
280 // If the system is not caching the file contents, then this is a noop.
281 // This call has no effect on dirty pages in the cache.
282 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
f67539c2 283 fs->fsync(h);
7c673cae
FG
284 fs->invalidate_cache(h->file, offset, length);
285 return rocksdb::Status::OK();
286 }
287
288 using rocksdb::WritableFile::RangeSync;
289 // Sync a file range with disk.
290 // offset is the starting byte of the file range to be synchronized.
291 // nbytes specifies the length of the range to be synchronized.
292 // This asks the OS to initiate flushing the cached data to disk,
293 // without waiting for completion.
294 // Default implementation does nothing.
295 rocksdb::Status RangeSync(off_t offset, off_t nbytes) {
296 // round down to page boundaries
297 int partial = offset & 4095;
298 offset -= partial;
299 nbytes += partial;
300 nbytes &= ~4095;
301 if (nbytes)
302 fs->flush_range(h, offset, nbytes);
303 return rocksdb::Status::OK();
304 }
305
306 protected:
307 using rocksdb::WritableFile::Allocate;
308 /*
309 * Pre-allocate space for a file.
310 */
311 rocksdb::Status Allocate(off_t offset, off_t len) {
312 int r = fs->preallocate(h->file, offset, len);
313 return err_to_status(r);
314 }
315};
316
317
318// Directory object represents collection of files and implements
319// filesystem operations that can be executed on directories.
320class BlueRocksDirectory : public rocksdb::Directory {
321 BlueFS *fs;
322 public:
323 explicit BlueRocksDirectory(BlueFS *f) : fs(f) {}
324
325 // Fsync directory. Can be called concurrently from multiple threads.
326 rocksdb::Status Fsync() override {
327 // it is sufficient to flush the log.
1911f103 328 fs->sync_metadata(false);
7c673cae
FG
329 return rocksdb::Status::OK();
330 }
331};
332
333// Identifies a locked file.
334class BlueRocksFileLock : public rocksdb::FileLock {
335 public:
336 BlueFS *fs;
337 BlueFS::FileLock *lock;
338 BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { }
339 ~BlueRocksFileLock() override {
340 }
341};
342
343
344// --------------------
345// --- BlueRocksEnv ---
346// --------------------
347
348BlueRocksEnv::BlueRocksEnv(BlueFS *f)
349 : EnvWrapper(Env::Default()), // forward most of it to POSIX
350 fs(f)
351{
352
353}
354
355rocksdb::Status BlueRocksEnv::NewSequentialFile(
356 const std::string& fname,
357 std::unique_ptr<rocksdb::SequentialFile>* result,
358 const rocksdb::EnvOptions& options)
359{
360 if (fname[0] == '/')
361 return target()->NewSequentialFile(fname, result, options);
b3b6e05e 362 auto [dir, file] = split(fname);
7c673cae
FG
363 BlueFS::FileReader *h;
364 int r = fs->open_for_read(dir, file, &h, false);
365 if (r < 0)
366 return err_to_status(r);
367 result->reset(new BlueRocksSequentialFile(fs, h));
368 return rocksdb::Status::OK();
369}
370
371rocksdb::Status BlueRocksEnv::NewRandomAccessFile(
372 const std::string& fname,
373 std::unique_ptr<rocksdb::RandomAccessFile>* result,
374 const rocksdb::EnvOptions& options)
375{
b3b6e05e 376 auto [dir, file] = split(fname);
7c673cae
FG
377 BlueFS::FileReader *h;
378 int r = fs->open_for_read(dir, file, &h, true);
379 if (r < 0)
380 return err_to_status(r);
381 result->reset(new BlueRocksRandomAccessFile(fs, h));
382 return rocksdb::Status::OK();
383}
384
385rocksdb::Status BlueRocksEnv::NewWritableFile(
386 const std::string& fname,
387 std::unique_ptr<rocksdb::WritableFile>* result,
388 const rocksdb::EnvOptions& options)
389{
b3b6e05e 390 auto [dir, file] = split(fname);
7c673cae
FG
391 BlueFS::FileWriter *h;
392 int r = fs->open_for_write(dir, file, &h, false);
393 if (r < 0)
394 return err_to_status(r);
395 result->reset(new BlueRocksWritableFile(fs, h));
396 return rocksdb::Status::OK();
397}
398
399rocksdb::Status BlueRocksEnv::ReuseWritableFile(
400 const std::string& new_fname,
401 const std::string& old_fname,
402 std::unique_ptr<rocksdb::WritableFile>* result,
403 const rocksdb::EnvOptions& options)
404{
b3b6e05e
TL
405 auto [old_dir, old_file] = split(old_fname);
406 auto [new_dir, new_file] = split(new_fname);
7c673cae
FG
407
408 int r = fs->rename(old_dir, old_file, new_dir, new_file);
409 if (r < 0)
410 return err_to_status(r);
411
412 BlueFS::FileWriter *h;
413 r = fs->open_for_write(new_dir, new_file, &h, true);
414 if (r < 0)
415 return err_to_status(r);
416 result->reset(new BlueRocksWritableFile(fs, h));
20effc67 417 fs->sync_metadata(false);
7c673cae
FG
418 return rocksdb::Status::OK();
419}
420
421rocksdb::Status BlueRocksEnv::NewDirectory(
422 const std::string& name,
423 std::unique_ptr<rocksdb::Directory>* result)
424{
425 if (!fs->dir_exists(name))
11fdf7f2 426 return rocksdb::Status::NotFound(name, strerror(ENOENT));
7c673cae
FG
427 result->reset(new BlueRocksDirectory(fs));
428 return rocksdb::Status::OK();
429}
430
431rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname)
432{
433 if (fname[0] == '/')
434 return target()->FileExists(fname);
b3b6e05e 435 auto [dir, file] = split(fname);
7c673cae
FG
436 if (fs->stat(dir, file, NULL, NULL) == 0)
437 return rocksdb::Status::OK();
438 return err_to_status(-ENOENT);
439}
440
441rocksdb::Status BlueRocksEnv::GetChildren(
442 const std::string& dir,
443 std::vector<std::string>* result)
444{
d2e6a577 445 result->clear();
7c673cae
FG
446 int r = fs->readdir(dir, result);
447 if (r < 0)
11fdf7f2 448 return rocksdb::Status::NotFound(dir, strerror(ENOENT));// return err_to_status(r);
7c673cae
FG
449 return rocksdb::Status::OK();
450}
451
452rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname)
453{
b3b6e05e 454 auto [dir, file] = split(fname);
7c673cae
FG
455 int r = fs->unlink(dir, file);
456 if (r < 0)
457 return err_to_status(r);
20effc67 458 fs->sync_metadata(false);
7c673cae
FG
459 return rocksdb::Status::OK();
460}
461
462rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname)
463{
464 int r = fs->mkdir(dirname);
465 if (r < 0)
466 return err_to_status(r);
467 return rocksdb::Status::OK();
468}
469
470rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname)
471{
472 int r = fs->mkdir(dirname);
473 if (r < 0 && r != -EEXIST)
474 return err_to_status(r);
475 return rocksdb::Status::OK();
476}
477
478rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname)
479{
480 int r = fs->rmdir(dirname);
481 if (r < 0)
482 return err_to_status(r);
483 return rocksdb::Status::OK();
484}
485
486rocksdb::Status BlueRocksEnv::GetFileSize(
487 const std::string& fname,
488 uint64_t* file_size)
489{
b3b6e05e 490 auto [dir, file] = split(fname);
7c673cae
FG
491 int r = fs->stat(dir, file, file_size, NULL);
492 if (r < 0)
493 return err_to_status(r);
494 return rocksdb::Status::OK();
495}
496
497rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname,
498 uint64_t* file_mtime)
499{
b3b6e05e 500 auto [dir, file] = split(fname);
7c673cae
FG
501 utime_t mtime;
502 int r = fs->stat(dir, file, NULL, &mtime);
503 if (r < 0)
504 return err_to_status(r);
505 *file_mtime = mtime.sec();
506 return rocksdb::Status::OK();
507}
508
509rocksdb::Status BlueRocksEnv::RenameFile(
510 const std::string& src,
511 const std::string& target)
512{
b3b6e05e
TL
513 auto [old_dir, old_file] = split(src);
514 auto [new_dir, new_file] = split(target);
7c673cae
FG
515
516 int r = fs->rename(old_dir, old_file, new_dir, new_file);
517 if (r < 0)
518 return err_to_status(r);
20effc67 519 fs->sync_metadata(false);
7c673cae
FG
520 return rocksdb::Status::OK();
521}
522
523rocksdb::Status BlueRocksEnv::LinkFile(
524 const std::string& src,
525 const std::string& target)
526{
527 ceph_abort();
528}
529
11fdf7f2
TL
530rocksdb::Status BlueRocksEnv::AreFilesSame(
531 const std::string& first,
532 const std::string& second, bool* res)
533{
534 for (auto& path : {first, second}) {
535 if (fs->dir_exists(path)) {
536 continue;
537 }
b3b6e05e 538 auto [dir, file] = split(path);
11fdf7f2
TL
539 int r = fs->stat(dir, file, nullptr, nullptr);
540 if (!r) {
541 continue;
542 } else if (r == -ENOENT) {
543 return rocksdb::Status::NotFound("AreFilesSame", path);
544 } else {
545 return err_to_status(r);
546 }
547 }
548 *res = (first == second);
549 return rocksdb::Status::OK();
550}
551
7c673cae
FG
552rocksdb::Status BlueRocksEnv::LockFile(
553 const std::string& fname,
554 rocksdb::FileLock** lock)
555{
b3b6e05e 556 auto [dir, file] = split(fname);
7c673cae
FG
557 BlueFS::FileLock *l = NULL;
558 int r = fs->lock_file(dir, file, &l);
559 if (r < 0)
560 return err_to_status(r);
561 *lock = new BlueRocksFileLock(fs, l);
562 return rocksdb::Status::OK();
563}
564
565rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock)
566{
567 BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock);
568 int r = fs->unlock_file(l->lock);
569 if (r < 0)
570 return err_to_status(r);
571 delete lock;
11fdf7f2 572 lock = nullptr;
7c673cae
FG
573 return rocksdb::Status::OK();
574}
575
576rocksdb::Status BlueRocksEnv::GetAbsolutePath(
577 const std::string& db_path,
578 std::string* output_path)
579{
580 // this is a lie...
581 *output_path = "/" + db_path;
582 return rocksdb::Status::OK();
583}
584
585rocksdb::Status BlueRocksEnv::NewLogger(
586 const std::string& fname,
587 std::shared_ptr<rocksdb::Logger>* result)
588{
589 // ignore the filename :)
590 result->reset(create_rocksdb_ceph_logger());
591 return rocksdb::Status::OK();
592}
593
594rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path)
595{
596 static int foo = 0;
597 *path = "temp_" + stringify(++foo);
598 return rocksdb::Status::OK();
599}