]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueRocksEnv.cc
import ceph 15.2.14
[ceph.git] / ceph / src / os / bluestore / BlueRocksEnv.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "BlueRocksEnv.h"
5#include "BlueFS.h"
6#include "include/stringify.h"
7#include "kv/RocksDBStore.h"
11fdf7f2 8#include "string.h"
7c673cae 9
ec96510d
FG
10namespace {
11
7c673cae
FG
12rocksdb::Status err_to_status(int r)
13{
14 switch (r) {
15 case 0:
16 return rocksdb::Status::OK();
17 case -ENOENT:
18 return rocksdb::Status::NotFound(rocksdb::Status::kNone);
19 case -EINVAL:
20 return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone);
21 case -EIO:
11fdf7f2 22 case -EEXIST:
7c673cae 23 return rocksdb::Status::IOError(rocksdb::Status::kNone);
11fdf7f2
TL
24 case -ENOLCK:
25 return rocksdb::Status::IOError(strerror(r));
7c673cae
FG
26 default:
27 // FIXME :(
11fdf7f2 28 ceph_abort_msg("unrecognized error code");
7c673cae
FG
29 return rocksdb::Status::NotSupported(rocksdb::Status::kNone);
30 }
31}
32
ec96510d
FG
33std::pair<std::string_view, std::string_view>
34split(const std::string &fn)
35{
36 size_t slash = fn.rfind('/');
37 assert(slash != fn.npos);
38 size_t file_begin = slash + 1;
39 while (slash && fn[slash - 1] == '/')
40 --slash;
41 return {string_view(fn.data(), slash),
42 string_view(fn.data() + file_begin,
43 fn.size() - file_begin)};
44}
45
46}
47
7c673cae
FG
48// A file abstraction for reading sequentially through a file
49class BlueRocksSequentialFile : public rocksdb::SequentialFile {
50 BlueFS *fs;
51 BlueFS::FileReader *h;
52 public:
53 BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
54 ~BlueRocksSequentialFile() override {
55 delete h;
56 }
57
58 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
59 // written by this routine. Sets "*result" to the data that was
60 // read (including if fewer than "n" bytes were successfully read).
61 // May set "*result" to point at data in "scratch[0..n-1]", so
62 // "scratch[0..n-1]" must be live when "*result" is used.
63 // If an error was encountered, returns a non-OK status.
64 //
65 // REQUIRES: External synchronization
66 rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override {
adb31ebb 67 int64_t r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch);
11fdf7f2 68 ceph_assert(r >= 0);
7c673cae
FG
69 *result = rocksdb::Slice(scratch, r);
70 return rocksdb::Status::OK();
71 }
72
73 // Skip "n" bytes from the file. This is guaranteed to be no
74 // slower that reading the same data, but may be faster.
75 //
76 // If end of file is reached, skipping will stop at the end of the
77 // file, and Skip will return OK.
78 //
79 // REQUIRES: External synchronization
80 rocksdb::Status Skip(uint64_t n) override {
81 h->buf.skip(n);
82 return rocksdb::Status::OK();
83 }
84
85 // Remove any kind of caching of data from the offset to offset+length
86 // of this file. If the length is 0, then it refers to the end of file.
87 // If the system is not caching the file contents, then this is a noop.
88 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
89 fs->invalidate_cache(h->file, offset, length);
90 return rocksdb::Status::OK();
91 }
92};
93
94// A file abstraction for randomly reading the contents of a file.
95class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile {
96 BlueFS *fs;
97 BlueFS::FileReader *h;
98 public:
99 BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
100 ~BlueRocksRandomAccessFile() override {
101 delete h;
102 }
103
104 // Read up to "n" bytes from the file starting at "offset".
105 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
106 // to the data that was read (including if fewer than "n" bytes were
107 // successfully read). May set "*result" to point at data in
108 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
109 // "*result" is used. If an error was encountered, returns a non-OK
110 // status.
111 //
112 // Safe for concurrent use by multiple threads.
113 rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result,
114 char* scratch) const override {
adb31ebb 115 int64_t r = fs->read_random(h, offset, n, scratch);
11fdf7f2 116 ceph_assert(r >= 0);
7c673cae
FG
117 *result = rocksdb::Slice(scratch, r);
118 return rocksdb::Status::OK();
119 }
120
7c673cae
FG
121 // Tries to get an unique ID for this file that will be the same each time
122 // the file is opened (and will stay the same while the file is open).
123 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
124 // ID can be created this function returns the length of the ID and places it
125 // in "id"; otherwise, this function returns 0, in which case "id"
126 // may not have been modified.
127 //
128 // This function guarantees, for IDs from a given environment, two unique ids
129 // cannot be made equal to eachother by adding arbitrary bytes to one of
130 // them. That is, no unique ID is the prefix of another.
131 //
132 // This function guarantees that the returned ID will not be interpretable as
133 // a single varint.
134 //
135 // Note: these IDs are only valid for the duration of the process.
136 size_t GetUniqueId(char* id, size_t max_size) const override {
137 return snprintf(id, max_size, "%016llx",
138 (unsigned long long)h->file->fnode.ino);
139 };
140
494da23a
TL
141 // Readahead the file starting from offset by n bytes for caching.
142 rocksdb::Status Prefetch(uint64_t offset, size_t n) override {
143 fs->read(h, &h->buf, offset, n, nullptr, nullptr);
144 return rocksdb::Status::OK();
145 }
146
7c673cae
FG
147 //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
148
149 void Hint(AccessPattern pattern) override {
150 if (pattern == RANDOM)
151 h->buf.max_prefetch = 4096;
152 else if (pattern == SEQUENTIAL)
153 h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch;
154 }
155
156 // Remove any kind of caching of data from the offset to offset+length
157 // of this file. If the length is 0, then it refers to the end of file.
158 // If the system is not caching the file contents, then this is a noop.
159 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
160 fs->invalidate_cache(h->file, offset, length);
161 return rocksdb::Status::OK();
162 }
163};
164
165
166// A file abstraction for sequential writing. The implementation
167// must provide buffering since callers may append small fragments
168// at a time to the file.
169class BlueRocksWritableFile : public rocksdb::WritableFile {
170 BlueFS *fs;
171 BlueFS::FileWriter *h;
172 public:
173 BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {}
174 ~BlueRocksWritableFile() override {
175 fs->close_writer(h);
176 }
177
178 // Indicates if the class makes use of unbuffered I/O
179 /*bool UseOSBuffer() const {
180 return true;
181 }*/
182
183 // This is needed when you want to allocate
184 // AlignedBuffer for use with file I/O classes
185 // Used for unbuffered file I/O when UseOSBuffer() returns false
186 /*size_t GetRequiredBufferAlignment() const {
187 return c_DefaultPageSize;
188 }*/
189
190 rocksdb::Status Append(const rocksdb::Slice& data) override {
cd265ab1 191 fs->append_try_flush(h, data.data(), data.size());
7c673cae
FG
192 return rocksdb::Status::OK();
193 }
194
195 // Positioned write for unbuffered access default forward
196 // to simple append as most of the tests are buffered by default
197 rocksdb::Status PositionedAppend(
198 const rocksdb::Slice& /* data */,
199 uint64_t /* offset */) override {
200 return rocksdb::Status::NotSupported();
201 }
202
203 // Truncate is necessary to trim the file to the correct size
204 // before closing. It is not always possible to keep track of the file
205 // size due to whole pages writes. The behavior is undefined if called
206 // with other writes to follow.
207 rocksdb::Status Truncate(uint64_t size) override {
208 // we mirror the posix env, which does nothing here; instead, it
209 // truncates to the final size on close. whatever!
210 return rocksdb::Status::OK();
211 //int r = fs->truncate(h, size);
212 // return err_to_status(r);
213 }
214
215 rocksdb::Status Close() override {
ec96510d 216 fs->fsync(h);
7c673cae
FG
217
218 // mimic posix env, here. shrug.
219 size_t block_size;
220 size_t last_allocated_block;
221 GetPreallocationStatus(&block_size, &last_allocated_block);
222 if (last_allocated_block > 0) {
223 int r = fs->truncate(h, h->pos);
224 if (r < 0)
225 return err_to_status(r);
226 }
227
228 return rocksdb::Status::OK();
229 }
230
231 rocksdb::Status Flush() override {
232 fs->flush(h);
233 return rocksdb::Status::OK();
234 }
235
236 rocksdb::Status Sync() override { // sync data
237 fs->fsync(h);
238 return rocksdb::Status::OK();
239 }
240
241 // true if Sync() and Fsync() are safe to call concurrently with Append()
242 // and Flush().
243 bool IsSyncThreadSafe() const override {
244 return true;
245 }
246
247 // Indicates the upper layers if the current WritableFile implementation
248 // uses direct IO.
249 bool UseDirectIO() const {
250 return false;
251 }
252
11fdf7f2
TL
253 void SetWriteLifeTimeHint(rocksdb::Env::WriteLifeTimeHint hint) override {
254 h->write_hint = (const int)hint;
255 }
256
7c673cae
FG
257 /*
258 * Get the size of valid data in the file.
259 */
260 uint64_t GetFileSize() override {
261 return h->file->fnode.size + h->buffer.length();;
262 }
263
264 // For documentation, refer to RandomAccessFile::GetUniqueId()
265 size_t GetUniqueId(char* id, size_t max_size) const override {
266 return snprintf(id, max_size, "%016llx",
267 (unsigned long long)h->file->fnode.ino);
268 }
269
270 // Remove any kind of caching of data from the offset to offset+length
271 // of this file. If the length is 0, then it refers to the end of file.
272 // If the system is not caching the file contents, then this is a noop.
273 // This call has no effect on dirty pages in the cache.
274 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
275 fs->invalidate_cache(h->file, offset, length);
276 return rocksdb::Status::OK();
277 }
278
279 using rocksdb::WritableFile::RangeSync;
280 // Sync a file range with disk.
281 // offset is the starting byte of the file range to be synchronized.
282 // nbytes specifies the length of the range to be synchronized.
283 // This asks the OS to initiate flushing the cached data to disk,
284 // without waiting for completion.
285 // Default implementation does nothing.
286 rocksdb::Status RangeSync(off_t offset, off_t nbytes) {
287 // round down to page boundaries
288 int partial = offset & 4095;
289 offset -= partial;
290 nbytes += partial;
291 nbytes &= ~4095;
292 if (nbytes)
293 fs->flush_range(h, offset, nbytes);
294 return rocksdb::Status::OK();
295 }
296
297 protected:
298 using rocksdb::WritableFile::Allocate;
299 /*
300 * Pre-allocate space for a file.
301 */
302 rocksdb::Status Allocate(off_t offset, off_t len) {
303 int r = fs->preallocate(h->file, offset, len);
304 return err_to_status(r);
305 }
306};
307
308
309// Directory object represents collection of files and implements
310// filesystem operations that can be executed on directories.
311class BlueRocksDirectory : public rocksdb::Directory {
312 BlueFS *fs;
313 public:
314 explicit BlueRocksDirectory(BlueFS *f) : fs(f) {}
315
316 // Fsync directory. Can be called concurrently from multiple threads.
317 rocksdb::Status Fsync() override {
318 // it is sufficient to flush the log.
1911f103 319 fs->sync_metadata(false);
7c673cae
FG
320 return rocksdb::Status::OK();
321 }
322};
323
324// Identifies a locked file.
325class BlueRocksFileLock : public rocksdb::FileLock {
326 public:
327 BlueFS *fs;
328 BlueFS::FileLock *lock;
329 BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { }
330 ~BlueRocksFileLock() override {
331 }
332};
333
334
335// --------------------
336// --- BlueRocksEnv ---
337// --------------------
338
339BlueRocksEnv::BlueRocksEnv(BlueFS *f)
340 : EnvWrapper(Env::Default()), // forward most of it to POSIX
341 fs(f)
342{
343
344}
345
346rocksdb::Status BlueRocksEnv::NewSequentialFile(
347 const std::string& fname,
348 std::unique_ptr<rocksdb::SequentialFile>* result,
349 const rocksdb::EnvOptions& options)
350{
351 if (fname[0] == '/')
352 return target()->NewSequentialFile(fname, result, options);
ec96510d 353 auto [dir, file] = split(fname);
7c673cae
FG
354 BlueFS::FileReader *h;
355 int r = fs->open_for_read(dir, file, &h, false);
356 if (r < 0)
357 return err_to_status(r);
358 result->reset(new BlueRocksSequentialFile(fs, h));
359 return rocksdb::Status::OK();
360}
361
362rocksdb::Status BlueRocksEnv::NewRandomAccessFile(
363 const std::string& fname,
364 std::unique_ptr<rocksdb::RandomAccessFile>* result,
365 const rocksdb::EnvOptions& options)
366{
ec96510d 367 auto [dir, file] = split(fname);
7c673cae
FG
368 BlueFS::FileReader *h;
369 int r = fs->open_for_read(dir, file, &h, true);
370 if (r < 0)
371 return err_to_status(r);
372 result->reset(new BlueRocksRandomAccessFile(fs, h));
373 return rocksdb::Status::OK();
374}
375
376rocksdb::Status BlueRocksEnv::NewWritableFile(
377 const std::string& fname,
378 std::unique_ptr<rocksdb::WritableFile>* result,
379 const rocksdb::EnvOptions& options)
380{
ec96510d 381 auto [dir, file] = split(fname);
7c673cae
FG
382 BlueFS::FileWriter *h;
383 int r = fs->open_for_write(dir, file, &h, false);
384 if (r < 0)
385 return err_to_status(r);
386 result->reset(new BlueRocksWritableFile(fs, h));
387 return rocksdb::Status::OK();
388}
389
390rocksdb::Status BlueRocksEnv::ReuseWritableFile(
391 const std::string& new_fname,
392 const std::string& old_fname,
393 std::unique_ptr<rocksdb::WritableFile>* result,
394 const rocksdb::EnvOptions& options)
395{
ec96510d
FG
396 auto [old_dir, old_file] = split(old_fname);
397 auto [new_dir, new_file] = split(new_fname);
7c673cae
FG
398
399 int r = fs->rename(old_dir, old_file, new_dir, new_file);
400 if (r < 0)
401 return err_to_status(r);
402
403 BlueFS::FileWriter *h;
404 r = fs->open_for_write(new_dir, new_file, &h, true);
405 if (r < 0)
406 return err_to_status(r);
407 result->reset(new BlueRocksWritableFile(fs, h));
408 return rocksdb::Status::OK();
409}
410
411rocksdb::Status BlueRocksEnv::NewDirectory(
412 const std::string& name,
413 std::unique_ptr<rocksdb::Directory>* result)
414{
415 if (!fs->dir_exists(name))
11fdf7f2 416 return rocksdb::Status::NotFound(name, strerror(ENOENT));
7c673cae
FG
417 result->reset(new BlueRocksDirectory(fs));
418 return rocksdb::Status::OK();
419}
420
421rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname)
422{
423 if (fname[0] == '/')
424 return target()->FileExists(fname);
ec96510d 425 auto [dir, file] = split(fname);
7c673cae
FG
426 if (fs->stat(dir, file, NULL, NULL) == 0)
427 return rocksdb::Status::OK();
428 return err_to_status(-ENOENT);
429}
430
431rocksdb::Status BlueRocksEnv::GetChildren(
432 const std::string& dir,
433 std::vector<std::string>* result)
434{
d2e6a577 435 result->clear();
7c673cae
FG
436 int r = fs->readdir(dir, result);
437 if (r < 0)
11fdf7f2 438 return rocksdb::Status::NotFound(dir, strerror(ENOENT));// return err_to_status(r);
7c673cae
FG
439 return rocksdb::Status::OK();
440}
441
442rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname)
443{
ec96510d 444 auto [dir, file] = split(fname);
7c673cae
FG
445 int r = fs->unlink(dir, file);
446 if (r < 0)
447 return err_to_status(r);
448 return rocksdb::Status::OK();
449}
450
451rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname)
452{
453 int r = fs->mkdir(dirname);
454 if (r < 0)
455 return err_to_status(r);
456 return rocksdb::Status::OK();
457}
458
459rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname)
460{
461 int r = fs->mkdir(dirname);
462 if (r < 0 && r != -EEXIST)
463 return err_to_status(r);
464 return rocksdb::Status::OK();
465}
466
467rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname)
468{
469 int r = fs->rmdir(dirname);
470 if (r < 0)
471 return err_to_status(r);
472 return rocksdb::Status::OK();
473}
474
475rocksdb::Status BlueRocksEnv::GetFileSize(
476 const std::string& fname,
477 uint64_t* file_size)
478{
ec96510d 479 auto [dir, file] = split(fname);
7c673cae
FG
480 int r = fs->stat(dir, file, file_size, NULL);
481 if (r < 0)
482 return err_to_status(r);
483 return rocksdb::Status::OK();
484}
485
486rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname,
487 uint64_t* file_mtime)
488{
ec96510d 489 auto [dir, file] = split(fname);
7c673cae
FG
490 utime_t mtime;
491 int r = fs->stat(dir, file, NULL, &mtime);
492 if (r < 0)
493 return err_to_status(r);
494 *file_mtime = mtime.sec();
495 return rocksdb::Status::OK();
496}
497
498rocksdb::Status BlueRocksEnv::RenameFile(
499 const std::string& src,
500 const std::string& target)
501{
ec96510d
FG
502 auto [old_dir, old_file] = split(src);
503 auto [new_dir, new_file] = split(target);
7c673cae
FG
504
505 int r = fs->rename(old_dir, old_file, new_dir, new_file);
506 if (r < 0)
507 return err_to_status(r);
508 return rocksdb::Status::OK();
509}
510
511rocksdb::Status BlueRocksEnv::LinkFile(
512 const std::string& src,
513 const std::string& target)
514{
515 ceph_abort();
516}
517
11fdf7f2
TL
518rocksdb::Status BlueRocksEnv::AreFilesSame(
519 const std::string& first,
520 const std::string& second, bool* res)
521{
522 for (auto& path : {first, second}) {
523 if (fs->dir_exists(path)) {
524 continue;
525 }
ec96510d 526 auto [dir, file] = split(path);
11fdf7f2
TL
527 int r = fs->stat(dir, file, nullptr, nullptr);
528 if (!r) {
529 continue;
530 } else if (r == -ENOENT) {
531 return rocksdb::Status::NotFound("AreFilesSame", path);
532 } else {
533 return err_to_status(r);
534 }
535 }
536 *res = (first == second);
537 return rocksdb::Status::OK();
538}
539
7c673cae
FG
540rocksdb::Status BlueRocksEnv::LockFile(
541 const std::string& fname,
542 rocksdb::FileLock** lock)
543{
ec96510d 544 auto [dir, file] = split(fname);
7c673cae
FG
545 BlueFS::FileLock *l = NULL;
546 int r = fs->lock_file(dir, file, &l);
547 if (r < 0)
548 return err_to_status(r);
549 *lock = new BlueRocksFileLock(fs, l);
550 return rocksdb::Status::OK();
551}
552
553rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock)
554{
555 BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock);
556 int r = fs->unlock_file(l->lock);
557 if (r < 0)
558 return err_to_status(r);
559 delete lock;
11fdf7f2 560 lock = nullptr;
7c673cae
FG
561 return rocksdb::Status::OK();
562}
563
564rocksdb::Status BlueRocksEnv::GetAbsolutePath(
565 const std::string& db_path,
566 std::string* output_path)
567{
568 // this is a lie...
569 *output_path = "/" + db_path;
570 return rocksdb::Status::OK();
571}
572
573rocksdb::Status BlueRocksEnv::NewLogger(
574 const std::string& fname,
575 std::shared_ptr<rocksdb::Logger>* result)
576{
577 // ignore the filename :)
578 result->reset(create_rocksdb_ceph_logger());
579 return rocksdb::Status::OK();
580}
581
582rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path)
583{
584 static int foo = 0;
585 *path = "temp_" + stringify(++foo);
586 return rocksdb::Status::OK();
587}