]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueRocksEnv.cc
import ceph 16.2.6
[ceph.git] / ceph / src / os / bluestore / BlueRocksEnv.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "BlueRocksEnv.h"
5 #include "BlueFS.h"
6 #include "include/stringify.h"
7 #include "kv/RocksDBStore.h"
8 #include "string.h"
9
10 namespace {
11
12 rocksdb::Status err_to_status(int r)
13 {
14 switch (r) {
15 case 0:
16 return rocksdb::Status::OK();
17 case -ENOENT:
18 return rocksdb::Status::NotFound(rocksdb::Status::kNone);
19 case -EINVAL:
20 return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone);
21 case -EIO:
22 case -EEXIST:
23 return rocksdb::Status::IOError(rocksdb::Status::kNone);
24 case -ENOLCK:
25 return rocksdb::Status::IOError(strerror(r));
26 default:
27 // FIXME :(
28 ceph_abort_msg("unrecognized error code");
29 return rocksdb::Status::NotSupported(rocksdb::Status::kNone);
30 }
31 }
32
33 std::pair<std::string_view, std::string_view>
34 split(const std::string &fn)
35 {
36 size_t slash = fn.rfind('/');
37 assert(slash != fn.npos);
38 size_t file_begin = slash + 1;
39 while (slash && fn[slash - 1] == '/')
40 --slash;
41 return {string_view(fn.data(), slash),
42 string_view(fn.data() + file_begin,
43 fn.size() - file_begin)};
44 }
45
46 }
47
48 // A file abstraction for reading sequentially through a file
49 class BlueRocksSequentialFile : public rocksdb::SequentialFile {
50 BlueFS *fs;
51 BlueFS::FileReader *h;
52 public:
53 BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
54 ~BlueRocksSequentialFile() override {
55 delete h;
56 }
57
58 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
59 // written by this routine. Sets "*result" to the data that was
60 // read (including if fewer than "n" bytes were successfully read).
61 // May set "*result" to point at data in "scratch[0..n-1]", so
62 // "scratch[0..n-1]" must be live when "*result" is used.
63 // If an error was encountered, returns a non-OK status.
64 //
65 // REQUIRES: External synchronization
66 rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override {
67 int64_t r = fs->read(h, h->buf.pos, n, NULL, scratch);
68 ceph_assert(r >= 0);
69 *result = rocksdb::Slice(scratch, r);
70 return rocksdb::Status::OK();
71 }
72
73 // Skip "n" bytes from the file. This is guaranteed to be no
74 // slower that reading the same data, but may be faster.
75 //
76 // If end of file is reached, skipping will stop at the end of the
77 // file, and Skip will return OK.
78 //
79 // REQUIRES: External synchronization
80 rocksdb::Status Skip(uint64_t n) override {
81 h->buf.skip(n);
82 return rocksdb::Status::OK();
83 }
84
85 // Remove any kind of caching of data from the offset to offset+length
86 // of this file. If the length is 0, then it refers to the end of file.
87 // If the system is not caching the file contents, then this is a noop.
88 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
89 h->buf.invalidate_cache(offset, length);
90 fs->invalidate_cache(h->file, offset, length);
91 return rocksdb::Status::OK();
92 }
93 };
94
95 // A file abstraction for randomly reading the contents of a file.
96 class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile {
97 BlueFS *fs;
98 BlueFS::FileReader *h;
99 public:
100 BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
101 ~BlueRocksRandomAccessFile() override {
102 delete h;
103 }
104
105 // Read up to "n" bytes from the file starting at "offset".
106 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
107 // to the data that was read (including if fewer than "n" bytes were
108 // successfully read). May set "*result" to point at data in
109 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
110 // "*result" is used. If an error was encountered, returns a non-OK
111 // status.
112 //
113 // Safe for concurrent use by multiple threads.
114 rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result,
115 char* scratch) const override {
116 int64_t r = fs->read_random(h, offset, n, scratch);
117 ceph_assert(r >= 0);
118 *result = rocksdb::Slice(scratch, r);
119 return rocksdb::Status::OK();
120 }
121
122 // Tries to get an unique ID for this file that will be the same each time
123 // the file is opened (and will stay the same while the file is open).
124 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
125 // ID can be created this function returns the length of the ID and places it
126 // in "id"; otherwise, this function returns 0, in which case "id"
127 // may not have been modified.
128 //
129 // This function guarantees, for IDs from a given environment, two unique ids
130 // cannot be made equal to eachother by adding arbitrary bytes to one of
131 // them. That is, no unique ID is the prefix of another.
132 //
133 // This function guarantees that the returned ID will not be interpretable as
134 // a single varint.
135 //
136 // Note: these IDs are only valid for the duration of the process.
137 size_t GetUniqueId(char* id, size_t max_size) const override {
138 return snprintf(id, max_size, "%016llx",
139 (unsigned long long)h->file->fnode.ino);
140 };
141
142 // Readahead the file starting from offset by n bytes for caching.
143 rocksdb::Status Prefetch(uint64_t offset, size_t n) override {
144 fs->read(h, offset, n, nullptr, nullptr);
145 return rocksdb::Status::OK();
146 }
147
148 //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
149
150 void Hint(AccessPattern pattern) override {
151 if (pattern == RANDOM)
152 h->buf.max_prefetch = 4096;
153 else if (pattern == SEQUENTIAL)
154 h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch;
155 }
156
157 bool use_direct_io() const override {
158 return !fs->cct->_conf->bluefs_buffered_io;
159 }
160
161 // Remove any kind of caching of data from the offset to offset+length
162 // of this file. If the length is 0, then it refers to the end of file.
163 // If the system is not caching the file contents, then this is a noop.
164 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
165 h->buf.invalidate_cache(offset, length);
166 fs->invalidate_cache(h->file, offset, length);
167 return rocksdb::Status::OK();
168 }
169 };
170
171
172 // A file abstraction for sequential writing. The implementation
173 // must provide buffering since callers may append small fragments
174 // at a time to the file.
175 class BlueRocksWritableFile : public rocksdb::WritableFile {
176 BlueFS *fs;
177 BlueFS::FileWriter *h;
178 public:
179 BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {}
180 ~BlueRocksWritableFile() override {
181 fs->close_writer(h);
182 }
183
184 // Indicates if the class makes use of unbuffered I/O
185 /*bool UseOSBuffer() const {
186 return true;
187 }*/
188
189 // This is needed when you want to allocate
190 // AlignedBuffer for use with file I/O classes
191 // Used for unbuffered file I/O when UseOSBuffer() returns false
192 /*size_t GetRequiredBufferAlignment() const {
193 return c_DefaultPageSize;
194 }*/
195
196 rocksdb::Status Append(const rocksdb::Slice& data) override {
197 fs->append_try_flush(h, data.data(), data.size());
198 return rocksdb::Status::OK();
199 }
200
201 // Positioned write for unbuffered access default forward
202 // to simple append as most of the tests are buffered by default
203 rocksdb::Status PositionedAppend(
204 const rocksdb::Slice& /* data */,
205 uint64_t /* offset */) override {
206 return rocksdb::Status::NotSupported();
207 }
208
209 // Truncate is necessary to trim the file to the correct size
210 // before closing. It is not always possible to keep track of the file
211 // size due to whole pages writes. The behavior is undefined if called
212 // with other writes to follow.
213 rocksdb::Status Truncate(uint64_t size) override {
214 // we mirror the posix env, which does nothing here; instead, it
215 // truncates to the final size on close. whatever!
216 return rocksdb::Status::OK();
217 //int r = fs->truncate(h, size);
218 // return err_to_status(r);
219 }
220
221 rocksdb::Status Close() override {
222 fs->fsync(h);
223
224 // mimic posix env, here. shrug.
225 size_t block_size;
226 size_t last_allocated_block;
227 GetPreallocationStatus(&block_size, &last_allocated_block);
228 if (last_allocated_block > 0) {
229 int r = fs->truncate(h, h->pos);
230 if (r < 0)
231 return err_to_status(r);
232 }
233
234 return rocksdb::Status::OK();
235 }
236
237 rocksdb::Status Flush() override {
238 fs->flush(h);
239 return rocksdb::Status::OK();
240 }
241
242 rocksdb::Status Sync() override { // sync data
243 fs->fsync(h);
244 return rocksdb::Status::OK();
245 }
246
247 // true if Sync() and Fsync() are safe to call concurrently with Append()
248 // and Flush().
249 bool IsSyncThreadSafe() const override {
250 return true;
251 }
252
253 // Indicates the upper layers if the current WritableFile implementation
254 // uses direct IO.
255 bool UseDirectIO() const {
256 return false;
257 }
258
259 void SetWriteLifeTimeHint(rocksdb::Env::WriteLifeTimeHint hint) override {
260 h->write_hint = (const int)hint;
261 }
262
263 /*
264 * Get the size of valid data in the file.
265 */
266 uint64_t GetFileSize() override {
267 return h->file->fnode.size + h->get_buffer_length();;
268 }
269
270 // For documentation, refer to RandomAccessFile::GetUniqueId()
271 size_t GetUniqueId(char* id, size_t max_size) const override {
272 return snprintf(id, max_size, "%016llx",
273 (unsigned long long)h->file->fnode.ino);
274 }
275
276 // Remove any kind of caching of data from the offset to offset+length
277 // of this file. If the length is 0, then it refers to the end of file.
278 // If the system is not caching the file contents, then this is a noop.
279 // This call has no effect on dirty pages in the cache.
280 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
281 fs->fsync(h);
282 fs->invalidate_cache(h->file, offset, length);
283 return rocksdb::Status::OK();
284 }
285
286 using rocksdb::WritableFile::RangeSync;
287 // Sync a file range with disk.
288 // offset is the starting byte of the file range to be synchronized.
289 // nbytes specifies the length of the range to be synchronized.
290 // This asks the OS to initiate flushing the cached data to disk,
291 // without waiting for completion.
292 // Default implementation does nothing.
293 rocksdb::Status RangeSync(off_t offset, off_t nbytes) {
294 // round down to page boundaries
295 int partial = offset & 4095;
296 offset -= partial;
297 nbytes += partial;
298 nbytes &= ~4095;
299 if (nbytes)
300 fs->flush_range(h, offset, nbytes);
301 return rocksdb::Status::OK();
302 }
303
304 protected:
305 using rocksdb::WritableFile::Allocate;
306 /*
307 * Pre-allocate space for a file.
308 */
309 rocksdb::Status Allocate(off_t offset, off_t len) {
310 int r = fs->preallocate(h->file, offset, len);
311 return err_to_status(r);
312 }
313 };
314
315
316 // Directory object represents collection of files and implements
317 // filesystem operations that can be executed on directories.
318 class BlueRocksDirectory : public rocksdb::Directory {
319 BlueFS *fs;
320 public:
321 explicit BlueRocksDirectory(BlueFS *f) : fs(f) {}
322
323 // Fsync directory. Can be called concurrently from multiple threads.
324 rocksdb::Status Fsync() override {
325 // it is sufficient to flush the log.
326 fs->sync_metadata(false);
327 return rocksdb::Status::OK();
328 }
329 };
330
331 // Identifies a locked file.
332 class BlueRocksFileLock : public rocksdb::FileLock {
333 public:
334 BlueFS *fs;
335 BlueFS::FileLock *lock;
336 BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { }
337 ~BlueRocksFileLock() override {
338 }
339 };
340
341
342 // --------------------
343 // --- BlueRocksEnv ---
344 // --------------------
345
346 BlueRocksEnv::BlueRocksEnv(BlueFS *f)
347 : EnvWrapper(Env::Default()), // forward most of it to POSIX
348 fs(f)
349 {
350
351 }
352
353 rocksdb::Status BlueRocksEnv::NewSequentialFile(
354 const std::string& fname,
355 std::unique_ptr<rocksdb::SequentialFile>* result,
356 const rocksdb::EnvOptions& options)
357 {
358 if (fname[0] == '/')
359 return target()->NewSequentialFile(fname, result, options);
360 auto [dir, file] = split(fname);
361 BlueFS::FileReader *h;
362 int r = fs->open_for_read(dir, file, &h, false);
363 if (r < 0)
364 return err_to_status(r);
365 result->reset(new BlueRocksSequentialFile(fs, h));
366 return rocksdb::Status::OK();
367 }
368
369 rocksdb::Status BlueRocksEnv::NewRandomAccessFile(
370 const std::string& fname,
371 std::unique_ptr<rocksdb::RandomAccessFile>* result,
372 const rocksdb::EnvOptions& options)
373 {
374 auto [dir, file] = split(fname);
375 BlueFS::FileReader *h;
376 int r = fs->open_for_read(dir, file, &h, true);
377 if (r < 0)
378 return err_to_status(r);
379 result->reset(new BlueRocksRandomAccessFile(fs, h));
380 return rocksdb::Status::OK();
381 }
382
383 rocksdb::Status BlueRocksEnv::NewWritableFile(
384 const std::string& fname,
385 std::unique_ptr<rocksdb::WritableFile>* result,
386 const rocksdb::EnvOptions& options)
387 {
388 auto [dir, file] = split(fname);
389 BlueFS::FileWriter *h;
390 int r = fs->open_for_write(dir, file, &h, false);
391 if (r < 0)
392 return err_to_status(r);
393 result->reset(new BlueRocksWritableFile(fs, h));
394 return rocksdb::Status::OK();
395 }
396
397 rocksdb::Status BlueRocksEnv::ReuseWritableFile(
398 const std::string& new_fname,
399 const std::string& old_fname,
400 std::unique_ptr<rocksdb::WritableFile>* result,
401 const rocksdb::EnvOptions& options)
402 {
403 auto [old_dir, old_file] = split(old_fname);
404 auto [new_dir, new_file] = split(new_fname);
405
406 int r = fs->rename(old_dir, old_file, new_dir, new_file);
407 if (r < 0)
408 return err_to_status(r);
409
410 BlueFS::FileWriter *h;
411 r = fs->open_for_write(new_dir, new_file, &h, true);
412 if (r < 0)
413 return err_to_status(r);
414 result->reset(new BlueRocksWritableFile(fs, h));
415 return rocksdb::Status::OK();
416 }
417
418 rocksdb::Status BlueRocksEnv::NewDirectory(
419 const std::string& name,
420 std::unique_ptr<rocksdb::Directory>* result)
421 {
422 if (!fs->dir_exists(name))
423 return rocksdb::Status::NotFound(name, strerror(ENOENT));
424 result->reset(new BlueRocksDirectory(fs));
425 return rocksdb::Status::OK();
426 }
427
428 rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname)
429 {
430 if (fname[0] == '/')
431 return target()->FileExists(fname);
432 auto [dir, file] = split(fname);
433 if (fs->stat(dir, file, NULL, NULL) == 0)
434 return rocksdb::Status::OK();
435 return err_to_status(-ENOENT);
436 }
437
438 rocksdb::Status BlueRocksEnv::GetChildren(
439 const std::string& dir,
440 std::vector<std::string>* result)
441 {
442 result->clear();
443 int r = fs->readdir(dir, result);
444 if (r < 0)
445 return rocksdb::Status::NotFound(dir, strerror(ENOENT));// return err_to_status(r);
446 return rocksdb::Status::OK();
447 }
448
449 rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname)
450 {
451 auto [dir, file] = split(fname);
452 int r = fs->unlink(dir, file);
453 if (r < 0)
454 return err_to_status(r);
455 return rocksdb::Status::OK();
456 }
457
458 rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname)
459 {
460 int r = fs->mkdir(dirname);
461 if (r < 0)
462 return err_to_status(r);
463 return rocksdb::Status::OK();
464 }
465
466 rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname)
467 {
468 int r = fs->mkdir(dirname);
469 if (r < 0 && r != -EEXIST)
470 return err_to_status(r);
471 return rocksdb::Status::OK();
472 }
473
474 rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname)
475 {
476 int r = fs->rmdir(dirname);
477 if (r < 0)
478 return err_to_status(r);
479 return rocksdb::Status::OK();
480 }
481
482 rocksdb::Status BlueRocksEnv::GetFileSize(
483 const std::string& fname,
484 uint64_t* file_size)
485 {
486 auto [dir, file] = split(fname);
487 int r = fs->stat(dir, file, file_size, NULL);
488 if (r < 0)
489 return err_to_status(r);
490 return rocksdb::Status::OK();
491 }
492
493 rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname,
494 uint64_t* file_mtime)
495 {
496 auto [dir, file] = split(fname);
497 utime_t mtime;
498 int r = fs->stat(dir, file, NULL, &mtime);
499 if (r < 0)
500 return err_to_status(r);
501 *file_mtime = mtime.sec();
502 return rocksdb::Status::OK();
503 }
504
505 rocksdb::Status BlueRocksEnv::RenameFile(
506 const std::string& src,
507 const std::string& target)
508 {
509 auto [old_dir, old_file] = split(src);
510 auto [new_dir, new_file] = split(target);
511
512 int r = fs->rename(old_dir, old_file, new_dir, new_file);
513 if (r < 0)
514 return err_to_status(r);
515 return rocksdb::Status::OK();
516 }
517
518 rocksdb::Status BlueRocksEnv::LinkFile(
519 const std::string& src,
520 const std::string& target)
521 {
522 ceph_abort();
523 }
524
525 rocksdb::Status BlueRocksEnv::AreFilesSame(
526 const std::string& first,
527 const std::string& second, bool* res)
528 {
529 for (auto& path : {first, second}) {
530 if (fs->dir_exists(path)) {
531 continue;
532 }
533 auto [dir, file] = split(path);
534 int r = fs->stat(dir, file, nullptr, nullptr);
535 if (!r) {
536 continue;
537 } else if (r == -ENOENT) {
538 return rocksdb::Status::NotFound("AreFilesSame", path);
539 } else {
540 return err_to_status(r);
541 }
542 }
543 *res = (first == second);
544 return rocksdb::Status::OK();
545 }
546
547 rocksdb::Status BlueRocksEnv::LockFile(
548 const std::string& fname,
549 rocksdb::FileLock** lock)
550 {
551 auto [dir, file] = split(fname);
552 BlueFS::FileLock *l = NULL;
553 int r = fs->lock_file(dir, file, &l);
554 if (r < 0)
555 return err_to_status(r);
556 *lock = new BlueRocksFileLock(fs, l);
557 return rocksdb::Status::OK();
558 }
559
560 rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock)
561 {
562 BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock);
563 int r = fs->unlock_file(l->lock);
564 if (r < 0)
565 return err_to_status(r);
566 delete lock;
567 lock = nullptr;
568 return rocksdb::Status::OK();
569 }
570
571 rocksdb::Status BlueRocksEnv::GetAbsolutePath(
572 const std::string& db_path,
573 std::string* output_path)
574 {
575 // this is a lie...
576 *output_path = "/" + db_path;
577 return rocksdb::Status::OK();
578 }
579
580 rocksdb::Status BlueRocksEnv::NewLogger(
581 const std::string& fname,
582 std::shared_ptr<rocksdb::Logger>* result)
583 {
584 // ignore the filename :)
585 result->reset(create_rocksdb_ceph_logger());
586 return rocksdb::Status::OK();
587 }
588
589 rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path)
590 {
591 static int foo = 0;
592 *path = "temp_" + stringify(++foo);
593 return rocksdb::Status::OK();
594 }