]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueRocksEnv.cc
import 15.2.5
[ceph.git] / ceph / src / os / bluestore / BlueRocksEnv.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "BlueRocksEnv.h"
5 #include "BlueFS.h"
6 #include "include/stringify.h"
7 #include "kv/RocksDBStore.h"
8 #include "string.h"
9
10 rocksdb::Status err_to_status(int r)
11 {
12 switch (r) {
13 case 0:
14 return rocksdb::Status::OK();
15 case -ENOENT:
16 return rocksdb::Status::NotFound(rocksdb::Status::kNone);
17 case -EINVAL:
18 return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone);
19 case -EIO:
20 case -EEXIST:
21 return rocksdb::Status::IOError(rocksdb::Status::kNone);
22 case -ENOLCK:
23 return rocksdb::Status::IOError(strerror(r));
24 default:
25 // FIXME :(
26 ceph_abort_msg("unrecognized error code");
27 return rocksdb::Status::NotSupported(rocksdb::Status::kNone);
28 }
29 }
30
31 // A file abstraction for reading sequentially through a file
32 class BlueRocksSequentialFile : public rocksdb::SequentialFile {
33 BlueFS *fs;
34 BlueFS::FileReader *h;
35 public:
36 BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
37 ~BlueRocksSequentialFile() override {
38 delete h;
39 }
40
41 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
42 // written by this routine. Sets "*result" to the data that was
43 // read (including if fewer than "n" bytes were successfully read).
44 // May set "*result" to point at data in "scratch[0..n-1]", so
45 // "scratch[0..n-1]" must be live when "*result" is used.
46 // If an error was encountered, returns a non-OK status.
47 //
48 // REQUIRES: External synchronization
49 rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override {
50 int r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch);
51 ceph_assert(r >= 0);
52 *result = rocksdb::Slice(scratch, r);
53 return rocksdb::Status::OK();
54 }
55
56 // Skip "n" bytes from the file. This is guaranteed to be no
57 // slower that reading the same data, but may be faster.
58 //
59 // If end of file is reached, skipping will stop at the end of the
60 // file, and Skip will return OK.
61 //
62 // REQUIRES: External synchronization
63 rocksdb::Status Skip(uint64_t n) override {
64 h->buf.skip(n);
65 return rocksdb::Status::OK();
66 }
67
68 // Remove any kind of caching of data from the offset to offset+length
69 // of this file. If the length is 0, then it refers to the end of file.
70 // If the system is not caching the file contents, then this is a noop.
71 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
72 fs->invalidate_cache(h->file, offset, length);
73 return rocksdb::Status::OK();
74 }
75 };
76
77 // A file abstraction for randomly reading the contents of a file.
78 class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile {
79 BlueFS *fs;
80 BlueFS::FileReader *h;
81 public:
82 BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
83 ~BlueRocksRandomAccessFile() override {
84 delete h;
85 }
86
87 // Read up to "n" bytes from the file starting at "offset".
88 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
89 // to the data that was read (including if fewer than "n" bytes were
90 // successfully read). May set "*result" to point at data in
91 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
92 // "*result" is used. If an error was encountered, returns a non-OK
93 // status.
94 //
95 // Safe for concurrent use by multiple threads.
96 rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result,
97 char* scratch) const override {
98 int r = fs->read_random(h, offset, n, scratch);
99 ceph_assert(r >= 0);
100 *result = rocksdb::Slice(scratch, r);
101 return rocksdb::Status::OK();
102 }
103
104 // Tries to get an unique ID for this file that will be the same each time
105 // the file is opened (and will stay the same while the file is open).
106 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
107 // ID can be created this function returns the length of the ID and places it
108 // in "id"; otherwise, this function returns 0, in which case "id"
109 // may not have been modified.
110 //
111 // This function guarantees, for IDs from a given environment, two unique ids
112 // cannot be made equal to eachother by adding arbitrary bytes to one of
113 // them. That is, no unique ID is the prefix of another.
114 //
115 // This function guarantees that the returned ID will not be interpretable as
116 // a single varint.
117 //
118 // Note: these IDs are only valid for the duration of the process.
119 size_t GetUniqueId(char* id, size_t max_size) const override {
120 return snprintf(id, max_size, "%016llx",
121 (unsigned long long)h->file->fnode.ino);
122 };
123
124 // Readahead the file starting from offset by n bytes for caching.
125 rocksdb::Status Prefetch(uint64_t offset, size_t n) override {
126 fs->read(h, &h->buf, offset, n, nullptr, nullptr);
127 return rocksdb::Status::OK();
128 }
129
130 //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
131
132 void Hint(AccessPattern pattern) override {
133 if (pattern == RANDOM)
134 h->buf.max_prefetch = 4096;
135 else if (pattern == SEQUENTIAL)
136 h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch;
137 }
138
139 // Remove any kind of caching of data from the offset to offset+length
140 // of this file. If the length is 0, then it refers to the end of file.
141 // If the system is not caching the file contents, then this is a noop.
142 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
143 fs->invalidate_cache(h->file, offset, length);
144 return rocksdb::Status::OK();
145 }
146 };
147
148
149 // A file abstraction for sequential writing. The implementation
150 // must provide buffering since callers may append small fragments
151 // at a time to the file.
152 class BlueRocksWritableFile : public rocksdb::WritableFile {
153 BlueFS *fs;
154 BlueFS::FileWriter *h;
155 public:
156 BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {}
157 ~BlueRocksWritableFile() override {
158 fs->close_writer(h);
159 }
160
161 // Indicates if the class makes use of unbuffered I/O
162 /*bool UseOSBuffer() const {
163 return true;
164 }*/
165
166 // This is needed when you want to allocate
167 // AlignedBuffer for use with file I/O classes
168 // Used for unbuffered file I/O when UseOSBuffer() returns false
169 /*size_t GetRequiredBufferAlignment() const {
170 return c_DefaultPageSize;
171 }*/
172
173 rocksdb::Status Append(const rocksdb::Slice& data) override {
174 h->append(data.data(), data.size());
175 return rocksdb::Status::OK();
176 }
177
178 // Positioned write for unbuffered access default forward
179 // to simple append as most of the tests are buffered by default
180 rocksdb::Status PositionedAppend(
181 const rocksdb::Slice& /* data */,
182 uint64_t /* offset */) override {
183 return rocksdb::Status::NotSupported();
184 }
185
186 // Truncate is necessary to trim the file to the correct size
187 // before closing. It is not always possible to keep track of the file
188 // size due to whole pages writes. The behavior is undefined if called
189 // with other writes to follow.
190 rocksdb::Status Truncate(uint64_t size) override {
191 // we mirror the posix env, which does nothing here; instead, it
192 // truncates to the final size on close. whatever!
193 return rocksdb::Status::OK();
194 //int r = fs->truncate(h, size);
195 // return err_to_status(r);
196 }
197
198 rocksdb::Status Close() override {
199 fs->flush(h, true);
200
201 // mimic posix env, here. shrug.
202 size_t block_size;
203 size_t last_allocated_block;
204 GetPreallocationStatus(&block_size, &last_allocated_block);
205 if (last_allocated_block > 0) {
206 int r = fs->truncate(h, h->pos);
207 if (r < 0)
208 return err_to_status(r);
209 }
210
211 return rocksdb::Status::OK();
212 }
213
214 rocksdb::Status Flush() override {
215 fs->flush(h);
216 return rocksdb::Status::OK();
217 }
218
219 rocksdb::Status Sync() override { // sync data
220 fs->fsync(h);
221 return rocksdb::Status::OK();
222 }
223
224 // true if Sync() and Fsync() are safe to call concurrently with Append()
225 // and Flush().
226 bool IsSyncThreadSafe() const override {
227 return true;
228 }
229
230 // Indicates the upper layers if the current WritableFile implementation
231 // uses direct IO.
232 bool UseDirectIO() const {
233 return false;
234 }
235
236 void SetWriteLifeTimeHint(rocksdb::Env::WriteLifeTimeHint hint) override {
237 h->write_hint = (const int)hint;
238 }
239
240 /*
241 * Get the size of valid data in the file.
242 */
243 uint64_t GetFileSize() override {
244 return h->file->fnode.size + h->buffer.length();;
245 }
246
247 // For documentation, refer to RandomAccessFile::GetUniqueId()
248 size_t GetUniqueId(char* id, size_t max_size) const override {
249 return snprintf(id, max_size, "%016llx",
250 (unsigned long long)h->file->fnode.ino);
251 }
252
253 // Remove any kind of caching of data from the offset to offset+length
254 // of this file. If the length is 0, then it refers to the end of file.
255 // If the system is not caching the file contents, then this is a noop.
256 // This call has no effect on dirty pages in the cache.
257 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
258 fs->invalidate_cache(h->file, offset, length);
259 return rocksdb::Status::OK();
260 }
261
262 using rocksdb::WritableFile::RangeSync;
263 // Sync a file range with disk.
264 // offset is the starting byte of the file range to be synchronized.
265 // nbytes specifies the length of the range to be synchronized.
266 // This asks the OS to initiate flushing the cached data to disk,
267 // without waiting for completion.
268 // Default implementation does nothing.
269 rocksdb::Status RangeSync(off_t offset, off_t nbytes) {
270 // round down to page boundaries
271 int partial = offset & 4095;
272 offset -= partial;
273 nbytes += partial;
274 nbytes &= ~4095;
275 if (nbytes)
276 fs->flush_range(h, offset, nbytes);
277 return rocksdb::Status::OK();
278 }
279
280 protected:
281 using rocksdb::WritableFile::Allocate;
282 /*
283 * Pre-allocate space for a file.
284 */
285 rocksdb::Status Allocate(off_t offset, off_t len) {
286 int r = fs->preallocate(h->file, offset, len);
287 return err_to_status(r);
288 }
289 };
290
291
292 // Directory object represents collection of files and implements
293 // filesystem operations that can be executed on directories.
294 class BlueRocksDirectory : public rocksdb::Directory {
295 BlueFS *fs;
296 public:
297 explicit BlueRocksDirectory(BlueFS *f) : fs(f) {}
298
299 // Fsync directory. Can be called concurrently from multiple threads.
300 rocksdb::Status Fsync() override {
301 // it is sufficient to flush the log.
302 fs->sync_metadata(false);
303 return rocksdb::Status::OK();
304 }
305 };
306
307 // Identifies a locked file.
308 class BlueRocksFileLock : public rocksdb::FileLock {
309 public:
310 BlueFS *fs;
311 BlueFS::FileLock *lock;
312 BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { }
313 ~BlueRocksFileLock() override {
314 }
315 };
316
317
318 // --------------------
319 // --- BlueRocksEnv ---
320 // --------------------
321
322 BlueRocksEnv::BlueRocksEnv(BlueFS *f)
323 : EnvWrapper(Env::Default()), // forward most of it to POSIX
324 fs(f)
325 {
326
327 }
328
329 rocksdb::Status BlueRocksEnv::NewSequentialFile(
330 const std::string& fname,
331 std::unique_ptr<rocksdb::SequentialFile>* result,
332 const rocksdb::EnvOptions& options)
333 {
334 if (fname[0] == '/')
335 return target()->NewSequentialFile(fname, result, options);
336 std::string dir, file;
337 split(fname, &dir, &file);
338 BlueFS::FileReader *h;
339 int r = fs->open_for_read(dir, file, &h, false);
340 if (r < 0)
341 return err_to_status(r);
342 result->reset(new BlueRocksSequentialFile(fs, h));
343 return rocksdb::Status::OK();
344 }
345
346 rocksdb::Status BlueRocksEnv::NewRandomAccessFile(
347 const std::string& fname,
348 std::unique_ptr<rocksdb::RandomAccessFile>* result,
349 const rocksdb::EnvOptions& options)
350 {
351 std::string dir, file;
352 split(fname, &dir, &file);
353 BlueFS::FileReader *h;
354 int r = fs->open_for_read(dir, file, &h, true);
355 if (r < 0)
356 return err_to_status(r);
357 result->reset(new BlueRocksRandomAccessFile(fs, h));
358 return rocksdb::Status::OK();
359 }
360
361 rocksdb::Status BlueRocksEnv::NewWritableFile(
362 const std::string& fname,
363 std::unique_ptr<rocksdb::WritableFile>* result,
364 const rocksdb::EnvOptions& options)
365 {
366 std::string dir, file;
367 split(fname, &dir, &file);
368 BlueFS::FileWriter *h;
369 int r = fs->open_for_write(dir, file, &h, false);
370 if (r < 0)
371 return err_to_status(r);
372 result->reset(new BlueRocksWritableFile(fs, h));
373 return rocksdb::Status::OK();
374 }
375
376 rocksdb::Status BlueRocksEnv::ReuseWritableFile(
377 const std::string& new_fname,
378 const std::string& old_fname,
379 std::unique_ptr<rocksdb::WritableFile>* result,
380 const rocksdb::EnvOptions& options)
381 {
382 std::string old_dir, old_file;
383 split(old_fname, &old_dir, &old_file);
384 std::string new_dir, new_file;
385 split(new_fname, &new_dir, &new_file);
386
387 int r = fs->rename(old_dir, old_file, new_dir, new_file);
388 if (r < 0)
389 return err_to_status(r);
390
391 BlueFS::FileWriter *h;
392 r = fs->open_for_write(new_dir, new_file, &h, true);
393 if (r < 0)
394 return err_to_status(r);
395 result->reset(new BlueRocksWritableFile(fs, h));
396 return rocksdb::Status::OK();
397 }
398
399 rocksdb::Status BlueRocksEnv::NewDirectory(
400 const std::string& name,
401 std::unique_ptr<rocksdb::Directory>* result)
402 {
403 if (!fs->dir_exists(name))
404 return rocksdb::Status::NotFound(name, strerror(ENOENT));
405 result->reset(new BlueRocksDirectory(fs));
406 return rocksdb::Status::OK();
407 }
408
409 rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname)
410 {
411 if (fname[0] == '/')
412 return target()->FileExists(fname);
413 std::string dir, file;
414 split(fname, &dir, &file);
415 if (fs->stat(dir, file, NULL, NULL) == 0)
416 return rocksdb::Status::OK();
417 return err_to_status(-ENOENT);
418 }
419
420 rocksdb::Status BlueRocksEnv::GetChildren(
421 const std::string& dir,
422 std::vector<std::string>* result)
423 {
424 result->clear();
425 int r = fs->readdir(dir, result);
426 if (r < 0)
427 return rocksdb::Status::NotFound(dir, strerror(ENOENT));// return err_to_status(r);
428 return rocksdb::Status::OK();
429 }
430
431 rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname)
432 {
433 std::string dir, file;
434 split(fname, &dir, &file);
435 int r = fs->unlink(dir, file);
436 if (r < 0)
437 return err_to_status(r);
438 return rocksdb::Status::OK();
439 }
440
441 rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname)
442 {
443 int r = fs->mkdir(dirname);
444 if (r < 0)
445 return err_to_status(r);
446 return rocksdb::Status::OK();
447 }
448
449 rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname)
450 {
451 int r = fs->mkdir(dirname);
452 if (r < 0 && r != -EEXIST)
453 return err_to_status(r);
454 return rocksdb::Status::OK();
455 }
456
457 rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname)
458 {
459 int r = fs->rmdir(dirname);
460 if (r < 0)
461 return err_to_status(r);
462 return rocksdb::Status::OK();
463 }
464
465 rocksdb::Status BlueRocksEnv::GetFileSize(
466 const std::string& fname,
467 uint64_t* file_size)
468 {
469 std::string dir, file;
470 split(fname, &dir, &file);
471 int r = fs->stat(dir, file, file_size, NULL);
472 if (r < 0)
473 return err_to_status(r);
474 return rocksdb::Status::OK();
475 }
476
477 rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname,
478 uint64_t* file_mtime)
479 {
480 std::string dir, file;
481 split(fname, &dir, &file);
482 utime_t mtime;
483 int r = fs->stat(dir, file, NULL, &mtime);
484 if (r < 0)
485 return err_to_status(r);
486 *file_mtime = mtime.sec();
487 return rocksdb::Status::OK();
488 }
489
490 rocksdb::Status BlueRocksEnv::RenameFile(
491 const std::string& src,
492 const std::string& target)
493 {
494 std::string old_dir, old_file;
495 split(src, &old_dir, &old_file);
496 std::string new_dir, new_file;
497 split(target, &new_dir, &new_file);
498
499 int r = fs->rename(old_dir, old_file, new_dir, new_file);
500 if (r < 0)
501 return err_to_status(r);
502 return rocksdb::Status::OK();
503 }
504
505 rocksdb::Status BlueRocksEnv::LinkFile(
506 const std::string& src,
507 const std::string& target)
508 {
509 ceph_abort();
510 }
511
512 rocksdb::Status BlueRocksEnv::AreFilesSame(
513 const std::string& first,
514 const std::string& second, bool* res)
515 {
516 for (auto& path : {first, second}) {
517 if (fs->dir_exists(path)) {
518 continue;
519 }
520 std::string dir, file;
521 split(path, &dir, &file);
522 int r = fs->stat(dir, file, nullptr, nullptr);
523 if (!r) {
524 continue;
525 } else if (r == -ENOENT) {
526 return rocksdb::Status::NotFound("AreFilesSame", path);
527 } else {
528 return err_to_status(r);
529 }
530 }
531 *res = (first == second);
532 return rocksdb::Status::OK();
533 }
534
535 rocksdb::Status BlueRocksEnv::LockFile(
536 const std::string& fname,
537 rocksdb::FileLock** lock)
538 {
539 std::string dir, file;
540 split(fname, &dir, &file);
541 BlueFS::FileLock *l = NULL;
542 int r = fs->lock_file(dir, file, &l);
543 if (r < 0)
544 return err_to_status(r);
545 *lock = new BlueRocksFileLock(fs, l);
546 return rocksdb::Status::OK();
547 }
548
549 rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock)
550 {
551 BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock);
552 int r = fs->unlock_file(l->lock);
553 if (r < 0)
554 return err_to_status(r);
555 delete lock;
556 lock = nullptr;
557 return rocksdb::Status::OK();
558 }
559
560 rocksdb::Status BlueRocksEnv::GetAbsolutePath(
561 const std::string& db_path,
562 std::string* output_path)
563 {
564 // this is a lie...
565 *output_path = "/" + db_path;
566 return rocksdb::Status::OK();
567 }
568
569 rocksdb::Status BlueRocksEnv::NewLogger(
570 const std::string& fname,
571 std::shared_ptr<rocksdb::Logger>* result)
572 {
573 // ignore the filename :)
574 result->reset(create_rocksdb_ceph_logger());
575 return rocksdb::Status::OK();
576 }
577
578 rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path)
579 {
580 static int foo = 0;
581 *path = "temp_" + stringify(++foo);
582 return rocksdb::Status::OK();
583 }