]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueRocksEnv.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / os / bluestore / BlueRocksEnv.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "BlueRocksEnv.h"
5#include "BlueFS.h"
6#include "include/stringify.h"
7#include "kv/RocksDBStore.h"
8
9rocksdb::Status err_to_status(int r)
10{
11 switch (r) {
12 case 0:
13 return rocksdb::Status::OK();
14 case -ENOENT:
15 return rocksdb::Status::NotFound(rocksdb::Status::kNone);
16 case -EINVAL:
17 return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone);
18 case -EIO:
19 return rocksdb::Status::IOError(rocksdb::Status::kNone);
20 default:
21 // FIXME :(
22 assert(0 == "unrecognized error code");
23 return rocksdb::Status::NotSupported(rocksdb::Status::kNone);
24 }
25}
26
27// A file abstraction for reading sequentially through a file
28class BlueRocksSequentialFile : public rocksdb::SequentialFile {
29 BlueFS *fs;
30 BlueFS::FileReader *h;
31 public:
32 BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
33 ~BlueRocksSequentialFile() override {
34 delete h;
35 }
36
37 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
38 // written by this routine. Sets "*result" to the data that was
39 // read (including if fewer than "n" bytes were successfully read).
40 // May set "*result" to point at data in "scratch[0..n-1]", so
41 // "scratch[0..n-1]" must be live when "*result" is used.
42 // If an error was encountered, returns a non-OK status.
43 //
44 // REQUIRES: External synchronization
45 rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override {
46 int r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch);
47 assert(r >= 0);
48 *result = rocksdb::Slice(scratch, r);
49 return rocksdb::Status::OK();
50 }
51
52 // Skip "n" bytes from the file. This is guaranteed to be no
53 // slower that reading the same data, but may be faster.
54 //
55 // If end of file is reached, skipping will stop at the end of the
56 // file, and Skip will return OK.
57 //
58 // REQUIRES: External synchronization
59 rocksdb::Status Skip(uint64_t n) override {
60 h->buf.skip(n);
61 return rocksdb::Status::OK();
62 }
63
64 // Remove any kind of caching of data from the offset to offset+length
65 // of this file. If the length is 0, then it refers to the end of file.
66 // If the system is not caching the file contents, then this is a noop.
67 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
68 fs->invalidate_cache(h->file, offset, length);
69 return rocksdb::Status::OK();
70 }
71};
72
73// A file abstraction for randomly reading the contents of a file.
74class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile {
75 BlueFS *fs;
76 BlueFS::FileReader *h;
77 public:
78 BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
79 ~BlueRocksRandomAccessFile() override {
80 delete h;
81 }
82
83 // Read up to "n" bytes from the file starting at "offset".
84 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
85 // to the data that was read (including if fewer than "n" bytes were
86 // successfully read). May set "*result" to point at data in
87 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
88 // "*result" is used. If an error was encountered, returns a non-OK
89 // status.
90 //
91 // Safe for concurrent use by multiple threads.
92 rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result,
93 char* scratch) const override {
94 int r = fs->read_random(h, offset, n, scratch);
95 assert(r >= 0);
96 *result = rocksdb::Slice(scratch, r);
97 return rocksdb::Status::OK();
98 }
99
100 // Used by the file_reader_writer to decide if the ReadAhead wrapper
101 // should simply forward the call and do not enact buffering or locking.
102 bool ShouldForwardRawRequest() const override {
103 return false;
104 }
105
106 // For cases when read-ahead is implemented in the platform dependent
107 // layer
108 void EnableReadAhead() override {}
109
110 // Tries to get an unique ID for this file that will be the same each time
111 // the file is opened (and will stay the same while the file is open).
112 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
113 // ID can be created this function returns the length of the ID and places it
114 // in "id"; otherwise, this function returns 0, in which case "id"
115 // may not have been modified.
116 //
117 // This function guarantees, for IDs from a given environment, two unique ids
118 // cannot be made equal to eachother by adding arbitrary bytes to one of
119 // them. That is, no unique ID is the prefix of another.
120 //
121 // This function guarantees that the returned ID will not be interpretable as
122 // a single varint.
123 //
124 // Note: these IDs are only valid for the duration of the process.
125 size_t GetUniqueId(char* id, size_t max_size) const override {
126 return snprintf(id, max_size, "%016llx",
127 (unsigned long long)h->file->fnode.ino);
128 };
129
130 //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
131
132 void Hint(AccessPattern pattern) override {
133 if (pattern == RANDOM)
134 h->buf.max_prefetch = 4096;
135 else if (pattern == SEQUENTIAL)
136 h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch;
137 }
138
139 // Remove any kind of caching of data from the offset to offset+length
140 // of this file. If the length is 0, then it refers to the end of file.
141 // If the system is not caching the file contents, then this is a noop.
142 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
143 fs->invalidate_cache(h->file, offset, length);
144 return rocksdb::Status::OK();
145 }
146};
147
148
149// A file abstraction for sequential writing. The implementation
150// must provide buffering since callers may append small fragments
151// at a time to the file.
152class BlueRocksWritableFile : public rocksdb::WritableFile {
153 BlueFS *fs;
154 BlueFS::FileWriter *h;
155 public:
156 BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {}
157 ~BlueRocksWritableFile() override {
158 fs->close_writer(h);
159 }
160
161 // Indicates if the class makes use of unbuffered I/O
162 /*bool UseOSBuffer() const {
163 return true;
164 }*/
165
166 // This is needed when you want to allocate
167 // AlignedBuffer for use with file I/O classes
168 // Used for unbuffered file I/O when UseOSBuffer() returns false
169 /*size_t GetRequiredBufferAlignment() const {
170 return c_DefaultPageSize;
171 }*/
172
173 rocksdb::Status Append(const rocksdb::Slice& data) override {
174 h->append(data.data(), data.size());
175 return rocksdb::Status::OK();
176 }
177
178 // Positioned write for unbuffered access default forward
179 // to simple append as most of the tests are buffered by default
180 rocksdb::Status PositionedAppend(
181 const rocksdb::Slice& /* data */,
182 uint64_t /* offset */) override {
183 return rocksdb::Status::NotSupported();
184 }
185
186 // Truncate is necessary to trim the file to the correct size
187 // before closing. It is not always possible to keep track of the file
188 // size due to whole pages writes. The behavior is undefined if called
189 // with other writes to follow.
190 rocksdb::Status Truncate(uint64_t size) override {
191 // we mirror the posix env, which does nothing here; instead, it
192 // truncates to the final size on close. whatever!
193 return rocksdb::Status::OK();
194 //int r = fs->truncate(h, size);
195 // return err_to_status(r);
196 }
197
198 rocksdb::Status Close() override {
199 Flush();
200
201 // mimic posix env, here. shrug.
202 size_t block_size;
203 size_t last_allocated_block;
204 GetPreallocationStatus(&block_size, &last_allocated_block);
205 if (last_allocated_block > 0) {
206 int r = fs->truncate(h, h->pos);
207 if (r < 0)
208 return err_to_status(r);
209 }
210
211 return rocksdb::Status::OK();
212 }
213
214 rocksdb::Status Flush() override {
215 fs->flush(h);
216 return rocksdb::Status::OK();
217 }
218
219 rocksdb::Status Sync() override { // sync data
220 fs->fsync(h);
221 return rocksdb::Status::OK();
222 }
223
224 // true if Sync() and Fsync() are safe to call concurrently with Append()
225 // and Flush().
226 bool IsSyncThreadSafe() const override {
227 return true;
228 }
229
230 // Indicates the upper layers if the current WritableFile implementation
231 // uses direct IO.
232 bool UseDirectIO() const {
233 return false;
234 }
235
236 /*
237 * Get the size of valid data in the file.
238 */
239 uint64_t GetFileSize() override {
240 return h->file->fnode.size + h->buffer.length();;
241 }
242
243 // For documentation, refer to RandomAccessFile::GetUniqueId()
244 size_t GetUniqueId(char* id, size_t max_size) const override {
245 return snprintf(id, max_size, "%016llx",
246 (unsigned long long)h->file->fnode.ino);
247 }
248
249 // Remove any kind of caching of data from the offset to offset+length
250 // of this file. If the length is 0, then it refers to the end of file.
251 // If the system is not caching the file contents, then this is a noop.
252 // This call has no effect on dirty pages in the cache.
253 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
254 fs->invalidate_cache(h->file, offset, length);
255 return rocksdb::Status::OK();
256 }
257
258 using rocksdb::WritableFile::RangeSync;
259 // Sync a file range with disk.
260 // offset is the starting byte of the file range to be synchronized.
261 // nbytes specifies the length of the range to be synchronized.
262 // This asks the OS to initiate flushing the cached data to disk,
263 // without waiting for completion.
264 // Default implementation does nothing.
265 rocksdb::Status RangeSync(off_t offset, off_t nbytes) {
266 // round down to page boundaries
267 int partial = offset & 4095;
268 offset -= partial;
269 nbytes += partial;
270 nbytes &= ~4095;
271 if (nbytes)
272 fs->flush_range(h, offset, nbytes);
273 return rocksdb::Status::OK();
274 }
275
276 protected:
277 using rocksdb::WritableFile::Allocate;
278 /*
279 * Pre-allocate space for a file.
280 */
281 rocksdb::Status Allocate(off_t offset, off_t len) {
282 int r = fs->preallocate(h->file, offset, len);
283 return err_to_status(r);
284 }
285};
286
287
288// Directory object represents collection of files and implements
289// filesystem operations that can be executed on directories.
290class BlueRocksDirectory : public rocksdb::Directory {
291 BlueFS *fs;
292 public:
293 explicit BlueRocksDirectory(BlueFS *f) : fs(f) {}
294
295 // Fsync directory. Can be called concurrently from multiple threads.
296 rocksdb::Status Fsync() override {
297 // it is sufficient to flush the log.
298 fs->sync_metadata();
299 return rocksdb::Status::OK();
300 }
301};
302
303// Identifies a locked file.
304class BlueRocksFileLock : public rocksdb::FileLock {
305 public:
306 BlueFS *fs;
307 BlueFS::FileLock *lock;
308 BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { }
309 ~BlueRocksFileLock() override {
310 }
311};
312
313
314// --------------------
315// --- BlueRocksEnv ---
316// --------------------
317
318BlueRocksEnv::BlueRocksEnv(BlueFS *f)
319 : EnvWrapper(Env::Default()), // forward most of it to POSIX
320 fs(f)
321{
322
323}
324
325rocksdb::Status BlueRocksEnv::NewSequentialFile(
326 const std::string& fname,
327 std::unique_ptr<rocksdb::SequentialFile>* result,
328 const rocksdb::EnvOptions& options)
329{
330 if (fname[0] == '/')
331 return target()->NewSequentialFile(fname, result, options);
332 std::string dir, file;
333 split(fname, &dir, &file);
334 BlueFS::FileReader *h;
335 int r = fs->open_for_read(dir, file, &h, false);
336 if (r < 0)
337 return err_to_status(r);
338 result->reset(new BlueRocksSequentialFile(fs, h));
339 return rocksdb::Status::OK();
340}
341
342rocksdb::Status BlueRocksEnv::NewRandomAccessFile(
343 const std::string& fname,
344 std::unique_ptr<rocksdb::RandomAccessFile>* result,
345 const rocksdb::EnvOptions& options)
346{
347 std::string dir, file;
348 split(fname, &dir, &file);
349 BlueFS::FileReader *h;
350 int r = fs->open_for_read(dir, file, &h, true);
351 if (r < 0)
352 return err_to_status(r);
353 result->reset(new BlueRocksRandomAccessFile(fs, h));
354 return rocksdb::Status::OK();
355}
356
357rocksdb::Status BlueRocksEnv::NewWritableFile(
358 const std::string& fname,
359 std::unique_ptr<rocksdb::WritableFile>* result,
360 const rocksdb::EnvOptions& options)
361{
362 std::string dir, file;
363 split(fname, &dir, &file);
364 BlueFS::FileWriter *h;
365 int r = fs->open_for_write(dir, file, &h, false);
366 if (r < 0)
367 return err_to_status(r);
368 result->reset(new BlueRocksWritableFile(fs, h));
369 return rocksdb::Status::OK();
370}
371
372rocksdb::Status BlueRocksEnv::ReuseWritableFile(
373 const std::string& new_fname,
374 const std::string& old_fname,
375 std::unique_ptr<rocksdb::WritableFile>* result,
376 const rocksdb::EnvOptions& options)
377{
378 std::string old_dir, old_file;
379 split(old_fname, &old_dir, &old_file);
380 std::string new_dir, new_file;
381 split(new_fname, &new_dir, &new_file);
382
383 int r = fs->rename(old_dir, old_file, new_dir, new_file);
384 if (r < 0)
385 return err_to_status(r);
386
387 BlueFS::FileWriter *h;
388 r = fs->open_for_write(new_dir, new_file, &h, true);
389 if (r < 0)
390 return err_to_status(r);
391 result->reset(new BlueRocksWritableFile(fs, h));
392 return rocksdb::Status::OK();
393}
394
395rocksdb::Status BlueRocksEnv::NewDirectory(
396 const std::string& name,
397 std::unique_ptr<rocksdb::Directory>* result)
398{
399 if (!fs->dir_exists(name))
400 return rocksdb::Status::IOError(name, strerror(ENOENT));
401 result->reset(new BlueRocksDirectory(fs));
402 return rocksdb::Status::OK();
403}
404
405rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname)
406{
407 if (fname[0] == '/')
408 return target()->FileExists(fname);
409 std::string dir, file;
410 split(fname, &dir, &file);
411 if (fs->stat(dir, file, NULL, NULL) == 0)
412 return rocksdb::Status::OK();
413 return err_to_status(-ENOENT);
414}
415
416rocksdb::Status BlueRocksEnv::GetChildren(
417 const std::string& dir,
418 std::vector<std::string>* result)
419{
420 int r = fs->readdir(dir, result);
421 if (r < 0)
422 return rocksdb::Status::IOError(dir, strerror(ENOENT));// return err_to_status(r);
423 return rocksdb::Status::OK();
424}
425
426rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname)
427{
428 std::string dir, file;
429 split(fname, &dir, &file);
430 int r = fs->unlink(dir, file);
431 if (r < 0)
432 return err_to_status(r);
433 return rocksdb::Status::OK();
434}
435
436rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname)
437{
438 int r = fs->mkdir(dirname);
439 if (r < 0)
440 return err_to_status(r);
441 return rocksdb::Status::OK();
442}
443
444rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname)
445{
446 int r = fs->mkdir(dirname);
447 if (r < 0 && r != -EEXIST)
448 return err_to_status(r);
449 return rocksdb::Status::OK();
450}
451
452rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname)
453{
454 int r = fs->rmdir(dirname);
455 if (r < 0)
456 return err_to_status(r);
457 return rocksdb::Status::OK();
458}
459
460rocksdb::Status BlueRocksEnv::GetFileSize(
461 const std::string& fname,
462 uint64_t* file_size)
463{
464 std::string dir, file;
465 split(fname, &dir, &file);
466 int r = fs->stat(dir, file, file_size, NULL);
467 if (r < 0)
468 return err_to_status(r);
469 return rocksdb::Status::OK();
470}
471
472rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname,
473 uint64_t* file_mtime)
474{
475 std::string dir, file;
476 split(fname, &dir, &file);
477 utime_t mtime;
478 int r = fs->stat(dir, file, NULL, &mtime);
479 if (r < 0)
480 return err_to_status(r);
481 *file_mtime = mtime.sec();
482 return rocksdb::Status::OK();
483}
484
485rocksdb::Status BlueRocksEnv::RenameFile(
486 const std::string& src,
487 const std::string& target)
488{
489 std::string old_dir, old_file;
490 split(src, &old_dir, &old_file);
491 std::string new_dir, new_file;
492 split(target, &new_dir, &new_file);
493
494 int r = fs->rename(old_dir, old_file, new_dir, new_file);
495 if (r < 0)
496 return err_to_status(r);
497 return rocksdb::Status::OK();
498}
499
500rocksdb::Status BlueRocksEnv::LinkFile(
501 const std::string& src,
502 const std::string& target)
503{
504 ceph_abort();
505}
506
507rocksdb::Status BlueRocksEnv::LockFile(
508 const std::string& fname,
509 rocksdb::FileLock** lock)
510{
511 std::string dir, file;
512 split(fname, &dir, &file);
513 BlueFS::FileLock *l = NULL;
514 int r = fs->lock_file(dir, file, &l);
515 if (r < 0)
516 return err_to_status(r);
517 *lock = new BlueRocksFileLock(fs, l);
518 return rocksdb::Status::OK();
519}
520
521rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock)
522{
523 BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock);
524 int r = fs->unlock_file(l->lock);
525 if (r < 0)
526 return err_to_status(r);
527 delete lock;
528 return rocksdb::Status::OK();
529}
530
531rocksdb::Status BlueRocksEnv::GetAbsolutePath(
532 const std::string& db_path,
533 std::string* output_path)
534{
535 // this is a lie...
536 *output_path = "/" + db_path;
537 return rocksdb::Status::OK();
538}
539
540rocksdb::Status BlueRocksEnv::NewLogger(
541 const std::string& fname,
542 std::shared_ptr<rocksdb::Logger>* result)
543{
544 // ignore the filename :)
545 result->reset(create_rocksdb_ceph_logger());
546 return rocksdb::Status::OK();
547}
548
549rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path)
550{
551 static int foo = 0;
552 *path = "temp_" + stringify(++foo);
553 return rocksdb::Status::OK();
554}