]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueRocksEnv.cc
update sources to v12.1.3
[ceph.git] / ceph / src / os / bluestore / BlueRocksEnv.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "BlueRocksEnv.h"
5#include "BlueFS.h"
6#include "include/stringify.h"
7#include "kv/RocksDBStore.h"
8
9rocksdb::Status err_to_status(int r)
10{
11 switch (r) {
12 case 0:
13 return rocksdb::Status::OK();
14 case -ENOENT:
15 return rocksdb::Status::NotFound(rocksdb::Status::kNone);
16 case -EINVAL:
17 return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone);
18 case -EIO:
19 return rocksdb::Status::IOError(rocksdb::Status::kNone);
20 default:
21 // FIXME :(
22 assert(0 == "unrecognized error code");
23 return rocksdb::Status::NotSupported(rocksdb::Status::kNone);
24 }
25}
26
27// A file abstraction for reading sequentially through a file
28class BlueRocksSequentialFile : public rocksdb::SequentialFile {
29 BlueFS *fs;
30 BlueFS::FileReader *h;
31 public:
32 BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
33 ~BlueRocksSequentialFile() override {
34 delete h;
35 }
36
37 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
38 // written by this routine. Sets "*result" to the data that was
39 // read (including if fewer than "n" bytes were successfully read).
40 // May set "*result" to point at data in "scratch[0..n-1]", so
41 // "scratch[0..n-1]" must be live when "*result" is used.
42 // If an error was encountered, returns a non-OK status.
43 //
44 // REQUIRES: External synchronization
45 rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override {
46 int r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch);
47 assert(r >= 0);
48 *result = rocksdb::Slice(scratch, r);
49 return rocksdb::Status::OK();
50 }
51
52 // Skip "n" bytes from the file. This is guaranteed to be no
53 // slower that reading the same data, but may be faster.
54 //
55 // If end of file is reached, skipping will stop at the end of the
56 // file, and Skip will return OK.
57 //
58 // REQUIRES: External synchronization
59 rocksdb::Status Skip(uint64_t n) override {
60 h->buf.skip(n);
61 return rocksdb::Status::OK();
62 }
63
64 // Remove any kind of caching of data from the offset to offset+length
65 // of this file. If the length is 0, then it refers to the end of file.
66 // If the system is not caching the file contents, then this is a noop.
67 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
68 fs->invalidate_cache(h->file, offset, length);
69 return rocksdb::Status::OK();
70 }
71};
72
73// A file abstraction for randomly reading the contents of a file.
74class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile {
75 BlueFS *fs;
76 BlueFS::FileReader *h;
77 public:
78 BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
79 ~BlueRocksRandomAccessFile() override {
80 delete h;
81 }
82
83 // Read up to "n" bytes from the file starting at "offset".
84 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
85 // to the data that was read (including if fewer than "n" bytes were
86 // successfully read). May set "*result" to point at data in
87 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
88 // "*result" is used. If an error was encountered, returns a non-OK
89 // status.
90 //
91 // Safe for concurrent use by multiple threads.
92 rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result,
93 char* scratch) const override {
94 int r = fs->read_random(h, offset, n, scratch);
95 assert(r >= 0);
96 *result = rocksdb::Slice(scratch, r);
97 return rocksdb::Status::OK();
98 }
99
7c673cae
FG
100 // Tries to get an unique ID for this file that will be the same each time
101 // the file is opened (and will stay the same while the file is open).
102 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
103 // ID can be created this function returns the length of the ID and places it
104 // in "id"; otherwise, this function returns 0, in which case "id"
105 // may not have been modified.
106 //
107 // This function guarantees, for IDs from a given environment, two unique ids
108 // cannot be made equal to eachother by adding arbitrary bytes to one of
109 // them. That is, no unique ID is the prefix of another.
110 //
111 // This function guarantees that the returned ID will not be interpretable as
112 // a single varint.
113 //
114 // Note: these IDs are only valid for the duration of the process.
115 size_t GetUniqueId(char* id, size_t max_size) const override {
116 return snprintf(id, max_size, "%016llx",
117 (unsigned long long)h->file->fnode.ino);
118 };
119
120 //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
121
122 void Hint(AccessPattern pattern) override {
123 if (pattern == RANDOM)
124 h->buf.max_prefetch = 4096;
125 else if (pattern == SEQUENTIAL)
126 h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch;
127 }
128
129 // Remove any kind of caching of data from the offset to offset+length
130 // of this file. If the length is 0, then it refers to the end of file.
131 // If the system is not caching the file contents, then this is a noop.
132 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
133 fs->invalidate_cache(h->file, offset, length);
134 return rocksdb::Status::OK();
135 }
136};
137
138
139// A file abstraction for sequential writing. The implementation
140// must provide buffering since callers may append small fragments
141// at a time to the file.
142class BlueRocksWritableFile : public rocksdb::WritableFile {
143 BlueFS *fs;
144 BlueFS::FileWriter *h;
145 public:
146 BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {}
147 ~BlueRocksWritableFile() override {
148 fs->close_writer(h);
149 }
150
151 // Indicates if the class makes use of unbuffered I/O
152 /*bool UseOSBuffer() const {
153 return true;
154 }*/
155
156 // This is needed when you want to allocate
157 // AlignedBuffer for use with file I/O classes
158 // Used for unbuffered file I/O when UseOSBuffer() returns false
159 /*size_t GetRequiredBufferAlignment() const {
160 return c_DefaultPageSize;
161 }*/
162
163 rocksdb::Status Append(const rocksdb::Slice& data) override {
164 h->append(data.data(), data.size());
165 return rocksdb::Status::OK();
166 }
167
168 // Positioned write for unbuffered access default forward
169 // to simple append as most of the tests are buffered by default
170 rocksdb::Status PositionedAppend(
171 const rocksdb::Slice& /* data */,
172 uint64_t /* offset */) override {
173 return rocksdb::Status::NotSupported();
174 }
175
176 // Truncate is necessary to trim the file to the correct size
177 // before closing. It is not always possible to keep track of the file
178 // size due to whole pages writes. The behavior is undefined if called
179 // with other writes to follow.
180 rocksdb::Status Truncate(uint64_t size) override {
181 // we mirror the posix env, which does nothing here; instead, it
182 // truncates to the final size on close. whatever!
183 return rocksdb::Status::OK();
184 //int r = fs->truncate(h, size);
185 // return err_to_status(r);
186 }
187
188 rocksdb::Status Close() override {
189 Flush();
190
191 // mimic posix env, here. shrug.
192 size_t block_size;
193 size_t last_allocated_block;
194 GetPreallocationStatus(&block_size, &last_allocated_block);
195 if (last_allocated_block > 0) {
196 int r = fs->truncate(h, h->pos);
197 if (r < 0)
198 return err_to_status(r);
199 }
200
201 return rocksdb::Status::OK();
202 }
203
204 rocksdb::Status Flush() override {
205 fs->flush(h);
206 return rocksdb::Status::OK();
207 }
208
209 rocksdb::Status Sync() override { // sync data
210 fs->fsync(h);
211 return rocksdb::Status::OK();
212 }
213
214 // true if Sync() and Fsync() are safe to call concurrently with Append()
215 // and Flush().
216 bool IsSyncThreadSafe() const override {
217 return true;
218 }
219
220 // Indicates the upper layers if the current WritableFile implementation
221 // uses direct IO.
222 bool UseDirectIO() const {
223 return false;
224 }
225
226 /*
227 * Get the size of valid data in the file.
228 */
229 uint64_t GetFileSize() override {
230 return h->file->fnode.size + h->buffer.length();;
231 }
232
233 // For documentation, refer to RandomAccessFile::GetUniqueId()
234 size_t GetUniqueId(char* id, size_t max_size) const override {
235 return snprintf(id, max_size, "%016llx",
236 (unsigned long long)h->file->fnode.ino);
237 }
238
239 // Remove any kind of caching of data from the offset to offset+length
240 // of this file. If the length is 0, then it refers to the end of file.
241 // If the system is not caching the file contents, then this is a noop.
242 // This call has no effect on dirty pages in the cache.
243 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
244 fs->invalidate_cache(h->file, offset, length);
245 return rocksdb::Status::OK();
246 }
247
248 using rocksdb::WritableFile::RangeSync;
249 // Sync a file range with disk.
250 // offset is the starting byte of the file range to be synchronized.
251 // nbytes specifies the length of the range to be synchronized.
252 // This asks the OS to initiate flushing the cached data to disk,
253 // without waiting for completion.
254 // Default implementation does nothing.
255 rocksdb::Status RangeSync(off_t offset, off_t nbytes) {
256 // round down to page boundaries
257 int partial = offset & 4095;
258 offset -= partial;
259 nbytes += partial;
260 nbytes &= ~4095;
261 if (nbytes)
262 fs->flush_range(h, offset, nbytes);
263 return rocksdb::Status::OK();
264 }
265
266 protected:
267 using rocksdb::WritableFile::Allocate;
268 /*
269 * Pre-allocate space for a file.
270 */
271 rocksdb::Status Allocate(off_t offset, off_t len) {
272 int r = fs->preallocate(h->file, offset, len);
273 return err_to_status(r);
274 }
275};
276
277
278// Directory object represents collection of files and implements
279// filesystem operations that can be executed on directories.
280class BlueRocksDirectory : public rocksdb::Directory {
281 BlueFS *fs;
282 public:
283 explicit BlueRocksDirectory(BlueFS *f) : fs(f) {}
284
285 // Fsync directory. Can be called concurrently from multiple threads.
286 rocksdb::Status Fsync() override {
287 // it is sufficient to flush the log.
288 fs->sync_metadata();
289 return rocksdb::Status::OK();
290 }
291};
292
293// Identifies a locked file.
294class BlueRocksFileLock : public rocksdb::FileLock {
295 public:
296 BlueFS *fs;
297 BlueFS::FileLock *lock;
298 BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { }
299 ~BlueRocksFileLock() override {
300 }
301};
302
303
304// --------------------
305// --- BlueRocksEnv ---
306// --------------------
307
308BlueRocksEnv::BlueRocksEnv(BlueFS *f)
309 : EnvWrapper(Env::Default()), // forward most of it to POSIX
310 fs(f)
311{
312
313}
314
315rocksdb::Status BlueRocksEnv::NewSequentialFile(
316 const std::string& fname,
317 std::unique_ptr<rocksdb::SequentialFile>* result,
318 const rocksdb::EnvOptions& options)
319{
320 if (fname[0] == '/')
321 return target()->NewSequentialFile(fname, result, options);
322 std::string dir, file;
323 split(fname, &dir, &file);
324 BlueFS::FileReader *h;
325 int r = fs->open_for_read(dir, file, &h, false);
326 if (r < 0)
327 return err_to_status(r);
328 result->reset(new BlueRocksSequentialFile(fs, h));
329 return rocksdb::Status::OK();
330}
331
332rocksdb::Status BlueRocksEnv::NewRandomAccessFile(
333 const std::string& fname,
334 std::unique_ptr<rocksdb::RandomAccessFile>* result,
335 const rocksdb::EnvOptions& options)
336{
337 std::string dir, file;
338 split(fname, &dir, &file);
339 BlueFS::FileReader *h;
340 int r = fs->open_for_read(dir, file, &h, true);
341 if (r < 0)
342 return err_to_status(r);
343 result->reset(new BlueRocksRandomAccessFile(fs, h));
344 return rocksdb::Status::OK();
345}
346
347rocksdb::Status BlueRocksEnv::NewWritableFile(
348 const std::string& fname,
349 std::unique_ptr<rocksdb::WritableFile>* result,
350 const rocksdb::EnvOptions& options)
351{
352 std::string dir, file;
353 split(fname, &dir, &file);
354 BlueFS::FileWriter *h;
355 int r = fs->open_for_write(dir, file, &h, false);
356 if (r < 0)
357 return err_to_status(r);
358 result->reset(new BlueRocksWritableFile(fs, h));
359 return rocksdb::Status::OK();
360}
361
362rocksdb::Status BlueRocksEnv::ReuseWritableFile(
363 const std::string& new_fname,
364 const std::string& old_fname,
365 std::unique_ptr<rocksdb::WritableFile>* result,
366 const rocksdb::EnvOptions& options)
367{
368 std::string old_dir, old_file;
369 split(old_fname, &old_dir, &old_file);
370 std::string new_dir, new_file;
371 split(new_fname, &new_dir, &new_file);
372
373 int r = fs->rename(old_dir, old_file, new_dir, new_file);
374 if (r < 0)
375 return err_to_status(r);
376
377 BlueFS::FileWriter *h;
378 r = fs->open_for_write(new_dir, new_file, &h, true);
379 if (r < 0)
380 return err_to_status(r);
381 result->reset(new BlueRocksWritableFile(fs, h));
382 return rocksdb::Status::OK();
383}
384
385rocksdb::Status BlueRocksEnv::NewDirectory(
386 const std::string& name,
387 std::unique_ptr<rocksdb::Directory>* result)
388{
389 if (!fs->dir_exists(name))
390 return rocksdb::Status::IOError(name, strerror(ENOENT));
391 result->reset(new BlueRocksDirectory(fs));
392 return rocksdb::Status::OK();
393}
394
395rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname)
396{
397 if (fname[0] == '/')
398 return target()->FileExists(fname);
399 std::string dir, file;
400 split(fname, &dir, &file);
401 if (fs->stat(dir, file, NULL, NULL) == 0)
402 return rocksdb::Status::OK();
403 return err_to_status(-ENOENT);
404}
405
406rocksdb::Status BlueRocksEnv::GetChildren(
407 const std::string& dir,
408 std::vector<std::string>* result)
409{
d2e6a577 410 result->clear();
7c673cae
FG
411 int r = fs->readdir(dir, result);
412 if (r < 0)
413 return rocksdb::Status::IOError(dir, strerror(ENOENT));// return err_to_status(r);
414 return rocksdb::Status::OK();
415}
416
417rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname)
418{
419 std::string dir, file;
420 split(fname, &dir, &file);
421 int r = fs->unlink(dir, file);
422 if (r < 0)
423 return err_to_status(r);
424 return rocksdb::Status::OK();
425}
426
427rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname)
428{
429 int r = fs->mkdir(dirname);
430 if (r < 0)
431 return err_to_status(r);
432 return rocksdb::Status::OK();
433}
434
435rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname)
436{
437 int r = fs->mkdir(dirname);
438 if (r < 0 && r != -EEXIST)
439 return err_to_status(r);
440 return rocksdb::Status::OK();
441}
442
443rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname)
444{
445 int r = fs->rmdir(dirname);
446 if (r < 0)
447 return err_to_status(r);
448 return rocksdb::Status::OK();
449}
450
451rocksdb::Status BlueRocksEnv::GetFileSize(
452 const std::string& fname,
453 uint64_t* file_size)
454{
455 std::string dir, file;
456 split(fname, &dir, &file);
457 int r = fs->stat(dir, file, file_size, NULL);
458 if (r < 0)
459 return err_to_status(r);
460 return rocksdb::Status::OK();
461}
462
463rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname,
464 uint64_t* file_mtime)
465{
466 std::string dir, file;
467 split(fname, &dir, &file);
468 utime_t mtime;
469 int r = fs->stat(dir, file, NULL, &mtime);
470 if (r < 0)
471 return err_to_status(r);
472 *file_mtime = mtime.sec();
473 return rocksdb::Status::OK();
474}
475
476rocksdb::Status BlueRocksEnv::RenameFile(
477 const std::string& src,
478 const std::string& target)
479{
480 std::string old_dir, old_file;
481 split(src, &old_dir, &old_file);
482 std::string new_dir, new_file;
483 split(target, &new_dir, &new_file);
484
485 int r = fs->rename(old_dir, old_file, new_dir, new_file);
486 if (r < 0)
487 return err_to_status(r);
488 return rocksdb::Status::OK();
489}
490
491rocksdb::Status BlueRocksEnv::LinkFile(
492 const std::string& src,
493 const std::string& target)
494{
495 ceph_abort();
496}
497
498rocksdb::Status BlueRocksEnv::LockFile(
499 const std::string& fname,
500 rocksdb::FileLock** lock)
501{
502 std::string dir, file;
503 split(fname, &dir, &file);
504 BlueFS::FileLock *l = NULL;
505 int r = fs->lock_file(dir, file, &l);
506 if (r < 0)
507 return err_to_status(r);
508 *lock = new BlueRocksFileLock(fs, l);
509 return rocksdb::Status::OK();
510}
511
512rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock)
513{
514 BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock);
515 int r = fs->unlock_file(l->lock);
516 if (r < 0)
517 return err_to_status(r);
518 delete lock;
519 return rocksdb::Status::OK();
520}
521
522rocksdb::Status BlueRocksEnv::GetAbsolutePath(
523 const std::string& db_path,
524 std::string* output_path)
525{
526 // this is a lie...
527 *output_path = "/" + db_path;
528 return rocksdb::Status::OK();
529}
530
531rocksdb::Status BlueRocksEnv::NewLogger(
532 const std::string& fname,
533 std::shared_ptr<rocksdb::Logger>* result)
534{
535 // ignore the filename :)
536 result->reset(create_rocksdb_ceph_logger());
537 return rocksdb::Status::OK();
538}
539
540rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path)
541{
542 static int foo = 0;
543 *path = "temp_" + stringify(++foo);
544 return rocksdb::Status::OK();
545}