]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueRocksEnv.cc
1b1e2e903572f918ee0df75ba6ae5a3314e13181
[ceph.git] / ceph / src / os / bluestore / BlueRocksEnv.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "BlueRocksEnv.h"
5 #include "BlueFS.h"
6 #include "include/stringify.h"
7 #include "kv/RocksDBStore.h"
8
9 rocksdb::Status err_to_status(int r)
10 {
11 switch (r) {
12 case 0:
13 return rocksdb::Status::OK();
14 case -ENOENT:
15 return rocksdb::Status::NotFound(rocksdb::Status::kNone);
16 case -EINVAL:
17 return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone);
18 case -EIO:
19 return rocksdb::Status::IOError(rocksdb::Status::kNone);
20 default:
21 // FIXME :(
22 assert(0 == "unrecognized error code");
23 return rocksdb::Status::NotSupported(rocksdb::Status::kNone);
24 }
25 }
26
27 // A file abstraction for reading sequentially through a file
28 class BlueRocksSequentialFile : public rocksdb::SequentialFile {
29 BlueFS *fs;
30 BlueFS::FileReader *h;
31 public:
32 BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
33 ~BlueRocksSequentialFile() override {
34 delete h;
35 }
36
37 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
38 // written by this routine. Sets "*result" to the data that was
39 // read (including if fewer than "n" bytes were successfully read).
40 // May set "*result" to point at data in "scratch[0..n-1]", so
41 // "scratch[0..n-1]" must be live when "*result" is used.
42 // If an error was encountered, returns a non-OK status.
43 //
44 // REQUIRES: External synchronization
45 rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override {
46 int r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch);
47 assert(r >= 0);
48 *result = rocksdb::Slice(scratch, r);
49 return rocksdb::Status::OK();
50 }
51
52 // Skip "n" bytes from the file. This is guaranteed to be no
53 // slower that reading the same data, but may be faster.
54 //
55 // If end of file is reached, skipping will stop at the end of the
56 // file, and Skip will return OK.
57 //
58 // REQUIRES: External synchronization
59 rocksdb::Status Skip(uint64_t n) override {
60 h->buf.skip(n);
61 return rocksdb::Status::OK();
62 }
63
64 // Remove any kind of caching of data from the offset to offset+length
65 // of this file. If the length is 0, then it refers to the end of file.
66 // If the system is not caching the file contents, then this is a noop.
67 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
68 fs->invalidate_cache(h->file, offset, length);
69 return rocksdb::Status::OK();
70 }
71 };
72
73 // A file abstraction for randomly reading the contents of a file.
74 class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile {
75 BlueFS *fs;
76 BlueFS::FileReader *h;
77 public:
78 BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
79 ~BlueRocksRandomAccessFile() override {
80 delete h;
81 }
82
83 // Read up to "n" bytes from the file starting at "offset".
84 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
85 // to the data that was read (including if fewer than "n" bytes were
86 // successfully read). May set "*result" to point at data in
87 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
88 // "*result" is used. If an error was encountered, returns a non-OK
89 // status.
90 //
91 // Safe for concurrent use by multiple threads.
92 rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result,
93 char* scratch) const override {
94 int r = fs->read_random(h, offset, n, scratch);
95 assert(r >= 0);
96 *result = rocksdb::Slice(scratch, r);
97 return rocksdb::Status::OK();
98 }
99
100 // Tries to get an unique ID for this file that will be the same each time
101 // the file is opened (and will stay the same while the file is open).
102 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
103 // ID can be created this function returns the length of the ID and places it
104 // in "id"; otherwise, this function returns 0, in which case "id"
105 // may not have been modified.
106 //
107 // This function guarantees, for IDs from a given environment, two unique ids
108 // cannot be made equal to eachother by adding arbitrary bytes to one of
109 // them. That is, no unique ID is the prefix of another.
110 //
111 // This function guarantees that the returned ID will not be interpretable as
112 // a single varint.
113 //
114 // Note: these IDs are only valid for the duration of the process.
115 size_t GetUniqueId(char* id, size_t max_size) const override {
116 return snprintf(id, max_size, "%016llx",
117 (unsigned long long)h->file->fnode.ino);
118 };
119
120 //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
121
122 void Hint(AccessPattern pattern) override {
123 if (pattern == RANDOM)
124 h->buf.max_prefetch = 4096;
125 else if (pattern == SEQUENTIAL)
126 h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch;
127 }
128
129 // Remove any kind of caching of data from the offset to offset+length
130 // of this file. If the length is 0, then it refers to the end of file.
131 // If the system is not caching the file contents, then this is a noop.
132 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
133 fs->invalidate_cache(h->file, offset, length);
134 return rocksdb::Status::OK();
135 }
136 };
137
138
139 // A file abstraction for sequential writing. The implementation
140 // must provide buffering since callers may append small fragments
141 // at a time to the file.
142 class BlueRocksWritableFile : public rocksdb::WritableFile {
143 BlueFS *fs;
144 BlueFS::FileWriter *h;
145 public:
146 BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {}
147 ~BlueRocksWritableFile() override {
148 fs->close_writer(h);
149 }
150
151 // Indicates if the class makes use of unbuffered I/O
152 /*bool UseOSBuffer() const {
153 return true;
154 }*/
155
156 // This is needed when you want to allocate
157 // AlignedBuffer for use with file I/O classes
158 // Used for unbuffered file I/O when UseOSBuffer() returns false
159 /*size_t GetRequiredBufferAlignment() const {
160 return c_DefaultPageSize;
161 }*/
162
163 rocksdb::Status Append(const rocksdb::Slice& data) override {
164 h->append(data.data(), data.size());
165 return rocksdb::Status::OK();
166 }
167
168 // Positioned write for unbuffered access default forward
169 // to simple append as most of the tests are buffered by default
170 rocksdb::Status PositionedAppend(
171 const rocksdb::Slice& /* data */,
172 uint64_t /* offset */) override {
173 return rocksdb::Status::NotSupported();
174 }
175
176 // Truncate is necessary to trim the file to the correct size
177 // before closing. It is not always possible to keep track of the file
178 // size due to whole pages writes. The behavior is undefined if called
179 // with other writes to follow.
180 rocksdb::Status Truncate(uint64_t size) override {
181 // we mirror the posix env, which does nothing here; instead, it
182 // truncates to the final size on close. whatever!
183 return rocksdb::Status::OK();
184 //int r = fs->truncate(h, size);
185 // return err_to_status(r);
186 }
187
188 rocksdb::Status Close() override {
189 Flush();
190
191 // mimic posix env, here. shrug.
192 size_t block_size;
193 size_t last_allocated_block;
194 GetPreallocationStatus(&block_size, &last_allocated_block);
195 if (last_allocated_block > 0) {
196 int r = fs->truncate(h, h->pos);
197 if (r < 0)
198 return err_to_status(r);
199 }
200
201 return rocksdb::Status::OK();
202 }
203
204 rocksdb::Status Flush() override {
205 fs->flush(h);
206 return rocksdb::Status::OK();
207 }
208
209 rocksdb::Status Sync() override { // sync data
210 fs->fsync(h);
211 return rocksdb::Status::OK();
212 }
213
214 // true if Sync() and Fsync() are safe to call concurrently with Append()
215 // and Flush().
216 bool IsSyncThreadSafe() const override {
217 return true;
218 }
219
220 // Indicates the upper layers if the current WritableFile implementation
221 // uses direct IO.
222 bool UseDirectIO() const {
223 return false;
224 }
225
226 /*
227 * Get the size of valid data in the file.
228 */
229 uint64_t GetFileSize() override {
230 return h->file->fnode.size + h->buffer.length();;
231 }
232
233 // For documentation, refer to RandomAccessFile::GetUniqueId()
234 size_t GetUniqueId(char* id, size_t max_size) const override {
235 return snprintf(id, max_size, "%016llx",
236 (unsigned long long)h->file->fnode.ino);
237 }
238
239 // Remove any kind of caching of data from the offset to offset+length
240 // of this file. If the length is 0, then it refers to the end of file.
241 // If the system is not caching the file contents, then this is a noop.
242 // This call has no effect on dirty pages in the cache.
243 rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
244 fs->invalidate_cache(h->file, offset, length);
245 return rocksdb::Status::OK();
246 }
247
248 using rocksdb::WritableFile::RangeSync;
249 // Sync a file range with disk.
250 // offset is the starting byte of the file range to be synchronized.
251 // nbytes specifies the length of the range to be synchronized.
252 // This asks the OS to initiate flushing the cached data to disk,
253 // without waiting for completion.
254 // Default implementation does nothing.
255 rocksdb::Status RangeSync(off_t offset, off_t nbytes) {
256 // round down to page boundaries
257 int partial = offset & 4095;
258 offset -= partial;
259 nbytes += partial;
260 nbytes &= ~4095;
261 if (nbytes)
262 fs->flush_range(h, offset, nbytes);
263 return rocksdb::Status::OK();
264 }
265
266 protected:
267 using rocksdb::WritableFile::Allocate;
268 /*
269 * Pre-allocate space for a file.
270 */
271 rocksdb::Status Allocate(off_t offset, off_t len) {
272 int r = fs->preallocate(h->file, offset, len);
273 return err_to_status(r);
274 }
275 };
276
277
278 // Directory object represents collection of files and implements
279 // filesystem operations that can be executed on directories.
280 class BlueRocksDirectory : public rocksdb::Directory {
281 BlueFS *fs;
282 public:
283 explicit BlueRocksDirectory(BlueFS *f) : fs(f) {}
284
285 // Fsync directory. Can be called concurrently from multiple threads.
286 rocksdb::Status Fsync() override {
287 // it is sufficient to flush the log.
288 fs->sync_metadata();
289 return rocksdb::Status::OK();
290 }
291 };
292
293 // Identifies a locked file.
294 class BlueRocksFileLock : public rocksdb::FileLock {
295 public:
296 BlueFS *fs;
297 BlueFS::FileLock *lock;
298 BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { }
299 ~BlueRocksFileLock() override {
300 }
301 };
302
303
304 // --------------------
305 // --- BlueRocksEnv ---
306 // --------------------
307
308 BlueRocksEnv::BlueRocksEnv(BlueFS *f)
309 : EnvWrapper(Env::Default()), // forward most of it to POSIX
310 fs(f)
311 {
312
313 }
314
315 rocksdb::Status BlueRocksEnv::NewSequentialFile(
316 const std::string& fname,
317 std::unique_ptr<rocksdb::SequentialFile>* result,
318 const rocksdb::EnvOptions& options)
319 {
320 if (fname[0] == '/')
321 return target()->NewSequentialFile(fname, result, options);
322 std::string dir, file;
323 split(fname, &dir, &file);
324 BlueFS::FileReader *h;
325 int r = fs->open_for_read(dir, file, &h, false);
326 if (r < 0)
327 return err_to_status(r);
328 result->reset(new BlueRocksSequentialFile(fs, h));
329 return rocksdb::Status::OK();
330 }
331
332 rocksdb::Status BlueRocksEnv::NewRandomAccessFile(
333 const std::string& fname,
334 std::unique_ptr<rocksdb::RandomAccessFile>* result,
335 const rocksdb::EnvOptions& options)
336 {
337 std::string dir, file;
338 split(fname, &dir, &file);
339 BlueFS::FileReader *h;
340 int r = fs->open_for_read(dir, file, &h, true);
341 if (r < 0)
342 return err_to_status(r);
343 result->reset(new BlueRocksRandomAccessFile(fs, h));
344 return rocksdb::Status::OK();
345 }
346
347 rocksdb::Status BlueRocksEnv::NewWritableFile(
348 const std::string& fname,
349 std::unique_ptr<rocksdb::WritableFile>* result,
350 const rocksdb::EnvOptions& options)
351 {
352 std::string dir, file;
353 split(fname, &dir, &file);
354 BlueFS::FileWriter *h;
355 int r = fs->open_for_write(dir, file, &h, false);
356 if (r < 0)
357 return err_to_status(r);
358 result->reset(new BlueRocksWritableFile(fs, h));
359 return rocksdb::Status::OK();
360 }
361
362 rocksdb::Status BlueRocksEnv::ReuseWritableFile(
363 const std::string& new_fname,
364 const std::string& old_fname,
365 std::unique_ptr<rocksdb::WritableFile>* result,
366 const rocksdb::EnvOptions& options)
367 {
368 std::string old_dir, old_file;
369 split(old_fname, &old_dir, &old_file);
370 std::string new_dir, new_file;
371 split(new_fname, &new_dir, &new_file);
372
373 int r = fs->rename(old_dir, old_file, new_dir, new_file);
374 if (r < 0)
375 return err_to_status(r);
376
377 BlueFS::FileWriter *h;
378 r = fs->open_for_write(new_dir, new_file, &h, true);
379 if (r < 0)
380 return err_to_status(r);
381 result->reset(new BlueRocksWritableFile(fs, h));
382 return rocksdb::Status::OK();
383 }
384
385 rocksdb::Status BlueRocksEnv::NewDirectory(
386 const std::string& name,
387 std::unique_ptr<rocksdb::Directory>* result)
388 {
389 if (!fs->dir_exists(name))
390 return rocksdb::Status::IOError(name, strerror(ENOENT));
391 result->reset(new BlueRocksDirectory(fs));
392 return rocksdb::Status::OK();
393 }
394
395 rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname)
396 {
397 if (fname[0] == '/')
398 return target()->FileExists(fname);
399 std::string dir, file;
400 split(fname, &dir, &file);
401 if (fs->stat(dir, file, NULL, NULL) == 0)
402 return rocksdb::Status::OK();
403 return err_to_status(-ENOENT);
404 }
405
406 rocksdb::Status BlueRocksEnv::GetChildren(
407 const std::string& dir,
408 std::vector<std::string>* result)
409 {
410 int r = fs->readdir(dir, result);
411 if (r < 0)
412 return rocksdb::Status::IOError(dir, strerror(ENOENT));// return err_to_status(r);
413 return rocksdb::Status::OK();
414 }
415
416 rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname)
417 {
418 std::string dir, file;
419 split(fname, &dir, &file);
420 int r = fs->unlink(dir, file);
421 if (r < 0)
422 return err_to_status(r);
423 return rocksdb::Status::OK();
424 }
425
426 rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname)
427 {
428 int r = fs->mkdir(dirname);
429 if (r < 0)
430 return err_to_status(r);
431 return rocksdb::Status::OK();
432 }
433
434 rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname)
435 {
436 int r = fs->mkdir(dirname);
437 if (r < 0 && r != -EEXIST)
438 return err_to_status(r);
439 return rocksdb::Status::OK();
440 }
441
442 rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname)
443 {
444 int r = fs->rmdir(dirname);
445 if (r < 0)
446 return err_to_status(r);
447 return rocksdb::Status::OK();
448 }
449
450 rocksdb::Status BlueRocksEnv::GetFileSize(
451 const std::string& fname,
452 uint64_t* file_size)
453 {
454 std::string dir, file;
455 split(fname, &dir, &file);
456 int r = fs->stat(dir, file, file_size, NULL);
457 if (r < 0)
458 return err_to_status(r);
459 return rocksdb::Status::OK();
460 }
461
462 rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname,
463 uint64_t* file_mtime)
464 {
465 std::string dir, file;
466 split(fname, &dir, &file);
467 utime_t mtime;
468 int r = fs->stat(dir, file, NULL, &mtime);
469 if (r < 0)
470 return err_to_status(r);
471 *file_mtime = mtime.sec();
472 return rocksdb::Status::OK();
473 }
474
475 rocksdb::Status BlueRocksEnv::RenameFile(
476 const std::string& src,
477 const std::string& target)
478 {
479 std::string old_dir, old_file;
480 split(src, &old_dir, &old_file);
481 std::string new_dir, new_file;
482 split(target, &new_dir, &new_file);
483
484 int r = fs->rename(old_dir, old_file, new_dir, new_file);
485 if (r < 0)
486 return err_to_status(r);
487 return rocksdb::Status::OK();
488 }
489
490 rocksdb::Status BlueRocksEnv::LinkFile(
491 const std::string& src,
492 const std::string& target)
493 {
494 ceph_abort();
495 }
496
497 rocksdb::Status BlueRocksEnv::LockFile(
498 const std::string& fname,
499 rocksdb::FileLock** lock)
500 {
501 std::string dir, file;
502 split(fname, &dir, &file);
503 BlueFS::FileLock *l = NULL;
504 int r = fs->lock_file(dir, file, &l);
505 if (r < 0)
506 return err_to_status(r);
507 *lock = new BlueRocksFileLock(fs, l);
508 return rocksdb::Status::OK();
509 }
510
511 rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock)
512 {
513 BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock);
514 int r = fs->unlock_file(l->lock);
515 if (r < 0)
516 return err_to_status(r);
517 delete lock;
518 return rocksdb::Status::OK();
519 }
520
521 rocksdb::Status BlueRocksEnv::GetAbsolutePath(
522 const std::string& db_path,
523 std::string* output_path)
524 {
525 // this is a lie...
526 *output_path = "/" + db_path;
527 return rocksdb::Status::OK();
528 }
529
530 rocksdb::Status BlueRocksEnv::NewLogger(
531 const std::string& fname,
532 std::shared_ptr<rocksdb::Logger>* result)
533 {
534 // ignore the filename :)
535 result->reset(create_rocksdb_ceph_logger());
536 return rocksdb::Status::OK();
537 }
538
539 rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path)
540 {
541 static int foo = 0;
542 *path = "temp_" + stringify(++foo);
543 return rocksdb::Status::OK();
544 }