]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "BlueRocksEnv.h" | |
5 | #include "BlueFS.h" | |
6 | #include "include/stringify.h" | |
7 | #include "kv/RocksDBStore.h" | |
8 | ||
9 | rocksdb::Status err_to_status(int r) | |
10 | { | |
11 | switch (r) { | |
12 | case 0: | |
13 | return rocksdb::Status::OK(); | |
14 | case -ENOENT: | |
15 | return rocksdb::Status::NotFound(rocksdb::Status::kNone); | |
16 | case -EINVAL: | |
17 | return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone); | |
18 | case -EIO: | |
19 | return rocksdb::Status::IOError(rocksdb::Status::kNone); | |
20 | default: | |
21 | // FIXME :( | |
22 | assert(0 == "unrecognized error code"); | |
23 | return rocksdb::Status::NotSupported(rocksdb::Status::kNone); | |
24 | } | |
25 | } | |
26 | ||
27 | // A file abstraction for reading sequentially through a file | |
28 | class BlueRocksSequentialFile : public rocksdb::SequentialFile { | |
29 | BlueFS *fs; | |
30 | BlueFS::FileReader *h; | |
31 | public: | |
32 | BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {} | |
33 | ~BlueRocksSequentialFile() override { | |
34 | delete h; | |
35 | } | |
36 | ||
37 | // Read up to "n" bytes from the file. "scratch[0..n-1]" may be | |
38 | // written by this routine. Sets "*result" to the data that was | |
39 | // read (including if fewer than "n" bytes were successfully read). | |
40 | // May set "*result" to point at data in "scratch[0..n-1]", so | |
41 | // "scratch[0..n-1]" must be live when "*result" is used. | |
42 | // If an error was encountered, returns a non-OK status. | |
43 | // | |
44 | // REQUIRES: External synchronization | |
45 | rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override { | |
46 | int r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch); | |
47 | assert(r >= 0); | |
48 | *result = rocksdb::Slice(scratch, r); | |
49 | return rocksdb::Status::OK(); | |
50 | } | |
51 | ||
52 | // Skip "n" bytes from the file. This is guaranteed to be no | |
53 | // slower that reading the same data, but may be faster. | |
54 | // | |
55 | // If end of file is reached, skipping will stop at the end of the | |
56 | // file, and Skip will return OK. | |
57 | // | |
58 | // REQUIRES: External synchronization | |
59 | rocksdb::Status Skip(uint64_t n) override { | |
60 | h->buf.skip(n); | |
61 | return rocksdb::Status::OK(); | |
62 | } | |
63 | ||
64 | // Remove any kind of caching of data from the offset to offset+length | |
65 | // of this file. If the length is 0, then it refers to the end of file. | |
66 | // If the system is not caching the file contents, then this is a noop. | |
67 | rocksdb::Status InvalidateCache(size_t offset, size_t length) override { | |
68 | fs->invalidate_cache(h->file, offset, length); | |
69 | return rocksdb::Status::OK(); | |
70 | } | |
71 | }; | |
72 | ||
73 | // A file abstraction for randomly reading the contents of a file. | |
74 | class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile { | |
75 | BlueFS *fs; | |
76 | BlueFS::FileReader *h; | |
77 | public: | |
78 | BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {} | |
79 | ~BlueRocksRandomAccessFile() override { | |
80 | delete h; | |
81 | } | |
82 | ||
83 | // Read up to "n" bytes from the file starting at "offset". | |
84 | // "scratch[0..n-1]" may be written by this routine. Sets "*result" | |
85 | // to the data that was read (including if fewer than "n" bytes were | |
86 | // successfully read). May set "*result" to point at data in | |
87 | // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when | |
88 | // "*result" is used. If an error was encountered, returns a non-OK | |
89 | // status. | |
90 | // | |
91 | // Safe for concurrent use by multiple threads. | |
92 | rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result, | |
93 | char* scratch) const override { | |
94 | int r = fs->read_random(h, offset, n, scratch); | |
95 | assert(r >= 0); | |
96 | *result = rocksdb::Slice(scratch, r); | |
97 | return rocksdb::Status::OK(); | |
98 | } | |
99 | ||
7c673cae FG |
100 | // Tries to get an unique ID for this file that will be the same each time |
101 | // the file is opened (and will stay the same while the file is open). | |
102 | // Furthermore, it tries to make this ID at most "max_size" bytes. If such an | |
103 | // ID can be created this function returns the length of the ID and places it | |
104 | // in "id"; otherwise, this function returns 0, in which case "id" | |
105 | // may not have been modified. | |
106 | // | |
107 | // This function guarantees, for IDs from a given environment, two unique ids | |
108 | // cannot be made equal to eachother by adding arbitrary bytes to one of | |
109 | // them. That is, no unique ID is the prefix of another. | |
110 | // | |
111 | // This function guarantees that the returned ID will not be interpretable as | |
112 | // a single varint. | |
113 | // | |
114 | // Note: these IDs are only valid for the duration of the process. | |
115 | size_t GetUniqueId(char* id, size_t max_size) const override { | |
116 | return snprintf(id, max_size, "%016llx", | |
117 | (unsigned long long)h->file->fnode.ino); | |
118 | }; | |
119 | ||
120 | //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; | |
121 | ||
122 | void Hint(AccessPattern pattern) override { | |
123 | if (pattern == RANDOM) | |
124 | h->buf.max_prefetch = 4096; | |
125 | else if (pattern == SEQUENTIAL) | |
126 | h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch; | |
127 | } | |
128 | ||
129 | // Remove any kind of caching of data from the offset to offset+length | |
130 | // of this file. If the length is 0, then it refers to the end of file. | |
131 | // If the system is not caching the file contents, then this is a noop. | |
132 | rocksdb::Status InvalidateCache(size_t offset, size_t length) override { | |
133 | fs->invalidate_cache(h->file, offset, length); | |
134 | return rocksdb::Status::OK(); | |
135 | } | |
136 | }; | |
137 | ||
138 | ||
139 | // A file abstraction for sequential writing. The implementation | |
140 | // must provide buffering since callers may append small fragments | |
141 | // at a time to the file. | |
142 | class BlueRocksWritableFile : public rocksdb::WritableFile { | |
143 | BlueFS *fs; | |
144 | BlueFS::FileWriter *h; | |
145 | public: | |
146 | BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {} | |
147 | ~BlueRocksWritableFile() override { | |
148 | fs->close_writer(h); | |
149 | } | |
150 | ||
151 | // Indicates if the class makes use of unbuffered I/O | |
152 | /*bool UseOSBuffer() const { | |
153 | return true; | |
154 | }*/ | |
155 | ||
156 | // This is needed when you want to allocate | |
157 | // AlignedBuffer for use with file I/O classes | |
158 | // Used for unbuffered file I/O when UseOSBuffer() returns false | |
159 | /*size_t GetRequiredBufferAlignment() const { | |
160 | return c_DefaultPageSize; | |
161 | }*/ | |
162 | ||
163 | rocksdb::Status Append(const rocksdb::Slice& data) override { | |
164 | h->append(data.data(), data.size()); | |
165 | return rocksdb::Status::OK(); | |
166 | } | |
167 | ||
168 | // Positioned write for unbuffered access default forward | |
169 | // to simple append as most of the tests are buffered by default | |
170 | rocksdb::Status PositionedAppend( | |
171 | const rocksdb::Slice& /* data */, | |
172 | uint64_t /* offset */) override { | |
173 | return rocksdb::Status::NotSupported(); | |
174 | } | |
175 | ||
176 | // Truncate is necessary to trim the file to the correct size | |
177 | // before closing. It is not always possible to keep track of the file | |
178 | // size due to whole pages writes. The behavior is undefined if called | |
179 | // with other writes to follow. | |
180 | rocksdb::Status Truncate(uint64_t size) override { | |
181 | // we mirror the posix env, which does nothing here; instead, it | |
182 | // truncates to the final size on close. whatever! | |
183 | return rocksdb::Status::OK(); | |
184 | //int r = fs->truncate(h, size); | |
185 | // return err_to_status(r); | |
186 | } | |
187 | ||
188 | rocksdb::Status Close() override { | |
189 | Flush(); | |
190 | ||
191 | // mimic posix env, here. shrug. | |
192 | size_t block_size; | |
193 | size_t last_allocated_block; | |
194 | GetPreallocationStatus(&block_size, &last_allocated_block); | |
195 | if (last_allocated_block > 0) { | |
196 | int r = fs->truncate(h, h->pos); | |
197 | if (r < 0) | |
198 | return err_to_status(r); | |
199 | } | |
200 | ||
201 | return rocksdb::Status::OK(); | |
202 | } | |
203 | ||
204 | rocksdb::Status Flush() override { | |
205 | fs->flush(h); | |
206 | return rocksdb::Status::OK(); | |
207 | } | |
208 | ||
209 | rocksdb::Status Sync() override { // sync data | |
210 | fs->fsync(h); | |
211 | return rocksdb::Status::OK(); | |
212 | } | |
213 | ||
214 | // true if Sync() and Fsync() are safe to call concurrently with Append() | |
215 | // and Flush(). | |
216 | bool IsSyncThreadSafe() const override { | |
217 | return true; | |
218 | } | |
219 | ||
220 | // Indicates the upper layers if the current WritableFile implementation | |
221 | // uses direct IO. | |
222 | bool UseDirectIO() const { | |
223 | return false; | |
224 | } | |
225 | ||
226 | /* | |
227 | * Get the size of valid data in the file. | |
228 | */ | |
229 | uint64_t GetFileSize() override { | |
230 | return h->file->fnode.size + h->buffer.length();; | |
231 | } | |
232 | ||
233 | // For documentation, refer to RandomAccessFile::GetUniqueId() | |
234 | size_t GetUniqueId(char* id, size_t max_size) const override { | |
235 | return snprintf(id, max_size, "%016llx", | |
236 | (unsigned long long)h->file->fnode.ino); | |
237 | } | |
238 | ||
239 | // Remove any kind of caching of data from the offset to offset+length | |
240 | // of this file. If the length is 0, then it refers to the end of file. | |
241 | // If the system is not caching the file contents, then this is a noop. | |
242 | // This call has no effect on dirty pages in the cache. | |
243 | rocksdb::Status InvalidateCache(size_t offset, size_t length) override { | |
244 | fs->invalidate_cache(h->file, offset, length); | |
245 | return rocksdb::Status::OK(); | |
246 | } | |
247 | ||
248 | using rocksdb::WritableFile::RangeSync; | |
249 | // Sync a file range with disk. | |
250 | // offset is the starting byte of the file range to be synchronized. | |
251 | // nbytes specifies the length of the range to be synchronized. | |
252 | // This asks the OS to initiate flushing the cached data to disk, | |
253 | // without waiting for completion. | |
254 | // Default implementation does nothing. | |
255 | rocksdb::Status RangeSync(off_t offset, off_t nbytes) { | |
256 | // round down to page boundaries | |
257 | int partial = offset & 4095; | |
258 | offset -= partial; | |
259 | nbytes += partial; | |
260 | nbytes &= ~4095; | |
261 | if (nbytes) | |
262 | fs->flush_range(h, offset, nbytes); | |
263 | return rocksdb::Status::OK(); | |
264 | } | |
265 | ||
266 | protected: | |
267 | using rocksdb::WritableFile::Allocate; | |
268 | /* | |
269 | * Pre-allocate space for a file. | |
270 | */ | |
271 | rocksdb::Status Allocate(off_t offset, off_t len) { | |
272 | int r = fs->preallocate(h->file, offset, len); | |
273 | return err_to_status(r); | |
274 | } | |
275 | }; | |
276 | ||
277 | ||
278 | // Directory object represents collection of files and implements | |
279 | // filesystem operations that can be executed on directories. | |
280 | class BlueRocksDirectory : public rocksdb::Directory { | |
281 | BlueFS *fs; | |
282 | public: | |
283 | explicit BlueRocksDirectory(BlueFS *f) : fs(f) {} | |
284 | ||
285 | // Fsync directory. Can be called concurrently from multiple threads. | |
286 | rocksdb::Status Fsync() override { | |
287 | // it is sufficient to flush the log. | |
288 | fs->sync_metadata(); | |
289 | return rocksdb::Status::OK(); | |
290 | } | |
291 | }; | |
292 | ||
293 | // Identifies a locked file. | |
294 | class BlueRocksFileLock : public rocksdb::FileLock { | |
295 | public: | |
296 | BlueFS *fs; | |
297 | BlueFS::FileLock *lock; | |
298 | BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { } | |
299 | ~BlueRocksFileLock() override { | |
300 | } | |
301 | }; | |
302 | ||
303 | ||
304 | // -------------------- | |
305 | // --- BlueRocksEnv --- | |
306 | // -------------------- | |
307 | ||
308 | BlueRocksEnv::BlueRocksEnv(BlueFS *f) | |
309 | : EnvWrapper(Env::Default()), // forward most of it to POSIX | |
310 | fs(f) | |
311 | { | |
312 | ||
313 | } | |
314 | ||
315 | rocksdb::Status BlueRocksEnv::NewSequentialFile( | |
316 | const std::string& fname, | |
317 | std::unique_ptr<rocksdb::SequentialFile>* result, | |
318 | const rocksdb::EnvOptions& options) | |
319 | { | |
320 | if (fname[0] == '/') | |
321 | return target()->NewSequentialFile(fname, result, options); | |
322 | std::string dir, file; | |
323 | split(fname, &dir, &file); | |
324 | BlueFS::FileReader *h; | |
325 | int r = fs->open_for_read(dir, file, &h, false); | |
326 | if (r < 0) | |
327 | return err_to_status(r); | |
328 | result->reset(new BlueRocksSequentialFile(fs, h)); | |
329 | return rocksdb::Status::OK(); | |
330 | } | |
331 | ||
332 | rocksdb::Status BlueRocksEnv::NewRandomAccessFile( | |
333 | const std::string& fname, | |
334 | std::unique_ptr<rocksdb::RandomAccessFile>* result, | |
335 | const rocksdb::EnvOptions& options) | |
336 | { | |
337 | std::string dir, file; | |
338 | split(fname, &dir, &file); | |
339 | BlueFS::FileReader *h; | |
340 | int r = fs->open_for_read(dir, file, &h, true); | |
341 | if (r < 0) | |
342 | return err_to_status(r); | |
343 | result->reset(new BlueRocksRandomAccessFile(fs, h)); | |
344 | return rocksdb::Status::OK(); | |
345 | } | |
346 | ||
347 | rocksdb::Status BlueRocksEnv::NewWritableFile( | |
348 | const std::string& fname, | |
349 | std::unique_ptr<rocksdb::WritableFile>* result, | |
350 | const rocksdb::EnvOptions& options) | |
351 | { | |
352 | std::string dir, file; | |
353 | split(fname, &dir, &file); | |
354 | BlueFS::FileWriter *h; | |
355 | int r = fs->open_for_write(dir, file, &h, false); | |
356 | if (r < 0) | |
357 | return err_to_status(r); | |
358 | result->reset(new BlueRocksWritableFile(fs, h)); | |
359 | return rocksdb::Status::OK(); | |
360 | } | |
361 | ||
362 | rocksdb::Status BlueRocksEnv::ReuseWritableFile( | |
363 | const std::string& new_fname, | |
364 | const std::string& old_fname, | |
365 | std::unique_ptr<rocksdb::WritableFile>* result, | |
366 | const rocksdb::EnvOptions& options) | |
367 | { | |
368 | std::string old_dir, old_file; | |
369 | split(old_fname, &old_dir, &old_file); | |
370 | std::string new_dir, new_file; | |
371 | split(new_fname, &new_dir, &new_file); | |
372 | ||
373 | int r = fs->rename(old_dir, old_file, new_dir, new_file); | |
374 | if (r < 0) | |
375 | return err_to_status(r); | |
376 | ||
377 | BlueFS::FileWriter *h; | |
378 | r = fs->open_for_write(new_dir, new_file, &h, true); | |
379 | if (r < 0) | |
380 | return err_to_status(r); | |
381 | result->reset(new BlueRocksWritableFile(fs, h)); | |
382 | return rocksdb::Status::OK(); | |
383 | } | |
384 | ||
385 | rocksdb::Status BlueRocksEnv::NewDirectory( | |
386 | const std::string& name, | |
387 | std::unique_ptr<rocksdb::Directory>* result) | |
388 | { | |
389 | if (!fs->dir_exists(name)) | |
390 | return rocksdb::Status::IOError(name, strerror(ENOENT)); | |
391 | result->reset(new BlueRocksDirectory(fs)); | |
392 | return rocksdb::Status::OK(); | |
393 | } | |
394 | ||
395 | rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname) | |
396 | { | |
397 | if (fname[0] == '/') | |
398 | return target()->FileExists(fname); | |
399 | std::string dir, file; | |
400 | split(fname, &dir, &file); | |
401 | if (fs->stat(dir, file, NULL, NULL) == 0) | |
402 | return rocksdb::Status::OK(); | |
403 | return err_to_status(-ENOENT); | |
404 | } | |
405 | ||
406 | rocksdb::Status BlueRocksEnv::GetChildren( | |
407 | const std::string& dir, | |
408 | std::vector<std::string>* result) | |
409 | { | |
d2e6a577 | 410 | result->clear(); |
7c673cae FG |
411 | int r = fs->readdir(dir, result); |
412 | if (r < 0) | |
413 | return rocksdb::Status::IOError(dir, strerror(ENOENT));// return err_to_status(r); | |
414 | return rocksdb::Status::OK(); | |
415 | } | |
416 | ||
417 | rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname) | |
418 | { | |
419 | std::string dir, file; | |
420 | split(fname, &dir, &file); | |
421 | int r = fs->unlink(dir, file); | |
422 | if (r < 0) | |
423 | return err_to_status(r); | |
424 | return rocksdb::Status::OK(); | |
425 | } | |
426 | ||
427 | rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname) | |
428 | { | |
429 | int r = fs->mkdir(dirname); | |
430 | if (r < 0) | |
431 | return err_to_status(r); | |
432 | return rocksdb::Status::OK(); | |
433 | } | |
434 | ||
435 | rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname) | |
436 | { | |
437 | int r = fs->mkdir(dirname); | |
438 | if (r < 0 && r != -EEXIST) | |
439 | return err_to_status(r); | |
440 | return rocksdb::Status::OK(); | |
441 | } | |
442 | ||
443 | rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname) | |
444 | { | |
445 | int r = fs->rmdir(dirname); | |
446 | if (r < 0) | |
447 | return err_to_status(r); | |
448 | return rocksdb::Status::OK(); | |
449 | } | |
450 | ||
451 | rocksdb::Status BlueRocksEnv::GetFileSize( | |
452 | const std::string& fname, | |
453 | uint64_t* file_size) | |
454 | { | |
455 | std::string dir, file; | |
456 | split(fname, &dir, &file); | |
457 | int r = fs->stat(dir, file, file_size, NULL); | |
458 | if (r < 0) | |
459 | return err_to_status(r); | |
460 | return rocksdb::Status::OK(); | |
461 | } | |
462 | ||
463 | rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname, | |
464 | uint64_t* file_mtime) | |
465 | { | |
466 | std::string dir, file; | |
467 | split(fname, &dir, &file); | |
468 | utime_t mtime; | |
469 | int r = fs->stat(dir, file, NULL, &mtime); | |
470 | if (r < 0) | |
471 | return err_to_status(r); | |
472 | *file_mtime = mtime.sec(); | |
473 | return rocksdb::Status::OK(); | |
474 | } | |
475 | ||
476 | rocksdb::Status BlueRocksEnv::RenameFile( | |
477 | const std::string& src, | |
478 | const std::string& target) | |
479 | { | |
480 | std::string old_dir, old_file; | |
481 | split(src, &old_dir, &old_file); | |
482 | std::string new_dir, new_file; | |
483 | split(target, &new_dir, &new_file); | |
484 | ||
485 | int r = fs->rename(old_dir, old_file, new_dir, new_file); | |
486 | if (r < 0) | |
487 | return err_to_status(r); | |
488 | return rocksdb::Status::OK(); | |
489 | } | |
490 | ||
491 | rocksdb::Status BlueRocksEnv::LinkFile( | |
492 | const std::string& src, | |
493 | const std::string& target) | |
494 | { | |
495 | ceph_abort(); | |
496 | } | |
497 | ||
498 | rocksdb::Status BlueRocksEnv::LockFile( | |
499 | const std::string& fname, | |
500 | rocksdb::FileLock** lock) | |
501 | { | |
502 | std::string dir, file; | |
503 | split(fname, &dir, &file); | |
504 | BlueFS::FileLock *l = NULL; | |
505 | int r = fs->lock_file(dir, file, &l); | |
506 | if (r < 0) | |
507 | return err_to_status(r); | |
508 | *lock = new BlueRocksFileLock(fs, l); | |
509 | return rocksdb::Status::OK(); | |
510 | } | |
511 | ||
512 | rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock) | |
513 | { | |
514 | BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock); | |
515 | int r = fs->unlock_file(l->lock); | |
516 | if (r < 0) | |
517 | return err_to_status(r); | |
518 | delete lock; | |
519 | return rocksdb::Status::OK(); | |
520 | } | |
521 | ||
522 | rocksdb::Status BlueRocksEnv::GetAbsolutePath( | |
523 | const std::string& db_path, | |
524 | std::string* output_path) | |
525 | { | |
526 | // this is a lie... | |
527 | *output_path = "/" + db_path; | |
528 | return rocksdb::Status::OK(); | |
529 | } | |
530 | ||
531 | rocksdb::Status BlueRocksEnv::NewLogger( | |
532 | const std::string& fname, | |
533 | std::shared_ptr<rocksdb::Logger>* result) | |
534 | { | |
535 | // ignore the filename :) | |
536 | result->reset(create_rocksdb_ceph_logger()); | |
537 | return rocksdb::Status::OK(); | |
538 | } | |
539 | ||
540 | rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path) | |
541 | { | |
542 | static int foo = 0; | |
543 | *path = "temp_" + stringify(++foo); | |
544 | return rocksdb::Status::OK(); | |
545 | } |