]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "BlueRocksEnv.h" | |
5 | #include "BlueFS.h" | |
6 | #include "include/stringify.h" | |
7 | #include "kv/RocksDBStore.h" | |
8 | ||
9 | rocksdb::Status err_to_status(int r) | |
10 | { | |
11 | switch (r) { | |
12 | case 0: | |
13 | return rocksdb::Status::OK(); | |
14 | case -ENOENT: | |
15 | return rocksdb::Status::NotFound(rocksdb::Status::kNone); | |
16 | case -EINVAL: | |
17 | return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone); | |
18 | case -EIO: | |
19 | return rocksdb::Status::IOError(rocksdb::Status::kNone); | |
20 | default: | |
21 | // FIXME :( | |
22 | assert(0 == "unrecognized error code"); | |
23 | return rocksdb::Status::NotSupported(rocksdb::Status::kNone); | |
24 | } | |
25 | } | |
26 | ||
27 | // A file abstraction for reading sequentially through a file | |
28 | class BlueRocksSequentialFile : public rocksdb::SequentialFile { | |
29 | BlueFS *fs; | |
30 | BlueFS::FileReader *h; | |
31 | public: | |
32 | BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {} | |
33 | ~BlueRocksSequentialFile() override { | |
34 | delete h; | |
35 | } | |
36 | ||
37 | // Read up to "n" bytes from the file. "scratch[0..n-1]" may be | |
38 | // written by this routine. Sets "*result" to the data that was | |
39 | // read (including if fewer than "n" bytes were successfully read). | |
40 | // May set "*result" to point at data in "scratch[0..n-1]", so | |
41 | // "scratch[0..n-1]" must be live when "*result" is used. | |
42 | // If an error was encountered, returns a non-OK status. | |
43 | // | |
44 | // REQUIRES: External synchronization | |
45 | rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override { | |
46 | int r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch); | |
47 | assert(r >= 0); | |
48 | *result = rocksdb::Slice(scratch, r); | |
49 | return rocksdb::Status::OK(); | |
50 | } | |
51 | ||
52 | // Skip "n" bytes from the file. This is guaranteed to be no | |
53 | // slower that reading the same data, but may be faster. | |
54 | // | |
55 | // If end of file is reached, skipping will stop at the end of the | |
56 | // file, and Skip will return OK. | |
57 | // | |
58 | // REQUIRES: External synchronization | |
59 | rocksdb::Status Skip(uint64_t n) override { | |
60 | h->buf.skip(n); | |
61 | return rocksdb::Status::OK(); | |
62 | } | |
63 | ||
64 | // Remove any kind of caching of data from the offset to offset+length | |
65 | // of this file. If the length is 0, then it refers to the end of file. | |
66 | // If the system is not caching the file contents, then this is a noop. | |
67 | rocksdb::Status InvalidateCache(size_t offset, size_t length) override { | |
68 | fs->invalidate_cache(h->file, offset, length); | |
69 | return rocksdb::Status::OK(); | |
70 | } | |
71 | }; | |
72 | ||
73 | // A file abstraction for randomly reading the contents of a file. | |
74 | class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile { | |
75 | BlueFS *fs; | |
76 | BlueFS::FileReader *h; | |
77 | public: | |
78 | BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {} | |
79 | ~BlueRocksRandomAccessFile() override { | |
80 | delete h; | |
81 | } | |
82 | ||
83 | // Read up to "n" bytes from the file starting at "offset". | |
84 | // "scratch[0..n-1]" may be written by this routine. Sets "*result" | |
85 | // to the data that was read (including if fewer than "n" bytes were | |
86 | // successfully read). May set "*result" to point at data in | |
87 | // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when | |
88 | // "*result" is used. If an error was encountered, returns a non-OK | |
89 | // status. | |
90 | // | |
91 | // Safe for concurrent use by multiple threads. | |
92 | rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result, | |
93 | char* scratch) const override { | |
94 | int r = fs->read_random(h, offset, n, scratch); | |
95 | assert(r >= 0); | |
96 | *result = rocksdb::Slice(scratch, r); | |
97 | return rocksdb::Status::OK(); | |
98 | } | |
99 | ||
100 | // Used by the file_reader_writer to decide if the ReadAhead wrapper | |
101 | // should simply forward the call and do not enact buffering or locking. | |
102 | bool ShouldForwardRawRequest() const override { | |
103 | return false; | |
104 | } | |
105 | ||
106 | // For cases when read-ahead is implemented in the platform dependent | |
107 | // layer | |
108 | void EnableReadAhead() override {} | |
109 | ||
110 | // Tries to get an unique ID for this file that will be the same each time | |
111 | // the file is opened (and will stay the same while the file is open). | |
112 | // Furthermore, it tries to make this ID at most "max_size" bytes. If such an | |
113 | // ID can be created this function returns the length of the ID and places it | |
114 | // in "id"; otherwise, this function returns 0, in which case "id" | |
115 | // may not have been modified. | |
116 | // | |
117 | // This function guarantees, for IDs from a given environment, two unique ids | |
118 | // cannot be made equal to eachother by adding arbitrary bytes to one of | |
119 | // them. That is, no unique ID is the prefix of another. | |
120 | // | |
121 | // This function guarantees that the returned ID will not be interpretable as | |
122 | // a single varint. | |
123 | // | |
124 | // Note: these IDs are only valid for the duration of the process. | |
125 | size_t GetUniqueId(char* id, size_t max_size) const override { | |
126 | return snprintf(id, max_size, "%016llx", | |
127 | (unsigned long long)h->file->fnode.ino); | |
128 | }; | |
129 | ||
130 | //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; | |
131 | ||
132 | void Hint(AccessPattern pattern) override { | |
133 | if (pattern == RANDOM) | |
134 | h->buf.max_prefetch = 4096; | |
135 | else if (pattern == SEQUENTIAL) | |
136 | h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch; | |
137 | } | |
138 | ||
139 | // Remove any kind of caching of data from the offset to offset+length | |
140 | // of this file. If the length is 0, then it refers to the end of file. | |
141 | // If the system is not caching the file contents, then this is a noop. | |
142 | rocksdb::Status InvalidateCache(size_t offset, size_t length) override { | |
143 | fs->invalidate_cache(h->file, offset, length); | |
144 | return rocksdb::Status::OK(); | |
145 | } | |
146 | }; | |
147 | ||
148 | ||
149 | // A file abstraction for sequential writing. The implementation | |
150 | // must provide buffering since callers may append small fragments | |
151 | // at a time to the file. | |
152 | class BlueRocksWritableFile : public rocksdb::WritableFile { | |
153 | BlueFS *fs; | |
154 | BlueFS::FileWriter *h; | |
155 | public: | |
156 | BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {} | |
157 | ~BlueRocksWritableFile() override { | |
158 | fs->close_writer(h); | |
159 | } | |
160 | ||
161 | // Indicates if the class makes use of unbuffered I/O | |
162 | /*bool UseOSBuffer() const { | |
163 | return true; | |
164 | }*/ | |
165 | ||
166 | // This is needed when you want to allocate | |
167 | // AlignedBuffer for use with file I/O classes | |
168 | // Used for unbuffered file I/O when UseOSBuffer() returns false | |
169 | /*size_t GetRequiredBufferAlignment() const { | |
170 | return c_DefaultPageSize; | |
171 | }*/ | |
172 | ||
173 | rocksdb::Status Append(const rocksdb::Slice& data) override { | |
174 | h->append(data.data(), data.size()); | |
175 | return rocksdb::Status::OK(); | |
176 | } | |
177 | ||
178 | // Positioned write for unbuffered access default forward | |
179 | // to simple append as most of the tests are buffered by default | |
180 | rocksdb::Status PositionedAppend( | |
181 | const rocksdb::Slice& /* data */, | |
182 | uint64_t /* offset */) override { | |
183 | return rocksdb::Status::NotSupported(); | |
184 | } | |
185 | ||
186 | // Truncate is necessary to trim the file to the correct size | |
187 | // before closing. It is not always possible to keep track of the file | |
188 | // size due to whole pages writes. The behavior is undefined if called | |
189 | // with other writes to follow. | |
190 | rocksdb::Status Truncate(uint64_t size) override { | |
191 | // we mirror the posix env, which does nothing here; instead, it | |
192 | // truncates to the final size on close. whatever! | |
193 | return rocksdb::Status::OK(); | |
194 | //int r = fs->truncate(h, size); | |
195 | // return err_to_status(r); | |
196 | } | |
197 | ||
198 | rocksdb::Status Close() override { | |
199 | Flush(); | |
200 | ||
201 | // mimic posix env, here. shrug. | |
202 | size_t block_size; | |
203 | size_t last_allocated_block; | |
204 | GetPreallocationStatus(&block_size, &last_allocated_block); | |
205 | if (last_allocated_block > 0) { | |
206 | int r = fs->truncate(h, h->pos); | |
207 | if (r < 0) | |
208 | return err_to_status(r); | |
209 | } | |
210 | ||
211 | return rocksdb::Status::OK(); | |
212 | } | |
213 | ||
214 | rocksdb::Status Flush() override { | |
215 | fs->flush(h); | |
216 | return rocksdb::Status::OK(); | |
217 | } | |
218 | ||
219 | rocksdb::Status Sync() override { // sync data | |
220 | fs->fsync(h); | |
221 | return rocksdb::Status::OK(); | |
222 | } | |
223 | ||
224 | // true if Sync() and Fsync() are safe to call concurrently with Append() | |
225 | // and Flush(). | |
226 | bool IsSyncThreadSafe() const override { | |
227 | return true; | |
228 | } | |
229 | ||
230 | // Indicates the upper layers if the current WritableFile implementation | |
231 | // uses direct IO. | |
232 | bool UseDirectIO() const { | |
233 | return false; | |
234 | } | |
235 | ||
236 | /* | |
237 | * Get the size of valid data in the file. | |
238 | */ | |
239 | uint64_t GetFileSize() override { | |
240 | return h->file->fnode.size + h->buffer.length();; | |
241 | } | |
242 | ||
243 | // For documentation, refer to RandomAccessFile::GetUniqueId() | |
244 | size_t GetUniqueId(char* id, size_t max_size) const override { | |
245 | return snprintf(id, max_size, "%016llx", | |
246 | (unsigned long long)h->file->fnode.ino); | |
247 | } | |
248 | ||
249 | // Remove any kind of caching of data from the offset to offset+length | |
250 | // of this file. If the length is 0, then it refers to the end of file. | |
251 | // If the system is not caching the file contents, then this is a noop. | |
252 | // This call has no effect on dirty pages in the cache. | |
253 | rocksdb::Status InvalidateCache(size_t offset, size_t length) override { | |
254 | fs->invalidate_cache(h->file, offset, length); | |
255 | return rocksdb::Status::OK(); | |
256 | } | |
257 | ||
258 | using rocksdb::WritableFile::RangeSync; | |
259 | // Sync a file range with disk. | |
260 | // offset is the starting byte of the file range to be synchronized. | |
261 | // nbytes specifies the length of the range to be synchronized. | |
262 | // This asks the OS to initiate flushing the cached data to disk, | |
263 | // without waiting for completion. | |
264 | // Default implementation does nothing. | |
265 | rocksdb::Status RangeSync(off_t offset, off_t nbytes) { | |
266 | // round down to page boundaries | |
267 | int partial = offset & 4095; | |
268 | offset -= partial; | |
269 | nbytes += partial; | |
270 | nbytes &= ~4095; | |
271 | if (nbytes) | |
272 | fs->flush_range(h, offset, nbytes); | |
273 | return rocksdb::Status::OK(); | |
274 | } | |
275 | ||
276 | protected: | |
277 | using rocksdb::WritableFile::Allocate; | |
278 | /* | |
279 | * Pre-allocate space for a file. | |
280 | */ | |
281 | rocksdb::Status Allocate(off_t offset, off_t len) { | |
282 | int r = fs->preallocate(h->file, offset, len); | |
283 | return err_to_status(r); | |
284 | } | |
285 | }; | |
286 | ||
287 | ||
288 | // Directory object represents collection of files and implements | |
289 | // filesystem operations that can be executed on directories. | |
290 | class BlueRocksDirectory : public rocksdb::Directory { | |
291 | BlueFS *fs; | |
292 | public: | |
293 | explicit BlueRocksDirectory(BlueFS *f) : fs(f) {} | |
294 | ||
295 | // Fsync directory. Can be called concurrently from multiple threads. | |
296 | rocksdb::Status Fsync() override { | |
297 | // it is sufficient to flush the log. | |
298 | fs->sync_metadata(); | |
299 | return rocksdb::Status::OK(); | |
300 | } | |
301 | }; | |
302 | ||
303 | // Identifies a locked file. | |
304 | class BlueRocksFileLock : public rocksdb::FileLock { | |
305 | public: | |
306 | BlueFS *fs; | |
307 | BlueFS::FileLock *lock; | |
308 | BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { } | |
309 | ~BlueRocksFileLock() override { | |
310 | } | |
311 | }; | |
312 | ||
313 | ||
314 | // -------------------- | |
315 | // --- BlueRocksEnv --- | |
316 | // -------------------- | |
317 | ||
318 | BlueRocksEnv::BlueRocksEnv(BlueFS *f) | |
319 | : EnvWrapper(Env::Default()), // forward most of it to POSIX | |
320 | fs(f) | |
321 | { | |
322 | ||
323 | } | |
324 | ||
325 | rocksdb::Status BlueRocksEnv::NewSequentialFile( | |
326 | const std::string& fname, | |
327 | std::unique_ptr<rocksdb::SequentialFile>* result, | |
328 | const rocksdb::EnvOptions& options) | |
329 | { | |
330 | if (fname[0] == '/') | |
331 | return target()->NewSequentialFile(fname, result, options); | |
332 | std::string dir, file; | |
333 | split(fname, &dir, &file); | |
334 | BlueFS::FileReader *h; | |
335 | int r = fs->open_for_read(dir, file, &h, false); | |
336 | if (r < 0) | |
337 | return err_to_status(r); | |
338 | result->reset(new BlueRocksSequentialFile(fs, h)); | |
339 | return rocksdb::Status::OK(); | |
340 | } | |
341 | ||
342 | rocksdb::Status BlueRocksEnv::NewRandomAccessFile( | |
343 | const std::string& fname, | |
344 | std::unique_ptr<rocksdb::RandomAccessFile>* result, | |
345 | const rocksdb::EnvOptions& options) | |
346 | { | |
347 | std::string dir, file; | |
348 | split(fname, &dir, &file); | |
349 | BlueFS::FileReader *h; | |
350 | int r = fs->open_for_read(dir, file, &h, true); | |
351 | if (r < 0) | |
352 | return err_to_status(r); | |
353 | result->reset(new BlueRocksRandomAccessFile(fs, h)); | |
354 | return rocksdb::Status::OK(); | |
355 | } | |
356 | ||
357 | rocksdb::Status BlueRocksEnv::NewWritableFile( | |
358 | const std::string& fname, | |
359 | std::unique_ptr<rocksdb::WritableFile>* result, | |
360 | const rocksdb::EnvOptions& options) | |
361 | { | |
362 | std::string dir, file; | |
363 | split(fname, &dir, &file); | |
364 | BlueFS::FileWriter *h; | |
365 | int r = fs->open_for_write(dir, file, &h, false); | |
366 | if (r < 0) | |
367 | return err_to_status(r); | |
368 | result->reset(new BlueRocksWritableFile(fs, h)); | |
369 | return rocksdb::Status::OK(); | |
370 | } | |
371 | ||
372 | rocksdb::Status BlueRocksEnv::ReuseWritableFile( | |
373 | const std::string& new_fname, | |
374 | const std::string& old_fname, | |
375 | std::unique_ptr<rocksdb::WritableFile>* result, | |
376 | const rocksdb::EnvOptions& options) | |
377 | { | |
378 | std::string old_dir, old_file; | |
379 | split(old_fname, &old_dir, &old_file); | |
380 | std::string new_dir, new_file; | |
381 | split(new_fname, &new_dir, &new_file); | |
382 | ||
383 | int r = fs->rename(old_dir, old_file, new_dir, new_file); | |
384 | if (r < 0) | |
385 | return err_to_status(r); | |
386 | ||
387 | BlueFS::FileWriter *h; | |
388 | r = fs->open_for_write(new_dir, new_file, &h, true); | |
389 | if (r < 0) | |
390 | return err_to_status(r); | |
391 | result->reset(new BlueRocksWritableFile(fs, h)); | |
392 | return rocksdb::Status::OK(); | |
393 | } | |
394 | ||
395 | rocksdb::Status BlueRocksEnv::NewDirectory( | |
396 | const std::string& name, | |
397 | std::unique_ptr<rocksdb::Directory>* result) | |
398 | { | |
399 | if (!fs->dir_exists(name)) | |
400 | return rocksdb::Status::IOError(name, strerror(ENOENT)); | |
401 | result->reset(new BlueRocksDirectory(fs)); | |
402 | return rocksdb::Status::OK(); | |
403 | } | |
404 | ||
405 | rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname) | |
406 | { | |
407 | if (fname[0] == '/') | |
408 | return target()->FileExists(fname); | |
409 | std::string dir, file; | |
410 | split(fname, &dir, &file); | |
411 | if (fs->stat(dir, file, NULL, NULL) == 0) | |
412 | return rocksdb::Status::OK(); | |
413 | return err_to_status(-ENOENT); | |
414 | } | |
415 | ||
416 | rocksdb::Status BlueRocksEnv::GetChildren( | |
417 | const std::string& dir, | |
418 | std::vector<std::string>* result) | |
419 | { | |
420 | int r = fs->readdir(dir, result); | |
421 | if (r < 0) | |
422 | return rocksdb::Status::IOError(dir, strerror(ENOENT));// return err_to_status(r); | |
423 | return rocksdb::Status::OK(); | |
424 | } | |
425 | ||
426 | rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname) | |
427 | { | |
428 | std::string dir, file; | |
429 | split(fname, &dir, &file); | |
430 | int r = fs->unlink(dir, file); | |
431 | if (r < 0) | |
432 | return err_to_status(r); | |
433 | return rocksdb::Status::OK(); | |
434 | } | |
435 | ||
436 | rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname) | |
437 | { | |
438 | int r = fs->mkdir(dirname); | |
439 | if (r < 0) | |
440 | return err_to_status(r); | |
441 | return rocksdb::Status::OK(); | |
442 | } | |
443 | ||
444 | rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname) | |
445 | { | |
446 | int r = fs->mkdir(dirname); | |
447 | if (r < 0 && r != -EEXIST) | |
448 | return err_to_status(r); | |
449 | return rocksdb::Status::OK(); | |
450 | } | |
451 | ||
452 | rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname) | |
453 | { | |
454 | int r = fs->rmdir(dirname); | |
455 | if (r < 0) | |
456 | return err_to_status(r); | |
457 | return rocksdb::Status::OK(); | |
458 | } | |
459 | ||
460 | rocksdb::Status BlueRocksEnv::GetFileSize( | |
461 | const std::string& fname, | |
462 | uint64_t* file_size) | |
463 | { | |
464 | std::string dir, file; | |
465 | split(fname, &dir, &file); | |
466 | int r = fs->stat(dir, file, file_size, NULL); | |
467 | if (r < 0) | |
468 | return err_to_status(r); | |
469 | return rocksdb::Status::OK(); | |
470 | } | |
471 | ||
472 | rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname, | |
473 | uint64_t* file_mtime) | |
474 | { | |
475 | std::string dir, file; | |
476 | split(fname, &dir, &file); | |
477 | utime_t mtime; | |
478 | int r = fs->stat(dir, file, NULL, &mtime); | |
479 | if (r < 0) | |
480 | return err_to_status(r); | |
481 | *file_mtime = mtime.sec(); | |
482 | return rocksdb::Status::OK(); | |
483 | } | |
484 | ||
485 | rocksdb::Status BlueRocksEnv::RenameFile( | |
486 | const std::string& src, | |
487 | const std::string& target) | |
488 | { | |
489 | std::string old_dir, old_file; | |
490 | split(src, &old_dir, &old_file); | |
491 | std::string new_dir, new_file; | |
492 | split(target, &new_dir, &new_file); | |
493 | ||
494 | int r = fs->rename(old_dir, old_file, new_dir, new_file); | |
495 | if (r < 0) | |
496 | return err_to_status(r); | |
497 | return rocksdb::Status::OK(); | |
498 | } | |
499 | ||
500 | rocksdb::Status BlueRocksEnv::LinkFile( | |
501 | const std::string& src, | |
502 | const std::string& target) | |
503 | { | |
504 | ceph_abort(); | |
505 | } | |
506 | ||
507 | rocksdb::Status BlueRocksEnv::LockFile( | |
508 | const std::string& fname, | |
509 | rocksdb::FileLock** lock) | |
510 | { | |
511 | std::string dir, file; | |
512 | split(fname, &dir, &file); | |
513 | BlueFS::FileLock *l = NULL; | |
514 | int r = fs->lock_file(dir, file, &l); | |
515 | if (r < 0) | |
516 | return err_to_status(r); | |
517 | *lock = new BlueRocksFileLock(fs, l); | |
518 | return rocksdb::Status::OK(); | |
519 | } | |
520 | ||
521 | rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock) | |
522 | { | |
523 | BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock); | |
524 | int r = fs->unlock_file(l->lock); | |
525 | if (r < 0) | |
526 | return err_to_status(r); | |
527 | delete lock; | |
528 | return rocksdb::Status::OK(); | |
529 | } | |
530 | ||
531 | rocksdb::Status BlueRocksEnv::GetAbsolutePath( | |
532 | const std::string& db_path, | |
533 | std::string* output_path) | |
534 | { | |
535 | // this is a lie... | |
536 | *output_path = "/" + db_path; | |
537 | return rocksdb::Status::OK(); | |
538 | } | |
539 | ||
540 | rocksdb::Status BlueRocksEnv::NewLogger( | |
541 | const std::string& fname, | |
542 | std::shared_ptr<rocksdb::Logger>* result) | |
543 | { | |
544 | // ignore the filename :) | |
545 | result->reset(create_rocksdb_ceph_logger()); | |
546 | return rocksdb::Status::OK(); | |
547 | } | |
548 | ||
549 | rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path) | |
550 | { | |
551 | static int foo = 0; | |
552 | *path = "temp_" + stringify(++foo); | |
553 | return rocksdb::Status::OK(); | |
554 | } |