]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/include/rocksdb/env.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / include / rocksdb / env.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE file. See the AUTHORS file for names of contributors.
8 //
9 // An Env is an interface used by the rocksdb implementation to access
10 // operating system functionality like the filesystem etc. Callers
11 // may wish to provide a custom Env object when opening a database to
12 // get fine gain control; e.g., to rate limit file system operations.
13 //
14 // All Env implementations are safe for concurrent access from
15 // multiple threads without any external synchronization.
16
17 #ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
18 #define STORAGE_ROCKSDB_INCLUDE_ENV_H_
19
20 #include <stdint.h>
21 #include <cstdarg>
22 #include <functional>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <vector>
27 #include "rocksdb/status.h"
28 #include "rocksdb/thread_status.h"
29
30 #ifdef _WIN32
31 // Windows API macro interference
32 #undef DeleteFile
33 #undef GetCurrentTime
34 #endif
35
36 namespace rocksdb {
37
38 class FileLock;
39 class Logger;
40 class RandomAccessFile;
41 class SequentialFile;
42 class Slice;
43 class WritableFile;
44 class RandomRWFile;
45 class Directory;
46 struct DBOptions;
47 struct ImmutableDBOptions;
48 class RateLimiter;
49 class ThreadStatusUpdater;
50 struct ThreadStatus;
51
52 using std::unique_ptr;
53 using std::shared_ptr;
54
55 const size_t kDefaultPageSize = 4 * 1024;
56
57 // Options while opening a file to read/write
58 struct EnvOptions {
59
60 // construct with default Options
61 EnvOptions();
62
63 // construct from Options
64 explicit EnvOptions(const DBOptions& options);
65
66 // If true, then use mmap to read data
67 bool use_mmap_reads = false;
68
69 // If true, then use mmap to write data
70 bool use_mmap_writes = true;
71
72 // If true, then use O_DIRECT for reading data
73 bool use_direct_reads = false;
74
75 // If true, then use O_DIRECT for writing data
76 bool use_direct_writes = false;
77
78 // If false, fallocate() calls are bypassed
79 bool allow_fallocate = true;
80
81 // If true, set the FD_CLOEXEC on open fd.
82 bool set_fd_cloexec = true;
83
84 // Allows OS to incrementally sync files to disk while they are being
85 // written, in the background. Issue one request for every bytes_per_sync
86 // written. 0 turns it off.
87 // Default: 0
88 uint64_t bytes_per_sync = 0;
89
90 // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
91 // means that file size won't change as part of preallocation.
92 // If false, preallocation will also change the file size. This option will
93 // improve the performance in workloads where you sync the data on every
94 // write. By default, we set it to true for MANIFEST writes and false for
95 // WAL writes
96 bool fallocate_with_keep_size = true;
97
98 // See DBOPtions doc
99 size_t compaction_readahead_size;
100
101 // See DBOPtions doc
102 size_t random_access_max_buffer_size;
103
104 // See DBOptions doc
105 size_t writable_file_max_buffer_size = 1024 * 1024;
106
107 // If not nullptr, write rate limiting is enabled for flush and compaction
108 RateLimiter* rate_limiter = nullptr;
109 };
110
111 class Env {
112 public:
113 struct FileAttributes {
114 // File name
115 std::string name;
116
117 // Size of file in bytes
118 uint64_t size_bytes;
119 };
120
121 Env() : thread_status_updater_(nullptr) {}
122
123 virtual ~Env();
124
125 // Return a default environment suitable for the current operating
126 // system. Sophisticated users may wish to provide their own Env
127 // implementation instead of relying on this default environment.
128 //
129 // The result of Default() belongs to rocksdb and must never be deleted.
130 static Env* Default();
131
132 // Create a brand new sequentially-readable file with the specified name.
133 // On success, stores a pointer to the new file in *result and returns OK.
134 // On failure stores nullptr in *result and returns non-OK. If the file does
135 // not exist, returns a non-OK status.
136 //
137 // The returned file will only be accessed by one thread at a time.
138 virtual Status NewSequentialFile(const std::string& fname,
139 unique_ptr<SequentialFile>* result,
140 const EnvOptions& options)
141 = 0;
142
143 // Create a brand new random access read-only file with the
144 // specified name. On success, stores a pointer to the new file in
145 // *result and returns OK. On failure stores nullptr in *result and
146 // returns non-OK. If the file does not exist, returns a non-OK
147 // status.
148 //
149 // The returned file may be concurrently accessed by multiple threads.
150 virtual Status NewRandomAccessFile(const std::string& fname,
151 unique_ptr<RandomAccessFile>* result,
152 const EnvOptions& options)
153 = 0;
154
155 // Create an object that writes to a new file with the specified
156 // name. Deletes any existing file with the same name and creates a
157 // new file. On success, stores a pointer to the new file in
158 // *result and returns OK. On failure stores nullptr in *result and
159 // returns non-OK.
160 //
161 // The returned file will only be accessed by one thread at a time.
162 virtual Status NewWritableFile(const std::string& fname,
163 unique_ptr<WritableFile>* result,
164 const EnvOptions& options) = 0;
165
166 // Create an object that writes to a new file with the specified
167 // name. Deletes any existing file with the same name and creates a
168 // new file. On success, stores a pointer to the new file in
169 // *result and returns OK. On failure stores nullptr in *result and
170 // returns non-OK.
171 //
172 // The returned file will only be accessed by one thread at a time.
173 virtual Status ReopenWritableFile(const std::string& fname,
174 unique_ptr<WritableFile>* result,
175 const EnvOptions& options) {
176 Status s;
177 return s;
178 }
179
180 // Reuse an existing file by renaming it and opening it as writable.
181 virtual Status ReuseWritableFile(const std::string& fname,
182 const std::string& old_fname,
183 unique_ptr<WritableFile>* result,
184 const EnvOptions& options);
185
186 // Open `fname` for random read and write, if file dont exist the file
187 // will be created. On success, stores a pointer to the new file in
188 // *result and returns OK. On failure returns non-OK.
189 //
190 // The returned file will only be accessed by one thread at a time.
191 virtual Status NewRandomRWFile(const std::string& fname,
192 unique_ptr<RandomRWFile>* result,
193 const EnvOptions& options) {
194 return Status::NotSupported("RandomRWFile is not implemented in this Env");
195 }
196
197 // Create an object that represents a directory. Will fail if directory
198 // doesn't exist. If the directory exists, it will open the directory
199 // and create a new Directory object.
200 //
201 // On success, stores a pointer to the new Directory in
202 // *result and returns OK. On failure stores nullptr in *result and
203 // returns non-OK.
204 virtual Status NewDirectory(const std::string& name,
205 unique_ptr<Directory>* result) = 0;
206
207 // Returns OK if the named file exists.
208 // NotFound if the named file does not exist,
209 // the calling process does not have permission to determine
210 // whether this file exists, or if the path is invalid.
211 // IOError if an IO Error was encountered
212 virtual Status FileExists(const std::string& fname) = 0;
213
214 // Store in *result the names of the children of the specified directory.
215 // The names are relative to "dir".
216 // Original contents of *results are dropped.
217 // Returns OK if "dir" exists and "*result" contains its children.
218 // NotFound if "dir" does not exist, the calling process does not have
219 // permission to access "dir", or if "dir" is invalid.
220 // IOError if an IO Error was encountered
221 virtual Status GetChildren(const std::string& dir,
222 std::vector<std::string>* result) = 0;
223
224 // Store in *result the attributes of the children of the specified directory.
225 // In case the implementation lists the directory prior to iterating the files
226 // and files are concurrently deleted, the deleted files will be omitted from
227 // result.
228 // The name attributes are relative to "dir".
229 // Original contents of *results are dropped.
230 // Returns OK if "dir" exists and "*result" contains its children.
231 // NotFound if "dir" does not exist, the calling process does not have
232 // permission to access "dir", or if "dir" is invalid.
233 // IOError if an IO Error was encountered
234 virtual Status GetChildrenFileAttributes(const std::string& dir,
235 std::vector<FileAttributes>* result);
236
237 // Delete the named file.
238 virtual Status DeleteFile(const std::string& fname) = 0;
239
240 // Create the specified directory. Returns error if directory exists.
241 virtual Status CreateDir(const std::string& dirname) = 0;
242
243 // Creates directory if missing. Return Ok if it exists, or successful in
244 // Creating.
245 virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
246
247 // Delete the specified directory.
248 virtual Status DeleteDir(const std::string& dirname) = 0;
249
250 // Store the size of fname in *file_size.
251 virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
252
253 // Store the last modification time of fname in *file_mtime.
254 virtual Status GetFileModificationTime(const std::string& fname,
255 uint64_t* file_mtime) = 0;
256 // Rename file src to target.
257 virtual Status RenameFile(const std::string& src,
258 const std::string& target) = 0;
259
260 // Hard Link file src to target.
261 virtual Status LinkFile(const std::string& src, const std::string& target) {
262 return Status::NotSupported("LinkFile is not supported for this Env");
263 }
264
265 // Lock the specified file. Used to prevent concurrent access to
266 // the same db by multiple processes. On failure, stores nullptr in
267 // *lock and returns non-OK.
268 //
269 // On success, stores a pointer to the object that represents the
270 // acquired lock in *lock and returns OK. The caller should call
271 // UnlockFile(*lock) to release the lock. If the process exits,
272 // the lock will be automatically released.
273 //
274 // If somebody else already holds the lock, finishes immediately
275 // with a failure. I.e., this call does not wait for existing locks
276 // to go away.
277 //
278 // May create the named file if it does not already exist.
279 virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
280
281 // Release the lock acquired by a previous successful call to LockFile.
282 // REQUIRES: lock was returned by a successful LockFile() call
283 // REQUIRES: lock has not already been unlocked.
284 virtual Status UnlockFile(FileLock* lock) = 0;
285
286 // Priority for scheduling job in thread pool
287 enum Priority { LOW, HIGH, TOTAL };
288
289 // Priority for requesting bytes in rate limiter scheduler
290 enum IOPriority {
291 IO_LOW = 0,
292 IO_HIGH = 1,
293 IO_TOTAL = 2
294 };
295
296 // Arrange to run "(*function)(arg)" once in a background thread, in
297 // the thread pool specified by pri. By default, jobs go to the 'LOW'
298 // priority thread pool.
299
300 // "function" may run in an unspecified thread. Multiple functions
301 // added to the same Env may run concurrently in different threads.
302 // I.e., the caller may not assume that background work items are
303 // serialized.
304 // When the UnSchedule function is called, the unschedFunction
305 // registered at the time of Schedule is invoked with arg as a parameter.
306 virtual void Schedule(void (*function)(void* arg), void* arg,
307 Priority pri = LOW, void* tag = nullptr,
308 void (*unschedFunction)(void* arg) = 0) = 0;
309
310 // Arrange to remove jobs for given arg from the queue_ if they are not
311 // already scheduled. Caller is expected to have exclusive lock on arg.
312 virtual int UnSchedule(void* arg, Priority pri) { return 0; }
313
314 // Start a new thread, invoking "function(arg)" within the new thread.
315 // When "function(arg)" returns, the thread will be destroyed.
316 virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
317
318 // Wait for all threads started by StartThread to terminate.
319 virtual void WaitForJoin() {}
320
321 // Get thread pool queue length for specific thrad pool.
322 virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
323 return 0;
324 }
325
326 // *path is set to a temporary directory that can be used for testing. It may
327 // or many not have just been created. The directory may or may not differ
328 // between runs of the same process, but subsequent calls will return the
329 // same directory.
330 virtual Status GetTestDirectory(std::string* path) = 0;
331
332 // Create and return a log file for storing informational messages.
333 virtual Status NewLogger(const std::string& fname,
334 shared_ptr<Logger>* result) = 0;
335
336 // Returns the number of micro-seconds since some fixed point in time.
337 // It is often used as system time such as in GenericRateLimiter
338 // and other places so a port needs to return system time in order to work.
339 virtual uint64_t NowMicros() = 0;
340
341 // Returns the number of nano-seconds since some fixed point in time. Only
342 // useful for computing deltas of time in one run.
343 // Default implementation simply relies on NowMicros.
344 // In platform-specific implementations, NowNanos() should return time points
345 // that are MONOTONIC.
346 virtual uint64_t NowNanos() {
347 return NowMicros() * 1000;
348 }
349
350 // Sleep/delay the thread for the perscribed number of micro-seconds.
351 virtual void SleepForMicroseconds(int micros) = 0;
352
353 // Get the current host name.
354 virtual Status GetHostName(char* name, uint64_t len) = 0;
355
356 // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
357 virtual Status GetCurrentTime(int64_t* unix_time) = 0;
358
359 // Get full directory name for this db.
360 virtual Status GetAbsolutePath(const std::string& db_path,
361 std::string* output_path) = 0;
362
363 // The number of background worker threads of a specific thread pool
364 // for this environment. 'LOW' is the default pool.
365 // default number: 1
366 virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
367
368 // Enlarge number of background worker threads of a specific thread pool
369 // for this environment if it is smaller than specified. 'LOW' is the default
370 // pool.
371 virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0;
372
373 // Lower IO priority for threads from the specified pool.
374 virtual void LowerThreadPoolIOPriority(Priority pool = LOW) {}
375
376 // Converts seconds-since-Jan-01-1970 to a printable string
377 virtual std::string TimeToString(uint64_t time) = 0;
378
379 // Generates a unique id that can be used to identify a db
380 virtual std::string GenerateUniqueId();
381
382 // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
383 // the EnvOptions in the parameters, but is optimized for writing log files.
384 // Default implementation returns the copy of the same object.
385 virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
386 const DBOptions& db_options) const;
387 // OptimizeForManifestWrite will create a new EnvOptions object that is a copy
388 // of the EnvOptions in the parameters, but is optimized for writing manifest
389 // files. Default implementation returns the copy of the same object.
390 virtual EnvOptions OptimizeForManifestWrite(
391 const EnvOptions& env_options) const;
392
393 // OptimizeForCompactionTableWrite will create a new EnvOptions object that is a copy
394 // of the EnvOptions in the parameters, but is optimized for writing table
395 // files. Default implementation returns the copy of the same object.
396 virtual EnvOptions OptimizeForCompactionTableWrite(
397 const EnvOptions& env_options,
398 const ImmutableDBOptions& db_options) const;
399
400 // OptimizeForCompactionTableWrite will create a new EnvOptions object that is a copy
401 // of the EnvOptions in the parameters, but is optimized for reading table
402 // files. Default implementation returns the copy of the same object.
403 virtual EnvOptions OptimizeForCompactionTableRead(
404 const EnvOptions& env_options,
405 const ImmutableDBOptions& db_options) const;
406
407 // Returns the status of all threads that belong to the current Env.
408 virtual Status GetThreadList(std::vector<ThreadStatus>* thread_list) {
409 return Status::NotSupported("Not supported.");
410 }
411
412 // Returns the pointer to ThreadStatusUpdater. This function will be
413 // used in RocksDB internally to update thread status and supports
414 // GetThreadList().
415 virtual ThreadStatusUpdater* GetThreadStatusUpdater() const {
416 return thread_status_updater_;
417 }
418
419 // Returns the ID of the current thread.
420 virtual uint64_t GetThreadID() const;
421
422 protected:
423 // The pointer to an internal structure that will update the
424 // status of each thread.
425 ThreadStatusUpdater* thread_status_updater_;
426
427 private:
428 // No copying allowed
429 Env(const Env&);
430 void operator=(const Env&);
431 };
432
433 // The factory function to construct a ThreadStatusUpdater. Any Env
434 // that supports GetThreadList() feature should call this function in its
435 // constructor to initialize thread_status_updater_.
436 ThreadStatusUpdater* CreateThreadStatusUpdater();
437
438 // A file abstraction for reading sequentially through a file
439 class SequentialFile {
440 public:
441 SequentialFile() { }
442 virtual ~SequentialFile();
443
444 // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
445 // written by this routine. Sets "*result" to the data that was
446 // read (including if fewer than "n" bytes were successfully read).
447 // May set "*result" to point at data in "scratch[0..n-1]", so
448 // "scratch[0..n-1]" must be live when "*result" is used.
449 // If an error was encountered, returns a non-OK status.
450 //
451 // REQUIRES: External synchronization
452 virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
453
454 // Skip "n" bytes from the file. This is guaranteed to be no
455 // slower that reading the same data, but may be faster.
456 //
457 // If end of file is reached, skipping will stop at the end of the
458 // file, and Skip will return OK.
459 //
460 // REQUIRES: External synchronization
461 virtual Status Skip(uint64_t n) = 0;
462
463 // Indicates the upper layers if the current SequentialFile implementation
464 // uses direct IO.
465 virtual bool use_direct_io() const { return false; }
466
467 // Use the returned alignment value to allocate
468 // aligned buffer for Direct I/O
469 virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
470
471 virtual void Rewind() {}
472
473 // Remove any kind of caching of data from the offset to offset+length
474 // of this file. If the length is 0, then it refers to the end of file.
475 // If the system is not caching the file contents, then this is a noop.
476 virtual Status InvalidateCache(size_t offset, size_t length) {
477 return Status::NotSupported("InvalidateCache not supported.");
478 }
479
480 // Positioned Read for direct I/O
481 // If Direct I/O enabled, offset, n, and scratch should be properly aligned
482 virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result,
483 char* scratch) {
484 return Status::NotSupported();
485 }
486 };
487
488 // A file abstraction for randomly reading the contents of a file.
489 class RandomAccessFile {
490 public:
491
492 RandomAccessFile() { }
493 virtual ~RandomAccessFile();
494
495 // Read up to "n" bytes from the file starting at "offset".
496 // "scratch[0..n-1]" may be written by this routine. Sets "*result"
497 // to the data that was read (including if fewer than "n" bytes were
498 // successfully read). May set "*result" to point at data in
499 // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
500 // "*result" is used. If an error was encountered, returns a non-OK
501 // status.
502 //
503 // Safe for concurrent use by multiple threads.
504 // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
505 virtual Status Read(uint64_t offset, size_t n, Slice* result,
506 char* scratch) const = 0;
507
508 // Readahead the file starting from offset by n bytes for caching.
509 virtual Status Prefetch(uint64_t offset, size_t n) {
510 return Status::OK();
511 }
512
513 // Used by the file_reader_writer to decide if the ReadAhead wrapper
514 // should simply forward the call and do not enact buffering or locking.
515 virtual bool ShouldForwardRawRequest() const {
516 return false;
517 }
518
519 // For cases when read-ahead is implemented in the platform dependent
520 // layer
521 virtual void EnableReadAhead() {}
522
523 // Tries to get an unique ID for this file that will be the same each time
524 // the file is opened (and will stay the same while the file is open).
525 // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
526 // ID can be created this function returns the length of the ID and places it
527 // in "id"; otherwise, this function returns 0, in which case "id"
528 // may not have been modified.
529 //
530 // This function guarantees, for IDs from a given environment, two unique ids
531 // cannot be made equal to eachother by adding arbitrary bytes to one of
532 // them. That is, no unique ID is the prefix of another.
533 //
534 // This function guarantees that the returned ID will not be interpretable as
535 // a single varint.
536 //
537 // Note: these IDs are only valid for the duration of the process.
538 virtual size_t GetUniqueId(char* id, size_t max_size) const {
539 return 0; // Default implementation to prevent issues with backwards
540 // compatibility.
541 };
542
543 enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
544
545 virtual void Hint(AccessPattern pattern) {}
546
547 // Indicates the upper layers if the current RandomAccessFile implementation
548 // uses direct IO.
549 virtual bool use_direct_io() const { return false; }
550
551 // Use the returned alignment value to allocate
552 // aligned buffer for Direct I/O
553 virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
554
555 // Remove any kind of caching of data from the offset to offset+length
556 // of this file. If the length is 0, then it refers to the end of file.
557 // If the system is not caching the file contents, then this is a noop.
558 virtual Status InvalidateCache(size_t offset, size_t length) {
559 return Status::NotSupported("InvalidateCache not supported.");
560 }
561 };
562
563 // A file abstraction for sequential writing. The implementation
564 // must provide buffering since callers may append small fragments
565 // at a time to the file.
566 class WritableFile {
567 public:
568 WritableFile()
569 : last_preallocated_block_(0),
570 preallocation_block_size_(0),
571 io_priority_(Env::IO_TOTAL) {
572 }
573 virtual ~WritableFile();
574
575 // Append data to the end of the file
576 // Note: A WriteabelFile object must support either Append or
577 // PositionedAppend, so the users cannot mix the two.
578 virtual Status Append(const Slice& data) = 0;
579
580 // PositionedAppend data to the specified offset. The new EOF after append
581 // must be larger than the previous EOF. This is to be used when writes are
582 // not backed by OS buffers and hence has to always start from the start of
583 // the sector. The implementation thus needs to also rewrite the last
584 // partial sector.
585 // Note: PositionAppend does not guarantee moving the file offset after the
586 // write. A WritableFile object must support either Append or
587 // PositionedAppend, so the users cannot mix the two.
588 //
589 // PositionedAppend() can only happen on the page/sector boundaries. For that
590 // reason, if the last write was an incomplete sector we still need to rewind
591 // back to the nearest sector/page and rewrite the portion of it with whatever
592 // we need to add. We need to keep where we stop writing.
593 //
594 // PositionedAppend() can only write whole sectors. For that reason we have to
595 // pad with zeros for the last write and trim the file when closing according
596 // to the position we keep in the previous step.
597 //
598 // PositionedAppend() requires aligned buffer to be passed in. The alignment
599 // required is queried via GetRequiredBufferAlignment()
600 virtual Status PositionedAppend(const Slice& /* data */, uint64_t /* offset */) {
601 return Status::NotSupported();
602 }
603
604 // Truncate is necessary to trim the file to the correct size
605 // before closing. It is not always possible to keep track of the file
606 // size due to whole pages writes. The behavior is undefined if called
607 // with other writes to follow.
608 virtual Status Truncate(uint64_t size) {
609 return Status::OK();
610 }
611 virtual Status Close() = 0;
612 virtual Status Flush() = 0;
613 virtual Status Sync() = 0; // sync data
614
615 /*
616 * Sync data and/or metadata as well.
617 * By default, sync only data.
618 * Override this method for environments where we need to sync
619 * metadata as well.
620 */
621 virtual Status Fsync() {
622 return Sync();
623 }
624
625 // true if Sync() and Fsync() are safe to call concurrently with Append()
626 // and Flush().
627 virtual bool IsSyncThreadSafe() const {
628 return false;
629 }
630
631 // Indicates the upper layers if the current WritableFile implementation
632 // uses direct IO.
633 virtual bool use_direct_io() const { return false; }
634
635 // Use the returned alignment value to allocate
636 // aligned buffer for Direct I/O
637 virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
638 /*
639 * Change the priority in rate limiter if rate limiting is enabled.
640 * If rate limiting is not enabled, this call has no effect.
641 */
642 virtual void SetIOPriority(Env::IOPriority pri) {
643 io_priority_ = pri;
644 }
645
646 virtual Env::IOPriority GetIOPriority() { return io_priority_; }
647
648 /*
649 * Get the size of valid data in the file.
650 */
651 virtual uint64_t GetFileSize() {
652 return 0;
653 }
654
655 /*
656 * Get and set the default pre-allocation block size for writes to
657 * this file. If non-zero, then Allocate will be used to extend the
658 * underlying storage of a file (generally via fallocate) if the Env
659 * instance supports it.
660 */
661 virtual void SetPreallocationBlockSize(size_t size) {
662 preallocation_block_size_ = size;
663 }
664
665 virtual void GetPreallocationStatus(size_t* block_size,
666 size_t* last_allocated_block) {
667 *last_allocated_block = last_preallocated_block_;
668 *block_size = preallocation_block_size_;
669 }
670
671 // For documentation, refer to RandomAccessFile::GetUniqueId()
672 virtual size_t GetUniqueId(char* id, size_t max_size) const {
673 return 0; // Default implementation to prevent issues with backwards
674 }
675
676 // Remove any kind of caching of data from the offset to offset+length
677 // of this file. If the length is 0, then it refers to the end of file.
678 // If the system is not caching the file contents, then this is a noop.
679 // This call has no effect on dirty pages in the cache.
680 virtual Status InvalidateCache(size_t offset, size_t length) {
681 return Status::NotSupported("InvalidateCache not supported.");
682 }
683
684 // Sync a file range with disk.
685 // offset is the starting byte of the file range to be synchronized.
686 // nbytes specifies the length of the range to be synchronized.
687 // This asks the OS to initiate flushing the cached data to disk,
688 // without waiting for completion.
689 // Default implementation does nothing.
690 virtual Status RangeSync(uint64_t offset, uint64_t nbytes) { return Status::OK(); }
691
692 // PrepareWrite performs any necessary preparation for a write
693 // before the write actually occurs. This allows for pre-allocation
694 // of space on devices where it can result in less file
695 // fragmentation and/or less waste from over-zealous filesystem
696 // pre-allocation.
697 virtual void PrepareWrite(size_t offset, size_t len) {
698 if (preallocation_block_size_ == 0) {
699 return;
700 }
701 // If this write would cross one or more preallocation blocks,
702 // determine what the last preallocation block necesessary to
703 // cover this write would be and Allocate to that point.
704 const auto block_size = preallocation_block_size_;
705 size_t new_last_preallocated_block =
706 (offset + len + block_size - 1) / block_size;
707 if (new_last_preallocated_block > last_preallocated_block_) {
708 size_t num_spanned_blocks =
709 new_last_preallocated_block - last_preallocated_block_;
710 Allocate(block_size * last_preallocated_block_,
711 block_size * num_spanned_blocks);
712 last_preallocated_block_ = new_last_preallocated_block;
713 }
714 }
715
716 protected:
717 /*
718 * Pre-allocate space for a file.
719 */
720 virtual Status Allocate(uint64_t offset, uint64_t len) {
721 return Status::OK();
722 }
723
724 size_t preallocation_block_size() { return preallocation_block_size_; }
725
726 private:
727 size_t last_preallocated_block_;
728 size_t preallocation_block_size_;
729 // No copying allowed
730 WritableFile(const WritableFile&);
731 void operator=(const WritableFile&);
732
733 protected:
734 friend class WritableFileWrapper;
735 friend class WritableFileMirror;
736
737 Env::IOPriority io_priority_;
738 };
739
740 // A file abstraction for random reading and writing.
741 class RandomRWFile {
742 public:
743 RandomRWFile() {}
744 virtual ~RandomRWFile() {}
745
746 // Indicates if the class makes use of direct I/O
747 // If false you must pass aligned buffer to Write()
748 virtual bool use_direct_io() const { return false; }
749
750 // Use the returned alignment value to allocate
751 // aligned buffer for Direct I/O
752 virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
753
754 // Used by the file_reader_writer to decide if the ReadAhead wrapper
755 // should simply forward the call and do not enact read_ahead buffering or locking.
756 // The implementation below takes care of reading ahead
757 virtual bool ShouldForwardRawRequest() const {
758 return false;
759 }
760
761 // For cases when read-ahead is implemented in the platform dependent
762 // layer. This is when ShouldForwardRawRequest() returns true.
763 virtual void EnableReadAhead() {}
764
765 // Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
766 // Pass aligned buffer when use_direct_io() returns true.
767 virtual Status Write(uint64_t offset, const Slice& data) = 0;
768
769 // Read up to `n` bytes starting from offset `offset` and store them in
770 // result, provided `scratch` size should be at least `n`.
771 // Returns Status::OK() on success.
772 virtual Status Read(uint64_t offset, size_t n, Slice* result,
773 char* scratch) const = 0;
774
775 virtual Status Flush() = 0;
776
777 virtual Status Sync() = 0;
778
779 virtual Status Fsync() { return Sync(); }
780
781 virtual Status Close() = 0;
782
783 // No copying allowed
784 RandomRWFile(const RandomRWFile&) = delete;
785 RandomRWFile& operator=(const RandomRWFile&) = delete;
786 };
787
788 // Directory object represents collection of files and implements
789 // filesystem operations that can be executed on directories.
790 class Directory {
791 public:
792 virtual ~Directory() {}
793 // Fsync directory. Can be called concurrently from multiple threads.
794 virtual Status Fsync() = 0;
795 };
796
797 enum InfoLogLevel : unsigned char {
798 DEBUG_LEVEL = 0,
799 INFO_LEVEL,
800 WARN_LEVEL,
801 ERROR_LEVEL,
802 FATAL_LEVEL,
803 HEADER_LEVEL,
804 NUM_INFO_LOG_LEVELS,
805 };
806
807 // An interface for writing log messages.
808 class Logger {
809 public:
810 size_t kDoNotSupportGetLogFileSize = std::numeric_limits<size_t>::max();
811
812 explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
813 : log_level_(log_level) {}
814 virtual ~Logger();
815
816 // Write a header to the log file with the specified format
817 // It is recommended that you log all header information at the start of the
818 // application. But it is not enforced.
819 virtual void LogHeader(const char* format, va_list ap) {
820 // Default implementation does a simple INFO level log write.
821 // Please override as per the logger class requirement.
822 Logv(format, ap);
823 }
824
825 // Write an entry to the log file with the specified format.
826 virtual void Logv(const char* format, va_list ap) = 0;
827
828 // Write an entry to the log file with the specified log level
829 // and format. Any log with level under the internal log level
830 // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
831 // printed.
832 virtual void Logv(const InfoLogLevel log_level, const char* format, va_list ap);
833
834 virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; }
835 // Flush to the OS buffers
836 virtual void Flush() {}
837 virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
838 virtual void SetInfoLogLevel(const InfoLogLevel log_level) {
839 log_level_ = log_level;
840 }
841
842 private:
843 // No copying allowed
844 Logger(const Logger&);
845 void operator=(const Logger&);
846 InfoLogLevel log_level_;
847 };
848
849
850 // Identifies a locked file.
851 class FileLock {
852 public:
853 FileLock() { }
854 virtual ~FileLock();
855 private:
856 // No copying allowed
857 FileLock(const FileLock&);
858 void operator=(const FileLock&);
859 };
860
861 extern void LogFlush(const shared_ptr<Logger>& info_log);
862
863 extern void Log(const InfoLogLevel log_level,
864 const shared_ptr<Logger>& info_log, const char* format, ...);
865
866 // a set of log functions with different log levels.
867 extern void Header(const shared_ptr<Logger>& info_log, const char* format, ...);
868 extern void Debug(const shared_ptr<Logger>& info_log, const char* format, ...);
869 extern void Info(const shared_ptr<Logger>& info_log, const char* format, ...);
870 extern void Warn(const shared_ptr<Logger>& info_log, const char* format, ...);
871 extern void Error(const shared_ptr<Logger>& info_log, const char* format, ...);
872 extern void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...);
873
874 // Log the specified data to *info_log if info_log is non-nullptr.
875 // The default info log level is InfoLogLevel::INFO_LEVEL.
876 extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
877 # if defined(__GNUC__) || defined(__clang__)
878 __attribute__((__format__ (__printf__, 2, 3)))
879 # endif
880 ;
881
882 extern void LogFlush(Logger *info_log);
883
884 extern void Log(const InfoLogLevel log_level, Logger* info_log,
885 const char* format, ...);
886
887 // The default info log level is InfoLogLevel::INFO_LEVEL.
888 extern void Log(Logger* info_log, const char* format, ...)
889 # if defined(__GNUC__) || defined(__clang__)
890 __attribute__((__format__ (__printf__, 2, 3)))
891 # endif
892 ;
893
894 // a set of log functions with different log levels.
895 extern void Header(Logger* info_log, const char* format, ...);
896 extern void Debug(Logger* info_log, const char* format, ...);
897 extern void Info(Logger* info_log, const char* format, ...);
898 extern void Warn(Logger* info_log, const char* format, ...);
899 extern void Error(Logger* info_log, const char* format, ...);
900 extern void Fatal(Logger* info_log, const char* format, ...);
901
902 // A utility routine: write "data" to the named file.
903 extern Status WriteStringToFile(Env* env, const Slice& data,
904 const std::string& fname,
905 bool should_sync = false);
906
907 // A utility routine: read contents of named file into *data
908 extern Status ReadFileToString(Env* env, const std::string& fname,
909 std::string* data);
910
911 // An implementation of Env that forwards all calls to another Env.
912 // May be useful to clients who wish to override just part of the
913 // functionality of another Env.
914 class EnvWrapper : public Env {
915 public:
916 // Initialize an EnvWrapper that delegates all calls to *t
917 explicit EnvWrapper(Env* t) : target_(t) { }
918 virtual ~EnvWrapper();
919
920 // Return the target to which this Env forwards all calls
921 Env* target() const { return target_; }
922
923 // The following text is boilerplate that forwards all methods to target()
924 Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
925 const EnvOptions& options) override {
926 return target_->NewSequentialFile(f, r, options);
927 }
928 Status NewRandomAccessFile(const std::string& f,
929 unique_ptr<RandomAccessFile>* r,
930 const EnvOptions& options) override {
931 return target_->NewRandomAccessFile(f, r, options);
932 }
933 Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
934 const EnvOptions& options) override {
935 return target_->NewWritableFile(f, r, options);
936 }
937 Status ReopenWritableFile(const std::string& fname,
938 unique_ptr<WritableFile>* result,
939 const EnvOptions& options) override {
940 return target_->ReopenWritableFile(fname, result, options);
941 }
942 Status ReuseWritableFile(const std::string& fname,
943 const std::string& old_fname,
944 unique_ptr<WritableFile>* r,
945 const EnvOptions& options) override {
946 return target_->ReuseWritableFile(fname, old_fname, r, options);
947 }
948 Status NewRandomRWFile(const std::string& fname,
949 unique_ptr<RandomRWFile>* result,
950 const EnvOptions& options) override {
951 return target_->NewRandomRWFile(fname, result, options);
952 }
953 virtual Status NewDirectory(const std::string& name,
954 unique_ptr<Directory>* result) override {
955 return target_->NewDirectory(name, result);
956 }
957 Status FileExists(const std::string& f) override {
958 return target_->FileExists(f);
959 }
960 Status GetChildren(const std::string& dir,
961 std::vector<std::string>* r) override {
962 return target_->GetChildren(dir, r);
963 }
964 Status GetChildrenFileAttributes(
965 const std::string& dir, std::vector<FileAttributes>* result) override {
966 return target_->GetChildrenFileAttributes(dir, result);
967 }
968 Status DeleteFile(const std::string& f) override {
969 return target_->DeleteFile(f);
970 }
971 Status CreateDir(const std::string& d) override {
972 return target_->CreateDir(d);
973 }
974 Status CreateDirIfMissing(const std::string& d) override {
975 return target_->CreateDirIfMissing(d);
976 }
977 Status DeleteDir(const std::string& d) override {
978 return target_->DeleteDir(d);
979 }
980 Status GetFileSize(const std::string& f, uint64_t* s) override {
981 return target_->GetFileSize(f, s);
982 }
983
984 Status GetFileModificationTime(const std::string& fname,
985 uint64_t* file_mtime) override {
986 return target_->GetFileModificationTime(fname, file_mtime);
987 }
988
989 Status RenameFile(const std::string& s, const std::string& t) override {
990 return target_->RenameFile(s, t);
991 }
992
993 Status LinkFile(const std::string& s, const std::string& t) override {
994 return target_->LinkFile(s, t);
995 }
996
997 Status LockFile(const std::string& f, FileLock** l) override {
998 return target_->LockFile(f, l);
999 }
1000
1001 Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); }
1002
1003 void Schedule(void (*f)(void* arg), void* a, Priority pri,
1004 void* tag = nullptr, void (*u)(void* arg) = 0) override {
1005 return target_->Schedule(f, a, pri, tag, u);
1006 }
1007
1008 int UnSchedule(void* tag, Priority pri) override {
1009 return target_->UnSchedule(tag, pri);
1010 }
1011
1012 void StartThread(void (*f)(void*), void* a) override {
1013 return target_->StartThread(f, a);
1014 }
1015 void WaitForJoin() override { return target_->WaitForJoin(); }
1016 virtual unsigned int GetThreadPoolQueueLen(
1017 Priority pri = LOW) const override {
1018 return target_->GetThreadPoolQueueLen(pri);
1019 }
1020 virtual Status GetTestDirectory(std::string* path) override {
1021 return target_->GetTestDirectory(path);
1022 }
1023 virtual Status NewLogger(const std::string& fname,
1024 shared_ptr<Logger>* result) override {
1025 return target_->NewLogger(fname, result);
1026 }
1027 uint64_t NowMicros() override { return target_->NowMicros(); }
1028
1029 void SleepForMicroseconds(int micros) override {
1030 target_->SleepForMicroseconds(micros);
1031 }
1032 Status GetHostName(char* name, uint64_t len) override {
1033 return target_->GetHostName(name, len);
1034 }
1035 Status GetCurrentTime(int64_t* unix_time) override {
1036 return target_->GetCurrentTime(unix_time);
1037 }
1038 Status GetAbsolutePath(const std::string& db_path,
1039 std::string* output_path) override {
1040 return target_->GetAbsolutePath(db_path, output_path);
1041 }
1042 void SetBackgroundThreads(int num, Priority pri) override {
1043 return target_->SetBackgroundThreads(num, pri);
1044 }
1045
1046 void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
1047 return target_->IncBackgroundThreadsIfNeeded(num, pri);
1048 }
1049
1050 void LowerThreadPoolIOPriority(Priority pool = LOW) override {
1051 target_->LowerThreadPoolIOPriority(pool);
1052 }
1053
1054 std::string TimeToString(uint64_t time) override {
1055 return target_->TimeToString(time);
1056 }
1057
1058 Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
1059 return target_->GetThreadList(thread_list);
1060 }
1061
1062 ThreadStatusUpdater* GetThreadStatusUpdater() const override {
1063 return target_->GetThreadStatusUpdater();
1064 }
1065
1066 uint64_t GetThreadID() const override {
1067 return target_->GetThreadID();
1068 }
1069
1070 private:
1071 Env* target_;
1072 };
1073
1074 // An implementation of WritableFile that forwards all calls to another
1075 // WritableFile. May be useful to clients who wish to override just part of the
1076 // functionality of another WritableFile.
1077 // It's declared as friend of WritableFile to allow forwarding calls to
1078 // protected virtual methods.
1079 class WritableFileWrapper : public WritableFile {
1080 public:
1081 explicit WritableFileWrapper(WritableFile* t) : target_(t) { }
1082
1083 Status Append(const Slice& data) override { return target_->Append(data); }
1084 Status PositionedAppend(const Slice& data, uint64_t offset) override {
1085 return target_->PositionedAppend(data, offset);
1086 }
1087 Status Truncate(uint64_t size) override { return target_->Truncate(size); }
1088 Status Close() override { return target_->Close(); }
1089 Status Flush() override { return target_->Flush(); }
1090 Status Sync() override { return target_->Sync(); }
1091 Status Fsync() override { return target_->Fsync(); }
1092 bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
1093 void SetIOPriority(Env::IOPriority pri) override {
1094 target_->SetIOPriority(pri);
1095 }
1096 Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); }
1097 uint64_t GetFileSize() override { return target_->GetFileSize(); }
1098 void GetPreallocationStatus(size_t* block_size,
1099 size_t* last_allocated_block) override {
1100 target_->GetPreallocationStatus(block_size, last_allocated_block);
1101 }
1102 size_t GetUniqueId(char* id, size_t max_size) const override {
1103 return target_->GetUniqueId(id, max_size);
1104 }
1105 Status InvalidateCache(size_t offset, size_t length) override {
1106 return target_->InvalidateCache(offset, length);
1107 }
1108
1109 virtual void SetPreallocationBlockSize(size_t size) override {
1110 target_->SetPreallocationBlockSize(size);
1111 }
1112 virtual void PrepareWrite(size_t offset, size_t len) override {
1113 target_->PrepareWrite(offset, len);
1114 }
1115
1116 protected:
1117 Status Allocate(uint64_t offset, uint64_t len) override {
1118 return target_->Allocate(offset, len);
1119 }
1120 Status RangeSync(uint64_t offset, uint64_t nbytes) override {
1121 return target_->RangeSync(offset, nbytes);
1122 }
1123
1124 private:
1125 WritableFile* target_;
1126 };
1127
1128 // Returns a new environment that stores its data in memory and delegates
1129 // all non-file-storage tasks to base_env. The caller must delete the result
1130 // when it is no longer needed.
1131 // *base_env must remain live while the result is in use.
1132 Env* NewMemEnv(Env* base_env);
1133
1134 // Returns a new environment that is used for HDFS environment.
1135 // This is a factory method for HdfsEnv declared in hdfs/env_hdfs.h
1136 Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname);
1137
1138 // Returns a new environment that measures function call times for filesystem
1139 // operations, reporting results to variables in PerfContext.
1140 // This is a factory method for TimedEnv defined in utilities/env_timed.cc.
1141 Env* NewTimedEnv(Env* base_env);
1142
1143 } // namespace rocksdb
1144
1145 #endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_